1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qdisable_composition;
109 Lisp_Object Quse_entity_reference;
110 Lisp_Object Qd, Qx, QX;
112 Lisp_Object Qencode, Qdecode;
114 Lisp_Object Vcoding_system_hash_table;
116 int enable_multibyte_characters;
119 /* Additional information used by the ISO2022 decoder and detector. */
120 struct iso2022_decoder
122 /* CHARSET holds the character sets currently assigned to the G0
123 through G3 variables. It is initialized from the array
124 INITIAL_CHARSET in CODESYS. */
125 Lisp_Object charset[4];
127 /* Which registers are currently invoked into the left (GL) and
128 right (GR) halves of the 8-bit encoding space? */
129 int register_left, register_right;
131 /* ISO_ESC holds a value indicating part of an escape sequence
132 that has already been seen. */
133 enum iso_esc_flag esc;
135 /* This records the bytes we've seen so far in an escape sequence,
136 in case the sequence is invalid (we spit out the bytes unchanged). */
137 unsigned char esc_bytes[8];
139 /* Index for next byte to store in ISO escape sequence. */
142 #ifdef ENABLE_COMPOSITE_CHARS
143 /* Stuff seen so far when composing a string. */
144 unsigned_char_dynarr *composite_chars;
147 /* If we saw an invalid designation sequence for a particular
148 register, we flag it here and switch to ASCII. The next time we
149 see a valid designation for this register, we turn off the flag
150 and do the designation normally, but pretend the sequence was
151 invalid. The effect of all this is that (most of the time) the
152 escape sequences for both the switch to the unknown charset, and
153 the switch back to the known charset, get inserted literally into
154 the buffer and saved out as such. The hope is that we can
155 preserve the escape sequences so that the resulting written out
156 file makes sense. If we don't do any of this, the designation
157 to the invalid charset will be preserved but that switch back
158 to the known charset will probably get eaten because it was
159 the same charset that was already present in the register. */
160 unsigned char invalid_designated[4];
162 /* We try to do similar things as above for direction-switching
163 sequences. If we encountered a direction switch while an
164 invalid designation was present, or an invalid designation
165 just after a direction switch (i.e. no valid designation
166 encountered yet), we insert the direction-switch escape
167 sequence literally into the output stream, and later on
168 insert the corresponding direction-restoring escape sequence
170 unsigned int switched_dir_and_no_valid_charset_yet :1;
171 unsigned int invalid_switch_dir :1;
173 /* Tells the decoder to output the escape sequence literally
174 even though it was valid. Used in the games we play to
175 avoid lossage when we encounter invalid designations. */
176 unsigned int output_literally :1;
177 /* We encountered a direction switch followed by an invalid
178 designation. We didn't output the direction switch
179 literally because we didn't know about the invalid designation;
180 but we have to do so now. */
181 unsigned int output_direction_sequence :1;
184 EXFUN (Fcopy_coding_system, 2);
186 struct detection_state;
189 text_encode_generic (Lstream *encoding, const Bufbyte *src,
190 unsigned_char_dynarr *dst, size_t n);
192 static int detect_coding_sjis (struct detection_state *st,
193 const Extbyte *src, size_t n);
194 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
195 unsigned_char_dynarr *dst, size_t n);
196 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
197 unsigned_char_dynarr *dst, unsigned int *flags);
198 void char_finish_shift_jis (struct encoding_stream *str,
199 unsigned_char_dynarr *dst, unsigned int *flags);
201 static int detect_coding_big5 (struct detection_state *st,
202 const Extbyte *src, size_t n);
203 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
204 unsigned_char_dynarr *dst, size_t n);
205 void char_encode_big5 (struct encoding_stream *str, Emchar c,
206 unsigned_char_dynarr *dst, unsigned int *flags);
207 void char_finish_big5 (struct encoding_stream *str,
208 unsigned_char_dynarr *dst, unsigned int *flags);
210 static int detect_coding_ucs4 (struct detection_state *st,
211 const Extbyte *src, size_t n);
212 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
213 unsigned_char_dynarr *dst, size_t n);
214 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
215 unsigned_char_dynarr *dst, unsigned int *flags);
216 void char_finish_ucs4 (struct encoding_stream *str,
217 unsigned_char_dynarr *dst, unsigned int *flags);
219 static int detect_coding_utf8 (struct detection_state *st,
220 const Extbyte *src, size_t n);
221 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
222 unsigned_char_dynarr *dst, size_t n);
223 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
224 unsigned_char_dynarr *dst, unsigned int *flags);
225 void char_finish_utf8 (struct encoding_stream *str,
226 unsigned_char_dynarr *dst, unsigned int *flags);
228 static int postprocess_iso2022_mask (int mask);
229 static void reset_iso2022 (Lisp_Object coding_system,
230 struct iso2022_decoder *iso);
231 static int detect_coding_iso2022 (struct detection_state *st,
232 const Extbyte *src, size_t n);
233 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
234 unsigned_char_dynarr *dst, size_t n);
235 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
236 unsigned_char_dynarr *dst, unsigned int *flags);
237 void char_finish_iso2022 (struct encoding_stream *str,
238 unsigned_char_dynarr *dst, unsigned int *flags);
240 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
241 unsigned_char_dynarr *dst, size_t n);
242 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
243 unsigned_char_dynarr *dst, size_t n);
244 static void mule_decode (Lstream *decoding, const Extbyte *src,
245 unsigned_char_dynarr *dst, size_t n);
246 static void mule_encode (Lstream *encoding, const Bufbyte *src,
247 unsigned_char_dynarr *dst, size_t n);
249 typedef struct codesys_prop codesys_prop;
258 Dynarr_declare (codesys_prop);
259 } codesys_prop_dynarr;
261 static const struct lrecord_description codesys_prop_description_1[] = {
262 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
266 static const struct struct_description codesys_prop_description = {
267 sizeof (codesys_prop),
268 codesys_prop_description_1
271 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
272 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
276 static const struct struct_description codesys_prop_dynarr_description = {
277 sizeof (codesys_prop_dynarr),
278 codesys_prop_dynarr_description_1
281 codesys_prop_dynarr *the_codesys_prop_dynarr;
283 enum codesys_prop_enum
286 CODESYS_PROP_ISO2022,
291 /************************************************************************/
292 /* Coding system functions */
293 /************************************************************************/
295 static Lisp_Object mark_coding_system (Lisp_Object);
296 static void print_coding_system (Lisp_Object, Lisp_Object, int);
297 static void finalize_coding_system (void *header, int for_disksave);
300 static const struct lrecord_description ccs_description_1[] = {
301 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
302 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
306 static const struct struct_description ccs_description = {
307 sizeof (charset_conversion_spec),
311 static const struct lrecord_description ccsd_description_1[] = {
312 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
316 static const struct struct_description ccsd_description = {
317 sizeof (charset_conversion_spec_dynarr),
322 static const struct lrecord_description coding_system_description[] = {
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
332 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
333 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
334 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
344 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
345 mark_coding_system, print_coding_system,
346 finalize_coding_system,
347 0, 0, coding_system_description,
351 mark_coding_system (Lisp_Object obj)
353 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
355 mark_object (CODING_SYSTEM_NAME (codesys));
356 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
357 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
358 mark_object (CODING_SYSTEM_EOL_LF (codesys));
359 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
360 mark_object (CODING_SYSTEM_EOL_CR (codesys));
362 switch (CODING_SYSTEM_TYPE (codesys))
366 case CODESYS_ISO2022:
367 for (i = 0; i < 4; i++)
368 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
369 if (codesys->iso2022.input_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.input_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
379 if (codesys->iso2022.output_conv)
381 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
383 struct charset_conversion_spec *ccs =
384 Dynarr_atp (codesys->iso2022.output_conv, i);
385 mark_object (ccs->from_charset);
386 mark_object (ccs->to_charset);
393 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
394 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
399 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
400 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
407 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
409 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
411 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
415 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
418 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
420 error ("printing unreadable object #<coding_system 0x%x>",
423 write_c_string ("#<coding_system ", printcharfun);
424 print_internal (c->name, printcharfun, 1);
425 write_c_string (">", printcharfun);
429 finalize_coding_system (void *header, int for_disksave)
431 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
432 /* Since coding systems never go away, this function is not
433 necessary. But it would be necessary if we changed things
434 so that coding systems could go away. */
435 if (!for_disksave) /* see comment in lstream.c */
437 switch (CODING_SYSTEM_TYPE (c))
440 case CODESYS_ISO2022:
441 if (c->iso2022.input_conv)
443 Dynarr_free (c->iso2022.input_conv);
444 c->iso2022.input_conv = 0;
446 if (c->iso2022.output_conv)
448 Dynarr_free (c->iso2022.output_conv);
449 c->iso2022.output_conv = 0;
460 symbol_to_eol_type (Lisp_Object symbol)
462 CHECK_SYMBOL (symbol);
463 if (NILP (symbol)) return EOL_AUTODETECT;
464 if (EQ (symbol, Qlf)) return EOL_LF;
465 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
466 if (EQ (symbol, Qcr)) return EOL_CR;
468 signal_simple_error ("Unrecognized eol type", symbol);
469 return EOL_AUTODETECT; /* not reached */
473 eol_type_to_symbol (eol_type_t type)
478 case EOL_LF: return Qlf;
479 case EOL_CRLF: return Qcrlf;
480 case EOL_CR: return Qcr;
481 case EOL_AUTODETECT: return Qnil;
486 setup_eol_coding_systems (Lisp_Coding_System *codesys)
488 Lisp_Object codesys_obj;
489 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
490 char *codesys_name = (char *) alloca (len + 7);
492 char *codesys_mnemonic=0;
494 Lisp_Object codesys_name_sym, sub_codesys_obj;
498 XSETCODING_SYSTEM (codesys_obj, codesys);
500 memcpy (codesys_name,
501 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
503 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
505 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
506 codesys_mnemonic = (char *) alloca (mlen + 7);
507 memcpy (codesys_mnemonic,
508 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
511 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
512 strcpy (codesys_name + len, "-" op_sys); \
514 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
515 codesys_name_sym = intern (codesys_name); \
516 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
517 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
519 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
520 build_string (codesys_mnemonic); \
521 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
524 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
525 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
526 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
529 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
530 Return t if OBJECT is a coding system.
531 A coding system is an object that defines how text containing multiple
532 character sets is encoded into a stream of (typically 8-bit) bytes.
533 The coding system is used to decode the stream into a series of
534 characters (which may be from multiple charsets) when the text is read
535 from a file or process, and is used to encode the text back into the
536 same format when it is written out to a file or process.
538 For example, many ISO2022-compliant coding systems (such as Compound
539 Text, which is used for inter-client data under the X Window System)
540 use escape sequences to switch between different charsets -- Japanese
541 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
542 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
543 `make-coding-system' for more information.
545 Coding systems are normally identified using a symbol, and the
546 symbol is accepted in place of the actual coding system object whenever
547 a coding system is called for. (This is similar to how faces work.)
551 return CODING_SYSTEMP (object) ? Qt : Qnil;
554 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
555 Retrieve the coding system of the given name.
557 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
558 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
559 If there is no such coding system, nil is returned. Otherwise the
560 associated coding system object is returned.
562 (coding_system_or_name))
564 if (NILP (coding_system_or_name))
565 coding_system_or_name = Qbinary;
566 else if (CODING_SYSTEMP (coding_system_or_name))
567 return coding_system_or_name;
569 CHECK_SYMBOL (coding_system_or_name);
573 coding_system_or_name =
574 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
576 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
577 return coding_system_or_name;
581 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
582 Retrieve the coding system of the given name.
583 Same as `find-coding-system' except that if there is no such
584 coding system, an error is signaled instead of returning nil.
588 Lisp_Object coding_system = Ffind_coding_system (name);
590 if (NILP (coding_system))
591 signal_simple_error ("No such coding system", name);
592 return coding_system;
595 /* We store the coding systems in hash tables with the names as the key and the
596 actual coding system object as the value. Occasionally we need to use them
597 in a list format. These routines provide us with that. */
598 struct coding_system_list_closure
600 Lisp_Object *coding_system_list;
604 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
605 void *coding_system_list_closure)
607 /* This function can GC */
608 struct coding_system_list_closure *cscl =
609 (struct coding_system_list_closure *) coding_system_list_closure;
610 Lisp_Object *coding_system_list = cscl->coding_system_list;
612 *coding_system_list = Fcons (key, *coding_system_list);
616 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
617 Return a list of the names of all defined coding systems.
621 Lisp_Object coding_system_list = Qnil;
623 struct coding_system_list_closure coding_system_list_closure;
625 GCPRO1 (coding_system_list);
626 coding_system_list_closure.coding_system_list = &coding_system_list;
627 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
628 &coding_system_list_closure);
631 return coding_system_list;
634 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
635 Return the name of the given coding system.
639 coding_system = Fget_coding_system (coding_system);
640 return XCODING_SYSTEM_NAME (coding_system);
643 static Lisp_Coding_System *
644 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
646 Lisp_Coding_System *codesys =
647 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
649 zero_lcrecord (codesys);
650 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
651 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
652 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
653 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
654 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
655 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
656 CODING_SYSTEM_TYPE (codesys) = type;
657 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
660 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
662 if (type == CODESYS_ISO2022)
665 for (i = 0; i < 4; i++)
666 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
669 if (type == CODESYS_BIG5)
671 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
673 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
674 = Vcharset_chinese_big5;
675 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
677 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
681 else if (type == CODESYS_CCL)
683 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
684 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
687 CODING_SYSTEM_NAME (codesys) = name;
693 /* Given a list of charset conversion specs as specified in a Lisp
694 program, parse it into STORE_HERE. */
697 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
698 Lisp_Object spec_list)
702 EXTERNAL_LIST_LOOP (rest, spec_list)
704 Lisp_Object car = XCAR (rest);
705 Lisp_Object from, to;
706 struct charset_conversion_spec spec;
708 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
709 signal_simple_error ("Invalid charset conversion spec", car);
710 from = Fget_charset (XCAR (car));
711 to = Fget_charset (XCAR (XCDR (car)));
712 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
713 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
714 signal_simple_error_2
715 ("Attempted conversion between different charset types",
717 spec.from_charset = from;
718 spec.to_charset = to;
720 Dynarr_add (store_here, spec);
724 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
725 specs, return the equivalent as the Lisp programmer would see it.
727 If LOAD_HERE is 0, return Qnil. */
730 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
737 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
739 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
740 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
743 return Fnreverse (result);
748 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
749 Register symbol NAME as a coding system.
751 TYPE describes the conversion method used and should be one of
754 Automatic conversion. XEmacs attempts to detect the coding system
757 No conversion. Use this for binary files and such. On output,
758 graphic characters that are not in ASCII or Latin-1 will be
759 replaced by a ?. (For a no-conversion-encoded buffer, these
760 characters will only be present if you explicitly insert them.)
762 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
764 ISO 10646 UCS-4 encoding.
766 ISO 10646 UTF-8 encoding.
768 Any ISO2022-compliant encoding. Among other things, this includes
769 JIS (the Japanese encoding commonly used for e-mail), EUC (the
770 standard Unix encoding for Japanese and other languages), and
771 Compound Text (the encoding used in X11). You can specify more
772 specific information about the conversion with the PROPS argument.
774 Big5 (the encoding commonly used for Taiwanese).
776 The conversion is performed using a user-written pseudo-code
777 program. CCL (Code Conversion Language) is the name of this
780 Write out or read in the raw contents of the memory representing
781 the buffer's text. This is primarily useful for debugging
782 purposes, and is only enabled when XEmacs has been compiled with
783 DEBUG_XEMACS defined (via the --debug configure option).
784 WARNING: Reading in a file using 'internal conversion can result
785 in an internal inconsistency in the memory representing a
786 buffer's text, which will produce unpredictable results and may
787 cause XEmacs to crash. Under normal circumstances you should
788 never use 'internal conversion.
790 DOC-STRING is a string describing the coding system.
792 PROPS is a property list, describing the specific nature of the
793 character set. Recognized properties are:
796 String to be displayed in the modeline when this coding system is
800 End-of-line conversion to be used. It should be one of
803 Automatically detect the end-of-line type (LF, CRLF,
804 or CR). Also generate subsidiary coding systems named
805 `NAME-unix', `NAME-dos', and `NAME-mac', that are
806 identical to this coding system but have an EOL-TYPE
807 value of 'lf, 'crlf, and 'cr, respectively.
809 The end of a line is marked externally using ASCII LF.
810 Since this is also the way that XEmacs represents an
811 end-of-line internally, specifying this option results
812 in no end-of-line conversion. This is the standard
813 format for Unix text files.
815 The end of a line is marked externally using ASCII
816 CRLF. This is the standard format for MS-DOS text
819 The end of a line is marked externally using ASCII CR.
820 This is the standard format for Macintosh text files.
822 Automatically detect the end-of-line type but do not
823 generate subsidiary coding systems. (This value is
824 converted to nil when stored internally, and
825 `coding-system-property' will return nil.)
828 If non-nil, composition/decomposition for combining characters
831 'use-entity-reference
832 If non-nil, SGML style entity-reference is used for non-system-characters.
834 'post-read-conversion
835 Function called after a file has been read in, to perform the
836 decoding. Called with two arguments, START and END, denoting
837 a region of the current buffer to be decoded.
839 'pre-write-conversion
840 Function called before a file is written out, to perform the
841 encoding. Called with two arguments, START and END, denoting
842 a region of the current buffer to be encoded.
845 The following additional properties are recognized if TYPE is 'iso2022:
851 The character set initially designated to the G0 - G3 registers.
852 The value should be one of
854 -- A charset object (designate that character set)
855 -- nil (do not ever use this register)
856 -- t (no character set is initially designated to
857 the register, but may be later on; this automatically
858 sets the corresponding `force-g*-on-output' property)
864 If non-nil, send an explicit designation sequence on output before
865 using the specified register.
868 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
869 "ESC $ B" on output in place of the full designation sequences
870 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
873 If non-nil, don't designate ASCII to G0 at each end of line on output.
874 Setting this to non-nil also suppresses other state-resetting that
875 normally happens at the end of a line.
878 If non-nil, don't designate ASCII to G0 before control chars on output.
881 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
885 If non-nil, use locking-shift (SO/SI) instead of single-shift
886 or designation by escape sequence.
889 If non-nil, don't use ISO6429's direction specification.
892 If non-nil, literal control characters that are the same as
893 the beginning of a recognized ISO2022 or ISO6429 escape sequence
894 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
895 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
896 so that they can be properly distinguished from an escape sequence.
897 (Note that doing this results in a non-portable encoding.) This
898 encoding flag is used for byte-compiled files. Note that ESC
899 is a good choice for a quoting character because there are no
900 escape sequences whose second byte is a character from the Control-0
901 or Control-1 character sets; this is explicitly disallowed by the
904 'input-charset-conversion
905 A list of conversion specifications, specifying conversion of
906 characters in one charset to another when decoding is performed.
907 Each specification is a list of two elements: the source charset,
908 and the destination charset.
910 'output-charset-conversion
911 A list of conversion specifications, specifying conversion of
912 characters in one charset to another when encoding is performed.
913 The form of each specification is the same as for
914 'input-charset-conversion.
917 The following additional properties are recognized (and required)
921 CCL program used for decoding (converting to internal format).
924 CCL program used for encoding (converting to external format).
926 (name, type, doc_string, props))
928 Lisp_Coding_System *codesys;
929 enum coding_system_type ty;
930 int need_to_setup_eol_systems = 1;
932 /* Convert type to constant */
933 if (NILP (type) || EQ (type, Qundecided))
934 { ty = CODESYS_AUTODETECT; }
936 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
937 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
938 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
939 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
940 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
941 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
943 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
945 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
948 signal_simple_error ("Invalid coding system type", type);
952 codesys = allocate_coding_system (ty, name);
954 if (NILP (doc_string))
955 doc_string = build_string ("");
957 CHECK_STRING (doc_string);
958 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
961 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
963 if (EQ (key, Qmnemonic))
966 CHECK_STRING (value);
967 CODING_SYSTEM_MNEMONIC (codesys) = value;
970 else if (EQ (key, Qeol_type))
972 need_to_setup_eol_systems = NILP (value);
975 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
978 else if (EQ (key, Qpost_read_conversion))
979 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
980 else if (EQ (key, Qpre_write_conversion))
981 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
983 else if (EQ (key, Qdisable_composition))
984 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
985 else if (EQ (key, Quse_entity_reference))
986 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
989 else if (ty == CODESYS_ISO2022)
991 #define FROB_INITIAL_CHARSET(charset_num) \
992 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
993 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
995 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
996 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
997 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
998 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1000 #define FROB_FORCE_CHARSET(charset_num) \
1001 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1003 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1004 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1005 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1006 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1008 #define FROB_BOOLEAN_PROPERTY(prop) \
1009 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1011 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1012 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1013 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1014 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1015 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1016 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1017 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1019 else if (EQ (key, Qinput_charset_conversion))
1021 codesys->iso2022.input_conv =
1022 Dynarr_new (charset_conversion_spec);
1023 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1026 else if (EQ (key, Qoutput_charset_conversion))
1028 codesys->iso2022.output_conv =
1029 Dynarr_new (charset_conversion_spec);
1030 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1034 signal_simple_error ("Unrecognized property", key);
1037 else if (ty == CODESYS_BIG5)
1039 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1040 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1042 signal_simple_error ("Unrecognized property", key);
1045 else if (EQ (type, Qccl))
1048 struct ccl_program test_ccl;
1051 /* Check key first. */
1052 if (EQ (key, Qdecode))
1053 suffix = "-ccl-decode";
1054 else if (EQ (key, Qencode))
1055 suffix = "-ccl-encode";
1057 signal_simple_error ("Unrecognized property", key);
1059 /* If value is vector, register it as a ccl program
1060 associated with an newly created symbol for
1061 backward compatibility. */
1062 if (VECTORP (value))
1064 sym = Fintern (concat2 (Fsymbol_name (name),
1065 build_string (suffix)),
1067 Fregister_ccl_program (sym, value);
1071 CHECK_SYMBOL (value);
1074 /* check if the given ccl programs are valid. */
1075 if (setup_ccl_program (&test_ccl, sym) < 0)
1076 signal_simple_error ("Invalid CCL program", value);
1078 if (EQ (key, Qdecode))
1079 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1080 else if (EQ (key, Qencode))
1081 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1086 signal_simple_error ("Unrecognized property", key);
1090 if (need_to_setup_eol_systems)
1091 setup_eol_coding_systems (codesys);
1094 Lisp_Object codesys_obj;
1095 XSETCODING_SYSTEM (codesys_obj, codesys);
1096 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1101 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1102 Copy OLD-CODING-SYSTEM to NEW-NAME.
1103 If NEW-NAME does not name an existing coding system, a new one will
1106 (old_coding_system, new_name))
1108 Lisp_Object new_coding_system;
1109 old_coding_system = Fget_coding_system (old_coding_system);
1110 new_coding_system = Ffind_coding_system (new_name);
1111 if (NILP (new_coding_system))
1113 XSETCODING_SYSTEM (new_coding_system,
1114 allocate_coding_system
1115 (XCODING_SYSTEM_TYPE (old_coding_system),
1117 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1121 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1122 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1123 memcpy (((char *) to ) + sizeof (to->header),
1124 ((char *) from) + sizeof (from->header),
1125 sizeof (*from) - sizeof (from->header));
1126 to->name = new_name;
1128 return new_coding_system;
1131 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1132 Return t if OBJECT names a coding system, and is not a coding system alias.
1136 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1140 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1141 Return t if OBJECT is a coding system alias.
1142 All coding system aliases are created by `define-coding-system-alias'.
1146 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1150 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1151 Return the coding-system symbol for which symbol ALIAS is an alias.
1155 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1156 if (SYMBOLP (aliasee))
1159 signal_simple_error ("Symbol is not a coding system alias", alias);
1160 return Qnil; /* To keep the compiler happy */
1164 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1166 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1170 /* A maphash function, for removing dangling coding system aliases. */
1172 dangling_coding_system_alias_p (Lisp_Object alias,
1173 Lisp_Object aliasee,
1174 void *dangling_aliases)
1176 if (SYMBOLP (aliasee)
1177 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1179 (*(int *) dangling_aliases)++;
1186 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1187 Define symbol ALIAS as an alias for coding system ALIASEE.
1189 You can use this function to redefine an alias that has already been defined,
1190 but you cannot redefine a name which is the canonical name for a coding system.
1191 \(a canonical name of a coding system is what is returned when you call
1192 `coding-system-name' on a coding system).
1194 ALIASEE itself can be an alias, which allows you to define nested aliases.
1196 You are forbidden, however, from creating alias loops or `dangling' aliases.
1197 These will be detected, and an error will be signaled if you attempt to do so.
1199 If ALIASEE is nil, then ALIAS will simply be undefined.
1201 See also `coding-system-alias-p', `coding-system-aliasee',
1202 and `coding-system-canonical-name-p'.
1206 Lisp_Object real_coding_system, probe;
1208 CHECK_SYMBOL (alias);
1210 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1212 ("Symbol is the canonical name of a coding system and cannot be redefined",
1217 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1218 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1219 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1221 Fremhash (alias, Vcoding_system_hash_table);
1223 /* Undefine subsidiary aliases,
1224 presumably created by a previous call to this function */
1225 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1226 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1227 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1229 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1230 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1231 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1234 /* Undefine dangling coding system aliases. */
1236 int dangling_aliases;
1239 dangling_aliases = 0;
1240 elisp_map_remhash (dangling_coding_system_alias_p,
1241 Vcoding_system_hash_table,
1243 } while (dangling_aliases > 0);
1249 if (CODING_SYSTEMP (aliasee))
1250 aliasee = XCODING_SYSTEM_NAME (aliasee);
1252 /* Checks that aliasee names a coding-system */
1253 real_coding_system = Fget_coding_system (aliasee);
1255 /* Check for coding system alias loops */
1256 if (EQ (alias, aliasee))
1257 alias_loop: signal_simple_error_2
1258 ("Attempt to create a coding system alias loop", alias, aliasee);
1260 for (probe = aliasee;
1262 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1264 if (EQ (probe, alias))
1268 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1270 /* Set up aliases for subsidiaries.
1271 #### There must be a better way to handle subsidiary coding systems. */
1273 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1275 for (i = 0; i < countof (suffixes); i++)
1277 Lisp_Object alias_subsidiary =
1278 append_suffix_to_symbol (alias, suffixes[i]);
1279 Lisp_Object aliasee_subsidiary =
1280 append_suffix_to_symbol (aliasee, suffixes[i]);
1282 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1283 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1286 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1287 but it doesn't look intentional, so I'd rather return something
1288 meaningful or nothing at all. */
1293 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1295 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1296 Lisp_Object new_coding_system;
1298 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1299 return coding_system;
1303 case EOL_AUTODETECT: return coding_system;
1304 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1305 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1306 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1307 default: abort (); return Qnil;
1310 return NILP (new_coding_system) ? coding_system : new_coding_system;
1313 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1314 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1316 (coding_system, eol_type))
1318 coding_system = Fget_coding_system (coding_system);
1320 return subsidiary_coding_system (coding_system,
1321 symbol_to_eol_type (eol_type));
1325 /************************************************************************/
1326 /* Coding system accessors */
1327 /************************************************************************/
1329 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1330 Return the doc string for CODING-SYSTEM.
1334 coding_system = Fget_coding_system (coding_system);
1335 return XCODING_SYSTEM_DOC_STRING (coding_system);
1338 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1339 Return the type of CODING-SYSTEM.
1343 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1346 case CODESYS_AUTODETECT: return Qundecided;
1348 case CODESYS_SHIFT_JIS: return Qshift_jis;
1349 case CODESYS_ISO2022: return Qiso2022;
1350 case CODESYS_BIG5: return Qbig5;
1351 case CODESYS_UCS4: return Qucs4;
1352 case CODESYS_UTF8: return Qutf8;
1353 case CODESYS_CCL: return Qccl;
1355 case CODESYS_NO_CONVERSION: return Qno_conversion;
1357 case CODESYS_INTERNAL: return Qinternal;
1364 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1367 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1369 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1372 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1373 Return initial charset of CODING-SYSTEM designated to GNUM.
1376 (coding_system, gnum))
1378 coding_system = Fget_coding_system (coding_system);
1381 return coding_system_charset (coding_system, XINT (gnum));
1385 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1386 Return the PROP property of CODING-SYSTEM.
1388 (coding_system, prop))
1391 enum coding_system_type type;
1393 coding_system = Fget_coding_system (coding_system);
1394 CHECK_SYMBOL (prop);
1395 type = XCODING_SYSTEM_TYPE (coding_system);
1397 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1398 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1401 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1403 case CODESYS_PROP_ALL_OK:
1406 case CODESYS_PROP_ISO2022:
1407 if (type != CODESYS_ISO2022)
1409 ("Property only valid in ISO2022 coding systems",
1413 case CODESYS_PROP_CCL:
1414 if (type != CODESYS_CCL)
1416 ("Property only valid in CCL coding systems",
1426 signal_simple_error ("Unrecognized property", prop);
1428 if (EQ (prop, Qname))
1429 return XCODING_SYSTEM_NAME (coding_system);
1430 else if (EQ (prop, Qtype))
1431 return Fcoding_system_type (coding_system);
1432 else if (EQ (prop, Qdoc_string))
1433 return XCODING_SYSTEM_DOC_STRING (coding_system);
1434 else if (EQ (prop, Qmnemonic))
1435 return XCODING_SYSTEM_MNEMONIC (coding_system);
1436 else if (EQ (prop, Qeol_type))
1437 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1438 else if (EQ (prop, Qeol_lf))
1439 return XCODING_SYSTEM_EOL_LF (coding_system);
1440 else if (EQ (prop, Qeol_crlf))
1441 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1442 else if (EQ (prop, Qeol_cr))
1443 return XCODING_SYSTEM_EOL_CR (coding_system);
1444 else if (EQ (prop, Qpost_read_conversion))
1445 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1446 else if (EQ (prop, Qpre_write_conversion))
1447 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1450 else if (EQ (prop, Qdisable_composition))
1451 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1452 else if (EQ (prop, Quse_entity_reference))
1453 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1455 else if (type == CODESYS_ISO2022)
1457 if (EQ (prop, Qcharset_g0))
1458 return coding_system_charset (coding_system, 0);
1459 else if (EQ (prop, Qcharset_g1))
1460 return coding_system_charset (coding_system, 1);
1461 else if (EQ (prop, Qcharset_g2))
1462 return coding_system_charset (coding_system, 2);
1463 else if (EQ (prop, Qcharset_g3))
1464 return coding_system_charset (coding_system, 3);
1466 #define FORCE_CHARSET(charset_num) \
1467 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1468 (coding_system, charset_num) ? Qt : Qnil)
1470 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1471 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1472 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1473 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1475 #define LISP_BOOLEAN(prop) \
1476 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1478 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1479 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1480 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1481 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1482 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1483 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1484 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1486 else if (EQ (prop, Qinput_charset_conversion))
1488 unparse_charset_conversion_specs
1489 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1490 else if (EQ (prop, Qoutput_charset_conversion))
1492 unparse_charset_conversion_specs
1493 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1497 else if (type == CODESYS_CCL)
1499 if (EQ (prop, Qdecode))
1500 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1501 else if (EQ (prop, Qencode))
1502 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1510 return Qnil; /* not reached */
1514 /************************************************************************/
1515 /* Coding category functions */
1516 /************************************************************************/
1519 decode_coding_category (Lisp_Object symbol)
1523 CHECK_SYMBOL (symbol);
1524 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1525 if (EQ (coding_category_symbol[i], symbol))
1528 signal_simple_error ("Unrecognized coding category", symbol);
1529 return 0; /* not reached */
1532 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1533 Return a list of all recognized coding categories.
1538 Lisp_Object list = Qnil;
1540 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1541 list = Fcons (coding_category_symbol[i], list);
1545 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1546 Change the priority order of the coding categories.
1547 LIST should be list of coding categories, in descending order of
1548 priority. Unspecified coding categories will be lower in priority
1549 than all specified ones, in the same relative order they were in
1554 int category_to_priority[CODING_CATEGORY_LAST];
1558 /* First generate a list that maps coding categories to priorities. */
1560 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1561 category_to_priority[i] = -1;
1563 /* Highest priority comes from the specified list. */
1565 EXTERNAL_LIST_LOOP (rest, list)
1567 int cat = decode_coding_category (XCAR (rest));
1569 if (category_to_priority[cat] >= 0)
1570 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1571 category_to_priority[cat] = i++;
1574 /* Now go through the existing categories by priority to retrieve
1575 the categories not yet specified and preserve their priority
1577 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1579 int cat = fcd->coding_category_by_priority[j];
1580 if (category_to_priority[cat] < 0)
1581 category_to_priority[cat] = i++;
1584 /* Now we need to construct the inverse of the mapping we just
1587 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1588 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1590 /* Phew! That was confusing. */
1594 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1595 Return a list of coding categories in descending order of priority.
1600 Lisp_Object list = Qnil;
1602 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1603 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1608 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1609 Change the coding system associated with a coding category.
1611 (coding_category, coding_system))
1613 int cat = decode_coding_category (coding_category);
1615 coding_system = Fget_coding_system (coding_system);
1616 fcd->coding_category_system[cat] = coding_system;
1620 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1621 Return the coding system associated with a coding category.
1625 int cat = decode_coding_category (coding_category);
1626 Lisp_Object sys = fcd->coding_category_system[cat];
1629 return XCODING_SYSTEM_NAME (sys);
1634 /************************************************************************/
1635 /* Detecting the encoding of data */
1636 /************************************************************************/
1638 struct detection_state
1640 eol_type_t eol_type;
1676 struct iso2022_decoder iso;
1678 int high_byte_count;
1679 unsigned int saw_single_shift:1;
1692 acceptable_control_char_p (int c)
1696 /* Allow and ignore control characters that you might
1697 reasonably see in a text file */
1702 case 8: /* backspace */
1703 case 11: /* vertical tab */
1704 case 12: /* form feed */
1705 case 26: /* MS-DOS C-z junk */
1706 case 31: /* '^_' -- for info */
1714 mask_has_at_most_one_bit_p (int mask)
1716 /* Perhaps the only thing useful you learn from intensive Microsoft
1717 technical interviews */
1718 return (mask & (mask - 1)) == 0;
1722 detect_eol_type (struct detection_state *st, const Extbyte *src,
1727 unsigned char c = *(unsigned char *)src++;
1730 if (st->eol.just_saw_cr)
1732 else if (st->eol.seen_anything)
1735 else if (st->eol.just_saw_cr)
1738 st->eol.just_saw_cr = 1;
1740 st->eol.just_saw_cr = 0;
1741 st->eol.seen_anything = 1;
1744 return EOL_AUTODETECT;
1747 /* Attempt to determine the encoding and EOL type of the given text.
1748 Before calling this function for the first type, you must initialize
1749 st->eol_type as appropriate and initialize st->mask to ~0.
1751 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1754 st->mask holds the determined coding category mask, or ~0 if only
1755 ASCII has been seen so far.
1759 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1760 is present in st->mask
1761 1 == definitive answers are here for both st->eol_type and st->mask
1765 detect_coding_type (struct detection_state *st, const Extbyte *src,
1766 size_t n, int just_do_eol)
1768 if (st->eol_type == EOL_AUTODETECT)
1769 st->eol_type = detect_eol_type (st, src, n);
1772 return st->eol_type != EOL_AUTODETECT;
1774 if (!st->seen_non_ascii)
1776 for (; n; n--, src++)
1778 unsigned char c = *(unsigned char *) src;
1779 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1781 st->seen_non_ascii = 1;
1783 st->shift_jis.mask = ~0;
1787 st->iso2022.mask = ~0;
1797 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1798 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1799 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1800 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1801 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1802 st->big5.mask = detect_coding_big5 (st, src, n);
1803 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1804 st->utf8.mask = detect_coding_utf8 (st, src, n);
1805 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1806 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1809 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1810 | st->utf8.mask | st->ucs4.mask;
1813 int retval = mask_has_at_most_one_bit_p (st->mask);
1814 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1815 return retval && st->eol_type != EOL_AUTODETECT;
1820 coding_system_from_mask (int mask)
1824 /* If the file was entirely or basically ASCII, use the
1825 default value of `buffer-file-coding-system'. */
1826 Lisp_Object retval =
1827 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1830 retval = Ffind_coding_system (retval);
1834 (Qbad_variable, Qwarning,
1835 "Invalid `default-buffer-file-coding-system', set to nil");
1836 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1840 retval = Fget_coding_system (Qraw_text);
1848 mask = postprocess_iso2022_mask (mask);
1850 /* Look through the coding categories by priority and find
1851 the first one that is allowed. */
1852 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1854 cat = fcd->coding_category_by_priority[i];
1855 if ((mask & (1 << cat)) &&
1856 !NILP (fcd->coding_category_system[cat]))
1860 return fcd->coding_category_system[cat];
1862 return Fget_coding_system (Qraw_text);
1866 /* Given a seekable read stream and potential coding system and EOL type
1867 as specified, do any autodetection that is called for. If the
1868 coding system and/or EOL type are not `autodetect', they will be left
1869 alone; but this function will never return an autodetect coding system
1872 This function does not automatically fetch subsidiary coding systems;
1873 that should be unnecessary with the explicit eol-type argument. */
1875 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1878 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1879 eol_type_t *eol_type_in_out)
1881 struct detection_state decst;
1883 if (*eol_type_in_out == EOL_AUTODETECT)
1884 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1887 decst.eol_type = *eol_type_in_out;
1890 /* If autodetection is called for, do it now. */
1891 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1892 || *eol_type_in_out == EOL_AUTODETECT)
1895 Lisp_Object coding_system = Qnil;
1897 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1900 /* Look for initial "-*-"; mode line prefix */
1902 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1907 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1909 Extbyte *local_vars_beg = p + 3;
1910 /* Look for final "-*-"; mode line suffix */
1911 for (p = local_vars_beg,
1912 scan_end = buf + nread - LENGTH ("-*-");
1917 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1919 Extbyte *suffix = p;
1920 /* Look for "coding:" */
1921 for (p = local_vars_beg,
1922 scan_end = suffix - LENGTH ("coding:?");
1925 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1926 && (p == local_vars_beg
1927 || (*(p-1) == ' ' ||
1933 p += LENGTH ("coding:");
1934 while (*p == ' ' || *p == '\t') p++;
1936 /* Get coding system name */
1937 save = *suffix; *suffix = '\0';
1938 /* Characters valid in a MIME charset name (rfc 1521),
1939 and in a Lisp symbol name. */
1940 n = strspn ( (char *) p,
1941 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1942 "abcdefghijklmnopqrstuvwxyz"
1948 save = p[n]; p[n] = '\0';
1950 Ffind_coding_system (intern ((char *) p));
1960 if (NILP (coding_system))
1963 if (detect_coding_type (&decst, buf, nread,
1964 XCODING_SYSTEM_TYPE (*codesys_in_out)
1965 != CODESYS_AUTODETECT))
1967 nread = Lstream_read (stream, buf, sizeof (buf));
1973 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1974 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1977 if (detect_coding_type (&decst, buf, nread, 1))
1979 nread = Lstream_read (stream, buf, sizeof (buf));
1985 *eol_type_in_out = decst.eol_type;
1986 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1988 if (NILP (coding_system))
1989 *codesys_in_out = coding_system_from_mask (decst.mask);
1991 *codesys_in_out = coding_system;
1995 /* If we absolutely can't determine the EOL type, just assume LF. */
1996 if (*eol_type_in_out == EOL_AUTODETECT)
1997 *eol_type_in_out = EOL_LF;
1999 Lstream_rewind (stream);
2002 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2003 Detect coding system of the text in the region between START and END.
2004 Return a list of possible coding systems ordered by priority.
2005 If only ASCII characters are found, return 'undecided or one of
2006 its subsidiary coding systems according to a detected end-of-line
2007 type. Optional arg BUFFER defaults to the current buffer.
2009 (start, end, buffer))
2011 Lisp_Object val = Qnil;
2012 struct buffer *buf = decode_buffer (buffer, 0);
2014 Lisp_Object instream, lb_instream;
2015 Lstream *istr, *lb_istr;
2016 struct detection_state decst;
2017 struct gcpro gcpro1, gcpro2;
2019 get_buffer_range_char (buf, start, end, &b, &e, 0);
2020 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2021 lb_istr = XLSTREAM (lb_instream);
2022 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2023 istr = XLSTREAM (instream);
2024 GCPRO2 (instream, lb_instream);
2026 decst.eol_type = EOL_AUTODETECT;
2030 Extbyte random_buffer[4096];
2031 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2035 if (detect_coding_type (&decst, random_buffer, nread, 0))
2039 if (decst.mask == ~0)
2040 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2048 decst.mask = postprocess_iso2022_mask (decst.mask);
2050 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2052 int sys = fcd->coding_category_by_priority[i];
2053 if (decst.mask & (1 << sys))
2055 Lisp_Object codesys = fcd->coding_category_system[sys];
2056 if (!NILP (codesys))
2057 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2058 val = Fcons (codesys, val);
2062 Lstream_close (istr);
2064 Lstream_delete (istr);
2065 Lstream_delete (lb_istr);
2070 /************************************************************************/
2071 /* Converting to internal Mule format ("decoding") */
2072 /************************************************************************/
2074 /* A decoding stream is a stream used for decoding text (i.e.
2075 converting from some external format to internal format).
2076 The decoding-stream object keeps track of the actual coding
2077 stream, the stream that is at the other end, and data that
2078 needs to be persistent across the lifetime of the stream. */
2080 /* Handle the EOL stuff related to just-read-in character C.
2081 EOL_TYPE is the EOL type of the coding stream.
2082 FLAGS is the current value of FLAGS in the coding stream, and may
2083 be modified by this macro. (The macro only looks at the
2084 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2085 bytes are to be written. You need to also define a local goto
2086 label "label_continue_loop" that is at the end of the main
2087 character-reading loop.
2089 If C is a CR character, then this macro handles it entirely and
2090 jumps to label_continue_loop. Otherwise, this macro does not add
2091 anything to DST, and continues normally. You should continue
2092 processing C normally after this macro. */
2094 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2098 if (eol_type == EOL_CR) \
2099 Dynarr_add (dst, '\n'); \
2100 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2101 Dynarr_add (dst, c); \
2103 flags |= CODING_STATE_CR; \
2104 goto label_continue_loop; \
2106 else if (flags & CODING_STATE_CR) \
2107 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2109 Dynarr_add (dst, '\r'); \
2110 flags &= ~CODING_STATE_CR; \
2114 /* C should be a binary character in the range 0 - 255; convert
2115 to internal format and add to Dynarr DST. */
2118 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2120 if (BYTE_ASCII_P (c)) \
2121 Dynarr_add (dst, c); \
2124 Dynarr_add (dst, (c >> 6) | 0xc0); \
2125 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2129 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2131 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2135 Dynarr_add (dst, c);
2137 else if ( c <= 0x7ff )
2139 Dynarr_add (dst, (c >> 6) | 0xc0);
2140 Dynarr_add (dst, (c & 0x3f) | 0x80);
2142 else if ( c <= 0xffff )
2144 Dynarr_add (dst, (c >> 12) | 0xe0);
2145 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2146 Dynarr_add (dst, (c & 0x3f) | 0x80);
2148 else if ( c <= 0x1fffff )
2150 Dynarr_add (dst, (c >> 18) | 0xf0);
2151 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2152 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2153 Dynarr_add (dst, (c & 0x3f) | 0x80);
2155 else if ( c <= 0x3ffffff )
2157 Dynarr_add (dst, (c >> 24) | 0xf8);
2158 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2159 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2160 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2161 Dynarr_add (dst, (c & 0x3f) | 0x80);
2165 Dynarr_add (dst, (c >> 30) | 0xfc);
2166 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2167 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2168 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2169 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2170 Dynarr_add (dst, (c & 0x3f) | 0x80);
2174 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2176 if (BYTE_ASCII_P (c)) \
2177 Dynarr_add (dst, c); \
2178 else if (BYTE_C1_P (c)) \
2180 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2181 Dynarr_add (dst, c + 0x20); \
2185 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2186 Dynarr_add (dst, c); \
2191 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2195 DECODE_ADD_BINARY_CHAR (ch, dst); \
2200 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2202 if (flags & CODING_STATE_END) \
2204 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2205 if (flags & CODING_STATE_CR) \
2206 Dynarr_add (dst, '\r'); \
2210 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2212 struct decoding_stream
2214 /* Coding system that governs the conversion. */
2215 Lisp_Coding_System *codesys;
2217 /* Stream that we read the encoded data from or
2218 write the decoded data to. */
2221 /* If we are reading, then we can return only a fixed amount of
2222 data, so if the conversion resulted in too much data, we store it
2223 here for retrieval the next time around. */
2224 unsigned_char_dynarr *runoff;
2226 /* FLAGS holds flags indicating the current state of the decoding.
2227 Some of these flags are dependent on the coding system. */
2230 /* CPOS holds a partially built-up code-point of character. */
2233 /* EOL_TYPE specifies the type of end-of-line conversion that
2234 currently applies. We need to keep this separate from the
2235 EOL type stored in CODESYS because the latter might indicate
2236 automatic EOL-type detection while the former will always
2237 indicate a particular EOL type. */
2238 eol_type_t eol_type;
2240 /* Additional ISO2022 information. We define the structure above
2241 because it's also needed by the detection routines. */
2242 struct iso2022_decoder iso2022;
2244 /* Additional information (the state of the running CCL program)
2245 used by the CCL decoder. */
2246 struct ccl_program ccl;
2248 /* counter for UTF-8 or UCS-4 */
2249 unsigned char counter;
2252 unsigned char er_counter;
2253 unsigned char er_buf[16];
2255 unsigned combined_char_count;
2256 Emchar combined_chars[16];
2257 Lisp_Object combining_table;
2259 struct detection_state decst;
2263 #ifdef HAVE_DATABASE
2264 extern Lisp_Object Qcomposition;
2266 extern Lisp_Object Vcharacter_composition_table;
2270 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
2272 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
2274 if ( str->er_counter > 0)
2276 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2277 str->er_counter = 0;
2281 void decode_add_er_char (struct decoding_stream *str, Emchar character,
2282 unsigned_char_dynarr* dst);
2284 decode_add_er_char (struct decoding_stream *str, Emchar c,
2285 unsigned_char_dynarr* dst)
2287 if (str->er_counter == 0)
2289 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
2292 str->er_buf[0] = '&';
2296 DECODE_ADD_UCS_CHAR (c, dst);
2300 Lisp_Object string = make_string (str->er_buf,
2302 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
2309 while (!NILP (rest))
2313 if (NILP (ccs = Ffind_charset (ccs)))
2328 pat = concat3 (build_string ("^&"),
2329 pat, build_string ("\\([0-9]+\\)$"));
2332 else if (EQ (ret, Qx))
2334 pat = concat3 (build_string ("^&"),
2335 pat, build_string ("\\([0-9a-f]+\\)$"));
2338 else if (EQ (ret, QX))
2340 pat = concat3 (build_string ("^&"),
2341 pat, build_string ("\\([0-9A-F]+\\)$"));
2347 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
2350 = XINT (Fstring_to_number
2351 (Fsubstring (string,
2352 Fmatch_beginning (make_int (1)),
2353 Fmatch_end (make_int (1))),
2356 DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
2361 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
2362 string, Qnil, Qnil)))
2365 = XINT (Fstring_to_number
2366 (Fsubstring (string,
2367 Fmatch_beginning (make_int (1)),
2368 Fmatch_end (make_int (1))),
2371 DECODE_ADD_UCS_CHAR (code, dst);
2375 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2376 Dynarr_add (dst, ';');
2379 str->er_counter = 0;
2381 else if ( (str->er_counter >= 16) || (c >= 0x7F) )
2383 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2384 str->er_counter = 0;
2385 DECODE_ADD_UCS_CHAR (c, dst);
2388 str->er_buf[str->er_counter++] = c;
2392 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2394 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2398 for (i = 0; i < str->combined_char_count; i++)
2399 decode_add_er_char (str, str->combined_chars[i], dst);
2400 str->combined_char_count = 0;
2401 str->combining_table = Qnil;
2404 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
2405 unsigned_char_dynarr* dst);
2407 COMPOSE_ADD_CHAR (struct decoding_stream *str,
2408 Emchar character, unsigned_char_dynarr* dst)
2410 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2411 decode_add_er_char (str, character, dst);
2413 #ifdef HAVE_DATABASE
2414 !CONSP (str->combining_table)
2416 !CHAR_TABLEP (str->combining_table)
2421 #ifdef HAVE_DATABASE
2422 Fget_char_attribute (make_char (character), Qcomposition, Qnil)
2424 get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2430 decode_add_er_char (str, character, dst);
2433 str->combined_chars[0] = character;
2434 str->combined_char_count = 1;
2435 str->combining_table = ret;
2441 #ifdef HAVE_DATABASE
2442 Fcdr (Fassq (make_char (character), str->combining_table))
2444 get_char_id_table (XCHAR_TABLE(str->combining_table),
2451 Emchar char2 = XCHARVAL (ret);
2453 #ifdef HAVE_DATABASE
2454 Fget_char_attribute (make_char (character), Qcomposition, Qnil)
2456 get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2462 decode_add_er_char (str, character, dst);
2463 str->combined_char_count = 0;
2464 str->combining_table = Qnil;
2468 str->combined_chars[0] = char2;
2469 str->combined_char_count = 1;
2470 str->combining_table = ret;
2473 #ifndef HAVE_DATABASE
2474 else if (CHAR_TABLEP (ret))
2476 str->combined_chars[str->combined_char_count++] = character;
2477 str->combining_table = ret;
2482 COMPOSE_FLUSH_CHARS (str, dst);
2483 decode_add_er_char (str, character, dst);
2487 #else /* not UTF2000 */
2488 #define COMPOSE_FLUSH_CHARS(str, dst)
2489 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2490 #endif /* UTF2000 */
2492 static ssize_t decoding_reader (Lstream *stream,
2493 unsigned char *data, size_t size);
2494 static ssize_t decoding_writer (Lstream *stream,
2495 const unsigned char *data, size_t size);
2496 static int decoding_rewinder (Lstream *stream);
2497 static int decoding_seekable_p (Lstream *stream);
2498 static int decoding_flusher (Lstream *stream);
2499 static int decoding_closer (Lstream *stream);
2501 static Lisp_Object decoding_marker (Lisp_Object stream);
2503 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2504 sizeof (struct decoding_stream));
2507 decoding_marker (Lisp_Object stream)
2509 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2510 Lisp_Object str_obj;
2512 /* We do not need to mark the coding systems or charsets stored
2513 within the stream because they are stored in a global list
2514 and automatically marked. */
2516 XSETLSTREAM (str_obj, str);
2517 mark_object (str_obj);
2518 if (str->imp->marker)
2519 return (str->imp->marker) (str_obj);
2524 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2525 so we read data from the other end, decode it, and store it into DATA. */
2528 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2530 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2531 unsigned char *orig_data = data;
2533 int error_occurred = 0;
2535 /* We need to interface to mule_decode(), which expects to take some
2536 amount of data and store the result into a Dynarr. We have
2537 mule_decode() store into str->runoff, and take data from there
2540 /* We loop until we have enough data, reading chunks from the other
2541 end and decoding it. */
2544 /* Take data from the runoff if we can. Make sure to take at
2545 most SIZE bytes, and delete the data from the runoff. */
2546 if (Dynarr_length (str->runoff) > 0)
2548 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2549 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2550 Dynarr_delete_many (str->runoff, 0, chunk);
2556 break; /* No more room for data */
2558 if (str->flags & CODING_STATE_END)
2559 /* This means that on the previous iteration, we hit the EOF on
2560 the other end. We loop once more so that mule_decode() can
2561 output any final stuff it may be holding, or any "go back
2562 to a sane state" escape sequences. (This latter makes sense
2563 during encoding.) */
2566 /* Exhausted the runoff, so get some more. DATA has at least
2567 SIZE bytes left of storage in it, so it's OK to read directly
2568 into it. (We'll be overwriting above, after we've decoded it
2569 into the runoff.) */
2570 read_size = Lstream_read (str->other_end, data, size);
2577 /* There might be some more end data produced in the translation.
2578 See the comment above. */
2579 str->flags |= CODING_STATE_END;
2580 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2583 if (data - orig_data == 0)
2584 return error_occurred ? -1 : 0;
2586 return data - orig_data;
2590 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2592 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2595 /* Decode all our data into the runoff, and then attempt to write
2596 it all out to the other end. Remove whatever chunk we succeeded
2598 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2599 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2600 Dynarr_length (str->runoff));
2602 Dynarr_delete_many (str->runoff, 0, retval);
2603 /* Do NOT return retval. The return value indicates how much
2604 of the incoming data was written, not how many bytes were
2610 reset_decoding_stream (struct decoding_stream *str)
2613 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2615 Lisp_Object coding_system;
2616 XSETCODING_SYSTEM (coding_system, str->codesys);
2617 reset_iso2022 (coding_system, &str->iso2022);
2619 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2621 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2626 str->er_counter = 0;
2627 str->combined_char_count = 0;
2628 str->combining_table = Qnil;
2630 str->flags = str->cpos = 0;
2634 decoding_rewinder (Lstream *stream)
2636 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2637 reset_decoding_stream (str);
2638 Dynarr_reset (str->runoff);
2639 return Lstream_rewind (str->other_end);
2643 decoding_seekable_p (Lstream *stream)
2645 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2646 return Lstream_seekable_p (str->other_end);
2650 decoding_flusher (Lstream *stream)
2652 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2653 return Lstream_flush (str->other_end);
2657 decoding_closer (Lstream *stream)
2659 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2660 if (stream->flags & LSTREAM_FL_WRITE)
2662 str->flags |= CODING_STATE_END;
2663 decoding_writer (stream, 0, 0);
2665 Dynarr_free (str->runoff);
2667 #ifdef ENABLE_COMPOSITE_CHARS
2668 if (str->iso2022.composite_chars)
2669 Dynarr_free (str->iso2022.composite_chars);
2672 return Lstream_close (str->other_end);
2676 decoding_stream_coding_system (Lstream *stream)
2678 Lisp_Object coding_system;
2679 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2681 XSETCODING_SYSTEM (coding_system, str->codesys);
2682 return subsidiary_coding_system (coding_system, str->eol_type);
2686 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2688 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2689 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2691 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2692 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2693 reset_decoding_stream (str);
2696 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2697 stream for writing, no automatic code detection will be performed.
2698 The reason for this is that automatic code detection requires a
2699 seekable input. Things will also fail if you open a decoding
2700 stream for reading using a non-fully-specified coding system and
2701 a non-seekable input stream. */
2704 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2707 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2708 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2712 str->other_end = stream;
2713 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2714 str->eol_type = EOL_AUTODETECT;
2715 if (!strcmp (mode, "r")
2716 && Lstream_seekable_p (stream))
2717 /* We can determine the coding system now. */
2718 determine_real_coding_system (stream, &codesys, &str->eol_type);
2719 set_decoding_stream_coding_system (lstr, codesys);
2720 str->decst.eol_type = str->eol_type;
2721 str->decst.mask = ~0;
2722 XSETLSTREAM (obj, lstr);
2727 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2729 return make_decoding_stream_1 (stream, codesys, "r");
2733 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2735 return make_decoding_stream_1 (stream, codesys, "w");
2738 /* Note: the decode_coding_* functions all take the same
2739 arguments as mule_decode(), which is to say some SRC data of
2740 size N, which is to be stored into dynamic array DST.
2741 DECODING is the stream within which the decoding is
2742 taking place, but no data is actually read from or
2743 written to that stream; that is handled in decoding_reader()
2744 or decoding_writer(). This allows the same functions to
2745 be used for both reading and writing. */
2748 mule_decode (Lstream *decoding, const Extbyte *src,
2749 unsigned_char_dynarr *dst, size_t n)
2751 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2753 /* If necessary, do encoding-detection now. We do this when
2754 we're a writing stream or a non-seekable reading stream,
2755 meaning that we can't just process the whole input,
2756 rewind, and start over. */
2758 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2759 str->eol_type == EOL_AUTODETECT)
2761 Lisp_Object codesys;
2763 XSETCODING_SYSTEM (codesys, str->codesys);
2764 detect_coding_type (&str->decst, src, n,
2765 CODING_SYSTEM_TYPE (str->codesys) !=
2766 CODESYS_AUTODETECT);
2767 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2768 str->decst.mask != ~0)
2769 /* #### This is cheesy. What we really ought to do is
2770 buffer up a certain amount of data so as to get a
2771 less random result. */
2772 codesys = coding_system_from_mask (str->decst.mask);
2773 str->eol_type = str->decst.eol_type;
2774 if (XCODING_SYSTEM (codesys) != str->codesys)
2776 /* Preserve the CODING_STATE_END flag in case it was set.
2777 If we erase it, bad things might happen. */
2778 int was_end = str->flags & CODING_STATE_END;
2779 set_decoding_stream_coding_system (decoding, codesys);
2781 str->flags |= CODING_STATE_END;
2785 switch (CODING_SYSTEM_TYPE (str->codesys))
2788 case CODESYS_INTERNAL:
2789 Dynarr_add_many (dst, src, n);
2792 case CODESYS_AUTODETECT:
2793 /* If we got this far and still haven't decided on the coding
2794 system, then do no conversion. */
2795 case CODESYS_NO_CONVERSION:
2796 decode_coding_no_conversion (decoding, src, dst, n);
2799 case CODESYS_SHIFT_JIS:
2800 decode_coding_sjis (decoding, src, dst, n);
2803 decode_coding_big5 (decoding, src, dst, n);
2806 decode_coding_ucs4 (decoding, src, dst, n);
2809 decode_coding_utf8 (decoding, src, dst, n);
2812 str->ccl.last_block = str->flags & CODING_STATE_END;
2813 /* When applying ccl program to stream, MUST NOT set NULL
2815 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2816 dst, n, 0, CCL_MODE_DECODING);
2818 case CODESYS_ISO2022:
2819 decode_coding_iso2022 (decoding, src, dst, n);
2827 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2828 Decode the text between START and END which is encoded in CODING-SYSTEM.
2829 This is useful if you've read in encoded text from a file without decoding
2830 it (e.g. you read in a JIS-formatted file but used the `binary' or
2831 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2832 Return length of decoded text.
2833 BUFFER defaults to the current buffer if unspecified.
2835 (start, end, coding_system, buffer))
2838 struct buffer *buf = decode_buffer (buffer, 0);
2839 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2840 Lstream *istr, *ostr;
2841 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2843 get_buffer_range_char (buf, start, end, &b, &e, 0);
2845 barf_if_buffer_read_only (buf, b, e);
2847 coding_system = Fget_coding_system (coding_system);
2848 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2849 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2850 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2852 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2853 Fget_coding_system (Qbinary));
2854 istr = XLSTREAM (instream);
2855 ostr = XLSTREAM (outstream);
2856 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2858 /* The chain of streams looks like this:
2860 [BUFFER] <----- send through
2861 ------> [ENCODE AS BINARY]
2862 ------> [DECODE AS SPECIFIED]
2868 char tempbuf[1024]; /* some random amount */
2869 Bufpos newpos, even_newer_pos;
2870 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2871 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2875 newpos = lisp_buffer_stream_startpos (istr);
2876 Lstream_write (ostr, tempbuf, size_in_bytes);
2877 even_newer_pos = lisp_buffer_stream_startpos (istr);
2878 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2881 Lstream_close (istr);
2882 Lstream_close (ostr);
2884 Lstream_delete (istr);
2885 Lstream_delete (ostr);
2886 Lstream_delete (XLSTREAM (de_outstream));
2887 Lstream_delete (XLSTREAM (lb_outstream));
2892 /************************************************************************/
2893 /* Converting to an external encoding ("encoding") */
2894 /************************************************************************/
2896 /* An encoding stream is an output stream. When you create the
2897 stream, you specify the coding system that governs the encoding
2898 and another stream that the resulting encoded data is to be
2899 sent to, and then start sending data to it. */
2901 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2903 struct encoding_stream
2905 /* Coding system that governs the conversion. */
2906 Lisp_Coding_System *codesys;
2908 /* Stream that we read the encoded data from or
2909 write the decoded data to. */
2912 /* If we are reading, then we can return only a fixed amount of
2913 data, so if the conversion resulted in too much data, we store it
2914 here for retrieval the next time around. */
2915 unsigned_char_dynarr *runoff;
2917 /* FLAGS holds flags indicating the current state of the encoding.
2918 Some of these flags are dependent on the coding system. */
2921 /* CH holds a partially built-up character. Since we only deal
2922 with one- and two-byte characters at the moment, we only use
2923 this to store the first byte of a two-byte character. */
2926 /* Additional information used by the ISO2022 encoder. */
2929 /* CHARSET holds the character sets currently assigned to the G0
2930 through G3 registers. It is initialized from the array
2931 INITIAL_CHARSET in CODESYS. */
2932 Lisp_Object charset[4];
2934 /* Which registers are currently invoked into the left (GL) and
2935 right (GR) halves of the 8-bit encoding space? */
2936 int register_left, register_right;
2938 /* Whether we need to explicitly designate the charset in the
2939 G? register before using it. It is initialized from the
2940 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2941 unsigned char force_charset_on_output[4];
2943 /* Other state variables that need to be preserved across
2945 Lisp_Object current_charset;
2947 int current_char_boundary;
2950 void (*encode_char) (struct encoding_stream *str, Emchar c,
2951 unsigned_char_dynarr *dst, unsigned int *flags);
2952 void (*finish) (struct encoding_stream *str,
2953 unsigned_char_dynarr *dst, unsigned int *flags);
2955 /* Additional information (the state of the running CCL program)
2956 used by the CCL encoder. */
2957 struct ccl_program ccl;
2961 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2962 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2964 static int encoding_rewinder (Lstream *stream);
2965 static int encoding_seekable_p (Lstream *stream);
2966 static int encoding_flusher (Lstream *stream);
2967 static int encoding_closer (Lstream *stream);
2969 static Lisp_Object encoding_marker (Lisp_Object stream);
2971 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2972 sizeof (struct encoding_stream));
2975 encoding_marker (Lisp_Object stream)
2977 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2978 Lisp_Object str_obj;
2980 /* We do not need to mark the coding systems or charsets stored
2981 within the stream because they are stored in a global list
2982 and automatically marked. */
2984 XSETLSTREAM (str_obj, str);
2985 mark_object (str_obj);
2986 if (str->imp->marker)
2987 return (str->imp->marker) (str_obj);
2992 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2993 so we read data from the other end, encode it, and store it into DATA. */
2996 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2998 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2999 unsigned char *orig_data = data;
3001 int error_occurred = 0;
3003 /* We need to interface to mule_encode(), which expects to take some
3004 amount of data and store the result into a Dynarr. We have
3005 mule_encode() store into str->runoff, and take data from there
3008 /* We loop until we have enough data, reading chunks from the other
3009 end and encoding it. */
3012 /* Take data from the runoff if we can. Make sure to take at
3013 most SIZE bytes, and delete the data from the runoff. */
3014 if (Dynarr_length (str->runoff) > 0)
3016 int chunk = min ((int) size, Dynarr_length (str->runoff));
3017 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
3018 Dynarr_delete_many (str->runoff, 0, chunk);
3024 break; /* No more room for data */
3026 if (str->flags & CODING_STATE_END)
3027 /* This means that on the previous iteration, we hit the EOF on
3028 the other end. We loop once more so that mule_encode() can
3029 output any final stuff it may be holding, or any "go back
3030 to a sane state" escape sequences. (This latter makes sense
3031 during encoding.) */
3034 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
3035 left of storage in it, so it's OK to read directly into it.
3036 (We'll be overwriting above, after we've encoded it into the
3038 read_size = Lstream_read (str->other_end, data, size);
3045 /* There might be some more end data produced in the translation.
3046 See the comment above. */
3047 str->flags |= CODING_STATE_END;
3048 mule_encode (stream, data, str->runoff, read_size);
3051 if (data == orig_data)
3052 return error_occurred ? -1 : 0;
3054 return data - orig_data;
3058 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
3060 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3063 /* Encode all our data into the runoff, and then attempt to write
3064 it all out to the other end. Remove whatever chunk we succeeded
3066 mule_encode (stream, data, str->runoff, size);
3067 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
3068 Dynarr_length (str->runoff));
3070 Dynarr_delete_many (str->runoff, 0, retval);
3071 /* Do NOT return retval. The return value indicates how much
3072 of the incoming data was written, not how many bytes were
3078 reset_encoding_stream (struct encoding_stream *str)
3081 switch (CODING_SYSTEM_TYPE (str->codesys))
3083 case CODESYS_ISO2022:
3087 str->encode_char = &char_encode_iso2022;
3088 str->finish = &char_finish_iso2022;
3089 for (i = 0; i < 4; i++)
3091 str->iso2022.charset[i] =
3092 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
3093 str->iso2022.force_charset_on_output[i] =
3094 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
3096 str->iso2022.register_left = 0;
3097 str->iso2022.register_right = 1;
3098 str->iso2022.current_charset = Qnil;
3099 str->iso2022.current_half = 0;
3103 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
3106 str->encode_char = &char_encode_utf8;
3107 str->finish = &char_finish_utf8;
3110 str->encode_char = &char_encode_ucs4;
3111 str->finish = &char_finish_ucs4;
3113 case CODESYS_SHIFT_JIS:
3114 str->encode_char = &char_encode_shift_jis;
3115 str->finish = &char_finish_shift_jis;
3118 str->encode_char = &char_encode_big5;
3119 str->finish = &char_finish_big5;
3125 str->iso2022.current_char_boundary = 0;
3126 str->flags = str->ch = 0;
3130 encoding_rewinder (Lstream *stream)
3132 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3133 reset_encoding_stream (str);
3134 Dynarr_reset (str->runoff);
3135 return Lstream_rewind (str->other_end);
3139 encoding_seekable_p (Lstream *stream)
3141 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3142 return Lstream_seekable_p (str->other_end);
3146 encoding_flusher (Lstream *stream)
3148 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3149 return Lstream_flush (str->other_end);
3153 encoding_closer (Lstream *stream)
3155 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3156 if (stream->flags & LSTREAM_FL_WRITE)
3158 str->flags |= CODING_STATE_END;
3159 encoding_writer (stream, 0, 0);
3161 Dynarr_free (str->runoff);
3162 return Lstream_close (str->other_end);
3166 encoding_stream_coding_system (Lstream *stream)
3168 Lisp_Object coding_system;
3169 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3171 XSETCODING_SYSTEM (coding_system, str->codesys);
3172 return coding_system;
3176 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3178 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3179 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3181 reset_encoding_stream (str);
3185 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3188 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3189 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3193 str->runoff = Dynarr_new (unsigned_char);
3194 str->other_end = stream;
3195 set_encoding_stream_coding_system (lstr, codesys);
3196 XSETLSTREAM (obj, lstr);
3201 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3203 return make_encoding_stream_1 (stream, codesys, "r");
3207 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3209 return make_encoding_stream_1 (stream, codesys, "w");
3212 /* Convert N bytes of internally-formatted data stored in SRC to an
3213 external format, according to the encoding stream ENCODING.
3214 Store the encoded data into DST. */
3217 mule_encode (Lstream *encoding, const Bufbyte *src,
3218 unsigned_char_dynarr *dst, size_t n)
3220 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3222 switch (CODING_SYSTEM_TYPE (str->codesys))
3225 case CODESYS_INTERNAL:
3226 Dynarr_add_many (dst, src, n);
3229 case CODESYS_AUTODETECT:
3230 /* If we got this far and still haven't decided on the coding
3231 system, then do no conversion. */
3232 case CODESYS_NO_CONVERSION:
3233 encode_coding_no_conversion (encoding, src, dst, n);
3237 str->ccl.last_block = str->flags & CODING_STATE_END;
3238 /* When applying ccl program to stream, MUST NOT set NULL
3240 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3241 dst, n, 0, CCL_MODE_ENCODING);
3245 text_encode_generic (encoding, src, dst, n);
3249 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3250 Encode the text between START and END using CODING-SYSTEM.
3251 This will, for example, convert Japanese characters into stuff such as
3252 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3253 text. BUFFER defaults to the current buffer if unspecified.
3255 (start, end, coding_system, buffer))
3258 struct buffer *buf = decode_buffer (buffer, 0);
3259 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3260 Lstream *istr, *ostr;
3261 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3263 get_buffer_range_char (buf, start, end, &b, &e, 0);
3265 barf_if_buffer_read_only (buf, b, e);
3267 coding_system = Fget_coding_system (coding_system);
3268 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3269 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3270 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3271 Fget_coding_system (Qbinary));
3272 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3274 istr = XLSTREAM (instream);
3275 ostr = XLSTREAM (outstream);
3276 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3277 /* The chain of streams looks like this:
3279 [BUFFER] <----- send through
3280 ------> [ENCODE AS SPECIFIED]
3281 ------> [DECODE AS BINARY]
3286 char tempbuf[1024]; /* some random amount */
3287 Bufpos newpos, even_newer_pos;
3288 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3289 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3293 newpos = lisp_buffer_stream_startpos (istr);
3294 Lstream_write (ostr, tempbuf, size_in_bytes);
3295 even_newer_pos = lisp_buffer_stream_startpos (istr);
3296 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3302 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3303 Lstream_close (istr);
3304 Lstream_close (ostr);
3306 Lstream_delete (istr);
3307 Lstream_delete (ostr);
3308 Lstream_delete (XLSTREAM (de_outstream));
3309 Lstream_delete (XLSTREAM (lb_outstream));
3310 return make_int (retlen);
3317 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3318 unsigned_char_dynarr *dst, size_t n)
3321 unsigned char char_boundary;
3322 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3323 unsigned int flags = str->flags;
3324 Emchar ch = str->ch;
3326 char_boundary = str->iso2022.current_char_boundary;
3332 if (char_boundary == 0)
3360 (*str->encode_char) (str, c, dst, &flags);
3362 else if (char_boundary == 1)
3364 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3370 ch = (ch << 6) | (c & 0x3f);
3375 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3377 (*str->finish) (str, dst, &flags);
3382 str->iso2022.current_char_boundary = char_boundary;
3386 /************************************************************************/
3387 /* Shift-JIS methods */
3388 /************************************************************************/
3390 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3391 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3392 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3393 encoded by "position-code + 0x80". A character of JISX0208
3394 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3395 position-codes are divided and shifted so that it fit in the range
3398 --- CODE RANGE of Shift-JIS ---
3399 (character set) (range)
3401 JISX0201-Kana 0xA0 .. 0xDF
3402 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3403 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3404 -------------------------------
3408 /* Is this the first byte of a Shift-JIS two-byte char? */
3410 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3411 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3413 /* Is this the second byte of a Shift-JIS two-byte char? */
3415 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3416 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3418 #define BYTE_SJIS_KATAKANA_P(c) \
3419 ((c) >= 0xA1 && (c) <= 0xDF)
3422 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3426 unsigned char c = *(unsigned char *)src++;
3427 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3429 if (st->shift_jis.in_second_byte)
3431 st->shift_jis.in_second_byte = 0;
3435 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3436 st->shift_jis.in_second_byte = 1;
3438 return CODING_CATEGORY_SHIFT_JIS_MASK;
3441 /* Convert Shift-JIS data to internal format. */
3444 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3445 unsigned_char_dynarr *dst, size_t n)
3447 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3448 unsigned int flags = str->flags;
3449 unsigned int cpos = str->cpos;
3450 eol_type_t eol_type = str->eol_type;
3454 unsigned char c = *(unsigned char *)src++;
3458 /* Previous character was first byte of Shift-JIS Kanji char. */
3459 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3461 unsigned char e1, e2;
3463 DECODE_SJIS (cpos, c, e1, e2);
3465 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3469 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3470 Dynarr_add (dst, e1);
3471 Dynarr_add (dst, e2);
3476 DECODE_ADD_BINARY_CHAR (cpos, dst);
3477 DECODE_ADD_BINARY_CHAR (c, dst);
3483 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3484 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3486 else if (BYTE_SJIS_KATAKANA_P (c))
3489 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3492 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3493 Dynarr_add (dst, c);
3498 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3502 DECODE_ADD_BINARY_CHAR (c, dst);
3504 label_continue_loop:;
3507 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3513 /* Convert internal character representation to Shift_JIS. */
3516 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3517 unsigned_char_dynarr *dst, unsigned int *flags)
3519 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3523 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3524 Dynarr_add (dst, '\r');
3525 if (eol_type != EOL_CR)
3526 Dynarr_add (dst, ch);
3530 unsigned int s1, s2;
3532 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3534 if (code_point >= 0)
3535 Dynarr_add (dst, code_point);
3536 else if ((code_point
3537 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3540 ENCODE_SJIS ((code_point >> 8) | 0x80,
3541 (code_point & 0xFF) | 0x80, s1, s2);
3542 Dynarr_add (dst, s1);
3543 Dynarr_add (dst, s2);
3545 else if ((code_point
3546 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3548 Dynarr_add (dst, code_point | 0x80);
3549 else if ((code_point
3550 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3553 ENCODE_SJIS ((code_point >> 8) | 0x80,
3554 (code_point & 0xFF) | 0x80, s1, s2);
3555 Dynarr_add (dst, s1);
3556 Dynarr_add (dst, s2);
3558 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3560 Dynarr_add (dst, code_point);
3562 Dynarr_add (dst, '?');
3564 Lisp_Object charset;
3565 unsigned int c1, c2;
3567 BREAKUP_CHAR (ch, charset, c1, c2);
3569 if (EQ(charset, Vcharset_katakana_jisx0201))
3571 Dynarr_add (dst, c1 | 0x80);
3575 Dynarr_add (dst, c1);
3577 else if (EQ(charset, Vcharset_japanese_jisx0208))
3579 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3580 Dynarr_add (dst, s1);
3581 Dynarr_add (dst, s2);
3584 Dynarr_add (dst, '?');
3590 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3591 unsigned int *flags)
3595 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3596 Decode a JISX0208 character of Shift-JIS coding-system.
3597 CODE is the character code in Shift-JIS as a cons of type bytes.
3598 Return the corresponding character.
3602 unsigned char c1, c2, s1, s2;
3605 CHECK_INT (XCAR (code));
3606 CHECK_INT (XCDR (code));
3607 s1 = XINT (XCAR (code));
3608 s2 = XINT (XCDR (code));
3609 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3610 BYTE_SJIS_TWO_BYTE_2_P (s2))
3612 DECODE_SJIS (s1, s2, c1, c2);
3613 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3614 c1 & 0x7F, c2 & 0x7F));
3620 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3621 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3622 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3626 Lisp_Object charset;
3629 CHECK_CHAR_COERCE_INT (character);
3630 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3631 if (EQ (charset, Vcharset_japanese_jisx0208))
3633 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3634 return Fcons (make_int (s1), make_int (s2));
3641 /************************************************************************/
3643 /************************************************************************/
3645 /* BIG5 is a coding system encoding two character sets: ASCII and
3646 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3647 character set and is encoded in two-byte.
3649 --- CODE RANGE of BIG5 ---
3650 (character set) (range)
3652 Big5 (1st byte) 0xA1 .. 0xFE
3653 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3654 --------------------------
3656 Since the number of characters in Big5 is larger than maximum
3657 characters in Emacs' charset (96x96), it can't be handled as one
3658 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3659 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3660 contains frequently used characters and the latter contains less
3661 frequently used characters. */
3664 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3665 ((c) >= 0x81 && (c) <= 0xFE)
3667 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3668 ((c) >= 0xA1 && (c) <= 0xFE)
3671 /* Is this the second byte of a Shift-JIS two-byte char? */
3673 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3674 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3676 /* Number of Big5 characters which have the same code in 1st byte. */
3678 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3680 /* Code conversion macros. These are macros because they are used in
3681 inner loops during code conversion.
3683 Note that temporary variables in macros introduce the classic
3684 dynamic-scoping problems with variable names. We use capital-
3685 lettered variables in the assumption that XEmacs does not use
3686 capital letters in variables except in a very formalized way
3689 /* Convert Big5 code (b1, b2) into its internal string representation
3692 /* There is a much simpler way to split the Big5 charset into two.
3693 For the moment I'm going to leave the algorithm as-is because it
3694 claims to separate out the most-used characters into a single
3695 charset, which perhaps will lead to optimizations in various
3698 The way the algorithm works is something like this:
3700 Big5 can be viewed as a 94x157 charset, where the row is
3701 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3702 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3703 the split between low and high column numbers is apparently
3704 meaningless; ascending rows produce less and less frequent chars.
3705 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3706 the first charset, and the upper half (0xC9 .. 0xFE) to the
3707 second. To do the conversion, we convert the character into
3708 a single number where 0 .. 156 is the first row, 157 .. 313
3709 is the second, etc. That way, the characters are ordered by
3710 decreasing frequency. Then we just chop the space in two
3711 and coerce the result into a 94x94 space.
3714 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3716 int B1 = b1, B2 = b2; \
3718 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3722 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3726 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3727 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3729 c1 = I / (0xFF - 0xA1) + 0xA1; \
3730 c2 = I % (0xFF - 0xA1) + 0xA1; \
3733 /* Convert the internal string representation of a Big5 character
3734 (lb, c1, c2) into Big5 code (b1, b2). */
3736 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3738 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3740 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3742 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3744 b1 = I / BIG5_SAME_ROW + 0xA1; \
3745 b2 = I % BIG5_SAME_ROW; \
3746 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3750 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3754 unsigned char c = *(unsigned char *)src++;
3755 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3757 || (c >= 0x80 && c <= 0xA0)
3761 if (st->big5.in_second_byte)
3763 st->big5.in_second_byte = 0;
3764 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3774 st->big5.in_second_byte = 1;
3776 return CODING_CATEGORY_BIG5_MASK;
3779 /* Convert Big5 data to internal format. */
3782 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3783 unsigned_char_dynarr *dst, size_t n)
3785 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3786 unsigned int flags = str->flags;
3787 unsigned int cpos = str->cpos;
3788 eol_type_t eol_type = str->eol_type;
3791 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3792 (decoding)->codesys, 1);
3797 unsigned char c = *(unsigned char *)src++;
3800 /* Previous character was first byte of Big5 char. */
3801 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3804 int code_point = (cpos << 8) | c;
3805 Emchar char_id = decode_defined_char (ccs, code_point);
3808 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3809 DECODE_ADD_UCS_CHAR (char_id, dst);
3811 unsigned char b1, b2, b3;
3812 DECODE_BIG5 (cpos, c, b1, b2, b3);
3813 Dynarr_add (dst, b1);
3814 Dynarr_add (dst, b2);
3815 Dynarr_add (dst, b3);
3820 DECODE_ADD_BINARY_CHAR (cpos, dst);
3821 DECODE_ADD_BINARY_CHAR (c, dst);
3827 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3828 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3831 DECODE_ADD_BINARY_CHAR (c, dst);
3833 label_continue_loop:;
3836 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3842 /* Convert internally-formatted data to Big5. */
3845 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3846 unsigned_char_dynarr *dst, unsigned int *flags)
3848 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3852 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3853 Dynarr_add (dst, '\r');
3854 if (eol_type != EOL_CR)
3855 Dynarr_add (dst, ch);
3862 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3864 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3865 Dynarr_add (dst, code_point);
3866 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3868 Dynarr_add (dst, code_point >> 8);
3869 Dynarr_add (dst, code_point & 0xFF);
3871 else if ((code_point
3872 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3874 Dynarr_add (dst, code_point >> 8);
3875 Dynarr_add (dst, code_point & 0xFF);
3877 else if ((code_point
3878 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3881 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3882 + ((code_point & 0xFF) - 33);
3883 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3884 unsigned char b2 = I % BIG5_SAME_ROW;
3886 b2 += b2 < 0x3F ? 0x40 : 0x62;
3887 Dynarr_add (dst, b1);
3888 Dynarr_add (dst, b2);
3890 else if ((code_point
3891 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3894 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3895 + ((code_point & 0xFF) - 33);
3896 unsigned char b1, b2;
3898 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3899 b1 = I / BIG5_SAME_ROW + 0xA1;
3900 b2 = I % BIG5_SAME_ROW;
3901 b2 += b2 < 0x3F ? 0x40 : 0x62;
3902 Dynarr_add (dst, b1);
3903 Dynarr_add (dst, b2);
3906 Dynarr_add (dst, '?');
3913 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3914 unsigned int *flags)
3919 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3920 Decode a Big5 character CODE of BIG5 coding-system.
3921 CODE is the character code in BIG5, a cons of two integers.
3922 Return the corresponding character.
3926 unsigned char c1, c2, b1, b2;
3929 CHECK_INT (XCAR (code));
3930 CHECK_INT (XCDR (code));
3931 b1 = XINT (XCAR (code));
3932 b2 = XINT (XCDR (code));
3933 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3934 BYTE_BIG5_TWO_BYTE_2_P (b2))
3936 Charset_ID leading_byte;
3937 Lisp_Object charset;
3938 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3939 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3940 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3946 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3947 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3948 Return the corresponding character code in Big5.
3952 Lisp_Object charset;
3955 CHECK_CHAR_COERCE_INT (character);
3956 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3957 if (EQ (charset, Vcharset_chinese_big5_1) ||
3958 EQ (charset, Vcharset_chinese_big5_2))
3960 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3962 return Fcons (make_int (b1), make_int (b2));
3969 /************************************************************************/
3971 /************************************************************************/
3974 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3978 unsigned char c = *(unsigned char *)src++;
3979 switch (st->ucs4.in_byte)
3988 st->ucs4.in_byte = 0;
3994 return CODING_CATEGORY_UCS4_MASK;
3998 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3999 unsigned_char_dynarr *dst, size_t n)
4001 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4002 unsigned int flags = str->flags;
4003 unsigned int cpos = str->cpos;
4004 unsigned char counter = str->counter;
4008 unsigned char c = *(unsigned char *)src++;
4016 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4021 cpos = ( cpos << 8 ) | c;
4025 if (counter & CODING_STATE_END)
4026 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4030 str->counter = counter;
4034 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4035 unsigned_char_dynarr *dst, unsigned int *flags)
4037 Dynarr_add (dst, ch >> 24);
4038 Dynarr_add (dst, ch >> 16);
4039 Dynarr_add (dst, ch >> 8);
4040 Dynarr_add (dst, ch );
4044 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4045 unsigned int *flags)
4050 /************************************************************************/
4052 /************************************************************************/
4055 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4059 unsigned char c = *(unsigned char *)src++;
4060 switch (st->utf8.in_byte)
4063 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4066 st->utf8.in_byte = 5;
4068 st->utf8.in_byte = 4;
4070 st->utf8.in_byte = 3;
4072 st->utf8.in_byte = 2;
4074 st->utf8.in_byte = 1;
4079 if ((c & 0xc0) != 0x80)
4085 return CODING_CATEGORY_UTF8_MASK;
4089 decode_output_utf8_partial_char (unsigned char counter,
4091 unsigned_char_dynarr *dst)
4094 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4095 else if (counter == 4)
4097 if (cpos < (1 << 6))
4098 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4101 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4102 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4105 else if (counter == 3)
4107 if (cpos < (1 << 6))
4108 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4109 else if (cpos < (1 << 12))
4111 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4112 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4116 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4117 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4118 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4121 else if (counter == 2)
4123 if (cpos < (1 << 6))
4124 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4125 else if (cpos < (1 << 12))
4127 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4128 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4130 else if (cpos < (1 << 18))
4132 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4133 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4134 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4138 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4139 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4140 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4141 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4146 if (cpos < (1 << 6))
4147 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4148 else if (cpos < (1 << 12))
4150 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4151 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4153 else if (cpos < (1 << 18))
4155 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4156 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4157 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4159 else if (cpos < (1 << 24))
4161 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4162 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4163 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4164 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4168 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4169 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4170 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4171 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4172 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4178 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4179 unsigned_char_dynarr *dst, size_t n)
4181 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4182 unsigned int flags = str->flags;
4183 unsigned int cpos = str->cpos;
4184 eol_type_t eol_type = str->eol_type;
4185 unsigned char counter = str->counter;
4189 unsigned char c = *(unsigned char *)src++;
4194 COMPOSE_FLUSH_CHARS (str, dst);
4195 decode_flush_er_chars (str, dst);
4196 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4197 DECODE_ADD_UCS_CHAR (c, dst);
4199 else if ( c < 0xC0 )
4200 /* decode_add_er_char (str, c, dst); */
4201 COMPOSE_ADD_CHAR (str, c, dst);
4204 /* decode_flush_er_chars (str, dst); */
4210 else if ( c < 0xF0 )
4215 else if ( c < 0xF8 )
4220 else if ( c < 0xFC )
4232 else if ( (c & 0xC0) == 0x80 )
4234 cpos = ( cpos << 6 ) | ( c & 0x3f );
4237 /* DECODE_ADD_UCS_CHAR (cpos, dst); */
4238 COMPOSE_ADD_CHAR (str, cpos, dst);
4247 COMPOSE_FLUSH_CHARS (str, dst);
4248 decode_flush_er_chars (str, dst);
4249 decode_output_utf8_partial_char (counter, cpos, dst);
4250 DECODE_ADD_BINARY_CHAR (c, dst);
4254 label_continue_loop:;
4257 if (flags & CODING_STATE_END)
4259 COMPOSE_FLUSH_CHARS (str, dst);
4260 decode_flush_er_chars (str, dst);
4263 decode_output_utf8_partial_char (counter, cpos, dst);
4270 str->counter = counter;
4274 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4275 unsigned_char_dynarr *dst, unsigned int *flags)
4277 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4281 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4282 Dynarr_add (dst, '\r');
4283 if (eol_type != EOL_CR)
4284 Dynarr_add (dst, ch);
4286 else if (ch <= 0x7f)
4288 Dynarr_add (dst, ch);
4292 int code_point = charset_code_point (Vcharset_ucs, ch);
4294 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4296 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4298 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4302 int format_columns, idx;
4303 char buf[16], format[16];
4305 while (!NILP (rest))
4309 if (!NILP (ccs = Ffind_charset (ccs)))
4312 = charset_code_point (ccs, ch)) >= 0 )
4317 && ((idx =XSTRING_LENGTH (ret)) <= 6))
4319 strncpy (format, XSTRING_DATA (ret), idx);
4328 format [idx++] = '%';
4329 format_columns = XINT (ret);
4330 if ( (2 <= format_columns)
4331 && (format_columns <= 8) )
4333 format [idx++] = '0';
4334 format [idx++] = '0' + format_columns;
4341 format [idx++] = 'd';
4342 else if (EQ (ret, Qx))
4343 format [idx++] = 'x';
4344 else if (EQ (ret, QX))
4345 format [idx++] = 'X';
4350 sprintf (buf, format, code_point);
4351 Dynarr_add (dst, '&');
4352 Dynarr_add_many (dst, buf, strlen (buf));
4353 Dynarr_add (dst, ';');
4359 sprintf (buf, "&MCS-%08X;", ch);
4360 Dynarr_add_many (dst, buf, strlen (buf));
4366 if (code_point <= 0x7ff)
4368 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4369 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4371 else if (code_point <= 0xffff)
4373 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4374 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4375 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4377 else if (code_point <= 0x1fffff)
4379 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4380 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4381 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4382 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4384 else if (code_point <= 0x3ffffff)
4386 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4387 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4388 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4389 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4390 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4394 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4395 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4396 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4397 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4398 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4399 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4405 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4406 unsigned int *flags)
4411 /************************************************************************/
4412 /* ISO2022 methods */
4413 /************************************************************************/
4415 /* The following note describes the coding system ISO2022 briefly.
4416 Since the intention of this note is to help understand the
4417 functions in this file, some parts are NOT ACCURATE or OVERLY
4418 SIMPLIFIED. For thorough understanding, please refer to the
4419 original document of ISO2022.
4421 ISO2022 provides many mechanisms to encode several character sets
4422 in 7-bit and 8-bit environments. For 7-bit environments, all text
4423 is encoded using bytes less than 128. This may make the encoded
4424 text a little bit longer, but the text passes more easily through
4425 several gateways, some of which strip off MSB (Most Signigant Bit).
4427 There are two kinds of character sets: control character set and
4428 graphic character set. The former contains control characters such
4429 as `newline' and `escape' to provide control functions (control
4430 functions are also provided by escape sequences). The latter
4431 contains graphic characters such as 'A' and '-'. Emacs recognizes
4432 two control character sets and many graphic character sets.
4434 Graphic character sets are classified into one of the following
4435 four classes, according to the number of bytes (DIMENSION) and
4436 number of characters in one dimension (CHARS) of the set:
4437 - DIMENSION1_CHARS94
4438 - DIMENSION1_CHARS96
4439 - DIMENSION2_CHARS94
4440 - DIMENSION2_CHARS96
4442 In addition, each character set is assigned an identification tag,
4443 unique for each set, called "final character" (denoted as <F>
4444 hereafter). The <F> of each character set is decided by ECMA(*)
4445 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4446 (0x30..0x3F are for private use only).
4448 Note (*): ECMA = European Computer Manufacturers Association
4450 Here are examples of graphic character set [NAME(<F>)]:
4451 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4452 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4453 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4454 o DIMENSION2_CHARS96 -- none for the moment
4456 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4457 C0 [0x00..0x1F] -- control character plane 0
4458 GL [0x20..0x7F] -- graphic character plane 0
4459 C1 [0x80..0x9F] -- control character plane 1
4460 GR [0xA0..0xFF] -- graphic character plane 1
4462 A control character set is directly designated and invoked to C0 or
4463 C1 by an escape sequence. The most common case is that:
4464 - ISO646's control character set is designated/invoked to C0, and
4465 - ISO6429's control character set is designated/invoked to C1,
4466 and usually these designations/invocations are omitted in encoded
4467 text. In a 7-bit environment, only C0 can be used, and a control
4468 character for C1 is encoded by an appropriate escape sequence to
4469 fit into the environment. All control characters for C1 are
4470 defined to have corresponding escape sequences.
4472 A graphic character set is at first designated to one of four
4473 graphic registers (G0 through G3), then these graphic registers are
4474 invoked to GL or GR. These designations and invocations can be
4475 done independently. The most common case is that G0 is invoked to
4476 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4477 these invocations and designations are omitted in encoded text.
4478 In a 7-bit environment, only GL can be used.
4480 When a graphic character set of CHARS94 is invoked to GL, codes
4481 0x20 and 0x7F of the GL area work as control characters SPACE and
4482 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4485 There are two ways of invocation: locking-shift and single-shift.
4486 With locking-shift, the invocation lasts until the next different
4487 invocation, whereas with single-shift, the invocation affects the
4488 following character only and doesn't affect the locking-shift
4489 state. Invocations are done by the following control characters or
4492 ----------------------------------------------------------------------
4493 abbrev function cntrl escape seq description
4494 ----------------------------------------------------------------------
4495 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4496 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4497 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4498 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4499 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4500 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4501 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4502 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4503 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4504 ----------------------------------------------------------------------
4505 (*) These are not used by any known coding system.
4507 Control characters for these functions are defined by macros
4508 ISO_CODE_XXX in `coding.h'.
4510 Designations are done by the following escape sequences:
4511 ----------------------------------------------------------------------
4512 escape sequence description
4513 ----------------------------------------------------------------------
4514 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4515 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4516 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4517 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4518 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4519 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4520 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4521 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4522 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4523 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4524 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4525 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4526 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4527 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4528 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4529 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4530 ----------------------------------------------------------------------
4532 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4533 of dimension 1, chars 94, and final character <F>, etc...
4535 Note (*): Although these designations are not allowed in ISO2022,
4536 Emacs accepts them on decoding, and produces them on encoding
4537 CHARS96 character sets in a coding system which is characterized as
4538 7-bit environment, non-locking-shift, and non-single-shift.
4540 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4541 '(' can be omitted. We refer to this as "short-form" hereafter.
4543 Now you may notice that there are a lot of ways for encoding the
4544 same multilingual text in ISO2022. Actually, there exist many
4545 coding systems such as Compound Text (used in X11's inter client
4546 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4547 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4548 localized platforms), and all of these are variants of ISO2022.
4550 In addition to the above, Emacs handles two more kinds of escape
4551 sequences: ISO6429's direction specification and Emacs' private
4552 sequence for specifying character composition.
4554 ISO6429's direction specification takes the following form:
4555 o CSI ']' -- end of the current direction
4556 o CSI '0' ']' -- end of the current direction
4557 o CSI '1' ']' -- start of left-to-right text
4558 o CSI '2' ']' -- start of right-to-left text
4559 The control character CSI (0x9B: control sequence introducer) is
4560 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4562 Character composition specification takes the following form:
4563 o ESC '0' -- start character composition
4564 o ESC '1' -- end character composition
4565 Since these are not standard escape sequences of any ISO standard,
4566 their use with these meanings is restricted to Emacs only. */
4569 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4573 for (i = 0; i < 4; i++)
4575 if (!NILP (coding_system))
4577 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4579 iso->charset[i] = Qt;
4580 iso->invalid_designated[i] = 0;
4582 iso->esc = ISO_ESC_NOTHING;
4583 iso->esc_bytes_index = 0;
4584 iso->register_left = 0;
4585 iso->register_right = 1;
4586 iso->switched_dir_and_no_valid_charset_yet = 0;
4587 iso->invalid_switch_dir = 0;
4588 iso->output_direction_sequence = 0;
4589 iso->output_literally = 0;
4590 #ifdef ENABLE_COMPOSITE_CHARS
4591 if (iso->composite_chars)
4592 Dynarr_reset (iso->composite_chars);
4597 fit_to_be_escape_quoted (unsigned char c)
4614 /* Parse one byte of an ISO2022 escape sequence.
4615 If the result is an invalid escape sequence, return 0 and
4616 do not change anything in STR. Otherwise, if the result is
4617 an incomplete escape sequence, update ISO2022.ESC and
4618 ISO2022.ESC_BYTES and return -1. Otherwise, update
4619 all the state variables (but not ISO2022.ESC_BYTES) and
4622 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4623 or invocation of an invalid character set and treat that as
4624 an unrecognized escape sequence. */
4627 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4628 unsigned char c, unsigned int *flags,
4629 int check_invalid_charsets)
4631 /* (1) If we're at the end of a designation sequence, CS is the
4632 charset being designated and REG is the register to designate
4635 (2) If we're at the end of a locking-shift sequence, REG is
4636 the register to invoke and HALF (0 == left, 1 == right) is
4637 the half to invoke it into.
4639 (3) If we're at the end of a single-shift sequence, REG is
4640 the register to invoke. */
4641 Lisp_Object cs = Qnil;
4644 /* NOTE: This code does goto's all over the fucking place.
4645 The reason for this is that we're basically implementing
4646 a state machine here, and hierarchical languages like C
4647 don't really provide a clean way of doing this. */
4649 if (! (*flags & CODING_STATE_ESCAPE))
4650 /* At beginning of escape sequence; we need to reset our
4651 escape-state variables. */
4652 iso->esc = ISO_ESC_NOTHING;
4654 iso->output_literally = 0;
4655 iso->output_direction_sequence = 0;
4659 case ISO_ESC_NOTHING:
4660 iso->esc_bytes_index = 0;
4663 case ISO_CODE_ESC: /* Start escape sequence */
4664 *flags |= CODING_STATE_ESCAPE;
4668 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4669 *flags |= CODING_STATE_ESCAPE;
4670 iso->esc = ISO_ESC_5_11;
4673 case ISO_CODE_SO: /* locking shift 1 */
4676 case ISO_CODE_SI: /* locking shift 0 */
4680 case ISO_CODE_SS2: /* single shift */
4683 case ISO_CODE_SS3: /* single shift */
4687 default: /* Other control characters */
4694 /**** single shift ****/
4696 case 'N': /* single shift 2 */
4699 case 'O': /* single shift 3 */
4703 /**** locking shift ****/
4705 case '~': /* locking shift 1 right */
4708 case 'n': /* locking shift 2 */
4711 case '}': /* locking shift 2 right */
4714 case 'o': /* locking shift 3 */
4717 case '|': /* locking shift 3 right */
4721 #ifdef ENABLE_COMPOSITE_CHARS
4722 /**** composite ****/
4725 iso->esc = ISO_ESC_START_COMPOSITE;
4726 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4727 CODING_STATE_COMPOSITE;
4731 iso->esc = ISO_ESC_END_COMPOSITE;
4732 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4733 ~CODING_STATE_COMPOSITE;
4735 #endif /* ENABLE_COMPOSITE_CHARS */
4737 /**** directionality ****/
4740 iso->esc = ISO_ESC_5_11;
4743 /**** designation ****/
4745 case '$': /* multibyte charset prefix */
4746 iso->esc = ISO_ESC_2_4;
4750 if (0x28 <= c && c <= 0x2F)
4752 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4756 /* This function is called with CODESYS equal to nil when
4757 doing coding-system detection. */
4759 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4760 && fit_to_be_escape_quoted (c))
4762 iso->esc = ISO_ESC_LITERAL;
4763 *flags &= CODING_STATE_ISO2022_LOCK;
4773 /**** directionality ****/
4775 case ISO_ESC_5_11: /* ISO6429 direction control */
4778 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4779 goto directionality;
4781 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4782 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4783 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4787 case ISO_ESC_5_11_0:
4790 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4791 goto directionality;
4795 case ISO_ESC_5_11_1:
4798 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4799 goto directionality;
4803 case ISO_ESC_5_11_2:
4806 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4807 goto directionality;
4812 iso->esc = ISO_ESC_DIRECTIONALITY;
4813 /* Various junk here to attempt to preserve the direction sequences
4814 literally in the text if they would otherwise be swallowed due
4815 to invalid designations that don't show up as actual charset
4816 changes in the text. */
4817 if (iso->invalid_switch_dir)
4819 /* We already inserted a direction switch literally into the
4820 text. We assume (#### this may not be right) that the
4821 next direction switch is the one going the other way,
4822 and we need to output that literally as well. */
4823 iso->output_literally = 1;
4824 iso->invalid_switch_dir = 0;
4830 /* If we are in the thrall of an invalid designation,
4831 then stick the directionality sequence literally into the
4832 output stream so it ends up in the original text again. */
4833 for (jj = 0; jj < 4; jj++)
4834 if (iso->invalid_designated[jj])
4838 iso->output_literally = 1;
4839 iso->invalid_switch_dir = 1;
4842 /* Indicate that we haven't yet seen a valid designation,
4843 so that if a switch-dir is directly followed by an
4844 invalid designation, both get inserted literally. */
4845 iso->switched_dir_and_no_valid_charset_yet = 1;
4850 /**** designation ****/
4853 if (0x28 <= c && c <= 0x2F)
4855 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4858 if (0x40 <= c && c <= 0x42)
4861 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4862 *flags & CODING_STATE_R2L ?
4863 CHARSET_RIGHT_TO_LEFT :
4864 CHARSET_LEFT_TO_RIGHT);
4875 if (c < '0' || c > '~')
4876 return 0; /* bad final byte */
4878 if (iso->esc >= ISO_ESC_2_8 &&
4879 iso->esc <= ISO_ESC_2_15)
4881 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4882 single = 1; /* single-byte */
4883 reg = (iso->esc - ISO_ESC_2_8) & 3;
4885 else if (iso->esc >= ISO_ESC_2_4_8 &&
4886 iso->esc <= ISO_ESC_2_4_15)
4888 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4889 single = -1; /* multi-byte */
4890 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4894 /* Can this ever be reached? -slb */
4898 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4899 *flags & CODING_STATE_R2L ?
4900 CHARSET_RIGHT_TO_LEFT :
4901 CHARSET_LEFT_TO_RIGHT);
4907 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4911 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4912 /* can't invoke something that ain't there. */
4914 iso->esc = ISO_ESC_SINGLE_SHIFT;
4915 *flags &= CODING_STATE_ISO2022_LOCK;
4917 *flags |= CODING_STATE_SS2;
4919 *flags |= CODING_STATE_SS3;
4923 if (check_invalid_charsets &&
4924 !CHARSETP (iso->charset[reg]))
4925 /* can't invoke something that ain't there. */
4928 iso->register_right = reg;
4930 iso->register_left = reg;
4931 *flags &= CODING_STATE_ISO2022_LOCK;
4932 iso->esc = ISO_ESC_LOCKING_SHIFT;
4936 if (NILP (cs) && check_invalid_charsets)
4938 iso->invalid_designated[reg] = 1;
4939 iso->charset[reg] = Vcharset_ascii;
4940 iso->esc = ISO_ESC_DESIGNATE;
4941 *flags &= CODING_STATE_ISO2022_LOCK;
4942 iso->output_literally = 1;
4943 if (iso->switched_dir_and_no_valid_charset_yet)
4945 /* We encountered a switch-direction followed by an
4946 invalid designation. Ensure that the switch-direction
4947 gets outputted; otherwise it will probably get eaten
4948 when the text is written out again. */
4949 iso->switched_dir_and_no_valid_charset_yet = 0;
4950 iso->output_direction_sequence = 1;
4951 /* And make sure that the switch-dir going the other
4952 way gets outputted, as well. */
4953 iso->invalid_switch_dir = 1;
4957 /* This function is called with CODESYS equal to nil when
4958 doing coding-system detection. */
4959 if (!NILP (codesys))
4961 charset_conversion_spec_dynarr *dyn =
4962 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4968 for (i = 0; i < Dynarr_length (dyn); i++)
4970 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4971 if (EQ (cs, spec->from_charset))
4972 cs = spec->to_charset;
4977 iso->charset[reg] = cs;
4978 iso->esc = ISO_ESC_DESIGNATE;
4979 *flags &= CODING_STATE_ISO2022_LOCK;
4980 if (iso->invalid_designated[reg])
4982 iso->invalid_designated[reg] = 0;
4983 iso->output_literally = 1;
4985 if (iso->switched_dir_and_no_valid_charset_yet)
4986 iso->switched_dir_and_no_valid_charset_yet = 0;
4991 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4995 /* #### There are serious deficiencies in the recognition mechanism
4996 here. This needs to be much smarter if it's going to cut it.
4997 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4998 it should be detected as Latin-1.
4999 All the ISO2022 stuff in this file should be synced up with the
5000 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5001 Perhaps we should wait till R2L works in FSF Emacs? */
5003 if (!st->iso2022.initted)
5005 reset_iso2022 (Qnil, &st->iso2022.iso);
5006 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5007 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5008 CODING_CATEGORY_ISO_8_1_MASK |
5009 CODING_CATEGORY_ISO_8_2_MASK |
5010 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5011 st->iso2022.flags = 0;
5012 st->iso2022.high_byte_count = 0;
5013 st->iso2022.saw_single_shift = 0;
5014 st->iso2022.initted = 1;
5017 mask = st->iso2022.mask;
5021 unsigned char c = *(unsigned char *)src++;
5024 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5025 st->iso2022.high_byte_count++;
5029 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5031 if (st->iso2022.high_byte_count & 1)
5032 /* odd number of high bytes; assume not iso-8-2 */
5033 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5035 st->iso2022.high_byte_count = 0;
5036 st->iso2022.saw_single_shift = 0;
5038 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5040 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5041 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5042 { /* control chars */
5045 /* Allow and ignore control characters that you might
5046 reasonably see in a text file */
5051 case 8: /* backspace */
5052 case 11: /* vertical tab */
5053 case 12: /* form feed */
5054 case 26: /* MS-DOS C-z junk */
5055 case 31: /* '^_' -- for info */
5056 goto label_continue_loop;
5063 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5066 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5067 &st->iso2022.flags, 0))
5069 switch (st->iso2022.iso.esc)
5071 case ISO_ESC_DESIGNATE:
5072 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5073 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5075 case ISO_ESC_LOCKING_SHIFT:
5076 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5077 goto ran_out_of_chars;
5078 case ISO_ESC_SINGLE_SHIFT:
5079 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5080 st->iso2022.saw_single_shift = 1;
5089 goto ran_out_of_chars;
5092 label_continue_loop:;
5101 postprocess_iso2022_mask (int mask)
5103 /* #### kind of cheesy */
5104 /* If seven-bit ISO is allowed, then assume that the encoding is
5105 entirely seven-bit and turn off the eight-bit ones. */
5106 if (mask & CODING_CATEGORY_ISO_7_MASK)
5107 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5108 CODING_CATEGORY_ISO_8_1_MASK |
5109 CODING_CATEGORY_ISO_8_2_MASK);
5113 /* If FLAGS is a null pointer or specifies right-to-left motion,
5114 output a switch-dir-to-left-to-right sequence to DST.
5115 Also update FLAGS if it is not a null pointer.
5116 If INTERNAL_P is set, we are outputting in internal format and
5117 need to handle the CSI differently. */
5120 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5121 unsigned_char_dynarr *dst,
5122 unsigned int *flags,
5125 if (!flags || (*flags & CODING_STATE_R2L))
5127 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5129 Dynarr_add (dst, ISO_CODE_ESC);
5130 Dynarr_add (dst, '[');
5132 else if (internal_p)
5133 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5135 Dynarr_add (dst, ISO_CODE_CSI);
5136 Dynarr_add (dst, '0');
5137 Dynarr_add (dst, ']');
5139 *flags &= ~CODING_STATE_R2L;
5143 /* If FLAGS is a null pointer or specifies a direction different from
5144 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5145 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5146 sequence to DST. Also update FLAGS if it is not a null pointer.
5147 If INTERNAL_P is set, we are outputting in internal format and
5148 need to handle the CSI differently. */
5151 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5152 unsigned_char_dynarr *dst, unsigned int *flags,
5155 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5156 direction == CHARSET_LEFT_TO_RIGHT)
5157 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5158 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5159 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5160 direction == CHARSET_RIGHT_TO_LEFT)
5162 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5164 Dynarr_add (dst, ISO_CODE_ESC);
5165 Dynarr_add (dst, '[');
5167 else if (internal_p)
5168 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5170 Dynarr_add (dst, ISO_CODE_CSI);
5171 Dynarr_add (dst, '2');
5172 Dynarr_add (dst, ']');
5174 *flags |= CODING_STATE_R2L;
5178 /* Convert ISO2022-format data to internal format. */
5181 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5182 unsigned_char_dynarr *dst, size_t n)
5184 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5185 unsigned int flags = str->flags;
5186 unsigned int cpos = str->cpos;
5187 unsigned char counter = str->counter;
5188 eol_type_t eol_type = str->eol_type;
5189 #ifdef ENABLE_COMPOSITE_CHARS
5190 unsigned_char_dynarr *real_dst = dst;
5192 Lisp_Object coding_system;
5194 XSETCODING_SYSTEM (coding_system, str->codesys);
5196 #ifdef ENABLE_COMPOSITE_CHARS
5197 if (flags & CODING_STATE_COMPOSITE)
5198 dst = str->iso2022.composite_chars;
5199 #endif /* ENABLE_COMPOSITE_CHARS */
5203 unsigned char c = *(unsigned char *)src++;
5204 if (flags & CODING_STATE_ESCAPE)
5205 { /* Within ESC sequence */
5206 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5211 switch (str->iso2022.esc)
5213 #ifdef ENABLE_COMPOSITE_CHARS
5214 case ISO_ESC_START_COMPOSITE:
5215 if (str->iso2022.composite_chars)
5216 Dynarr_reset (str->iso2022.composite_chars);
5218 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5219 dst = str->iso2022.composite_chars;
5221 case ISO_ESC_END_COMPOSITE:
5223 Bufbyte comstr[MAX_EMCHAR_LEN];
5225 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5226 Dynarr_length (dst));
5228 len = set_charptr_emchar (comstr, emch);
5229 Dynarr_add_many (dst, comstr, len);
5232 #endif /* ENABLE_COMPOSITE_CHARS */
5234 case ISO_ESC_LITERAL:
5235 COMPOSE_FLUSH_CHARS (str, dst);
5236 decode_flush_er_chars (str, dst);
5237 DECODE_ADD_BINARY_CHAR (c, dst);
5241 /* Everything else handled already */
5246 /* Attempted error recovery. */
5247 if (str->iso2022.output_direction_sequence)
5248 ensure_correct_direction (flags & CODING_STATE_R2L ?
5249 CHARSET_RIGHT_TO_LEFT :
5250 CHARSET_LEFT_TO_RIGHT,
5251 str->codesys, dst, 0, 1);
5252 /* More error recovery. */
5253 if (!retval || str->iso2022.output_literally)
5255 /* Output the (possibly invalid) sequence */
5257 COMPOSE_FLUSH_CHARS (str, dst);
5258 decode_flush_er_chars (str, dst);
5259 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5260 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5261 flags &= CODING_STATE_ISO2022_LOCK;
5263 n++, src--;/* Repeat the loop with the same character. */
5266 /* No sense in reprocessing the final byte of the
5267 escape sequence; it could mess things up anyway.
5269 COMPOSE_FLUSH_CHARS (str, dst);
5270 decode_flush_er_chars (str, dst);
5271 DECODE_ADD_BINARY_CHAR (c, dst);
5277 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5278 { /* Control characters */
5280 /***** Error-handling *****/
5282 /* If we were in the middle of a character, dump out the
5283 partial character. */
5286 COMPOSE_FLUSH_CHARS (str, dst);
5287 decode_flush_er_chars (str, dst);
5291 DECODE_ADD_BINARY_CHAR
5292 ((unsigned char)(cpos >> (counter * 8)), dst);
5297 /* If we just saw a single-shift character, dump it out.
5298 This may dump out the wrong sort of single-shift character,
5299 but least it will give an indication that something went
5301 if (flags & CODING_STATE_SS2)
5303 COMPOSE_FLUSH_CHARS (str, dst);
5304 decode_flush_er_chars (str, dst);
5305 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5306 flags &= ~CODING_STATE_SS2;
5308 if (flags & CODING_STATE_SS3)
5310 COMPOSE_FLUSH_CHARS (str, dst);
5311 decode_flush_er_chars (str, dst);
5312 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5313 flags &= ~CODING_STATE_SS3;
5316 /***** Now handle the control characters. *****/
5322 COMPOSE_FLUSH_CHARS (str, dst);
5323 decode_flush_er_chars (str, dst);
5324 if (eol_type == EOL_CR)
5325 Dynarr_add (dst, '\n');
5326 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5327 Dynarr_add (dst, c);
5329 flags |= CODING_STATE_CR;
5330 goto label_continue_loop;
5332 else if (flags & CODING_STATE_CR)
5333 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5335 Dynarr_add (dst, '\r');
5336 flags &= ~CODING_STATE_CR;
5339 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5342 flags &= CODING_STATE_ISO2022_LOCK;
5344 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5346 COMPOSE_FLUSH_CHARS (str, dst);
5347 decode_flush_er_chars (str, dst);
5348 DECODE_ADD_BINARY_CHAR (c, dst);
5352 { /* Graphic characters */
5353 Lisp_Object charset;
5362 COMPOSE_FLUSH_CHARS (str, dst);
5363 decode_flush_er_chars (str, dst);
5364 if (eol_type == EOL_CR)
5365 Dynarr_add (dst, '\n');
5366 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5367 Dynarr_add (dst, c);
5369 flags |= CODING_STATE_CR;
5370 goto label_continue_loop;
5372 else if (flags & CODING_STATE_CR)
5373 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5375 Dynarr_add (dst, '\r');
5376 flags &= ~CODING_STATE_CR;
5379 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5382 /* Now determine the charset. */
5383 reg = ((flags & CODING_STATE_SS2) ? 2
5384 : (flags & CODING_STATE_SS3) ? 3
5385 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5386 : str->iso2022.register_left);
5387 charset = str->iso2022.charset[reg];
5389 /* Error checking: */
5390 if (! CHARSETP (charset)
5391 || str->iso2022.invalid_designated[reg]
5392 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5393 && XCHARSET_CHARS (charset) == 94))
5394 /* Mrmph. We are trying to invoke a register that has no
5395 or an invalid charset in it, or trying to add a character
5396 outside the range of the charset. Insert that char literally
5397 to preserve it for the output. */
5399 COMPOSE_FLUSH_CHARS (str, dst);
5400 decode_flush_er_chars (str, dst);
5404 DECODE_ADD_BINARY_CHAR
5405 ((unsigned char)(cpos >> (counter * 8)), dst);
5408 DECODE_ADD_BINARY_CHAR (c, dst);
5413 /* Things are probably hunky-dorey. */
5415 /* Fetch reverse charset, maybe. */
5416 if (((flags & CODING_STATE_R2L) &&
5417 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5419 (!(flags & CODING_STATE_R2L) &&
5420 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5422 Lisp_Object new_charset =
5423 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5424 if (!NILP (new_charset))
5425 charset = new_charset;
5430 if (XCHARSET_DIMENSION (charset) == counter)
5432 COMPOSE_ADD_CHAR (str,
5433 DECODE_CHAR (charset,
5434 ((cpos & 0x7F7F7F) << 8)
5441 cpos = (cpos << 8) | c;
5443 lb = XCHARSET_LEADING_BYTE (charset);
5444 switch (XCHARSET_REP_BYTES (charset))
5447 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5448 Dynarr_add (dst, c & 0x7F);
5451 case 2: /* one-byte official */
5452 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5453 Dynarr_add (dst, lb);
5454 Dynarr_add (dst, c | 0x80);
5457 case 3: /* one-byte private or two-byte official */
5458 if (XCHARSET_PRIVATE_P (charset))
5460 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5461 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5462 Dynarr_add (dst, lb);
5463 Dynarr_add (dst, c | 0x80);
5469 Dynarr_add (dst, lb);
5470 Dynarr_add (dst, ch | 0x80);
5471 Dynarr_add (dst, c | 0x80);
5479 default: /* two-byte private */
5482 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5483 Dynarr_add (dst, lb);
5484 Dynarr_add (dst, ch | 0x80);
5485 Dynarr_add (dst, c | 0x80);
5495 flags &= CODING_STATE_ISO2022_LOCK;
5498 label_continue_loop:;
5501 if (flags & CODING_STATE_END)
5503 COMPOSE_FLUSH_CHARS (str, dst);
5504 decode_flush_er_chars (str, dst);
5505 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5509 str->counter = counter;
5513 /***** ISO2022 encoder *****/
5515 /* Designate CHARSET into register REG. */
5518 iso2022_designate (Lisp_Object charset, unsigned char reg,
5519 struct encoding_stream *str, unsigned_char_dynarr *dst)
5521 static const char inter94[] = "()*+";
5522 static const char inter96[] = ",-./";
5523 unsigned short chars;
5524 unsigned char dimension;
5525 unsigned char final;
5526 Lisp_Object old_charset = str->iso2022.charset[reg];
5528 str->iso2022.charset[reg] = charset;
5529 if (!CHARSETP (charset))
5530 /* charset might be an initial nil or t. */
5532 chars = XCHARSET_CHARS (charset);
5533 dimension = XCHARSET_DIMENSION (charset);
5534 final = XCHARSET_FINAL (charset);
5535 if (!str->iso2022.force_charset_on_output[reg] &&
5536 CHARSETP (old_charset) &&
5537 XCHARSET_CHARS (old_charset) == chars &&
5538 XCHARSET_DIMENSION (old_charset) == dimension &&
5539 XCHARSET_FINAL (old_charset) == final)
5542 str->iso2022.force_charset_on_output[reg] = 0;
5545 charset_conversion_spec_dynarr *dyn =
5546 str->codesys->iso2022.output_conv;
5552 for (i = 0; i < Dynarr_length (dyn); i++)
5554 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5555 if (EQ (charset, spec->from_charset))
5556 charset = spec->to_charset;
5561 Dynarr_add (dst, ISO_CODE_ESC);
5566 Dynarr_add (dst, inter94[reg]);
5569 Dynarr_add (dst, '$');
5571 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5574 Dynarr_add (dst, inter94[reg]);
5579 Dynarr_add (dst, inter96[reg]);
5582 Dynarr_add (dst, '$');
5583 Dynarr_add (dst, inter96[reg]);
5587 Dynarr_add (dst, final);
5591 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5593 if (str->iso2022.register_left != 0)
5595 Dynarr_add (dst, ISO_CODE_SI);
5596 str->iso2022.register_left = 0;
5601 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5603 if (str->iso2022.register_left != 1)
5605 Dynarr_add (dst, ISO_CODE_SO);
5606 str->iso2022.register_left = 1;
5611 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5612 unsigned_char_dynarr *dst, unsigned int *flags)
5614 unsigned char charmask;
5615 Lisp_Coding_System* codesys = str->codesys;
5616 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5618 Lisp_Object charset = str->iso2022.current_charset;
5619 int half = str->iso2022.current_half;
5620 int code_point = -1;
5624 restore_left_to_right_direction (codesys, dst, flags, 0);
5626 /* Make sure G0 contains ASCII */
5627 if ((ch > ' ' && ch < ISO_CODE_DEL)
5628 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5630 ensure_normal_shift (str, dst);
5631 iso2022_designate (Vcharset_ascii, 0, str, dst);
5634 /* If necessary, restore everything to the default state
5636 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5638 restore_left_to_right_direction (codesys, dst, flags, 0);
5640 ensure_normal_shift (str, dst);
5642 for (i = 0; i < 4; i++)
5644 Lisp_Object initial_charset =
5645 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5646 iso2022_designate (initial_charset, i, str, dst);
5651 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5652 Dynarr_add (dst, '\r');
5653 if (eol_type != EOL_CR)
5654 Dynarr_add (dst, ch);
5658 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5659 && fit_to_be_escape_quoted (ch))
5660 Dynarr_add (dst, ISO_CODE_ESC);
5661 Dynarr_add (dst, ch);
5664 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5666 charmask = (half == 0 ? 0x00 : 0x80);
5668 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5669 && fit_to_be_escape_quoted (ch))
5670 Dynarr_add (dst, ISO_CODE_ESC);
5671 /* you asked for it ... */
5672 Dynarr_add (dst, ch);
5678 /* Now determine which register to use. */
5680 for (i = 0; i < 4; i++)
5682 if ((CHARSETP (charset = str->iso2022.charset[i])
5683 && ((code_point = charset_code_point (charset, ch)) >= 0))
5687 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5688 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5696 Lisp_Object original_default_coded_charset_priority_list
5697 = Vdefault_coded_charset_priority_list;
5699 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5701 code_point = ENCODE_CHAR (ch, charset);
5702 if (XCHARSET_FINAL (charset))
5704 Vdefault_coded_charset_priority_list
5705 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5706 Vdefault_coded_charset_priority_list));
5708 code_point = ENCODE_CHAR (ch, charset);
5709 if (!XCHARSET_FINAL (charset))
5711 charset = Vcharset_ascii;
5715 Vdefault_coded_charset_priority_list
5716 = original_default_coded_charset_priority_list;
5718 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5719 codesys, dst, flags, 0);
5723 if (XCHARSET_GRAPHIC (charset) != 0)
5725 if (!NILP (str->iso2022.charset[1]) &&
5726 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5727 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5729 else if (!NILP (str->iso2022.charset[2]))
5731 else if (!NILP (str->iso2022.charset[3]))
5740 iso2022_designate (charset, reg, str, dst);
5742 /* Now invoke that register. */
5746 ensure_normal_shift (str, dst);
5750 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5752 ensure_shift_out (str, dst);
5759 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5761 Dynarr_add (dst, ISO_CODE_ESC);
5762 Dynarr_add (dst, 'N');
5767 Dynarr_add (dst, ISO_CODE_SS2);
5772 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5774 Dynarr_add (dst, ISO_CODE_ESC);
5775 Dynarr_add (dst, 'O');
5780 Dynarr_add (dst, ISO_CODE_SS3);
5788 charmask = (half == 0 ? 0x00 : 0x80);
5790 switch (XCHARSET_DIMENSION (charset))
5793 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5796 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5797 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5800 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5801 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5802 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5805 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5806 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5807 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5808 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5814 str->iso2022.current_charset = charset;
5815 str->iso2022.current_half = half;
5819 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5820 unsigned int *flags)
5822 Lisp_Coding_System* codesys = str->codesys;
5825 restore_left_to_right_direction (codesys, dst, flags, 0);
5826 ensure_normal_shift (str, dst);
5827 for (i = 0; i < 4; i++)
5829 Lisp_Object initial_charset
5830 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5831 iso2022_designate (initial_charset, i, str, dst);
5836 /************************************************************************/
5837 /* No-conversion methods */
5838 /************************************************************************/
5840 /* This is used when reading in "binary" files -- i.e. files that may
5841 contain all 256 possible byte values and that are not to be
5842 interpreted as being in any particular decoding. */
5844 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5845 unsigned_char_dynarr *dst, size_t n)
5847 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5848 unsigned int flags = str->flags;
5849 unsigned int cpos = str->cpos;
5850 eol_type_t eol_type = str->eol_type;
5854 unsigned char c = *(unsigned char *)src++;
5856 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5857 DECODE_ADD_BINARY_CHAR (c, dst);
5858 label_continue_loop:;
5861 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5868 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5869 unsigned_char_dynarr *dst, size_t n)
5872 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5873 unsigned int flags = str->flags;
5874 unsigned int ch = str->ch;
5875 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5877 unsigned char char_boundary = str->iso2022.current_char_boundary;
5884 if (char_boundary == 0)
5890 else if ( c >= 0xf8 )
5895 else if ( c >= 0xf0 )
5900 else if ( c >= 0xe0 )
5905 else if ( c >= 0xc0 )
5915 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5916 Dynarr_add (dst, '\r');
5917 if (eol_type != EOL_CR)
5918 Dynarr_add (dst, c);
5921 Dynarr_add (dst, c);
5924 else if (char_boundary == 1)
5926 ch = ( ch << 6 ) | ( c & 0x3f );
5927 Dynarr_add (dst, ch & 0xff);
5932 ch = ( ch << 6 ) | ( c & 0x3f );
5935 #else /* not UTF2000 */
5938 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5939 Dynarr_add (dst, '\r');
5940 if (eol_type != EOL_CR)
5941 Dynarr_add (dst, '\n');
5944 else if (BYTE_ASCII_P (c))
5947 Dynarr_add (dst, c);
5949 else if (BUFBYTE_LEADING_BYTE_P (c))
5952 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5953 c == LEADING_BYTE_CONTROL_1)
5956 Dynarr_add (dst, '~'); /* untranslatable character */
5960 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5961 Dynarr_add (dst, c);
5962 else if (ch == LEADING_BYTE_CONTROL_1)
5965 Dynarr_add (dst, c - 0x20);
5967 /* else it should be the second or third byte of an
5968 untranslatable character, so ignore it */
5971 #endif /* not UTF2000 */
5977 str->iso2022.current_char_boundary = char_boundary;
5983 /************************************************************************/
5984 /* Initialization */
5985 /************************************************************************/
5988 syms_of_file_coding (void)
5990 INIT_LRECORD_IMPLEMENTATION (coding_system);
5992 deferror (&Qcoding_system_error, "coding-system-error",
5993 "Coding-system error", Qio_error);
5995 DEFSUBR (Fcoding_system_p);
5996 DEFSUBR (Ffind_coding_system);
5997 DEFSUBR (Fget_coding_system);
5998 DEFSUBR (Fcoding_system_list);
5999 DEFSUBR (Fcoding_system_name);
6000 DEFSUBR (Fmake_coding_system);
6001 DEFSUBR (Fcopy_coding_system);
6002 DEFSUBR (Fcoding_system_canonical_name_p);
6003 DEFSUBR (Fcoding_system_alias_p);
6004 DEFSUBR (Fcoding_system_aliasee);
6005 DEFSUBR (Fdefine_coding_system_alias);
6006 DEFSUBR (Fsubsidiary_coding_system);
6008 DEFSUBR (Fcoding_system_type);
6009 DEFSUBR (Fcoding_system_doc_string);
6011 DEFSUBR (Fcoding_system_charset);
6013 DEFSUBR (Fcoding_system_property);
6015 DEFSUBR (Fcoding_category_list);
6016 DEFSUBR (Fset_coding_priority_list);
6017 DEFSUBR (Fcoding_priority_list);
6018 DEFSUBR (Fset_coding_category_system);
6019 DEFSUBR (Fcoding_category_system);
6021 DEFSUBR (Fdetect_coding_region);
6022 DEFSUBR (Fdecode_coding_region);
6023 DEFSUBR (Fencode_coding_region);
6025 DEFSUBR (Fdecode_shift_jis_char);
6026 DEFSUBR (Fencode_shift_jis_char);
6027 DEFSUBR (Fdecode_big5_char);
6028 DEFSUBR (Fencode_big5_char);
6030 defsymbol (&Qcoding_systemp, "coding-system-p");
6031 defsymbol (&Qno_conversion, "no-conversion");
6032 defsymbol (&Qraw_text, "raw-text");
6034 defsymbol (&Qbig5, "big5");
6035 defsymbol (&Qshift_jis, "shift-jis");
6036 defsymbol (&Qucs4, "ucs-4");
6037 defsymbol (&Qutf8, "utf-8");
6038 defsymbol (&Qccl, "ccl");
6039 defsymbol (&Qiso2022, "iso2022");
6041 defsymbol (&Qmnemonic, "mnemonic");
6042 defsymbol (&Qeol_type, "eol-type");
6043 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6044 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6046 defsymbol (&Qcr, "cr");
6047 defsymbol (&Qlf, "lf");
6048 defsymbol (&Qcrlf, "crlf");
6049 defsymbol (&Qeol_cr, "eol-cr");
6050 defsymbol (&Qeol_lf, "eol-lf");
6051 defsymbol (&Qeol_crlf, "eol-crlf");
6053 defsymbol (&Qcharset_g0, "charset-g0");
6054 defsymbol (&Qcharset_g1, "charset-g1");
6055 defsymbol (&Qcharset_g2, "charset-g2");
6056 defsymbol (&Qcharset_g3, "charset-g3");
6057 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6058 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6059 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6060 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6061 defsymbol (&Qno_iso6429, "no-iso6429");
6062 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6063 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6065 defsymbol (&Qshort, "short");
6066 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6067 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6068 defsymbol (&Qseven, "seven");
6069 defsymbol (&Qlock_shift, "lock-shift");
6070 defsymbol (&Qescape_quoted, "escape-quoted");
6073 defsymbol (&Qdisable_composition, "disable-composition");
6074 defsymbol (&Quse_entity_reference, "use-entity-reference");
6075 defsymbol (&Qd, "d");
6076 defsymbol (&Qx, "x");
6077 defsymbol (&QX, "X");
6079 defsymbol (&Qencode, "encode");
6080 defsymbol (&Qdecode, "decode");
6083 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6085 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6087 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6089 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6091 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6093 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6095 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6097 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6099 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6102 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6107 lstream_type_create_file_coding (void)
6109 LSTREAM_HAS_METHOD (decoding, reader);
6110 LSTREAM_HAS_METHOD (decoding, writer);
6111 LSTREAM_HAS_METHOD (decoding, rewinder);
6112 LSTREAM_HAS_METHOD (decoding, seekable_p);
6113 LSTREAM_HAS_METHOD (decoding, flusher);
6114 LSTREAM_HAS_METHOD (decoding, closer);
6115 LSTREAM_HAS_METHOD (decoding, marker);
6117 LSTREAM_HAS_METHOD (encoding, reader);
6118 LSTREAM_HAS_METHOD (encoding, writer);
6119 LSTREAM_HAS_METHOD (encoding, rewinder);
6120 LSTREAM_HAS_METHOD (encoding, seekable_p);
6121 LSTREAM_HAS_METHOD (encoding, flusher);
6122 LSTREAM_HAS_METHOD (encoding, closer);
6123 LSTREAM_HAS_METHOD (encoding, marker);
6127 vars_of_file_coding (void)
6131 fcd = xnew (struct file_coding_dump);
6132 dump_add_root_struct_ptr (&fcd, &fcd_description);
6134 /* Initialize to something reasonable ... */
6135 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6137 fcd->coding_category_system[i] = Qnil;
6138 fcd->coding_category_by_priority[i] = i;
6141 Fprovide (intern ("file-coding"));
6143 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6144 Coding system used for TTY keyboard input.
6145 Not used under a windowing system.
6147 Vkeyboard_coding_system = Qnil;
6149 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6150 Coding system used for TTY display output.
6151 Not used under a windowing system.
6153 Vterminal_coding_system = Qnil;
6155 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6156 Overriding coding system used when reading from a file or process.
6157 You should bind this variable with `let', but do not set it globally.
6158 If this is non-nil, it specifies the coding system that will be used
6159 to decode input on read operations, such as from a file or process.
6160 It overrides `buffer-file-coding-system-for-read',
6161 `insert-file-contents-pre-hook', etc. Use those variables instead of
6162 this one for permanent changes to the environment. */ );
6163 Vcoding_system_for_read = Qnil;
6165 DEFVAR_LISP ("coding-system-for-write",
6166 &Vcoding_system_for_write /*
6167 Overriding coding system used when writing to a file or process.
6168 You should bind this variable with `let', but do not set it globally.
6169 If this is non-nil, it specifies the coding system that will be used
6170 to encode output for write operations, such as to a file or process.
6171 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6172 Use those variables instead of this one for permanent changes to the
6174 Vcoding_system_for_write = Qnil;
6176 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6177 Coding system used to convert pathnames when accessing files.
6179 Vfile_name_coding_system = Qnil;
6181 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6182 &Vcoded_charset_entity_reference_alist /*
6183 Alist of coded-charset vs corresponding entity-reference.
6184 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6185 CCS is coded-charset.
6186 CODE-COLUMNS is columns of code-point of entity-reference.
6187 CODE-TYPE is format type of code-point of entity-reference.
6188 `d' means decimal value and `x' means hexadecimal value.
6190 Vcoded_charset_entity_reference_alist = Qnil;
6192 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6193 Non-nil means the buffer contents are regarded as multi-byte form
6194 of characters, not a binary code. This affects the display, file I/O,
6195 and behaviors of various editing commands.
6197 Setting this to nil does not do anything.
6199 enable_multibyte_characters = 1;
6203 complex_vars_of_file_coding (void)
6205 staticpro (&Vcoding_system_hash_table);
6206 Vcoding_system_hash_table =
6207 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6209 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6210 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6212 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6214 struct codesys_prop csp; \
6216 csp.prop_type = (Prop_Type); \
6217 Dynarr_add (the_codesys_prop_dynarr, csp); \
6220 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6221 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6222 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6223 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6224 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6225 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6226 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6228 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6229 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6230 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6231 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6232 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6233 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6234 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6235 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6236 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6237 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6238 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6239 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6240 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6241 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6242 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6243 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6244 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6246 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6247 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6249 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6250 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6253 /* Need to create this here or we're really screwed. */
6255 (Qraw_text, Qno_conversion,
6256 build_string ("Raw text, which means it converts only line-break-codes."),
6257 list2 (Qmnemonic, build_string ("Raw")));
6260 (Qbinary, Qno_conversion,
6261 build_string ("Binary, which means it does not convert anything."),
6262 list4 (Qeol_type, Qlf,
6263 Qmnemonic, build_string ("Binary")));
6268 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6269 list2 (Qmnemonic, build_string ("UTF8")));
6272 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6274 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6276 Fdefine_coding_system_alias (Qterminal, Qbinary);
6277 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6279 /* Need this for bootstrapping */
6280 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6281 Fget_coding_system (Qraw_text);
6284 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6285 = Fget_coding_system (Qutf8);
6288 #if defined(MULE) && !defined(UTF2000)
6292 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6293 fcd->ucs_to_mule_table[i] = Qnil;
6295 staticpro (&mule_to_ucs_table);
6296 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6297 #endif /* defined(MULE) && !defined(UTF2000) */