1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Quse_entity_reference;
111 Lisp_Object Qd, Qx, QX;
113 Lisp_Object Qencode, Qdecode;
115 Lisp_Object Vcoding_system_hash_table;
117 int enable_multibyte_characters;
120 /* Additional information used by the ISO2022 decoder and detector. */
121 struct iso2022_decoder
123 /* CHARSET holds the character sets currently assigned to the G0
124 through G3 variables. It is initialized from the array
125 INITIAL_CHARSET in CODESYS. */
126 Lisp_Object charset[4];
128 /* Which registers are currently invoked into the left (GL) and
129 right (GR) halves of the 8-bit encoding space? */
130 int register_left, register_right;
132 /* ISO_ESC holds a value indicating part of an escape sequence
133 that has already been seen. */
134 enum iso_esc_flag esc;
136 /* This records the bytes we've seen so far in an escape sequence,
137 in case the sequence is invalid (we spit out the bytes unchanged). */
138 unsigned char esc_bytes[8];
140 /* Index for next byte to store in ISO escape sequence. */
143 #ifdef ENABLE_COMPOSITE_CHARS
144 /* Stuff seen so far when composing a string. */
145 unsigned_char_dynarr *composite_chars;
148 /* If we saw an invalid designation sequence for a particular
149 register, we flag it here and switch to ASCII. The next time we
150 see a valid designation for this register, we turn off the flag
151 and do the designation normally, but pretend the sequence was
152 invalid. The effect of all this is that (most of the time) the
153 escape sequences for both the switch to the unknown charset, and
154 the switch back to the known charset, get inserted literally into
155 the buffer and saved out as such. The hope is that we can
156 preserve the escape sequences so that the resulting written out
157 file makes sense. If we don't do any of this, the designation
158 to the invalid charset will be preserved but that switch back
159 to the known charset will probably get eaten because it was
160 the same charset that was already present in the register. */
161 unsigned char invalid_designated[4];
163 /* We try to do similar things as above for direction-switching
164 sequences. If we encountered a direction switch while an
165 invalid designation was present, or an invalid designation
166 just after a direction switch (i.e. no valid designation
167 encountered yet), we insert the direction-switch escape
168 sequence literally into the output stream, and later on
169 insert the corresponding direction-restoring escape sequence
171 unsigned int switched_dir_and_no_valid_charset_yet :1;
172 unsigned int invalid_switch_dir :1;
174 /* Tells the decoder to output the escape sequence literally
175 even though it was valid. Used in the games we play to
176 avoid lossage when we encounter invalid designations. */
177 unsigned int output_literally :1;
178 /* We encountered a direction switch followed by an invalid
179 designation. We didn't output the direction switch
180 literally because we didn't know about the invalid designation;
181 but we have to do so now. */
182 unsigned int output_direction_sequence :1;
185 EXFUN (Fcopy_coding_system, 2);
187 struct detection_state;
190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
191 unsigned_char_dynarr *dst, size_t n);
193 static int detect_coding_sjis (struct detection_state *st,
194 const Extbyte *src, size_t n);
195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
196 unsigned_char_dynarr *dst, size_t n);
197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_shift_jis (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int detect_coding_big5 (struct detection_state *st,
203 const Extbyte *src, size_t n);
204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
205 unsigned_char_dynarr *dst, size_t n);
206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
207 unsigned_char_dynarr *dst, unsigned int *flags);
208 void char_finish_big5 (struct encoding_stream *str,
209 unsigned_char_dynarr *dst, unsigned int *flags);
211 static int detect_coding_ucs4 (struct detection_state *st,
212 const Extbyte *src, size_t n);
213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
214 unsigned_char_dynarr *dst, size_t n);
215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
216 unsigned_char_dynarr *dst, unsigned int *flags);
217 void char_finish_ucs4 (struct encoding_stream *str,
218 unsigned_char_dynarr *dst, unsigned int *flags);
220 static int detect_coding_utf8 (struct detection_state *st,
221 const Extbyte *src, size_t n);
222 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
223 unsigned_char_dynarr *dst, size_t n);
224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf8 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int postprocess_iso2022_mask (int mask);
230 static void reset_iso2022 (Lisp_Object coding_system,
231 struct iso2022_decoder *iso);
232 static int detect_coding_iso2022 (struct detection_state *st,
233 const Extbyte *src, size_t n);
234 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
235 unsigned_char_dynarr *dst, size_t n);
236 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
237 unsigned_char_dynarr *dst, unsigned int *flags);
238 void char_finish_iso2022 (struct encoding_stream *str,
239 unsigned_char_dynarr *dst, unsigned int *flags);
241 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
243 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
244 unsigned_char_dynarr *dst, size_t n);
245 static void mule_decode (Lstream *decoding, const Extbyte *src,
246 unsigned_char_dynarr *dst, size_t n);
247 static void mule_encode (Lstream *encoding, const Bufbyte *src,
248 unsigned_char_dynarr *dst, size_t n);
250 typedef struct codesys_prop codesys_prop;
259 Dynarr_declare (codesys_prop);
260 } codesys_prop_dynarr;
262 static const struct lrecord_description codesys_prop_description_1[] = {
263 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
267 static const struct struct_description codesys_prop_description = {
268 sizeof (codesys_prop),
269 codesys_prop_description_1
272 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
273 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
277 static const struct struct_description codesys_prop_dynarr_description = {
278 sizeof (codesys_prop_dynarr),
279 codesys_prop_dynarr_description_1
282 codesys_prop_dynarr *the_codesys_prop_dynarr;
284 enum codesys_prop_enum
287 CODESYS_PROP_ISO2022,
292 /************************************************************************/
293 /* Coding system functions */
294 /************************************************************************/
296 static Lisp_Object mark_coding_system (Lisp_Object);
297 static void print_coding_system (Lisp_Object, Lisp_Object, int);
298 static void finalize_coding_system (void *header, int for_disksave);
301 static const struct lrecord_description ccs_description_1[] = {
302 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
303 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
307 static const struct struct_description ccs_description = {
308 sizeof (charset_conversion_spec),
312 static const struct lrecord_description ccsd_description_1[] = {
313 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
317 static const struct struct_description ccsd_description = {
318 sizeof (charset_conversion_spec_dynarr),
323 static const struct lrecord_description coding_system_description[] = {
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
333 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
334 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
335 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
345 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
346 mark_coding_system, print_coding_system,
347 finalize_coding_system,
348 0, 0, coding_system_description,
352 mark_coding_system (Lisp_Object obj)
354 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
356 mark_object (CODING_SYSTEM_NAME (codesys));
357 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
358 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
359 mark_object (CODING_SYSTEM_EOL_LF (codesys));
360 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
361 mark_object (CODING_SYSTEM_EOL_CR (codesys));
363 switch (CODING_SYSTEM_TYPE (codesys))
367 case CODESYS_ISO2022:
368 for (i = 0; i < 4; i++)
369 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
370 if (codesys->iso2022.input_conv)
372 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
374 struct charset_conversion_spec *ccs =
375 Dynarr_atp (codesys->iso2022.input_conv, i);
376 mark_object (ccs->from_charset);
377 mark_object (ccs->to_charset);
380 if (codesys->iso2022.output_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.output_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
394 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
395 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
400 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
401 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
408 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
410 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
412 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
416 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
419 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
421 error ("printing unreadable object #<coding_system 0x%x>",
424 write_c_string ("#<coding_system ", printcharfun);
425 print_internal (c->name, printcharfun, 1);
426 write_c_string (">", printcharfun);
430 finalize_coding_system (void *header, int for_disksave)
432 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
433 /* Since coding systems never go away, this function is not
434 necessary. But it would be necessary if we changed things
435 so that coding systems could go away. */
436 if (!for_disksave) /* see comment in lstream.c */
438 switch (CODING_SYSTEM_TYPE (c))
441 case CODESYS_ISO2022:
442 if (c->iso2022.input_conv)
444 Dynarr_free (c->iso2022.input_conv);
445 c->iso2022.input_conv = 0;
447 if (c->iso2022.output_conv)
449 Dynarr_free (c->iso2022.output_conv);
450 c->iso2022.output_conv = 0;
461 symbol_to_eol_type (Lisp_Object symbol)
463 CHECK_SYMBOL (symbol);
464 if (NILP (symbol)) return EOL_AUTODETECT;
465 if (EQ (symbol, Qlf)) return EOL_LF;
466 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
467 if (EQ (symbol, Qcr)) return EOL_CR;
469 signal_simple_error ("Unrecognized eol type", symbol);
470 return EOL_AUTODETECT; /* not reached */
474 eol_type_to_symbol (eol_type_t type)
479 case EOL_LF: return Qlf;
480 case EOL_CRLF: return Qcrlf;
481 case EOL_CR: return Qcr;
482 case EOL_AUTODETECT: return Qnil;
487 setup_eol_coding_systems (Lisp_Coding_System *codesys)
489 Lisp_Object codesys_obj;
490 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
491 char *codesys_name = (char *) alloca (len + 7);
493 char *codesys_mnemonic=0;
495 Lisp_Object codesys_name_sym, sub_codesys_obj;
499 XSETCODING_SYSTEM (codesys_obj, codesys);
501 memcpy (codesys_name,
502 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
504 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
506 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
507 codesys_mnemonic = (char *) alloca (mlen + 7);
508 memcpy (codesys_mnemonic,
509 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
512 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
513 strcpy (codesys_name + len, "-" op_sys); \
515 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
516 codesys_name_sym = intern (codesys_name); \
517 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
518 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
520 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
521 build_string (codesys_mnemonic); \
522 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
525 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
526 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
527 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
530 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
531 Return t if OBJECT is a coding system.
532 A coding system is an object that defines how text containing multiple
533 character sets is encoded into a stream of (typically 8-bit) bytes.
534 The coding system is used to decode the stream into a series of
535 characters (which may be from multiple charsets) when the text is read
536 from a file or process, and is used to encode the text back into the
537 same format when it is written out to a file or process.
539 For example, many ISO2022-compliant coding systems (such as Compound
540 Text, which is used for inter-client data under the X Window System)
541 use escape sequences to switch between different charsets -- Japanese
542 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
543 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
544 `make-coding-system' for more information.
546 Coding systems are normally identified using a symbol, and the
547 symbol is accepted in place of the actual coding system object whenever
548 a coding system is called for. (This is similar to how faces work.)
552 return CODING_SYSTEMP (object) ? Qt : Qnil;
555 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
556 Retrieve the coding system of the given name.
558 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
559 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
560 If there is no such coding system, nil is returned. Otherwise the
561 associated coding system object is returned.
563 (coding_system_or_name))
565 if (NILP (coding_system_or_name))
566 coding_system_or_name = Qbinary;
567 else if (CODING_SYSTEMP (coding_system_or_name))
568 return coding_system_or_name;
570 CHECK_SYMBOL (coding_system_or_name);
574 coding_system_or_name =
575 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
577 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
578 return coding_system_or_name;
582 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
583 Retrieve the coding system of the given name.
584 Same as `find-coding-system' except that if there is no such
585 coding system, an error is signaled instead of returning nil.
589 Lisp_Object coding_system = Ffind_coding_system (name);
591 if (NILP (coding_system))
592 signal_simple_error ("No such coding system", name);
593 return coding_system;
596 /* We store the coding systems in hash tables with the names as the key and the
597 actual coding system object as the value. Occasionally we need to use them
598 in a list format. These routines provide us with that. */
599 struct coding_system_list_closure
601 Lisp_Object *coding_system_list;
605 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
606 void *coding_system_list_closure)
608 /* This function can GC */
609 struct coding_system_list_closure *cscl =
610 (struct coding_system_list_closure *) coding_system_list_closure;
611 Lisp_Object *coding_system_list = cscl->coding_system_list;
613 *coding_system_list = Fcons (key, *coding_system_list);
617 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
618 Return a list of the names of all defined coding systems.
622 Lisp_Object coding_system_list = Qnil;
624 struct coding_system_list_closure coding_system_list_closure;
626 GCPRO1 (coding_system_list);
627 coding_system_list_closure.coding_system_list = &coding_system_list;
628 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
629 &coding_system_list_closure);
632 return coding_system_list;
635 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
636 Return the name of the given coding system.
640 coding_system = Fget_coding_system (coding_system);
641 return XCODING_SYSTEM_NAME (coding_system);
644 static Lisp_Coding_System *
645 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
647 Lisp_Coding_System *codesys =
648 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
650 zero_lcrecord (codesys);
651 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
652 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
653 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
654 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
655 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
656 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
657 CODING_SYSTEM_TYPE (codesys) = type;
658 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
661 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
663 if (type == CODESYS_ISO2022)
666 for (i = 0; i < 4; i++)
667 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
670 if (type == CODESYS_UTF8)
672 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
674 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
678 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
681 else if (type == CODESYS_BIG5)
683 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
685 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
686 = Vcharset_chinese_big5;
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
693 else if (type == CODESYS_CCL)
695 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
696 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
699 CODING_SYSTEM_NAME (codesys) = name;
705 /* Given a list of charset conversion specs as specified in a Lisp
706 program, parse it into STORE_HERE. */
709 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
710 Lisp_Object spec_list)
714 EXTERNAL_LIST_LOOP (rest, spec_list)
716 Lisp_Object car = XCAR (rest);
717 Lisp_Object from, to;
718 struct charset_conversion_spec spec;
720 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
721 signal_simple_error ("Invalid charset conversion spec", car);
722 from = Fget_charset (XCAR (car));
723 to = Fget_charset (XCAR (XCDR (car)));
724 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
725 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
726 signal_simple_error_2
727 ("Attempted conversion between different charset types",
729 spec.from_charset = from;
730 spec.to_charset = to;
732 Dynarr_add (store_here, spec);
736 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
737 specs, return the equivalent as the Lisp programmer would see it.
739 If LOAD_HERE is 0, return Qnil. */
742 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
749 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
751 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
752 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
755 return Fnreverse (result);
760 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
761 Register symbol NAME as a coding system.
763 TYPE describes the conversion method used and should be one of
766 Automatic conversion. XEmacs attempts to detect the coding system
769 No conversion. Use this for binary files and such. On output,
770 graphic characters that are not in ASCII or Latin-1 will be
771 replaced by a ?. (For a no-conversion-encoded buffer, these
772 characters will only be present if you explicitly insert them.)
774 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
776 ISO 10646 UCS-4 encoding.
778 ISO 10646 UTF-8 encoding.
780 Any ISO2022-compliant encoding. Among other things, this includes
781 JIS (the Japanese encoding commonly used for e-mail), EUC (the
782 standard Unix encoding for Japanese and other languages), and
783 Compound Text (the encoding used in X11). You can specify more
784 specific information about the conversion with the PROPS argument.
786 Big5 (the encoding commonly used for Taiwanese).
788 The conversion is performed using a user-written pseudo-code
789 program. CCL (Code Conversion Language) is the name of this
792 Write out or read in the raw contents of the memory representing
793 the buffer's text. This is primarily useful for debugging
794 purposes, and is only enabled when XEmacs has been compiled with
795 DEBUG_XEMACS defined (via the --debug configure option).
796 WARNING: Reading in a file using 'internal conversion can result
797 in an internal inconsistency in the memory representing a
798 buffer's text, which will produce unpredictable results and may
799 cause XEmacs to crash. Under normal circumstances you should
800 never use 'internal conversion.
802 DOC-STRING is a string describing the coding system.
804 PROPS is a property list, describing the specific nature of the
805 character set. Recognized properties are:
808 String to be displayed in the modeline when this coding system is
812 End-of-line conversion to be used. It should be one of
815 Automatically detect the end-of-line type (LF, CRLF,
816 or CR). Also generate subsidiary coding systems named
817 `NAME-unix', `NAME-dos', and `NAME-mac', that are
818 identical to this coding system but have an EOL-TYPE
819 value of 'lf, 'crlf, and 'cr, respectively.
821 The end of a line is marked externally using ASCII LF.
822 Since this is also the way that XEmacs represents an
823 end-of-line internally, specifying this option results
824 in no end-of-line conversion. This is the standard
825 format for Unix text files.
827 The end of a line is marked externally using ASCII
828 CRLF. This is the standard format for MS-DOS text
831 The end of a line is marked externally using ASCII CR.
832 This is the standard format for Macintosh text files.
834 Automatically detect the end-of-line type but do not
835 generate subsidiary coding systems. (This value is
836 converted to nil when stored internally, and
837 `coding-system-property' will return nil.)
840 If non-nil, composition/decomposition for combining characters
843 'use-entity-reference
844 If non-nil, SGML style entity-reference is used for non-system-characters.
846 'post-read-conversion
847 Function called after a file has been read in, to perform the
848 decoding. Called with two arguments, START and END, denoting
849 a region of the current buffer to be decoded.
851 'pre-write-conversion
852 Function called before a file is written out, to perform the
853 encoding. Called with two arguments, START and END, denoting
854 a region of the current buffer to be encoded.
857 The following additional properties are recognized if TYPE is 'iso2022:
863 The character set initially designated to the G0 - G3 registers.
864 The value should be one of
866 -- A charset object (designate that character set)
867 -- nil (do not ever use this register)
868 -- t (no character set is initially designated to
869 the register, but may be later on; this automatically
870 sets the corresponding `force-g*-on-output' property)
876 If non-nil, send an explicit designation sequence on output before
877 using the specified register.
880 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
881 "ESC $ B" on output in place of the full designation sequences
882 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
885 If non-nil, don't designate ASCII to G0 at each end of line on output.
886 Setting this to non-nil also suppresses other state-resetting that
887 normally happens at the end of a line.
890 If non-nil, don't designate ASCII to G0 before control chars on output.
893 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
897 If non-nil, use locking-shift (SO/SI) instead of single-shift
898 or designation by escape sequence.
901 If non-nil, don't use ISO6429's direction specification.
904 If non-nil, literal control characters that are the same as
905 the beginning of a recognized ISO2022 or ISO6429 escape sequence
906 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
907 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
908 so that they can be properly distinguished from an escape sequence.
909 (Note that doing this results in a non-portable encoding.) This
910 encoding flag is used for byte-compiled files. Note that ESC
911 is a good choice for a quoting character because there are no
912 escape sequences whose second byte is a character from the Control-0
913 or Control-1 character sets; this is explicitly disallowed by the
916 'input-charset-conversion
917 A list of conversion specifications, specifying conversion of
918 characters in one charset to another when decoding is performed.
919 Each specification is a list of two elements: the source charset,
920 and the destination charset.
922 'output-charset-conversion
923 A list of conversion specifications, specifying conversion of
924 characters in one charset to another when encoding is performed.
925 The form of each specification is the same as for
926 'input-charset-conversion.
929 The following additional properties are recognized (and required)
933 CCL program used for decoding (converting to internal format).
936 CCL program used for encoding (converting to external format).
938 (name, type, doc_string, props))
940 Lisp_Coding_System *codesys;
941 enum coding_system_type ty;
942 int need_to_setup_eol_systems = 1;
944 /* Convert type to constant */
945 if (NILP (type) || EQ (type, Qundecided))
946 { ty = CODESYS_AUTODETECT; }
948 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
949 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
950 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
951 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
952 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
953 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
955 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
957 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
960 signal_simple_error ("Invalid coding system type", type);
964 codesys = allocate_coding_system (ty, name);
966 if (NILP (doc_string))
967 doc_string = build_string ("");
969 CHECK_STRING (doc_string);
970 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
973 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
975 if (EQ (key, Qmnemonic))
978 CHECK_STRING (value);
979 CODING_SYSTEM_MNEMONIC (codesys) = value;
982 else if (EQ (key, Qeol_type))
984 need_to_setup_eol_systems = NILP (value);
987 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
990 else if (EQ (key, Qpost_read_conversion))
991 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
992 else if (EQ (key, Qpre_write_conversion))
993 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
995 else if (EQ (key, Qdisable_composition))
996 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
997 else if (EQ (key, Quse_entity_reference))
998 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1001 else if (ty == CODESYS_ISO2022)
1003 #define FROB_INITIAL_CHARSET(charset_num) \
1004 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1005 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1007 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1008 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1009 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1010 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1012 #define FROB_FORCE_CHARSET(charset_num) \
1013 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1015 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1016 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1017 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1018 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1020 #define FROB_BOOLEAN_PROPERTY(prop) \
1021 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1023 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1024 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1025 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1026 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1027 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1028 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1029 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1031 else if (EQ (key, Qinput_charset_conversion))
1033 codesys->iso2022.input_conv =
1034 Dynarr_new (charset_conversion_spec);
1035 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1038 else if (EQ (key, Qoutput_charset_conversion))
1040 codesys->iso2022.output_conv =
1041 Dynarr_new (charset_conversion_spec);
1042 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1046 signal_simple_error ("Unrecognized property", key);
1049 else if (ty == CODESYS_UTF8)
1051 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1052 else if (EQ (key, Qcharset_g1))
1053 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1054 else if (EQ (key, Qcharset_g2))
1055 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1057 signal_simple_error ("Unrecognized property", key);
1059 else if (ty == CODESYS_BIG5)
1061 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1062 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1064 signal_simple_error ("Unrecognized property", key);
1067 else if (EQ (type, Qccl))
1070 struct ccl_program test_ccl;
1073 /* Check key first. */
1074 if (EQ (key, Qdecode))
1075 suffix = "-ccl-decode";
1076 else if (EQ (key, Qencode))
1077 suffix = "-ccl-encode";
1079 signal_simple_error ("Unrecognized property", key);
1081 /* If value is vector, register it as a ccl program
1082 associated with an newly created symbol for
1083 backward compatibility. */
1084 if (VECTORP (value))
1086 sym = Fintern (concat2 (Fsymbol_name (name),
1087 build_string (suffix)),
1089 Fregister_ccl_program (sym, value);
1093 CHECK_SYMBOL (value);
1096 /* check if the given ccl programs are valid. */
1097 if (setup_ccl_program (&test_ccl, sym) < 0)
1098 signal_simple_error ("Invalid CCL program", value);
1100 if (EQ (key, Qdecode))
1101 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1102 else if (EQ (key, Qencode))
1103 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1108 signal_simple_error ("Unrecognized property", key);
1112 if (need_to_setup_eol_systems)
1113 setup_eol_coding_systems (codesys);
1116 Lisp_Object codesys_obj;
1117 XSETCODING_SYSTEM (codesys_obj, codesys);
1118 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1123 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1124 Copy OLD-CODING-SYSTEM to NEW-NAME.
1125 If NEW-NAME does not name an existing coding system, a new one will
1128 (old_coding_system, new_name))
1130 Lisp_Object new_coding_system;
1131 old_coding_system = Fget_coding_system (old_coding_system);
1132 new_coding_system = Ffind_coding_system (new_name);
1133 if (NILP (new_coding_system))
1135 XSETCODING_SYSTEM (new_coding_system,
1136 allocate_coding_system
1137 (XCODING_SYSTEM_TYPE (old_coding_system),
1139 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1143 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1144 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1145 memcpy (((char *) to ) + sizeof (to->header),
1146 ((char *) from) + sizeof (from->header),
1147 sizeof (*from) - sizeof (from->header));
1148 to->name = new_name;
1150 return new_coding_system;
1153 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1154 Return t if OBJECT names a coding system, and is not a coding system alias.
1158 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1162 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1163 Return t if OBJECT is a coding system alias.
1164 All coding system aliases are created by `define-coding-system-alias'.
1168 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1172 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1173 Return the coding-system symbol for which symbol ALIAS is an alias.
1177 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1178 if (SYMBOLP (aliasee))
1181 signal_simple_error ("Symbol is not a coding system alias", alias);
1182 return Qnil; /* To keep the compiler happy */
1186 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1188 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1192 /* A maphash function, for removing dangling coding system aliases. */
1194 dangling_coding_system_alias_p (Lisp_Object alias,
1195 Lisp_Object aliasee,
1196 void *dangling_aliases)
1198 if (SYMBOLP (aliasee)
1199 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1201 (*(int *) dangling_aliases)++;
1208 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1209 Define symbol ALIAS as an alias for coding system ALIASEE.
1211 You can use this function to redefine an alias that has already been defined,
1212 but you cannot redefine a name which is the canonical name for a coding system.
1213 \(a canonical name of a coding system is what is returned when you call
1214 `coding-system-name' on a coding system).
1216 ALIASEE itself can be an alias, which allows you to define nested aliases.
1218 You are forbidden, however, from creating alias loops or `dangling' aliases.
1219 These will be detected, and an error will be signaled if you attempt to do so.
1221 If ALIASEE is nil, then ALIAS will simply be undefined.
1223 See also `coding-system-alias-p', `coding-system-aliasee',
1224 and `coding-system-canonical-name-p'.
1228 Lisp_Object real_coding_system, probe;
1230 CHECK_SYMBOL (alias);
1232 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1234 ("Symbol is the canonical name of a coding system and cannot be redefined",
1239 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1240 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1241 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1243 Fremhash (alias, Vcoding_system_hash_table);
1245 /* Undefine subsidiary aliases,
1246 presumably created by a previous call to this function */
1247 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1248 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1249 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1251 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1252 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1253 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1256 /* Undefine dangling coding system aliases. */
1258 int dangling_aliases;
1261 dangling_aliases = 0;
1262 elisp_map_remhash (dangling_coding_system_alias_p,
1263 Vcoding_system_hash_table,
1265 } while (dangling_aliases > 0);
1271 if (CODING_SYSTEMP (aliasee))
1272 aliasee = XCODING_SYSTEM_NAME (aliasee);
1274 /* Checks that aliasee names a coding-system */
1275 real_coding_system = Fget_coding_system (aliasee);
1277 /* Check for coding system alias loops */
1278 if (EQ (alias, aliasee))
1279 alias_loop: signal_simple_error_2
1280 ("Attempt to create a coding system alias loop", alias, aliasee);
1282 for (probe = aliasee;
1284 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1286 if (EQ (probe, alias))
1290 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1292 /* Set up aliases for subsidiaries.
1293 #### There must be a better way to handle subsidiary coding systems. */
1295 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1297 for (i = 0; i < countof (suffixes); i++)
1299 Lisp_Object alias_subsidiary =
1300 append_suffix_to_symbol (alias, suffixes[i]);
1301 Lisp_Object aliasee_subsidiary =
1302 append_suffix_to_symbol (aliasee, suffixes[i]);
1304 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1305 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1308 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1309 but it doesn't look intentional, so I'd rather return something
1310 meaningful or nothing at all. */
1315 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1317 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1318 Lisp_Object new_coding_system;
1320 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1321 return coding_system;
1325 case EOL_AUTODETECT: return coding_system;
1326 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1327 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1328 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1329 default: abort (); return Qnil;
1332 return NILP (new_coding_system) ? coding_system : new_coding_system;
1335 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1336 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1338 (coding_system, eol_type))
1340 coding_system = Fget_coding_system (coding_system);
1342 return subsidiary_coding_system (coding_system,
1343 symbol_to_eol_type (eol_type));
1347 /************************************************************************/
1348 /* Coding system accessors */
1349 /************************************************************************/
1351 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1352 Return the doc string for CODING-SYSTEM.
1356 coding_system = Fget_coding_system (coding_system);
1357 return XCODING_SYSTEM_DOC_STRING (coding_system);
1360 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1361 Return the type of CODING-SYSTEM.
1365 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1368 case CODESYS_AUTODETECT: return Qundecided;
1370 case CODESYS_SHIFT_JIS: return Qshift_jis;
1371 case CODESYS_ISO2022: return Qiso2022;
1372 case CODESYS_BIG5: return Qbig5;
1373 case CODESYS_UCS4: return Qucs4;
1374 case CODESYS_UTF8: return Qutf8;
1375 case CODESYS_CCL: return Qccl;
1377 case CODESYS_NO_CONVERSION: return Qno_conversion;
1379 case CODESYS_INTERNAL: return Qinternal;
1386 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1389 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1391 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1394 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1395 Return initial charset of CODING-SYSTEM designated to GNUM.
1398 (coding_system, gnum))
1400 coding_system = Fget_coding_system (coding_system);
1403 return coding_system_charset (coding_system, XINT (gnum));
1407 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1408 Return the PROP property of CODING-SYSTEM.
1410 (coding_system, prop))
1413 enum coding_system_type type;
1415 coding_system = Fget_coding_system (coding_system);
1416 CHECK_SYMBOL (prop);
1417 type = XCODING_SYSTEM_TYPE (coding_system);
1419 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1420 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1423 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1425 case CODESYS_PROP_ALL_OK:
1428 case CODESYS_PROP_ISO2022:
1429 if (type != CODESYS_ISO2022)
1431 ("Property only valid in ISO2022 coding systems",
1435 case CODESYS_PROP_CCL:
1436 if (type != CODESYS_CCL)
1438 ("Property only valid in CCL coding systems",
1448 signal_simple_error ("Unrecognized property", prop);
1450 if (EQ (prop, Qname))
1451 return XCODING_SYSTEM_NAME (coding_system);
1452 else if (EQ (prop, Qtype))
1453 return Fcoding_system_type (coding_system);
1454 else if (EQ (prop, Qdoc_string))
1455 return XCODING_SYSTEM_DOC_STRING (coding_system);
1456 else if (EQ (prop, Qmnemonic))
1457 return XCODING_SYSTEM_MNEMONIC (coding_system);
1458 else if (EQ (prop, Qeol_type))
1459 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1460 else if (EQ (prop, Qeol_lf))
1461 return XCODING_SYSTEM_EOL_LF (coding_system);
1462 else if (EQ (prop, Qeol_crlf))
1463 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1464 else if (EQ (prop, Qeol_cr))
1465 return XCODING_SYSTEM_EOL_CR (coding_system);
1466 else if (EQ (prop, Qpost_read_conversion))
1467 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1468 else if (EQ (prop, Qpre_write_conversion))
1469 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1472 else if (EQ (prop, Qdisable_composition))
1473 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1474 else if (EQ (prop, Quse_entity_reference))
1475 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1477 else if (type == CODESYS_ISO2022)
1479 if (EQ (prop, Qcharset_g0))
1480 return coding_system_charset (coding_system, 0);
1481 else if (EQ (prop, Qcharset_g1))
1482 return coding_system_charset (coding_system, 1);
1483 else if (EQ (prop, Qcharset_g2))
1484 return coding_system_charset (coding_system, 2);
1485 else if (EQ (prop, Qcharset_g3))
1486 return coding_system_charset (coding_system, 3);
1488 #define FORCE_CHARSET(charset_num) \
1489 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1490 (coding_system, charset_num) ? Qt : Qnil)
1492 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1493 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1494 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1495 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1497 #define LISP_BOOLEAN(prop) \
1498 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1500 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1501 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1502 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1503 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1504 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1505 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1506 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1508 else if (EQ (prop, Qinput_charset_conversion))
1510 unparse_charset_conversion_specs
1511 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1512 else if (EQ (prop, Qoutput_charset_conversion))
1514 unparse_charset_conversion_specs
1515 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1519 else if (type == CODESYS_CCL)
1521 if (EQ (prop, Qdecode))
1522 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1523 else if (EQ (prop, Qencode))
1524 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1532 return Qnil; /* not reached */
1536 /************************************************************************/
1537 /* Coding category functions */
1538 /************************************************************************/
1541 decode_coding_category (Lisp_Object symbol)
1545 CHECK_SYMBOL (symbol);
1546 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1547 if (EQ (coding_category_symbol[i], symbol))
1550 signal_simple_error ("Unrecognized coding category", symbol);
1551 return 0; /* not reached */
1554 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1555 Return a list of all recognized coding categories.
1560 Lisp_Object list = Qnil;
1562 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1563 list = Fcons (coding_category_symbol[i], list);
1567 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1568 Change the priority order of the coding categories.
1569 LIST should be list of coding categories, in descending order of
1570 priority. Unspecified coding categories will be lower in priority
1571 than all specified ones, in the same relative order they were in
1576 int category_to_priority[CODING_CATEGORY_LAST];
1580 /* First generate a list that maps coding categories to priorities. */
1582 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583 category_to_priority[i] = -1;
1585 /* Highest priority comes from the specified list. */
1587 EXTERNAL_LIST_LOOP (rest, list)
1589 int cat = decode_coding_category (XCAR (rest));
1591 if (category_to_priority[cat] >= 0)
1592 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1593 category_to_priority[cat] = i++;
1596 /* Now go through the existing categories by priority to retrieve
1597 the categories not yet specified and preserve their priority
1599 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1601 int cat = fcd->coding_category_by_priority[j];
1602 if (category_to_priority[cat] < 0)
1603 category_to_priority[cat] = i++;
1606 /* Now we need to construct the inverse of the mapping we just
1609 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1610 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1612 /* Phew! That was confusing. */
1616 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1617 Return a list of coding categories in descending order of priority.
1622 Lisp_Object list = Qnil;
1624 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1625 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1630 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1631 Change the coding system associated with a coding category.
1633 (coding_category, coding_system))
1635 int cat = decode_coding_category (coding_category);
1637 coding_system = Fget_coding_system (coding_system);
1638 fcd->coding_category_system[cat] = coding_system;
1642 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1643 Return the coding system associated with a coding category.
1647 int cat = decode_coding_category (coding_category);
1648 Lisp_Object sys = fcd->coding_category_system[cat];
1651 return XCODING_SYSTEM_NAME (sys);
1656 /************************************************************************/
1657 /* Detecting the encoding of data */
1658 /************************************************************************/
1660 struct detection_state
1662 eol_type_t eol_type;
1698 struct iso2022_decoder iso;
1700 int high_byte_count;
1701 unsigned int saw_single_shift:1;
1714 acceptable_control_char_p (int c)
1718 /* Allow and ignore control characters that you might
1719 reasonably see in a text file */
1724 case 8: /* backspace */
1725 case 11: /* vertical tab */
1726 case 12: /* form feed */
1727 case 26: /* MS-DOS C-z junk */
1728 case 31: /* '^_' -- for info */
1736 mask_has_at_most_one_bit_p (int mask)
1738 /* Perhaps the only thing useful you learn from intensive Microsoft
1739 technical interviews */
1740 return (mask & (mask - 1)) == 0;
1744 detect_eol_type (struct detection_state *st, const Extbyte *src,
1749 unsigned char c = *(unsigned char *)src++;
1752 if (st->eol.just_saw_cr)
1754 else if (st->eol.seen_anything)
1757 else if (st->eol.just_saw_cr)
1760 st->eol.just_saw_cr = 1;
1762 st->eol.just_saw_cr = 0;
1763 st->eol.seen_anything = 1;
1766 return EOL_AUTODETECT;
1769 /* Attempt to determine the encoding and EOL type of the given text.
1770 Before calling this function for the first type, you must initialize
1771 st->eol_type as appropriate and initialize st->mask to ~0.
1773 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1776 st->mask holds the determined coding category mask, or ~0 if only
1777 ASCII has been seen so far.
1781 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1782 is present in st->mask
1783 1 == definitive answers are here for both st->eol_type and st->mask
1787 detect_coding_type (struct detection_state *st, const Extbyte *src,
1788 size_t n, int just_do_eol)
1790 if (st->eol_type == EOL_AUTODETECT)
1791 st->eol_type = detect_eol_type (st, src, n);
1794 return st->eol_type != EOL_AUTODETECT;
1796 if (!st->seen_non_ascii)
1798 for (; n; n--, src++)
1800 unsigned char c = *(unsigned char *) src;
1801 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1803 st->seen_non_ascii = 1;
1805 st->shift_jis.mask = ~0;
1809 st->iso2022.mask = ~0;
1819 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1820 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1821 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1822 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1823 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1824 st->big5.mask = detect_coding_big5 (st, src, n);
1825 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1826 st->utf8.mask = detect_coding_utf8 (st, src, n);
1827 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1828 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1831 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1832 | st->utf8.mask | st->ucs4.mask;
1835 int retval = mask_has_at_most_one_bit_p (st->mask);
1836 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1837 return retval && st->eol_type != EOL_AUTODETECT;
1842 coding_system_from_mask (int mask)
1846 /* If the file was entirely or basically ASCII, use the
1847 default value of `buffer-file-coding-system'. */
1848 Lisp_Object retval =
1849 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1852 retval = Ffind_coding_system (retval);
1856 (Qbad_variable, Qwarning,
1857 "Invalid `default-buffer-file-coding-system', set to nil");
1858 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1862 retval = Fget_coding_system (Qraw_text);
1870 mask = postprocess_iso2022_mask (mask);
1872 /* Look through the coding categories by priority and find
1873 the first one that is allowed. */
1874 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1876 cat = fcd->coding_category_by_priority[i];
1877 if ((mask & (1 << cat)) &&
1878 !NILP (fcd->coding_category_system[cat]))
1882 return fcd->coding_category_system[cat];
1884 return Fget_coding_system (Qraw_text);
1888 /* Given a seekable read stream and potential coding system and EOL type
1889 as specified, do any autodetection that is called for. If the
1890 coding system and/or EOL type are not `autodetect', they will be left
1891 alone; but this function will never return an autodetect coding system
1894 This function does not automatically fetch subsidiary coding systems;
1895 that should be unnecessary with the explicit eol-type argument. */
1897 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1900 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1901 eol_type_t *eol_type_in_out)
1903 struct detection_state decst;
1905 if (*eol_type_in_out == EOL_AUTODETECT)
1906 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1909 decst.eol_type = *eol_type_in_out;
1912 /* If autodetection is called for, do it now. */
1913 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1914 || *eol_type_in_out == EOL_AUTODETECT)
1917 Lisp_Object coding_system = Qnil;
1919 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1922 /* Look for initial "-*-"; mode line prefix */
1924 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1929 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1931 Extbyte *local_vars_beg = p + 3;
1932 /* Look for final "-*-"; mode line suffix */
1933 for (p = local_vars_beg,
1934 scan_end = buf + nread - LENGTH ("-*-");
1939 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1941 Extbyte *suffix = p;
1942 /* Look for "coding:" */
1943 for (p = local_vars_beg,
1944 scan_end = suffix - LENGTH ("coding:?");
1947 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1948 && (p == local_vars_beg
1949 || (*(p-1) == ' ' ||
1955 p += LENGTH ("coding:");
1956 while (*p == ' ' || *p == '\t') p++;
1958 /* Get coding system name */
1959 save = *suffix; *suffix = '\0';
1960 /* Characters valid in a MIME charset name (rfc 1521),
1961 and in a Lisp symbol name. */
1962 n = strspn ( (char *) p,
1963 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1964 "abcdefghijklmnopqrstuvwxyz"
1970 save = p[n]; p[n] = '\0';
1972 Ffind_coding_system (intern ((char *) p));
1982 if (NILP (coding_system))
1985 if (detect_coding_type (&decst, buf, nread,
1986 XCODING_SYSTEM_TYPE (*codesys_in_out)
1987 != CODESYS_AUTODETECT))
1989 nread = Lstream_read (stream, buf, sizeof (buf));
1995 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1996 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1999 if (detect_coding_type (&decst, buf, nread, 1))
2001 nread = Lstream_read (stream, buf, sizeof (buf));
2007 *eol_type_in_out = decst.eol_type;
2008 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2010 if (NILP (coding_system))
2011 *codesys_in_out = coding_system_from_mask (decst.mask);
2013 *codesys_in_out = coding_system;
2017 /* If we absolutely can't determine the EOL type, just assume LF. */
2018 if (*eol_type_in_out == EOL_AUTODETECT)
2019 *eol_type_in_out = EOL_LF;
2021 Lstream_rewind (stream);
2024 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2025 Detect coding system of the text in the region between START and END.
2026 Return a list of possible coding systems ordered by priority.
2027 If only ASCII characters are found, return 'undecided or one of
2028 its subsidiary coding systems according to a detected end-of-line
2029 type. Optional arg BUFFER defaults to the current buffer.
2031 (start, end, buffer))
2033 Lisp_Object val = Qnil;
2034 struct buffer *buf = decode_buffer (buffer, 0);
2036 Lisp_Object instream, lb_instream;
2037 Lstream *istr, *lb_istr;
2038 struct detection_state decst;
2039 struct gcpro gcpro1, gcpro2;
2041 get_buffer_range_char (buf, start, end, &b, &e, 0);
2042 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2043 lb_istr = XLSTREAM (lb_instream);
2044 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2045 istr = XLSTREAM (instream);
2046 GCPRO2 (instream, lb_instream);
2048 decst.eol_type = EOL_AUTODETECT;
2052 Extbyte random_buffer[4096];
2053 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2057 if (detect_coding_type (&decst, random_buffer, nread, 0))
2061 if (decst.mask == ~0)
2062 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2070 decst.mask = postprocess_iso2022_mask (decst.mask);
2072 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2074 int sys = fcd->coding_category_by_priority[i];
2075 if (decst.mask & (1 << sys))
2077 Lisp_Object codesys = fcd->coding_category_system[sys];
2078 if (!NILP (codesys))
2079 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2080 val = Fcons (codesys, val);
2084 Lstream_close (istr);
2086 Lstream_delete (istr);
2087 Lstream_delete (lb_istr);
2092 /************************************************************************/
2093 /* Converting to internal Mule format ("decoding") */
2094 /************************************************************************/
2096 /* A decoding stream is a stream used for decoding text (i.e.
2097 converting from some external format to internal format).
2098 The decoding-stream object keeps track of the actual coding
2099 stream, the stream that is at the other end, and data that
2100 needs to be persistent across the lifetime of the stream. */
2102 /* Handle the EOL stuff related to just-read-in character C.
2103 EOL_TYPE is the EOL type of the coding stream.
2104 FLAGS is the current value of FLAGS in the coding stream, and may
2105 be modified by this macro. (The macro only looks at the
2106 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2107 bytes are to be written. You need to also define a local goto
2108 label "label_continue_loop" that is at the end of the main
2109 character-reading loop.
2111 If C is a CR character, then this macro handles it entirely and
2112 jumps to label_continue_loop. Otherwise, this macro does not add
2113 anything to DST, and continues normally. You should continue
2114 processing C normally after this macro. */
2116 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2120 if (eol_type == EOL_CR) \
2121 Dynarr_add (dst, '\n'); \
2122 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2123 Dynarr_add (dst, c); \
2125 flags |= CODING_STATE_CR; \
2126 goto label_continue_loop; \
2128 else if (flags & CODING_STATE_CR) \
2129 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2131 Dynarr_add (dst, '\r'); \
2132 flags &= ~CODING_STATE_CR; \
2136 /* C should be a binary character in the range 0 - 255; convert
2137 to internal format and add to Dynarr DST. */
2140 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2142 if (BYTE_ASCII_P (c)) \
2143 Dynarr_add (dst, c); \
2146 Dynarr_add (dst, (c >> 6) | 0xc0); \
2147 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2151 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2153 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2157 Dynarr_add (dst, c);
2159 else if ( c <= 0x7ff )
2161 Dynarr_add (dst, (c >> 6) | 0xc0);
2162 Dynarr_add (dst, (c & 0x3f) | 0x80);
2164 else if ( c <= 0xffff )
2166 Dynarr_add (dst, (c >> 12) | 0xe0);
2167 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2168 Dynarr_add (dst, (c & 0x3f) | 0x80);
2170 else if ( c <= 0x1fffff )
2172 Dynarr_add (dst, (c >> 18) | 0xf0);
2173 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2174 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2175 Dynarr_add (dst, (c & 0x3f) | 0x80);
2177 else if ( c <= 0x3ffffff )
2179 Dynarr_add (dst, (c >> 24) | 0xf8);
2180 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2181 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2182 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2183 Dynarr_add (dst, (c & 0x3f) | 0x80);
2187 Dynarr_add (dst, (c >> 30) | 0xfc);
2188 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2189 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2190 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2191 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2192 Dynarr_add (dst, (c & 0x3f) | 0x80);
2196 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2198 if (BYTE_ASCII_P (c)) \
2199 Dynarr_add (dst, c); \
2200 else if (BYTE_C1_P (c)) \
2202 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2203 Dynarr_add (dst, c + 0x20); \
2207 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2208 Dynarr_add (dst, c); \
2213 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2217 DECODE_ADD_BINARY_CHAR (ch, dst); \
2222 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2224 if (flags & CODING_STATE_END) \
2226 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2227 if (flags & CODING_STATE_CR) \
2228 Dynarr_add (dst, '\r'); \
2232 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2234 struct decoding_stream
2236 /* Coding system that governs the conversion. */
2237 Lisp_Coding_System *codesys;
2239 /* Stream that we read the encoded data from or
2240 write the decoded data to. */
2243 /* If we are reading, then we can return only a fixed amount of
2244 data, so if the conversion resulted in too much data, we store it
2245 here for retrieval the next time around. */
2246 unsigned_char_dynarr *runoff;
2248 /* FLAGS holds flags indicating the current state of the decoding.
2249 Some of these flags are dependent on the coding system. */
2252 /* CPOS holds a partially built-up code-point of character. */
2255 /* EOL_TYPE specifies the type of end-of-line conversion that
2256 currently applies. We need to keep this separate from the
2257 EOL type stored in CODESYS because the latter might indicate
2258 automatic EOL-type detection while the former will always
2259 indicate a particular EOL type. */
2260 eol_type_t eol_type;
2262 /* Additional ISO2022 information. We define the structure above
2263 because it's also needed by the detection routines. */
2264 struct iso2022_decoder iso2022;
2266 /* Additional information (the state of the running CCL program)
2267 used by the CCL decoder. */
2268 struct ccl_program ccl;
2270 /* counter for UTF-8 or UCS-4 */
2271 unsigned char counter;
2274 unsigned char er_counter;
2275 unsigned char er_buf[16];
2277 unsigned combined_char_count;
2278 Emchar combined_chars[16];
2279 Lisp_Object combining_table;
2281 struct detection_state decst;
2284 static ssize_t decoding_reader (Lstream *stream,
2285 unsigned char *data, size_t size);
2286 static ssize_t decoding_writer (Lstream *stream,
2287 const unsigned char *data, size_t size);
2288 static int decoding_rewinder (Lstream *stream);
2289 static int decoding_seekable_p (Lstream *stream);
2290 static int decoding_flusher (Lstream *stream);
2291 static int decoding_closer (Lstream *stream);
2293 static Lisp_Object decoding_marker (Lisp_Object stream);
2295 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2296 sizeof (struct decoding_stream));
2299 decoding_marker (Lisp_Object stream)
2301 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2302 Lisp_Object str_obj;
2304 /* We do not need to mark the coding systems or charsets stored
2305 within the stream because they are stored in a global list
2306 and automatically marked. */
2308 XSETLSTREAM (str_obj, str);
2309 mark_object (str_obj);
2310 if (str->imp->marker)
2311 return (str->imp->marker) (str_obj);
2316 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2317 so we read data from the other end, decode it, and store it into DATA. */
2320 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2322 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2323 unsigned char *orig_data = data;
2325 int error_occurred = 0;
2327 /* We need to interface to mule_decode(), which expects to take some
2328 amount of data and store the result into a Dynarr. We have
2329 mule_decode() store into str->runoff, and take data from there
2332 /* We loop until we have enough data, reading chunks from the other
2333 end and decoding it. */
2336 /* Take data from the runoff if we can. Make sure to take at
2337 most SIZE bytes, and delete the data from the runoff. */
2338 if (Dynarr_length (str->runoff) > 0)
2340 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2341 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2342 Dynarr_delete_many (str->runoff, 0, chunk);
2348 break; /* No more room for data */
2350 if (str->flags & CODING_STATE_END)
2351 /* This means that on the previous iteration, we hit the EOF on
2352 the other end. We loop once more so that mule_decode() can
2353 output any final stuff it may be holding, or any "go back
2354 to a sane state" escape sequences. (This latter makes sense
2355 during encoding.) */
2358 /* Exhausted the runoff, so get some more. DATA has at least
2359 SIZE bytes left of storage in it, so it's OK to read directly
2360 into it. (We'll be overwriting above, after we've decoded it
2361 into the runoff.) */
2362 read_size = Lstream_read (str->other_end, data, size);
2369 /* There might be some more end data produced in the translation.
2370 See the comment above. */
2371 str->flags |= CODING_STATE_END;
2372 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2375 if (data - orig_data == 0)
2376 return error_occurred ? -1 : 0;
2378 return data - orig_data;
2382 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2384 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2387 /* Decode all our data into the runoff, and then attempt to write
2388 it all out to the other end. Remove whatever chunk we succeeded
2390 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2391 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2392 Dynarr_length (str->runoff));
2394 Dynarr_delete_many (str->runoff, 0, retval);
2395 /* Do NOT return retval. The return value indicates how much
2396 of the incoming data was written, not how many bytes were
2402 reset_decoding_stream (struct decoding_stream *str)
2405 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2407 Lisp_Object coding_system;
2408 XSETCODING_SYSTEM (coding_system, str->codesys);
2409 reset_iso2022 (coding_system, &str->iso2022);
2411 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2413 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2418 str->er_counter = 0;
2419 str->combined_char_count = 0;
2420 str->combining_table = Qnil;
2422 str->flags = str->cpos = 0;
2426 decoding_rewinder (Lstream *stream)
2428 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2429 reset_decoding_stream (str);
2430 Dynarr_reset (str->runoff);
2431 return Lstream_rewind (str->other_end);
2435 decoding_seekable_p (Lstream *stream)
2437 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2438 return Lstream_seekable_p (str->other_end);
2442 decoding_flusher (Lstream *stream)
2444 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2445 return Lstream_flush (str->other_end);
2449 decoding_closer (Lstream *stream)
2451 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2452 if (stream->flags & LSTREAM_FL_WRITE)
2454 str->flags |= CODING_STATE_END;
2455 decoding_writer (stream, 0, 0);
2457 Dynarr_free (str->runoff);
2459 #ifdef ENABLE_COMPOSITE_CHARS
2460 if (str->iso2022.composite_chars)
2461 Dynarr_free (str->iso2022.composite_chars);
2464 return Lstream_close (str->other_end);
2468 decoding_stream_coding_system (Lstream *stream)
2470 Lisp_Object coding_system;
2471 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2473 XSETCODING_SYSTEM (coding_system, str->codesys);
2474 return subsidiary_coding_system (coding_system, str->eol_type);
2478 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2480 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2481 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2483 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2484 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2485 reset_decoding_stream (str);
2488 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2489 stream for writing, no automatic code detection will be performed.
2490 The reason for this is that automatic code detection requires a
2491 seekable input. Things will also fail if you open a decoding
2492 stream for reading using a non-fully-specified coding system and
2493 a non-seekable input stream. */
2496 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2499 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2500 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2504 str->other_end = stream;
2505 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2506 str->eol_type = EOL_AUTODETECT;
2507 if (!strcmp (mode, "r")
2508 && Lstream_seekable_p (stream))
2509 /* We can determine the coding system now. */
2510 determine_real_coding_system (stream, &codesys, &str->eol_type);
2511 set_decoding_stream_coding_system (lstr, codesys);
2512 str->decst.eol_type = str->eol_type;
2513 str->decst.mask = ~0;
2514 XSETLSTREAM (obj, lstr);
2519 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2521 return make_decoding_stream_1 (stream, codesys, "r");
2525 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2527 return make_decoding_stream_1 (stream, codesys, "w");
2530 /* Note: the decode_coding_* functions all take the same
2531 arguments as mule_decode(), which is to say some SRC data of
2532 size N, which is to be stored into dynamic array DST.
2533 DECODING is the stream within which the decoding is
2534 taking place, but no data is actually read from or
2535 written to that stream; that is handled in decoding_reader()
2536 or decoding_writer(). This allows the same functions to
2537 be used for both reading and writing. */
2540 mule_decode (Lstream *decoding, const Extbyte *src,
2541 unsigned_char_dynarr *dst, size_t n)
2543 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2545 /* If necessary, do encoding-detection now. We do this when
2546 we're a writing stream or a non-seekable reading stream,
2547 meaning that we can't just process the whole input,
2548 rewind, and start over. */
2550 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2551 str->eol_type == EOL_AUTODETECT)
2553 Lisp_Object codesys;
2555 XSETCODING_SYSTEM (codesys, str->codesys);
2556 detect_coding_type (&str->decst, src, n,
2557 CODING_SYSTEM_TYPE (str->codesys) !=
2558 CODESYS_AUTODETECT);
2559 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2560 str->decst.mask != ~0)
2561 /* #### This is cheesy. What we really ought to do is
2562 buffer up a certain amount of data so as to get a
2563 less random result. */
2564 codesys = coding_system_from_mask (str->decst.mask);
2565 str->eol_type = str->decst.eol_type;
2566 if (XCODING_SYSTEM (codesys) != str->codesys)
2568 /* Preserve the CODING_STATE_END flag in case it was set.
2569 If we erase it, bad things might happen. */
2570 int was_end = str->flags & CODING_STATE_END;
2571 set_decoding_stream_coding_system (decoding, codesys);
2573 str->flags |= CODING_STATE_END;
2577 switch (CODING_SYSTEM_TYPE (str->codesys))
2580 case CODESYS_INTERNAL:
2581 Dynarr_add_many (dst, src, n);
2584 case CODESYS_AUTODETECT:
2585 /* If we got this far and still haven't decided on the coding
2586 system, then do no conversion. */
2587 case CODESYS_NO_CONVERSION:
2588 decode_coding_no_conversion (decoding, src, dst, n);
2591 case CODESYS_SHIFT_JIS:
2592 decode_coding_sjis (decoding, src, dst, n);
2595 decode_coding_big5 (decoding, src, dst, n);
2598 decode_coding_ucs4 (decoding, src, dst, n);
2601 decode_coding_utf8 (decoding, src, dst, n);
2604 str->ccl.last_block = str->flags & CODING_STATE_END;
2605 /* When applying ccl program to stream, MUST NOT set NULL
2607 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2608 dst, n, 0, CCL_MODE_DECODING);
2610 case CODESYS_ISO2022:
2611 decode_coding_iso2022 (decoding, src, dst, n);
2619 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2620 Decode the text between START and END which is encoded in CODING-SYSTEM.
2621 This is useful if you've read in encoded text from a file without decoding
2622 it (e.g. you read in a JIS-formatted file but used the `binary' or
2623 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2624 Return length of decoded text.
2625 BUFFER defaults to the current buffer if unspecified.
2627 (start, end, coding_system, buffer))
2630 struct buffer *buf = decode_buffer (buffer, 0);
2631 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2632 Lstream *istr, *ostr;
2633 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2635 get_buffer_range_char (buf, start, end, &b, &e, 0);
2637 barf_if_buffer_read_only (buf, b, e);
2639 coding_system = Fget_coding_system (coding_system);
2640 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2641 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2642 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2644 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2645 Fget_coding_system (Qbinary));
2646 istr = XLSTREAM (instream);
2647 ostr = XLSTREAM (outstream);
2648 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2650 /* The chain of streams looks like this:
2652 [BUFFER] <----- send through
2653 ------> [ENCODE AS BINARY]
2654 ------> [DECODE AS SPECIFIED]
2660 char tempbuf[1024]; /* some random amount */
2661 Bufpos newpos, even_newer_pos;
2662 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2663 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2667 newpos = lisp_buffer_stream_startpos (istr);
2668 Lstream_write (ostr, tempbuf, size_in_bytes);
2669 even_newer_pos = lisp_buffer_stream_startpos (istr);
2670 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2673 Lstream_close (istr);
2674 Lstream_close (ostr);
2676 Lstream_delete (istr);
2677 Lstream_delete (ostr);
2678 Lstream_delete (XLSTREAM (de_outstream));
2679 Lstream_delete (XLSTREAM (lb_outstream));
2684 /************************************************************************/
2685 /* Converting to an external encoding ("encoding") */
2686 /************************************************************************/
2688 /* An encoding stream is an output stream. When you create the
2689 stream, you specify the coding system that governs the encoding
2690 and another stream that the resulting encoded data is to be
2691 sent to, and then start sending data to it. */
2693 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2695 struct encoding_stream
2697 /* Coding system that governs the conversion. */
2698 Lisp_Coding_System *codesys;
2700 /* Stream that we read the encoded data from or
2701 write the decoded data to. */
2704 /* If we are reading, then we can return only a fixed amount of
2705 data, so if the conversion resulted in too much data, we store it
2706 here for retrieval the next time around. */
2707 unsigned_char_dynarr *runoff;
2709 /* FLAGS holds flags indicating the current state of the encoding.
2710 Some of these flags are dependent on the coding system. */
2713 /* CH holds a partially built-up character. Since we only deal
2714 with one- and two-byte characters at the moment, we only use
2715 this to store the first byte of a two-byte character. */
2718 /* Additional information used by the ISO2022 encoder. */
2721 /* CHARSET holds the character sets currently assigned to the G0
2722 through G3 registers. It is initialized from the array
2723 INITIAL_CHARSET in CODESYS. */
2724 Lisp_Object charset[4];
2726 /* Which registers are currently invoked into the left (GL) and
2727 right (GR) halves of the 8-bit encoding space? */
2728 int register_left, register_right;
2730 /* Whether we need to explicitly designate the charset in the
2731 G? register before using it. It is initialized from the
2732 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2733 unsigned char force_charset_on_output[4];
2735 /* Other state variables that need to be preserved across
2737 Lisp_Object current_charset;
2739 int current_char_boundary;
2742 void (*encode_char) (struct encoding_stream *str, Emchar c,
2743 unsigned_char_dynarr *dst, unsigned int *flags);
2744 void (*finish) (struct encoding_stream *str,
2745 unsigned_char_dynarr *dst, unsigned int *flags);
2747 /* Additional information (the state of the running CCL program)
2748 used by the CCL encoder. */
2749 struct ccl_program ccl;
2753 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2754 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2756 static int encoding_rewinder (Lstream *stream);
2757 static int encoding_seekable_p (Lstream *stream);
2758 static int encoding_flusher (Lstream *stream);
2759 static int encoding_closer (Lstream *stream);
2761 static Lisp_Object encoding_marker (Lisp_Object stream);
2763 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2764 sizeof (struct encoding_stream));
2767 encoding_marker (Lisp_Object stream)
2769 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2770 Lisp_Object str_obj;
2772 /* We do not need to mark the coding systems or charsets stored
2773 within the stream because they are stored in a global list
2774 and automatically marked. */
2776 XSETLSTREAM (str_obj, str);
2777 mark_object (str_obj);
2778 if (str->imp->marker)
2779 return (str->imp->marker) (str_obj);
2784 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2785 so we read data from the other end, encode it, and store it into DATA. */
2788 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2790 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2791 unsigned char *orig_data = data;
2793 int error_occurred = 0;
2795 /* We need to interface to mule_encode(), which expects to take some
2796 amount of data and store the result into a Dynarr. We have
2797 mule_encode() store into str->runoff, and take data from there
2800 /* We loop until we have enough data, reading chunks from the other
2801 end and encoding it. */
2804 /* Take data from the runoff if we can. Make sure to take at
2805 most SIZE bytes, and delete the data from the runoff. */
2806 if (Dynarr_length (str->runoff) > 0)
2808 int chunk = min ((int) size, Dynarr_length (str->runoff));
2809 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2810 Dynarr_delete_many (str->runoff, 0, chunk);
2816 break; /* No more room for data */
2818 if (str->flags & CODING_STATE_END)
2819 /* This means that on the previous iteration, we hit the EOF on
2820 the other end. We loop once more so that mule_encode() can
2821 output any final stuff it may be holding, or any "go back
2822 to a sane state" escape sequences. (This latter makes sense
2823 during encoding.) */
2826 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2827 left of storage in it, so it's OK to read directly into it.
2828 (We'll be overwriting above, after we've encoded it into the
2830 read_size = Lstream_read (str->other_end, data, size);
2837 /* There might be some more end data produced in the translation.
2838 See the comment above. */
2839 str->flags |= CODING_STATE_END;
2840 mule_encode (stream, data, str->runoff, read_size);
2843 if (data == orig_data)
2844 return error_occurred ? -1 : 0;
2846 return data - orig_data;
2850 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2852 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2855 /* Encode all our data into the runoff, and then attempt to write
2856 it all out to the other end. Remove whatever chunk we succeeded
2858 mule_encode (stream, data, str->runoff, size);
2859 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2860 Dynarr_length (str->runoff));
2862 Dynarr_delete_many (str->runoff, 0, retval);
2863 /* Do NOT return retval. The return value indicates how much
2864 of the incoming data was written, not how many bytes were
2870 reset_encoding_stream (struct encoding_stream *str)
2873 switch (CODING_SYSTEM_TYPE (str->codesys))
2875 case CODESYS_ISO2022:
2879 str->encode_char = &char_encode_iso2022;
2880 str->finish = &char_finish_iso2022;
2881 for (i = 0; i < 4; i++)
2883 str->iso2022.charset[i] =
2884 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2885 str->iso2022.force_charset_on_output[i] =
2886 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2888 str->iso2022.register_left = 0;
2889 str->iso2022.register_right = 1;
2890 str->iso2022.current_charset = Qnil;
2891 str->iso2022.current_half = 0;
2895 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2898 str->encode_char = &char_encode_utf8;
2899 str->finish = &char_finish_utf8;
2902 str->encode_char = &char_encode_ucs4;
2903 str->finish = &char_finish_ucs4;
2905 case CODESYS_SHIFT_JIS:
2906 str->encode_char = &char_encode_shift_jis;
2907 str->finish = &char_finish_shift_jis;
2910 str->encode_char = &char_encode_big5;
2911 str->finish = &char_finish_big5;
2917 str->iso2022.current_char_boundary = 0;
2918 str->flags = str->ch = 0;
2922 encoding_rewinder (Lstream *stream)
2924 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2925 reset_encoding_stream (str);
2926 Dynarr_reset (str->runoff);
2927 return Lstream_rewind (str->other_end);
2931 encoding_seekable_p (Lstream *stream)
2933 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2934 return Lstream_seekable_p (str->other_end);
2938 encoding_flusher (Lstream *stream)
2940 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2941 return Lstream_flush (str->other_end);
2945 encoding_closer (Lstream *stream)
2947 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2948 if (stream->flags & LSTREAM_FL_WRITE)
2950 str->flags |= CODING_STATE_END;
2951 encoding_writer (stream, 0, 0);
2953 Dynarr_free (str->runoff);
2954 return Lstream_close (str->other_end);
2958 encoding_stream_coding_system (Lstream *stream)
2960 Lisp_Object coding_system;
2961 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2963 XSETCODING_SYSTEM (coding_system, str->codesys);
2964 return coding_system;
2968 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2970 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2971 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2973 reset_encoding_stream (str);
2977 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2980 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2981 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2985 str->runoff = Dynarr_new (unsigned_char);
2986 str->other_end = stream;
2987 set_encoding_stream_coding_system (lstr, codesys);
2988 XSETLSTREAM (obj, lstr);
2993 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2995 return make_encoding_stream_1 (stream, codesys, "r");
2999 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3001 return make_encoding_stream_1 (stream, codesys, "w");
3004 /* Convert N bytes of internally-formatted data stored in SRC to an
3005 external format, according to the encoding stream ENCODING.
3006 Store the encoded data into DST. */
3009 mule_encode (Lstream *encoding, const Bufbyte *src,
3010 unsigned_char_dynarr *dst, size_t n)
3012 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3014 switch (CODING_SYSTEM_TYPE (str->codesys))
3017 case CODESYS_INTERNAL:
3018 Dynarr_add_many (dst, src, n);
3021 case CODESYS_AUTODETECT:
3022 /* If we got this far and still haven't decided on the coding
3023 system, then do no conversion. */
3024 case CODESYS_NO_CONVERSION:
3025 encode_coding_no_conversion (encoding, src, dst, n);
3029 str->ccl.last_block = str->flags & CODING_STATE_END;
3030 /* When applying ccl program to stream, MUST NOT set NULL
3032 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3033 dst, n, 0, CCL_MODE_ENCODING);
3037 text_encode_generic (encoding, src, dst, n);
3041 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3042 Encode the text between START and END using CODING-SYSTEM.
3043 This will, for example, convert Japanese characters into stuff such as
3044 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3045 text. BUFFER defaults to the current buffer if unspecified.
3047 (start, end, coding_system, buffer))
3050 struct buffer *buf = decode_buffer (buffer, 0);
3051 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3052 Lstream *istr, *ostr;
3053 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3055 get_buffer_range_char (buf, start, end, &b, &e, 0);
3057 barf_if_buffer_read_only (buf, b, e);
3059 coding_system = Fget_coding_system (coding_system);
3060 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3061 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3062 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3063 Fget_coding_system (Qbinary));
3064 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3066 istr = XLSTREAM (instream);
3067 ostr = XLSTREAM (outstream);
3068 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3069 /* The chain of streams looks like this:
3071 [BUFFER] <----- send through
3072 ------> [ENCODE AS SPECIFIED]
3073 ------> [DECODE AS BINARY]
3078 char tempbuf[1024]; /* some random amount */
3079 Bufpos newpos, even_newer_pos;
3080 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3081 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3085 newpos = lisp_buffer_stream_startpos (istr);
3086 Lstream_write (ostr, tempbuf, size_in_bytes);
3087 even_newer_pos = lisp_buffer_stream_startpos (istr);
3088 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3094 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3095 Lstream_close (istr);
3096 Lstream_close (ostr);
3098 Lstream_delete (istr);
3099 Lstream_delete (ostr);
3100 Lstream_delete (XLSTREAM (de_outstream));
3101 Lstream_delete (XLSTREAM (lb_outstream));
3102 return make_int (retlen);
3109 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3110 unsigned_char_dynarr *dst, size_t n)
3113 unsigned char char_boundary;
3114 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3115 unsigned int flags = str->flags;
3116 Emchar ch = str->ch;
3118 char_boundary = str->iso2022.current_char_boundary;
3124 if (char_boundary == 0)
3152 (*str->encode_char) (str, c, dst, &flags);
3154 else if (char_boundary == 1)
3156 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3162 ch = (ch << 6) | (c & 0x3f);
3167 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3169 (*str->finish) (str, dst, &flags);
3174 str->iso2022.current_char_boundary = char_boundary;
3179 /************************************************************************/
3180 /* entity reference */
3181 /************************************************************************/
3184 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3186 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3188 if ( str->er_counter > 0)
3190 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3191 str->er_counter = 0;
3195 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3196 unsigned_char_dynarr* dst);
3198 decode_add_er_char (struct decoding_stream *str, Emchar c,
3199 unsigned_char_dynarr* dst)
3201 if (str->er_counter == 0)
3203 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3206 str->er_buf[0] = '&';
3210 DECODE_ADD_UCS_CHAR (c, dst);
3214 Lisp_Object string = make_string (str->er_buf,
3216 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3223 while (!NILP (rest))
3227 if (NILP (ccs = Ffind_charset (ccs)))
3242 pat = concat3 (build_string ("^&"),
3243 pat, build_string ("\\([0-9]+\\)$"));
3246 else if (EQ (ret, Qx))
3248 pat = concat3 (build_string ("^&"),
3249 pat, build_string ("\\([0-9a-f]+\\)$"));
3252 else if (EQ (ret, QX))
3254 pat = concat3 (build_string ("^&"),
3255 pat, build_string ("\\([0-9A-F]+\\)$"));
3261 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3264 = XINT (Fstring_to_number
3265 (Fsubstring (string,
3266 Fmatch_beginning (make_int (1)),
3267 Fmatch_end (make_int (1))),
3270 DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
3275 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3276 string, Qnil, Qnil)))
3279 = XINT (Fstring_to_number
3280 (Fsubstring (string,
3281 Fmatch_beginning (make_int (1)),
3282 Fmatch_end (make_int (1))),
3285 DECODE_ADD_UCS_CHAR (code, dst);
3289 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3290 Dynarr_add (dst, ';');
3293 str->er_counter = 0;
3295 else if ( (str->er_counter >= 16) || (c >= 0x7F) )
3297 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3298 str->er_counter = 0;
3299 DECODE_ADD_UCS_CHAR (c, dst);
3302 str->er_buf[str->er_counter++] = c;
3305 void char_encode_as_entity_reference (Emchar ch, char* buf);
3307 char_encode_as_entity_reference (Emchar ch, char* buf)
3309 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3312 int format_columns, idx;
3315 while (!NILP (rest))
3319 if (!NILP (ccs = Ffind_charset (ccs)))
3321 int code_point = charset_code_point (ccs, ch);
3323 if ( code_point >= 0 )
3329 if (STRINGP (ret) && ((idx = XSTRING_LENGTH (ret)) <= 6))
3332 strncpy (&format[1], XSTRING_DATA (ret), idx);
3342 format[idx++] = '%';
3343 format_columns = XINT (ret);
3344 if ( (2 <= format_columns) && (format_columns <= 8) )
3346 format [idx++] = '0';
3347 format [idx++] = '0' + format_columns;
3354 format [idx++] = 'd';
3355 else if (EQ (ret, Qx))
3356 format [idx++] = 'x';
3357 else if (EQ (ret, QX))
3358 format [idx++] = 'X';
3361 format [idx++] = ';';
3364 sprintf (buf, format, code_point);
3370 sprintf (buf, "&MCS-%08X;", ch);
3374 /************************************************************************/
3375 /* character composition */
3376 /************************************************************************/
3377 extern Lisp_Object Qcomposition;
3380 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3382 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3386 for (i = 0; i < str->combined_char_count; i++)
3387 decode_add_er_char (str, str->combined_chars[i], dst);
3388 str->combined_char_count = 0;
3389 str->combining_table = Qnil;
3392 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3393 unsigned_char_dynarr* dst);
3395 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3396 Emchar character, unsigned_char_dynarr* dst)
3398 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3399 decode_add_er_char (str, character, dst);
3400 else if (!CONSP (str->combining_table))
3403 = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
3406 decode_add_er_char (str, character, dst);
3409 str->combined_chars[0] = character;
3410 str->combined_char_count = 1;
3411 str->combining_table = ret;
3417 = Fcdr (Fassq (make_char (character), str->combining_table));
3421 Emchar char2 = XCHARVAL (ret);
3422 ret = Fget_char_attribute (make_char (character), Qcomposition,
3426 decode_add_er_char (str, character, dst);
3427 str->combined_char_count = 0;
3428 str->combining_table = Qnil;
3432 str->combined_chars[0] = char2;
3433 str->combined_char_count = 1;
3434 str->combining_table = ret;
3439 COMPOSE_FLUSH_CHARS (str, dst);
3440 decode_add_er_char (str, character, dst);
3444 #else /* not UTF2000 */
3445 #define COMPOSE_FLUSH_CHARS(str, dst)
3446 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3447 #endif /* UTF2000 */
3450 /************************************************************************/
3451 /* Shift-JIS methods */
3452 /************************************************************************/
3454 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3455 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3456 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3457 encoded by "position-code + 0x80". A character of JISX0208
3458 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3459 position-codes are divided and shifted so that it fit in the range
3462 --- CODE RANGE of Shift-JIS ---
3463 (character set) (range)
3465 JISX0201-Kana 0xA0 .. 0xDF
3466 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3467 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3468 -------------------------------
3472 /* Is this the first byte of a Shift-JIS two-byte char? */
3474 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3475 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3477 /* Is this the second byte of a Shift-JIS two-byte char? */
3479 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3480 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3482 #define BYTE_SJIS_KATAKANA_P(c) \
3483 ((c) >= 0xA1 && (c) <= 0xDF)
3486 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3490 unsigned char c = *(unsigned char *)src++;
3491 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3493 if (st->shift_jis.in_second_byte)
3495 st->shift_jis.in_second_byte = 0;
3499 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3500 st->shift_jis.in_second_byte = 1;
3502 return CODING_CATEGORY_SHIFT_JIS_MASK;
3505 /* Convert Shift-JIS data to internal format. */
3508 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3509 unsigned_char_dynarr *dst, size_t n)
3511 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3512 unsigned int flags = str->flags;
3513 unsigned int cpos = str->cpos;
3514 eol_type_t eol_type = str->eol_type;
3518 unsigned char c = *(unsigned char *)src++;
3522 /* Previous character was first byte of Shift-JIS Kanji char. */
3523 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3525 unsigned char e1, e2;
3527 DECODE_SJIS (cpos, c, e1, e2);
3529 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3533 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3534 Dynarr_add (dst, e1);
3535 Dynarr_add (dst, e2);
3540 DECODE_ADD_BINARY_CHAR (cpos, dst);
3541 DECODE_ADD_BINARY_CHAR (c, dst);
3547 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3548 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3550 else if (BYTE_SJIS_KATAKANA_P (c))
3553 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3556 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3557 Dynarr_add (dst, c);
3562 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3566 DECODE_ADD_BINARY_CHAR (c, dst);
3568 label_continue_loop:;
3571 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3577 /* Convert internal character representation to Shift_JIS. */
3580 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3581 unsigned_char_dynarr *dst, unsigned int *flags)
3583 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3587 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3588 Dynarr_add (dst, '\r');
3589 if (eol_type != EOL_CR)
3590 Dynarr_add (dst, ch);
3594 unsigned int s1, s2;
3596 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3598 if (code_point >= 0)
3599 Dynarr_add (dst, code_point);
3600 else if ((code_point
3601 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3604 ENCODE_SJIS ((code_point >> 8) | 0x80,
3605 (code_point & 0xFF) | 0x80, s1, s2);
3606 Dynarr_add (dst, s1);
3607 Dynarr_add (dst, s2);
3609 else if ((code_point
3610 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3612 Dynarr_add (dst, code_point | 0x80);
3613 else if ((code_point
3614 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3617 ENCODE_SJIS ((code_point >> 8) | 0x80,
3618 (code_point & 0xFF) | 0x80, s1, s2);
3619 Dynarr_add (dst, s1);
3620 Dynarr_add (dst, s2);
3622 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3624 Dynarr_add (dst, code_point);
3626 Dynarr_add (dst, '?');
3628 Lisp_Object charset;
3629 unsigned int c1, c2;
3631 BREAKUP_CHAR (ch, charset, c1, c2);
3633 if (EQ(charset, Vcharset_katakana_jisx0201))
3635 Dynarr_add (dst, c1 | 0x80);
3639 Dynarr_add (dst, c1);
3641 else if (EQ(charset, Vcharset_japanese_jisx0208))
3643 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3644 Dynarr_add (dst, s1);
3645 Dynarr_add (dst, s2);
3648 Dynarr_add (dst, '?');
3654 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3655 unsigned int *flags)
3659 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3660 Decode a JISX0208 character of Shift-JIS coding-system.
3661 CODE is the character code in Shift-JIS as a cons of type bytes.
3662 Return the corresponding character.
3666 unsigned char c1, c2, s1, s2;
3669 CHECK_INT (XCAR (code));
3670 CHECK_INT (XCDR (code));
3671 s1 = XINT (XCAR (code));
3672 s2 = XINT (XCDR (code));
3673 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3674 BYTE_SJIS_TWO_BYTE_2_P (s2))
3676 DECODE_SJIS (s1, s2, c1, c2);
3677 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3678 c1 & 0x7F, c2 & 0x7F));
3684 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3685 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3686 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3690 Lisp_Object charset;
3693 CHECK_CHAR_COERCE_INT (character);
3694 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3695 if (EQ (charset, Vcharset_japanese_jisx0208))
3697 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3698 return Fcons (make_int (s1), make_int (s2));
3705 /************************************************************************/
3707 /************************************************************************/
3709 /* BIG5 is a coding system encoding two character sets: ASCII and
3710 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3711 character set and is encoded in two-byte.
3713 --- CODE RANGE of BIG5 ---
3714 (character set) (range)
3716 Big5 (1st byte) 0xA1 .. 0xFE
3717 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3718 --------------------------
3720 Since the number of characters in Big5 is larger than maximum
3721 characters in Emacs' charset (96x96), it can't be handled as one
3722 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3723 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3724 contains frequently used characters and the latter contains less
3725 frequently used characters. */
3728 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3729 ((c) >= 0x81 && (c) <= 0xFE)
3731 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3732 ((c) >= 0xA1 && (c) <= 0xFE)
3735 /* Is this the second byte of a Shift-JIS two-byte char? */
3737 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3738 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3740 /* Number of Big5 characters which have the same code in 1st byte. */
3742 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3744 /* Code conversion macros. These are macros because they are used in
3745 inner loops during code conversion.
3747 Note that temporary variables in macros introduce the classic
3748 dynamic-scoping problems with variable names. We use capital-
3749 lettered variables in the assumption that XEmacs does not use
3750 capital letters in variables except in a very formalized way
3753 /* Convert Big5 code (b1, b2) into its internal string representation
3756 /* There is a much simpler way to split the Big5 charset into two.
3757 For the moment I'm going to leave the algorithm as-is because it
3758 claims to separate out the most-used characters into a single
3759 charset, which perhaps will lead to optimizations in various
3762 The way the algorithm works is something like this:
3764 Big5 can be viewed as a 94x157 charset, where the row is
3765 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3766 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3767 the split between low and high column numbers is apparently
3768 meaningless; ascending rows produce less and less frequent chars.
3769 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3770 the first charset, and the upper half (0xC9 .. 0xFE) to the
3771 second. To do the conversion, we convert the character into
3772 a single number where 0 .. 156 is the first row, 157 .. 313
3773 is the second, etc. That way, the characters are ordered by
3774 decreasing frequency. Then we just chop the space in two
3775 and coerce the result into a 94x94 space.
3778 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3780 int B1 = b1, B2 = b2; \
3782 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3786 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3790 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3791 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3793 c1 = I / (0xFF - 0xA1) + 0xA1; \
3794 c2 = I % (0xFF - 0xA1) + 0xA1; \
3797 /* Convert the internal string representation of a Big5 character
3798 (lb, c1, c2) into Big5 code (b1, b2). */
3800 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3802 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3804 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3806 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3808 b1 = I / BIG5_SAME_ROW + 0xA1; \
3809 b2 = I % BIG5_SAME_ROW; \
3810 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3814 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3818 unsigned char c = *(unsigned char *)src++;
3819 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3821 || (c >= 0x80 && c <= 0xA0)
3825 if (st->big5.in_second_byte)
3827 st->big5.in_second_byte = 0;
3828 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3838 st->big5.in_second_byte = 1;
3840 return CODING_CATEGORY_BIG5_MASK;
3843 /* Convert Big5 data to internal format. */
3846 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3847 unsigned_char_dynarr *dst, size_t n)
3849 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3850 unsigned int flags = str->flags;
3851 unsigned int cpos = str->cpos;
3852 eol_type_t eol_type = str->eol_type;
3855 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3856 (decoding)->codesys, 1);
3861 unsigned char c = *(unsigned char *)src++;
3864 /* Previous character was first byte of Big5 char. */
3865 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3868 int code_point = (cpos << 8) | c;
3869 Emchar char_id = decode_defined_char (ccs, code_point);
3872 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3873 DECODE_ADD_UCS_CHAR (char_id, dst);
3875 unsigned char b1, b2, b3;
3876 DECODE_BIG5 (cpos, c, b1, b2, b3);
3877 Dynarr_add (dst, b1);
3878 Dynarr_add (dst, b2);
3879 Dynarr_add (dst, b3);
3884 DECODE_ADD_BINARY_CHAR (cpos, dst);
3885 DECODE_ADD_BINARY_CHAR (c, dst);
3891 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3892 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3894 decode_flush_er_chars (str, dst);
3899 decode_flush_er_chars (str, dst);
3900 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3901 DECODE_ADD_BINARY_CHAR (c, dst);
3905 /* DECODE_ADD_BINARY_CHAR (c, dst); */
3906 decode_add_er_char (str, c, dst);
3909 label_continue_loop:;
3912 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
3913 if (flags & CODING_STATE_END)
3915 decode_flush_er_chars (str, dst);
3916 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3917 if (flags & CODING_STATE_CR)
3918 Dynarr_add (dst, '\r');
3925 /* Convert internally-formatted data to Big5. */
3928 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3929 unsigned_char_dynarr *dst, unsigned int *flags)
3931 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3935 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3936 Dynarr_add (dst, '\r');
3937 if (eol_type != EOL_CR)
3938 Dynarr_add (dst, ch);
3945 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3947 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3948 Dynarr_add (dst, code_point);
3949 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3951 Dynarr_add (dst, code_point >> 8);
3952 Dynarr_add (dst, code_point & 0xFF);
3954 else if ((code_point
3955 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3957 Dynarr_add (dst, code_point >> 8);
3958 Dynarr_add (dst, code_point & 0xFF);
3960 else if ((code_point
3961 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3964 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3965 + ((code_point & 0xFF) - 33);
3966 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3967 unsigned char b2 = I % BIG5_SAME_ROW;
3969 b2 += b2 < 0x3F ? 0x40 : 0x62;
3970 Dynarr_add (dst, b1);
3971 Dynarr_add (dst, b2);
3973 else if ((code_point
3974 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3977 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3978 + ((code_point & 0xFF) - 33);
3979 unsigned char b1, b2;
3981 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3982 b1 = I / BIG5_SAME_ROW + 0xA1;
3983 b2 = I % BIG5_SAME_ROW;
3984 b2 += b2 < 0x3F ? 0x40 : 0x62;
3985 Dynarr_add (dst, b1);
3986 Dynarr_add (dst, b2);
3988 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
3992 char_encode_as_entity_reference (ch, buf);
3993 Dynarr_add_many (dst, buf, strlen (buf));
3996 Dynarr_add (dst, '?');
4003 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4004 unsigned int *flags)
4009 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4010 Decode a Big5 character CODE of BIG5 coding-system.
4011 CODE is the character code in BIG5, a cons of two integers.
4012 Return the corresponding character.
4016 unsigned char c1, c2, b1, b2;
4019 CHECK_INT (XCAR (code));
4020 CHECK_INT (XCDR (code));
4021 b1 = XINT (XCAR (code));
4022 b2 = XINT (XCDR (code));
4023 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4024 BYTE_BIG5_TWO_BYTE_2_P (b2))
4026 Charset_ID leading_byte;
4027 Lisp_Object charset;
4028 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4029 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4030 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4036 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4037 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4038 Return the corresponding character code in Big5.
4042 Lisp_Object charset;
4045 CHECK_CHAR_COERCE_INT (character);
4046 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4047 if (EQ (charset, Vcharset_chinese_big5_1) ||
4048 EQ (charset, Vcharset_chinese_big5_2))
4050 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4052 return Fcons (make_int (b1), make_int (b2));
4059 /************************************************************************/
4061 /************************************************************************/
4064 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
4068 unsigned char c = *(unsigned char *)src++;
4069 switch (st->ucs4.in_byte)
4078 st->ucs4.in_byte = 0;
4084 return CODING_CATEGORY_UCS4_MASK;
4088 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4089 unsigned_char_dynarr *dst, size_t n)
4091 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4092 unsigned int flags = str->flags;
4093 unsigned int cpos = str->cpos;
4094 unsigned char counter = str->counter;
4098 unsigned char c = *(unsigned char *)src++;
4106 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4111 cpos = ( cpos << 8 ) | c;
4115 if (counter & CODING_STATE_END)
4116 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4120 str->counter = counter;
4124 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4125 unsigned_char_dynarr *dst, unsigned int *flags)
4127 Dynarr_add (dst, ch >> 24);
4128 Dynarr_add (dst, ch >> 16);
4129 Dynarr_add (dst, ch >> 8);
4130 Dynarr_add (dst, ch );
4134 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4135 unsigned int *flags)
4140 /************************************************************************/
4142 /************************************************************************/
4145 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4149 unsigned char c = *(unsigned char *)src++;
4150 switch (st->utf8.in_byte)
4153 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4156 st->utf8.in_byte = 5;
4158 st->utf8.in_byte = 4;
4160 st->utf8.in_byte = 3;
4162 st->utf8.in_byte = 2;
4164 st->utf8.in_byte = 1;
4169 if ((c & 0xc0) != 0x80)
4175 return CODING_CATEGORY_UTF8_MASK;
4179 decode_output_utf8_partial_char (unsigned char counter,
4181 unsigned_char_dynarr *dst)
4184 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4185 else if (counter == 4)
4187 if (cpos < (1 << 6))
4188 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4191 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4192 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4195 else if (counter == 3)
4197 if (cpos < (1 << 6))
4198 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4199 else if (cpos < (1 << 12))
4201 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4202 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4206 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4207 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4208 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4211 else if (counter == 2)
4213 if (cpos < (1 << 6))
4214 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4215 else if (cpos < (1 << 12))
4217 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4218 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4220 else if (cpos < (1 << 18))
4222 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4223 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4224 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4228 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4229 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4230 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4231 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4236 if (cpos < (1 << 6))
4237 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4238 else if (cpos < (1 << 12))
4240 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4241 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4243 else if (cpos < (1 << 18))
4245 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4246 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4247 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4249 else if (cpos < (1 << 24))
4251 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4252 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4253 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4254 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4258 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4259 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4260 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4261 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4262 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4268 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4269 unsigned_char_dynarr *dst, size_t n)
4271 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4272 unsigned int flags = str->flags;
4273 unsigned int cpos = str->cpos;
4274 eol_type_t eol_type = str->eol_type;
4275 unsigned char counter = str->counter;
4278 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4279 (decoding)->codesys, 0);
4284 unsigned char c = *(unsigned char *)src++;
4289 COMPOSE_FLUSH_CHARS (str, dst);
4290 decode_flush_er_chars (str, dst);
4291 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4292 DECODE_ADD_UCS_CHAR (c, dst);
4294 else if ( c < 0xC0 )
4295 /* decode_add_er_char (str, c, dst); */
4296 COMPOSE_ADD_CHAR (str, c, dst);
4299 /* decode_flush_er_chars (str, dst); */
4305 else if ( c < 0xF0 )
4310 else if ( c < 0xF8 )
4315 else if ( c < 0xFC )
4327 else if ( (c & 0xC0) == 0x80 )
4329 cpos = ( cpos << 6 ) | ( c & 0x3f );
4332 Emchar char_id = decode_defined_char (ccs, cpos);
4336 COMPOSE_ADD_CHAR (str, char_id, dst);
4345 COMPOSE_FLUSH_CHARS (str, dst);
4346 decode_flush_er_chars (str, dst);
4347 decode_output_utf8_partial_char (counter, cpos, dst);
4348 DECODE_ADD_BINARY_CHAR (c, dst);
4352 label_continue_loop:;
4355 if (flags & CODING_STATE_END)
4357 COMPOSE_FLUSH_CHARS (str, dst);
4358 decode_flush_er_chars (str, dst);
4361 decode_output_utf8_partial_char (counter, cpos, dst);
4368 str->counter = counter;
4372 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4373 unsigned_char_dynarr *dst, unsigned int *flags)
4375 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4379 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4380 Dynarr_add (dst, '\r');
4381 if (eol_type != EOL_CR)
4382 Dynarr_add (dst, ch);
4384 else if (ch <= 0x7f)
4386 Dynarr_add (dst, ch);
4391 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4392 int code_point = charset_code_point (ucs_ccs, ch);
4394 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4397 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4401 && INTP (ret = Fget_char_attribute (make_char (ch),
4403 code_point = XINT (ret);
4404 else if ( !NILP (map =
4405 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4407 && INTP (ret = Fget_char_attribute (make_char (ch),
4409 code_point = XINT (ret);
4410 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4414 char_encode_as_entity_reference (ch, buf);
4415 Dynarr_add_many (dst, buf, strlen (buf));
4421 if (code_point <= 0x7ff)
4423 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4424 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4426 else if (code_point <= 0xffff)
4428 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4429 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4430 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4432 else if (code_point <= 0x1fffff)
4434 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4435 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4436 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4437 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4439 else if (code_point <= 0x3ffffff)
4441 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4442 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4443 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4444 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4445 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4449 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4450 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4451 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4452 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4453 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4454 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4460 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4461 unsigned int *flags)
4466 /************************************************************************/
4467 /* ISO2022 methods */
4468 /************************************************************************/
4470 /* The following note describes the coding system ISO2022 briefly.
4471 Since the intention of this note is to help understand the
4472 functions in this file, some parts are NOT ACCURATE or OVERLY
4473 SIMPLIFIED. For thorough understanding, please refer to the
4474 original document of ISO2022.
4476 ISO2022 provides many mechanisms to encode several character sets
4477 in 7-bit and 8-bit environments. For 7-bit environments, all text
4478 is encoded using bytes less than 128. This may make the encoded
4479 text a little bit longer, but the text passes more easily through
4480 several gateways, some of which strip off MSB (Most Signigant Bit).
4482 There are two kinds of character sets: control character set and
4483 graphic character set. The former contains control characters such
4484 as `newline' and `escape' to provide control functions (control
4485 functions are also provided by escape sequences). The latter
4486 contains graphic characters such as 'A' and '-'. Emacs recognizes
4487 two control character sets and many graphic character sets.
4489 Graphic character sets are classified into one of the following
4490 four classes, according to the number of bytes (DIMENSION) and
4491 number of characters in one dimension (CHARS) of the set:
4492 - DIMENSION1_CHARS94
4493 - DIMENSION1_CHARS96
4494 - DIMENSION2_CHARS94
4495 - DIMENSION2_CHARS96
4497 In addition, each character set is assigned an identification tag,
4498 unique for each set, called "final character" (denoted as <F>
4499 hereafter). The <F> of each character set is decided by ECMA(*)
4500 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4501 (0x30..0x3F are for private use only).
4503 Note (*): ECMA = European Computer Manufacturers Association
4505 Here are examples of graphic character set [NAME(<F>)]:
4506 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4507 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4508 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4509 o DIMENSION2_CHARS96 -- none for the moment
4511 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4512 C0 [0x00..0x1F] -- control character plane 0
4513 GL [0x20..0x7F] -- graphic character plane 0
4514 C1 [0x80..0x9F] -- control character plane 1
4515 GR [0xA0..0xFF] -- graphic character plane 1
4517 A control character set is directly designated and invoked to C0 or
4518 C1 by an escape sequence. The most common case is that:
4519 - ISO646's control character set is designated/invoked to C0, and
4520 - ISO6429's control character set is designated/invoked to C1,
4521 and usually these designations/invocations are omitted in encoded
4522 text. In a 7-bit environment, only C0 can be used, and a control
4523 character for C1 is encoded by an appropriate escape sequence to
4524 fit into the environment. All control characters for C1 are
4525 defined to have corresponding escape sequences.
4527 A graphic character set is at first designated to one of four
4528 graphic registers (G0 through G3), then these graphic registers are
4529 invoked to GL or GR. These designations and invocations can be
4530 done independently. The most common case is that G0 is invoked to
4531 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4532 these invocations and designations are omitted in encoded text.
4533 In a 7-bit environment, only GL can be used.
4535 When a graphic character set of CHARS94 is invoked to GL, codes
4536 0x20 and 0x7F of the GL area work as control characters SPACE and
4537 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4540 There are two ways of invocation: locking-shift and single-shift.
4541 With locking-shift, the invocation lasts until the next different
4542 invocation, whereas with single-shift, the invocation affects the
4543 following character only and doesn't affect the locking-shift
4544 state. Invocations are done by the following control characters or
4547 ----------------------------------------------------------------------
4548 abbrev function cntrl escape seq description
4549 ----------------------------------------------------------------------
4550 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4551 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4552 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4553 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4554 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4555 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4556 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4557 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4558 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4559 ----------------------------------------------------------------------
4560 (*) These are not used by any known coding system.
4562 Control characters for these functions are defined by macros
4563 ISO_CODE_XXX in `coding.h'.
4565 Designations are done by the following escape sequences:
4566 ----------------------------------------------------------------------
4567 escape sequence description
4568 ----------------------------------------------------------------------
4569 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4570 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4571 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4572 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4573 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4574 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4575 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4576 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4577 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4578 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4579 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4580 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4581 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4582 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4583 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4584 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4585 ----------------------------------------------------------------------
4587 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4588 of dimension 1, chars 94, and final character <F>, etc...
4590 Note (*): Although these designations are not allowed in ISO2022,
4591 Emacs accepts them on decoding, and produces them on encoding
4592 CHARS96 character sets in a coding system which is characterized as
4593 7-bit environment, non-locking-shift, and non-single-shift.
4595 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4596 '(' can be omitted. We refer to this as "short-form" hereafter.
4598 Now you may notice that there are a lot of ways for encoding the
4599 same multilingual text in ISO2022. Actually, there exist many
4600 coding systems such as Compound Text (used in X11's inter client
4601 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4602 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4603 localized platforms), and all of these are variants of ISO2022.
4605 In addition to the above, Emacs handles two more kinds of escape
4606 sequences: ISO6429's direction specification and Emacs' private
4607 sequence for specifying character composition.
4609 ISO6429's direction specification takes the following form:
4610 o CSI ']' -- end of the current direction
4611 o CSI '0' ']' -- end of the current direction
4612 o CSI '1' ']' -- start of left-to-right text
4613 o CSI '2' ']' -- start of right-to-left text
4614 The control character CSI (0x9B: control sequence introducer) is
4615 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4617 Character composition specification takes the following form:
4618 o ESC '0' -- start character composition
4619 o ESC '1' -- end character composition
4620 Since these are not standard escape sequences of any ISO standard,
4621 their use with these meanings is restricted to Emacs only. */
4624 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4628 for (i = 0; i < 4; i++)
4630 if (!NILP (coding_system))
4632 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4634 iso->charset[i] = Qt;
4635 iso->invalid_designated[i] = 0;
4637 iso->esc = ISO_ESC_NOTHING;
4638 iso->esc_bytes_index = 0;
4639 iso->register_left = 0;
4640 iso->register_right = 1;
4641 iso->switched_dir_and_no_valid_charset_yet = 0;
4642 iso->invalid_switch_dir = 0;
4643 iso->output_direction_sequence = 0;
4644 iso->output_literally = 0;
4645 #ifdef ENABLE_COMPOSITE_CHARS
4646 if (iso->composite_chars)
4647 Dynarr_reset (iso->composite_chars);
4652 fit_to_be_escape_quoted (unsigned char c)
4669 /* Parse one byte of an ISO2022 escape sequence.
4670 If the result is an invalid escape sequence, return 0 and
4671 do not change anything in STR. Otherwise, if the result is
4672 an incomplete escape sequence, update ISO2022.ESC and
4673 ISO2022.ESC_BYTES and return -1. Otherwise, update
4674 all the state variables (but not ISO2022.ESC_BYTES) and
4677 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4678 or invocation of an invalid character set and treat that as
4679 an unrecognized escape sequence. */
4682 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4683 unsigned char c, unsigned int *flags,
4684 int check_invalid_charsets)
4686 /* (1) If we're at the end of a designation sequence, CS is the
4687 charset being designated and REG is the register to designate
4690 (2) If we're at the end of a locking-shift sequence, REG is
4691 the register to invoke and HALF (0 == left, 1 == right) is
4692 the half to invoke it into.
4694 (3) If we're at the end of a single-shift sequence, REG is
4695 the register to invoke. */
4696 Lisp_Object cs = Qnil;
4699 /* NOTE: This code does goto's all over the fucking place.
4700 The reason for this is that we're basically implementing
4701 a state machine here, and hierarchical languages like C
4702 don't really provide a clean way of doing this. */
4704 if (! (*flags & CODING_STATE_ESCAPE))
4705 /* At beginning of escape sequence; we need to reset our
4706 escape-state variables. */
4707 iso->esc = ISO_ESC_NOTHING;
4709 iso->output_literally = 0;
4710 iso->output_direction_sequence = 0;
4714 case ISO_ESC_NOTHING:
4715 iso->esc_bytes_index = 0;
4718 case ISO_CODE_ESC: /* Start escape sequence */
4719 *flags |= CODING_STATE_ESCAPE;
4723 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4724 *flags |= CODING_STATE_ESCAPE;
4725 iso->esc = ISO_ESC_5_11;
4728 case ISO_CODE_SO: /* locking shift 1 */
4731 case ISO_CODE_SI: /* locking shift 0 */
4735 case ISO_CODE_SS2: /* single shift */
4738 case ISO_CODE_SS3: /* single shift */
4742 default: /* Other control characters */
4749 /**** single shift ****/
4751 case 'N': /* single shift 2 */
4754 case 'O': /* single shift 3 */
4758 /**** locking shift ****/
4760 case '~': /* locking shift 1 right */
4763 case 'n': /* locking shift 2 */
4766 case '}': /* locking shift 2 right */
4769 case 'o': /* locking shift 3 */
4772 case '|': /* locking shift 3 right */
4776 #ifdef ENABLE_COMPOSITE_CHARS
4777 /**** composite ****/
4780 iso->esc = ISO_ESC_START_COMPOSITE;
4781 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4782 CODING_STATE_COMPOSITE;
4786 iso->esc = ISO_ESC_END_COMPOSITE;
4787 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4788 ~CODING_STATE_COMPOSITE;
4790 #endif /* ENABLE_COMPOSITE_CHARS */
4792 /**** directionality ****/
4795 iso->esc = ISO_ESC_5_11;
4798 /**** designation ****/
4800 case '$': /* multibyte charset prefix */
4801 iso->esc = ISO_ESC_2_4;
4805 if (0x28 <= c && c <= 0x2F)
4807 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4811 /* This function is called with CODESYS equal to nil when
4812 doing coding-system detection. */
4814 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4815 && fit_to_be_escape_quoted (c))
4817 iso->esc = ISO_ESC_LITERAL;
4818 *flags &= CODING_STATE_ISO2022_LOCK;
4828 /**** directionality ****/
4830 case ISO_ESC_5_11: /* ISO6429 direction control */
4833 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4834 goto directionality;
4836 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4837 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4838 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4842 case ISO_ESC_5_11_0:
4845 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4846 goto directionality;
4850 case ISO_ESC_5_11_1:
4853 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4854 goto directionality;
4858 case ISO_ESC_5_11_2:
4861 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4862 goto directionality;
4867 iso->esc = ISO_ESC_DIRECTIONALITY;
4868 /* Various junk here to attempt to preserve the direction sequences
4869 literally in the text if they would otherwise be swallowed due
4870 to invalid designations that don't show up as actual charset
4871 changes in the text. */
4872 if (iso->invalid_switch_dir)
4874 /* We already inserted a direction switch literally into the
4875 text. We assume (#### this may not be right) that the
4876 next direction switch is the one going the other way,
4877 and we need to output that literally as well. */
4878 iso->output_literally = 1;
4879 iso->invalid_switch_dir = 0;
4885 /* If we are in the thrall of an invalid designation,
4886 then stick the directionality sequence literally into the
4887 output stream so it ends up in the original text again. */
4888 for (jj = 0; jj < 4; jj++)
4889 if (iso->invalid_designated[jj])
4893 iso->output_literally = 1;
4894 iso->invalid_switch_dir = 1;
4897 /* Indicate that we haven't yet seen a valid designation,
4898 so that if a switch-dir is directly followed by an
4899 invalid designation, both get inserted literally. */
4900 iso->switched_dir_and_no_valid_charset_yet = 1;
4905 /**** designation ****/
4908 if (0x28 <= c && c <= 0x2F)
4910 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4913 if (0x40 <= c && c <= 0x42)
4916 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4917 *flags & CODING_STATE_R2L ?
4918 CHARSET_RIGHT_TO_LEFT :
4919 CHARSET_LEFT_TO_RIGHT);
4930 if (c < '0' || c > '~')
4931 return 0; /* bad final byte */
4933 if (iso->esc >= ISO_ESC_2_8 &&
4934 iso->esc <= ISO_ESC_2_15)
4936 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4937 single = 1; /* single-byte */
4938 reg = (iso->esc - ISO_ESC_2_8) & 3;
4940 else if (iso->esc >= ISO_ESC_2_4_8 &&
4941 iso->esc <= ISO_ESC_2_4_15)
4943 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4944 single = -1; /* multi-byte */
4945 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4949 /* Can this ever be reached? -slb */
4953 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4954 *flags & CODING_STATE_R2L ?
4955 CHARSET_RIGHT_TO_LEFT :
4956 CHARSET_LEFT_TO_RIGHT);
4962 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4966 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4967 /* can't invoke something that ain't there. */
4969 iso->esc = ISO_ESC_SINGLE_SHIFT;
4970 *flags &= CODING_STATE_ISO2022_LOCK;
4972 *flags |= CODING_STATE_SS2;
4974 *flags |= CODING_STATE_SS3;
4978 if (check_invalid_charsets &&
4979 !CHARSETP (iso->charset[reg]))
4980 /* can't invoke something that ain't there. */
4983 iso->register_right = reg;
4985 iso->register_left = reg;
4986 *flags &= CODING_STATE_ISO2022_LOCK;
4987 iso->esc = ISO_ESC_LOCKING_SHIFT;
4991 if (NILP (cs) && check_invalid_charsets)
4993 iso->invalid_designated[reg] = 1;
4994 iso->charset[reg] = Vcharset_ascii;
4995 iso->esc = ISO_ESC_DESIGNATE;
4996 *flags &= CODING_STATE_ISO2022_LOCK;
4997 iso->output_literally = 1;
4998 if (iso->switched_dir_and_no_valid_charset_yet)
5000 /* We encountered a switch-direction followed by an
5001 invalid designation. Ensure that the switch-direction
5002 gets outputted; otherwise it will probably get eaten
5003 when the text is written out again. */
5004 iso->switched_dir_and_no_valid_charset_yet = 0;
5005 iso->output_direction_sequence = 1;
5006 /* And make sure that the switch-dir going the other
5007 way gets outputted, as well. */
5008 iso->invalid_switch_dir = 1;
5012 /* This function is called with CODESYS equal to nil when
5013 doing coding-system detection. */
5014 if (!NILP (codesys))
5016 charset_conversion_spec_dynarr *dyn =
5017 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5023 for (i = 0; i < Dynarr_length (dyn); i++)
5025 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5026 if (EQ (cs, spec->from_charset))
5027 cs = spec->to_charset;
5032 iso->charset[reg] = cs;
5033 iso->esc = ISO_ESC_DESIGNATE;
5034 *flags &= CODING_STATE_ISO2022_LOCK;
5035 if (iso->invalid_designated[reg])
5037 iso->invalid_designated[reg] = 0;
5038 iso->output_literally = 1;
5040 if (iso->switched_dir_and_no_valid_charset_yet)
5041 iso->switched_dir_and_no_valid_charset_yet = 0;
5046 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
5050 /* #### There are serious deficiencies in the recognition mechanism
5051 here. This needs to be much smarter if it's going to cut it.
5052 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5053 it should be detected as Latin-1.
5054 All the ISO2022 stuff in this file should be synced up with the
5055 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5056 Perhaps we should wait till R2L works in FSF Emacs? */
5058 if (!st->iso2022.initted)
5060 reset_iso2022 (Qnil, &st->iso2022.iso);
5061 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5062 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5063 CODING_CATEGORY_ISO_8_1_MASK |
5064 CODING_CATEGORY_ISO_8_2_MASK |
5065 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5066 st->iso2022.flags = 0;
5067 st->iso2022.high_byte_count = 0;
5068 st->iso2022.saw_single_shift = 0;
5069 st->iso2022.initted = 1;
5072 mask = st->iso2022.mask;
5076 unsigned char c = *(unsigned char *)src++;
5079 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5080 st->iso2022.high_byte_count++;
5084 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5086 if (st->iso2022.high_byte_count & 1)
5087 /* odd number of high bytes; assume not iso-8-2 */
5088 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5090 st->iso2022.high_byte_count = 0;
5091 st->iso2022.saw_single_shift = 0;
5093 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5095 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5096 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5097 { /* control chars */
5100 /* Allow and ignore control characters that you might
5101 reasonably see in a text file */
5106 case 8: /* backspace */
5107 case 11: /* vertical tab */
5108 case 12: /* form feed */
5109 case 26: /* MS-DOS C-z junk */
5110 case 31: /* '^_' -- for info */
5111 goto label_continue_loop;
5118 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5121 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5122 &st->iso2022.flags, 0))
5124 switch (st->iso2022.iso.esc)
5126 case ISO_ESC_DESIGNATE:
5127 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5128 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5130 case ISO_ESC_LOCKING_SHIFT:
5131 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5132 goto ran_out_of_chars;
5133 case ISO_ESC_SINGLE_SHIFT:
5134 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5135 st->iso2022.saw_single_shift = 1;
5144 goto ran_out_of_chars;
5147 label_continue_loop:;
5156 postprocess_iso2022_mask (int mask)
5158 /* #### kind of cheesy */
5159 /* If seven-bit ISO is allowed, then assume that the encoding is
5160 entirely seven-bit and turn off the eight-bit ones. */
5161 if (mask & CODING_CATEGORY_ISO_7_MASK)
5162 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5163 CODING_CATEGORY_ISO_8_1_MASK |
5164 CODING_CATEGORY_ISO_8_2_MASK);
5168 /* If FLAGS is a null pointer or specifies right-to-left motion,
5169 output a switch-dir-to-left-to-right sequence to DST.
5170 Also update FLAGS if it is not a null pointer.
5171 If INTERNAL_P is set, we are outputting in internal format and
5172 need to handle the CSI differently. */
5175 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5176 unsigned_char_dynarr *dst,
5177 unsigned int *flags,
5180 if (!flags || (*flags & CODING_STATE_R2L))
5182 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5184 Dynarr_add (dst, ISO_CODE_ESC);
5185 Dynarr_add (dst, '[');
5187 else if (internal_p)
5188 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5190 Dynarr_add (dst, ISO_CODE_CSI);
5191 Dynarr_add (dst, '0');
5192 Dynarr_add (dst, ']');
5194 *flags &= ~CODING_STATE_R2L;
5198 /* If FLAGS is a null pointer or specifies a direction different from
5199 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5200 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5201 sequence to DST. Also update FLAGS if it is not a null pointer.
5202 If INTERNAL_P is set, we are outputting in internal format and
5203 need to handle the CSI differently. */
5206 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5207 unsigned_char_dynarr *dst, unsigned int *flags,
5210 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5211 direction == CHARSET_LEFT_TO_RIGHT)
5212 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5213 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5214 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5215 direction == CHARSET_RIGHT_TO_LEFT)
5217 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5219 Dynarr_add (dst, ISO_CODE_ESC);
5220 Dynarr_add (dst, '[');
5222 else if (internal_p)
5223 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5225 Dynarr_add (dst, ISO_CODE_CSI);
5226 Dynarr_add (dst, '2');
5227 Dynarr_add (dst, ']');
5229 *flags |= CODING_STATE_R2L;
5233 /* Convert ISO2022-format data to internal format. */
5236 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5237 unsigned_char_dynarr *dst, size_t n)
5239 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5240 unsigned int flags = str->flags;
5241 unsigned int cpos = str->cpos;
5242 unsigned char counter = str->counter;
5243 eol_type_t eol_type = str->eol_type;
5244 #ifdef ENABLE_COMPOSITE_CHARS
5245 unsigned_char_dynarr *real_dst = dst;
5247 Lisp_Object coding_system;
5249 XSETCODING_SYSTEM (coding_system, str->codesys);
5251 #ifdef ENABLE_COMPOSITE_CHARS
5252 if (flags & CODING_STATE_COMPOSITE)
5253 dst = str->iso2022.composite_chars;
5254 #endif /* ENABLE_COMPOSITE_CHARS */
5258 unsigned char c = *(unsigned char *)src++;
5259 if (flags & CODING_STATE_ESCAPE)
5260 { /* Within ESC sequence */
5261 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5266 switch (str->iso2022.esc)
5268 #ifdef ENABLE_COMPOSITE_CHARS
5269 case ISO_ESC_START_COMPOSITE:
5270 if (str->iso2022.composite_chars)
5271 Dynarr_reset (str->iso2022.composite_chars);
5273 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5274 dst = str->iso2022.composite_chars;
5276 case ISO_ESC_END_COMPOSITE:
5278 Bufbyte comstr[MAX_EMCHAR_LEN];
5280 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5281 Dynarr_length (dst));
5283 len = set_charptr_emchar (comstr, emch);
5284 Dynarr_add_many (dst, comstr, len);
5287 #endif /* ENABLE_COMPOSITE_CHARS */
5289 case ISO_ESC_LITERAL:
5290 COMPOSE_FLUSH_CHARS (str, dst);
5291 decode_flush_er_chars (str, dst);
5292 DECODE_ADD_BINARY_CHAR (c, dst);
5296 /* Everything else handled already */
5301 /* Attempted error recovery. */
5302 if (str->iso2022.output_direction_sequence)
5303 ensure_correct_direction (flags & CODING_STATE_R2L ?
5304 CHARSET_RIGHT_TO_LEFT :
5305 CHARSET_LEFT_TO_RIGHT,
5306 str->codesys, dst, 0, 1);
5307 /* More error recovery. */
5308 if (!retval || str->iso2022.output_literally)
5310 /* Output the (possibly invalid) sequence */
5312 COMPOSE_FLUSH_CHARS (str, dst);
5313 decode_flush_er_chars (str, dst);
5314 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5315 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5316 flags &= CODING_STATE_ISO2022_LOCK;
5318 n++, src--;/* Repeat the loop with the same character. */
5321 /* No sense in reprocessing the final byte of the
5322 escape sequence; it could mess things up anyway.
5324 COMPOSE_FLUSH_CHARS (str, dst);
5325 decode_flush_er_chars (str, dst);
5326 DECODE_ADD_BINARY_CHAR (c, dst);
5332 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5333 { /* Control characters */
5335 /***** Error-handling *****/
5337 /* If we were in the middle of a character, dump out the
5338 partial character. */
5341 COMPOSE_FLUSH_CHARS (str, dst);
5342 decode_flush_er_chars (str, dst);
5346 DECODE_ADD_BINARY_CHAR
5347 ((unsigned char)(cpos >> (counter * 8)), dst);
5352 /* If we just saw a single-shift character, dump it out.
5353 This may dump out the wrong sort of single-shift character,
5354 but least it will give an indication that something went
5356 if (flags & CODING_STATE_SS2)
5358 COMPOSE_FLUSH_CHARS (str, dst);
5359 decode_flush_er_chars (str, dst);
5360 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5361 flags &= ~CODING_STATE_SS2;
5363 if (flags & CODING_STATE_SS3)
5365 COMPOSE_FLUSH_CHARS (str, dst);
5366 decode_flush_er_chars (str, dst);
5367 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5368 flags &= ~CODING_STATE_SS3;
5371 /***** Now handle the control characters. *****/
5377 COMPOSE_FLUSH_CHARS (str, dst);
5378 decode_flush_er_chars (str, dst);
5379 if (eol_type == EOL_CR)
5380 Dynarr_add (dst, '\n');
5381 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5382 Dynarr_add (dst, c);
5384 flags |= CODING_STATE_CR;
5385 goto label_continue_loop;
5387 else if (flags & CODING_STATE_CR)
5388 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5390 Dynarr_add (dst, '\r');
5391 flags &= ~CODING_STATE_CR;
5394 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5397 flags &= CODING_STATE_ISO2022_LOCK;
5399 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5401 COMPOSE_FLUSH_CHARS (str, dst);
5402 decode_flush_er_chars (str, dst);
5403 DECODE_ADD_BINARY_CHAR (c, dst);
5407 { /* Graphic characters */
5408 Lisp_Object charset;
5417 COMPOSE_FLUSH_CHARS (str, dst);
5418 decode_flush_er_chars (str, dst);
5419 if (eol_type == EOL_CR)
5420 Dynarr_add (dst, '\n');
5421 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5422 Dynarr_add (dst, c);
5424 flags |= CODING_STATE_CR;
5425 goto label_continue_loop;
5427 else if (flags & CODING_STATE_CR)
5428 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5430 Dynarr_add (dst, '\r');
5431 flags &= ~CODING_STATE_CR;
5434 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5437 /* Now determine the charset. */
5438 reg = ((flags & CODING_STATE_SS2) ? 2
5439 : (flags & CODING_STATE_SS3) ? 3
5440 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5441 : str->iso2022.register_left);
5442 charset = str->iso2022.charset[reg];
5444 /* Error checking: */
5445 if (! CHARSETP (charset)
5446 || str->iso2022.invalid_designated[reg]
5447 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5448 && XCHARSET_CHARS (charset) == 94))
5449 /* Mrmph. We are trying to invoke a register that has no
5450 or an invalid charset in it, or trying to add a character
5451 outside the range of the charset. Insert that char literally
5452 to preserve it for the output. */
5454 COMPOSE_FLUSH_CHARS (str, dst);
5455 decode_flush_er_chars (str, dst);
5459 DECODE_ADD_BINARY_CHAR
5460 ((unsigned char)(cpos >> (counter * 8)), dst);
5463 DECODE_ADD_BINARY_CHAR (c, dst);
5468 /* Things are probably hunky-dorey. */
5470 /* Fetch reverse charset, maybe. */
5471 if (((flags & CODING_STATE_R2L) &&
5472 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5474 (!(flags & CODING_STATE_R2L) &&
5475 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5477 Lisp_Object new_charset =
5478 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5479 if (!NILP (new_charset))
5480 charset = new_charset;
5485 if (XCHARSET_DIMENSION (charset) == counter)
5487 COMPOSE_ADD_CHAR (str,
5488 DECODE_CHAR (charset,
5489 ((cpos & 0x7F7F7F) << 8)
5496 cpos = (cpos << 8) | c;
5498 lb = XCHARSET_LEADING_BYTE (charset);
5499 switch (XCHARSET_REP_BYTES (charset))
5502 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5503 Dynarr_add (dst, c & 0x7F);
5506 case 2: /* one-byte official */
5507 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5508 Dynarr_add (dst, lb);
5509 Dynarr_add (dst, c | 0x80);
5512 case 3: /* one-byte private or two-byte official */
5513 if (XCHARSET_PRIVATE_P (charset))
5515 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5516 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5517 Dynarr_add (dst, lb);
5518 Dynarr_add (dst, c | 0x80);
5524 Dynarr_add (dst, lb);
5525 Dynarr_add (dst, ch | 0x80);
5526 Dynarr_add (dst, c | 0x80);
5534 default: /* two-byte private */
5537 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5538 Dynarr_add (dst, lb);
5539 Dynarr_add (dst, ch | 0x80);
5540 Dynarr_add (dst, c | 0x80);
5550 flags &= CODING_STATE_ISO2022_LOCK;
5553 label_continue_loop:;
5556 if (flags & CODING_STATE_END)
5558 COMPOSE_FLUSH_CHARS (str, dst);
5559 decode_flush_er_chars (str, dst);
5560 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5564 str->counter = counter;
5568 /***** ISO2022 encoder *****/
5570 /* Designate CHARSET into register REG. */
5573 iso2022_designate (Lisp_Object charset, unsigned char reg,
5574 struct encoding_stream *str, unsigned_char_dynarr *dst)
5576 static const char inter94[] = "()*+";
5577 static const char inter96[] = ",-./";
5578 unsigned short chars;
5579 unsigned char dimension;
5580 unsigned char final;
5581 Lisp_Object old_charset = str->iso2022.charset[reg];
5583 str->iso2022.charset[reg] = charset;
5584 if (!CHARSETP (charset))
5585 /* charset might be an initial nil or t. */
5587 chars = XCHARSET_CHARS (charset);
5588 dimension = XCHARSET_DIMENSION (charset);
5589 final = XCHARSET_FINAL (charset);
5590 if (!str->iso2022.force_charset_on_output[reg] &&
5591 CHARSETP (old_charset) &&
5592 XCHARSET_CHARS (old_charset) == chars &&
5593 XCHARSET_DIMENSION (old_charset) == dimension &&
5594 XCHARSET_FINAL (old_charset) == final)
5597 str->iso2022.force_charset_on_output[reg] = 0;
5600 charset_conversion_spec_dynarr *dyn =
5601 str->codesys->iso2022.output_conv;
5607 for (i = 0; i < Dynarr_length (dyn); i++)
5609 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5610 if (EQ (charset, spec->from_charset))
5611 charset = spec->to_charset;
5616 Dynarr_add (dst, ISO_CODE_ESC);
5621 Dynarr_add (dst, inter94[reg]);
5624 Dynarr_add (dst, '$');
5626 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5629 Dynarr_add (dst, inter94[reg]);
5634 Dynarr_add (dst, inter96[reg]);
5637 Dynarr_add (dst, '$');
5638 Dynarr_add (dst, inter96[reg]);
5642 Dynarr_add (dst, final);
5646 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5648 if (str->iso2022.register_left != 0)
5650 Dynarr_add (dst, ISO_CODE_SI);
5651 str->iso2022.register_left = 0;
5656 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5658 if (str->iso2022.register_left != 1)
5660 Dynarr_add (dst, ISO_CODE_SO);
5661 str->iso2022.register_left = 1;
5666 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5667 unsigned_char_dynarr *dst, unsigned int *flags)
5669 unsigned char charmask;
5670 Lisp_Coding_System* codesys = str->codesys;
5671 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5673 Lisp_Object charset = str->iso2022.current_charset;
5674 int half = str->iso2022.current_half;
5675 int code_point = -1;
5679 restore_left_to_right_direction (codesys, dst, flags, 0);
5681 /* Make sure G0 contains ASCII */
5682 if ((ch > ' ' && ch < ISO_CODE_DEL)
5683 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5685 ensure_normal_shift (str, dst);
5686 iso2022_designate (Vcharset_ascii, 0, str, dst);
5689 /* If necessary, restore everything to the default state
5691 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5693 restore_left_to_right_direction (codesys, dst, flags, 0);
5695 ensure_normal_shift (str, dst);
5697 for (i = 0; i < 4; i++)
5699 Lisp_Object initial_charset =
5700 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5701 iso2022_designate (initial_charset, i, str, dst);
5706 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5707 Dynarr_add (dst, '\r');
5708 if (eol_type != EOL_CR)
5709 Dynarr_add (dst, ch);
5713 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5714 && fit_to_be_escape_quoted (ch))
5715 Dynarr_add (dst, ISO_CODE_ESC);
5716 Dynarr_add (dst, ch);
5719 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5721 charmask = (half == 0 ? 0x00 : 0x80);
5723 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5724 && fit_to_be_escape_quoted (ch))
5725 Dynarr_add (dst, ISO_CODE_ESC);
5726 /* you asked for it ... */
5727 Dynarr_add (dst, ch);
5733 /* Now determine which register to use. */
5735 for (i = 0; i < 4; i++)
5737 if ((CHARSETP (charset = str->iso2022.charset[i])
5738 && ((code_point = charset_code_point (charset, ch)) >= 0))
5742 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5743 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5751 Lisp_Object original_default_coded_charset_priority_list
5752 = Vdefault_coded_charset_priority_list;
5754 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5756 code_point = ENCODE_CHAR (ch, charset);
5757 if (XCHARSET_FINAL (charset))
5759 Vdefault_coded_charset_priority_list
5760 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5761 Vdefault_coded_charset_priority_list));
5763 code_point = ENCODE_CHAR (ch, charset);
5764 if (!XCHARSET_FINAL (charset))
5766 charset = Vcharset_ascii;
5770 Vdefault_coded_charset_priority_list
5771 = original_default_coded_charset_priority_list;
5773 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5774 codesys, dst, flags, 0);
5778 if (XCHARSET_GRAPHIC (charset) != 0)
5780 if (!NILP (str->iso2022.charset[1]) &&
5781 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5782 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5784 else if (!NILP (str->iso2022.charset[2]))
5786 else if (!NILP (str->iso2022.charset[3]))
5795 iso2022_designate (charset, reg, str, dst);
5797 /* Now invoke that register. */
5801 ensure_normal_shift (str, dst);
5805 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5807 ensure_shift_out (str, dst);
5814 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5816 Dynarr_add (dst, ISO_CODE_ESC);
5817 Dynarr_add (dst, 'N');
5822 Dynarr_add (dst, ISO_CODE_SS2);
5827 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5829 Dynarr_add (dst, ISO_CODE_ESC);
5830 Dynarr_add (dst, 'O');
5835 Dynarr_add (dst, ISO_CODE_SS3);
5843 charmask = (half == 0 ? 0x00 : 0x80);
5845 switch (XCHARSET_DIMENSION (charset))
5848 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5851 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5852 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5855 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5856 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5857 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5860 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5861 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5862 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5863 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5869 str->iso2022.current_charset = charset;
5870 str->iso2022.current_half = half;
5874 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5875 unsigned int *flags)
5877 Lisp_Coding_System* codesys = str->codesys;
5880 restore_left_to_right_direction (codesys, dst, flags, 0);
5881 ensure_normal_shift (str, dst);
5882 for (i = 0; i < 4; i++)
5884 Lisp_Object initial_charset
5885 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5886 iso2022_designate (initial_charset, i, str, dst);
5891 /************************************************************************/
5892 /* No-conversion methods */
5893 /************************************************************************/
5895 /* This is used when reading in "binary" files -- i.e. files that may
5896 contain all 256 possible byte values and that are not to be
5897 interpreted as being in any particular decoding. */
5899 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5900 unsigned_char_dynarr *dst, size_t n)
5902 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5903 unsigned int flags = str->flags;
5904 unsigned int cpos = str->cpos;
5905 eol_type_t eol_type = str->eol_type;
5909 unsigned char c = *(unsigned char *)src++;
5911 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5912 DECODE_ADD_BINARY_CHAR (c, dst);
5913 label_continue_loop:;
5916 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5923 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5924 unsigned_char_dynarr *dst, size_t n)
5927 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5928 unsigned int flags = str->flags;
5929 unsigned int ch = str->ch;
5930 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5932 unsigned char char_boundary = str->iso2022.current_char_boundary;
5939 if (char_boundary == 0)
5945 else if ( c >= 0xf8 )
5950 else if ( c >= 0xf0 )
5955 else if ( c >= 0xe0 )
5960 else if ( c >= 0xc0 )
5970 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5971 Dynarr_add (dst, '\r');
5972 if (eol_type != EOL_CR)
5973 Dynarr_add (dst, c);
5976 Dynarr_add (dst, c);
5979 else if (char_boundary == 1)
5981 ch = ( ch << 6 ) | ( c & 0x3f );
5982 Dynarr_add (dst, ch & 0xff);
5987 ch = ( ch << 6 ) | ( c & 0x3f );
5990 #else /* not UTF2000 */
5993 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5994 Dynarr_add (dst, '\r');
5995 if (eol_type != EOL_CR)
5996 Dynarr_add (dst, '\n');
5999 else if (BYTE_ASCII_P (c))
6002 Dynarr_add (dst, c);
6004 else if (BUFBYTE_LEADING_BYTE_P (c))
6007 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6008 c == LEADING_BYTE_CONTROL_1)
6011 Dynarr_add (dst, '~'); /* untranslatable character */
6015 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6016 Dynarr_add (dst, c);
6017 else if (ch == LEADING_BYTE_CONTROL_1)
6020 Dynarr_add (dst, c - 0x20);
6022 /* else it should be the second or third byte of an
6023 untranslatable character, so ignore it */
6026 #endif /* not UTF2000 */
6032 str->iso2022.current_char_boundary = char_boundary;
6038 /************************************************************************/
6039 /* Initialization */
6040 /************************************************************************/
6043 syms_of_file_coding (void)
6045 INIT_LRECORD_IMPLEMENTATION (coding_system);
6047 deferror (&Qcoding_system_error, "coding-system-error",
6048 "Coding-system error", Qio_error);
6050 DEFSUBR (Fcoding_system_p);
6051 DEFSUBR (Ffind_coding_system);
6052 DEFSUBR (Fget_coding_system);
6053 DEFSUBR (Fcoding_system_list);
6054 DEFSUBR (Fcoding_system_name);
6055 DEFSUBR (Fmake_coding_system);
6056 DEFSUBR (Fcopy_coding_system);
6057 DEFSUBR (Fcoding_system_canonical_name_p);
6058 DEFSUBR (Fcoding_system_alias_p);
6059 DEFSUBR (Fcoding_system_aliasee);
6060 DEFSUBR (Fdefine_coding_system_alias);
6061 DEFSUBR (Fsubsidiary_coding_system);
6063 DEFSUBR (Fcoding_system_type);
6064 DEFSUBR (Fcoding_system_doc_string);
6066 DEFSUBR (Fcoding_system_charset);
6068 DEFSUBR (Fcoding_system_property);
6070 DEFSUBR (Fcoding_category_list);
6071 DEFSUBR (Fset_coding_priority_list);
6072 DEFSUBR (Fcoding_priority_list);
6073 DEFSUBR (Fset_coding_category_system);
6074 DEFSUBR (Fcoding_category_system);
6076 DEFSUBR (Fdetect_coding_region);
6077 DEFSUBR (Fdecode_coding_region);
6078 DEFSUBR (Fencode_coding_region);
6080 DEFSUBR (Fdecode_shift_jis_char);
6081 DEFSUBR (Fencode_shift_jis_char);
6082 DEFSUBR (Fdecode_big5_char);
6083 DEFSUBR (Fencode_big5_char);
6085 defsymbol (&Qcoding_systemp, "coding-system-p");
6086 defsymbol (&Qno_conversion, "no-conversion");
6087 defsymbol (&Qraw_text, "raw-text");
6089 defsymbol (&Qbig5, "big5");
6090 defsymbol (&Qshift_jis, "shift-jis");
6091 defsymbol (&Qucs4, "ucs-4");
6092 defsymbol (&Qutf8, "utf-8");
6093 defsymbol (&Qccl, "ccl");
6094 defsymbol (&Qiso2022, "iso2022");
6096 defsymbol (&Qmnemonic, "mnemonic");
6097 defsymbol (&Qeol_type, "eol-type");
6098 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6099 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6101 defsymbol (&Qcr, "cr");
6102 defsymbol (&Qlf, "lf");
6103 defsymbol (&Qcrlf, "crlf");
6104 defsymbol (&Qeol_cr, "eol-cr");
6105 defsymbol (&Qeol_lf, "eol-lf");
6106 defsymbol (&Qeol_crlf, "eol-crlf");
6108 defsymbol (&Qcharset_g0, "charset-g0");
6109 defsymbol (&Qcharset_g1, "charset-g1");
6110 defsymbol (&Qcharset_g2, "charset-g2");
6111 defsymbol (&Qcharset_g3, "charset-g3");
6112 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6113 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6114 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6115 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6116 defsymbol (&Qno_iso6429, "no-iso6429");
6117 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6118 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6120 defsymbol (&Qshort, "short");
6121 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6122 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6123 defsymbol (&Qseven, "seven");
6124 defsymbol (&Qlock_shift, "lock-shift");
6125 defsymbol (&Qescape_quoted, "escape-quoted");
6128 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6129 defsymbol (&Qdisable_composition, "disable-composition");
6130 defsymbol (&Quse_entity_reference, "use-entity-reference");
6131 defsymbol (&Qd, "d");
6132 defsymbol (&Qx, "x");
6133 defsymbol (&QX, "X");
6135 defsymbol (&Qencode, "encode");
6136 defsymbol (&Qdecode, "decode");
6139 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6141 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6143 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6145 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6147 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6149 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6151 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6153 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6155 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6158 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6163 lstream_type_create_file_coding (void)
6165 LSTREAM_HAS_METHOD (decoding, reader);
6166 LSTREAM_HAS_METHOD (decoding, writer);
6167 LSTREAM_HAS_METHOD (decoding, rewinder);
6168 LSTREAM_HAS_METHOD (decoding, seekable_p);
6169 LSTREAM_HAS_METHOD (decoding, flusher);
6170 LSTREAM_HAS_METHOD (decoding, closer);
6171 LSTREAM_HAS_METHOD (decoding, marker);
6173 LSTREAM_HAS_METHOD (encoding, reader);
6174 LSTREAM_HAS_METHOD (encoding, writer);
6175 LSTREAM_HAS_METHOD (encoding, rewinder);
6176 LSTREAM_HAS_METHOD (encoding, seekable_p);
6177 LSTREAM_HAS_METHOD (encoding, flusher);
6178 LSTREAM_HAS_METHOD (encoding, closer);
6179 LSTREAM_HAS_METHOD (encoding, marker);
6183 vars_of_file_coding (void)
6187 fcd = xnew (struct file_coding_dump);
6188 dump_add_root_struct_ptr (&fcd, &fcd_description);
6190 /* Initialize to something reasonable ... */
6191 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6193 fcd->coding_category_system[i] = Qnil;
6194 fcd->coding_category_by_priority[i] = i;
6197 Fprovide (intern ("file-coding"));
6199 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6200 Coding system used for TTY keyboard input.
6201 Not used under a windowing system.
6203 Vkeyboard_coding_system = Qnil;
6205 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6206 Coding system used for TTY display output.
6207 Not used under a windowing system.
6209 Vterminal_coding_system = Qnil;
6211 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6212 Overriding coding system used when reading from a file or process.
6213 You should bind this variable with `let', but do not set it globally.
6214 If this is non-nil, it specifies the coding system that will be used
6215 to decode input on read operations, such as from a file or process.
6216 It overrides `buffer-file-coding-system-for-read',
6217 `insert-file-contents-pre-hook', etc. Use those variables instead of
6218 this one for permanent changes to the environment. */ );
6219 Vcoding_system_for_read = Qnil;
6221 DEFVAR_LISP ("coding-system-for-write",
6222 &Vcoding_system_for_write /*
6223 Overriding coding system used when writing to a file or process.
6224 You should bind this variable with `let', but do not set it globally.
6225 If this is non-nil, it specifies the coding system that will be used
6226 to encode output for write operations, such as to a file or process.
6227 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6228 Use those variables instead of this one for permanent changes to the
6230 Vcoding_system_for_write = Qnil;
6232 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6233 Coding system used to convert pathnames when accessing files.
6235 Vfile_name_coding_system = Qnil;
6237 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6238 &Vcoded_charset_entity_reference_alist /*
6239 Alist of coded-charset vs corresponding entity-reference.
6240 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6241 CCS is coded-charset.
6242 CODE-COLUMNS is columns of code-point of entity-reference.
6243 CODE-TYPE is format type of code-point of entity-reference.
6244 `d' means decimal value and `x' means hexadecimal value.
6246 Vcoded_charset_entity_reference_alist = Qnil;
6248 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6249 Non-nil means the buffer contents are regarded as multi-byte form
6250 of characters, not a binary code. This affects the display, file I/O,
6251 and behaviors of various editing commands.
6253 Setting this to nil does not do anything.
6255 enable_multibyte_characters = 1;
6259 complex_vars_of_file_coding (void)
6261 staticpro (&Vcoding_system_hash_table);
6262 Vcoding_system_hash_table =
6263 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6265 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6266 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6268 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6270 struct codesys_prop csp; \
6272 csp.prop_type = (Prop_Type); \
6273 Dynarr_add (the_codesys_prop_dynarr, csp); \
6276 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6277 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6278 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6279 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6280 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6281 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6282 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6284 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6285 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6286 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6287 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6288 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6289 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6290 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6291 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6292 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6293 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6294 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6295 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6296 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6297 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6298 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6299 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6300 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6302 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6303 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6305 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6306 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6309 /* Need to create this here or we're really screwed. */
6311 (Qraw_text, Qno_conversion,
6312 build_string ("Raw text, which means it converts only line-break-codes."),
6313 list2 (Qmnemonic, build_string ("Raw")));
6316 (Qbinary, Qno_conversion,
6317 build_string ("Binary, which means it does not convert anything."),
6318 list4 (Qeol_type, Qlf,
6319 Qmnemonic, build_string ("Binary")));
6325 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6326 list2 (Qmnemonic, build_string ("MTF8")));
6329 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6331 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6333 Fdefine_coding_system_alias (Qterminal, Qbinary);
6334 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6336 /* Need this for bootstrapping */
6337 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6338 Fget_coding_system (Qraw_text);
6341 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6342 = Fget_coding_system (Qutf_8_mcs);
6345 #if defined(MULE) && !defined(UTF2000)
6349 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6350 fcd->ucs_to_mule_table[i] = Qnil;
6352 staticpro (&mule_to_ucs_table);
6353 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6354 #endif /* defined(MULE) && !defined(UTF2000) */