1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Quse_entity_reference;
111 Lisp_Object Qd, Qx, QX;
113 Lisp_Object Qencode, Qdecode;
115 Lisp_Object Vcoding_system_hash_table;
117 int enable_multibyte_characters;
120 /* Additional information used by the ISO2022 decoder and detector. */
121 struct iso2022_decoder
123 /* CHARSET holds the character sets currently assigned to the G0
124 through G3 variables. It is initialized from the array
125 INITIAL_CHARSET in CODESYS. */
126 Lisp_Object charset[4];
128 /* Which registers are currently invoked into the left (GL) and
129 right (GR) halves of the 8-bit encoding space? */
130 int register_left, register_right;
132 /* ISO_ESC holds a value indicating part of an escape sequence
133 that has already been seen. */
134 enum iso_esc_flag esc;
136 /* This records the bytes we've seen so far in an escape sequence,
137 in case the sequence is invalid (we spit out the bytes unchanged). */
138 unsigned char esc_bytes[8];
140 /* Index for next byte to store in ISO escape sequence. */
143 #ifdef ENABLE_COMPOSITE_CHARS
144 /* Stuff seen so far when composing a string. */
145 unsigned_char_dynarr *composite_chars;
148 /* If we saw an invalid designation sequence for a particular
149 register, we flag it here and switch to ASCII. The next time we
150 see a valid designation for this register, we turn off the flag
151 and do the designation normally, but pretend the sequence was
152 invalid. The effect of all this is that (most of the time) the
153 escape sequences for both the switch to the unknown charset, and
154 the switch back to the known charset, get inserted literally into
155 the buffer and saved out as such. The hope is that we can
156 preserve the escape sequences so that the resulting written out
157 file makes sense. If we don't do any of this, the designation
158 to the invalid charset will be preserved but that switch back
159 to the known charset will probably get eaten because it was
160 the same charset that was already present in the register. */
161 unsigned char invalid_designated[4];
163 /* We try to do similar things as above for direction-switching
164 sequences. If we encountered a direction switch while an
165 invalid designation was present, or an invalid designation
166 just after a direction switch (i.e. no valid designation
167 encountered yet), we insert the direction-switch escape
168 sequence literally into the output stream, and later on
169 insert the corresponding direction-restoring escape sequence
171 unsigned int switched_dir_and_no_valid_charset_yet :1;
172 unsigned int invalid_switch_dir :1;
174 /* Tells the decoder to output the escape sequence literally
175 even though it was valid. Used in the games we play to
176 avoid lossage when we encounter invalid designations. */
177 unsigned int output_literally :1;
178 /* We encountered a direction switch followed by an invalid
179 designation. We didn't output the direction switch
180 literally because we didn't know about the invalid designation;
181 but we have to do so now. */
182 unsigned int output_direction_sequence :1;
185 EXFUN (Fcopy_coding_system, 2);
187 struct detection_state;
190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
191 unsigned_char_dynarr *dst, size_t n);
193 static int detect_coding_sjis (struct detection_state *st,
194 const Extbyte *src, size_t n);
195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
196 unsigned_char_dynarr *dst, size_t n);
197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_shift_jis (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int detect_coding_big5 (struct detection_state *st,
203 const Extbyte *src, size_t n);
204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
205 unsigned_char_dynarr *dst, size_t n);
206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
207 unsigned_char_dynarr *dst, unsigned int *flags);
208 void char_finish_big5 (struct encoding_stream *str,
209 unsigned_char_dynarr *dst, unsigned int *flags);
211 static int detect_coding_ucs4 (struct detection_state *st,
212 const Extbyte *src, size_t n);
213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
214 unsigned_char_dynarr *dst, size_t n);
215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
216 unsigned_char_dynarr *dst, unsigned int *flags);
217 void char_finish_ucs4 (struct encoding_stream *str,
218 unsigned_char_dynarr *dst, unsigned int *flags);
220 static int detect_coding_utf8 (struct detection_state *st,
221 const Extbyte *src, size_t n);
222 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
223 unsigned_char_dynarr *dst, size_t n);
224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf8 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int postprocess_iso2022_mask (int mask);
230 static void reset_iso2022 (Lisp_Object coding_system,
231 struct iso2022_decoder *iso);
232 static int detect_coding_iso2022 (struct detection_state *st,
233 const Extbyte *src, size_t n);
234 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
235 unsigned_char_dynarr *dst, size_t n);
236 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
237 unsigned_char_dynarr *dst, unsigned int *flags);
238 void char_finish_iso2022 (struct encoding_stream *str,
239 unsigned_char_dynarr *dst, unsigned int *flags);
241 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
243 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
244 unsigned_char_dynarr *dst, size_t n);
245 static void mule_decode (Lstream *decoding, const Extbyte *src,
246 unsigned_char_dynarr *dst, size_t n);
247 static void mule_encode (Lstream *encoding, const Bufbyte *src,
248 unsigned_char_dynarr *dst, size_t n);
250 typedef struct codesys_prop codesys_prop;
259 Dynarr_declare (codesys_prop);
260 } codesys_prop_dynarr;
262 static const struct lrecord_description codesys_prop_description_1[] = {
263 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
267 static const struct struct_description codesys_prop_description = {
268 sizeof (codesys_prop),
269 codesys_prop_description_1
272 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
273 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
277 static const struct struct_description codesys_prop_dynarr_description = {
278 sizeof (codesys_prop_dynarr),
279 codesys_prop_dynarr_description_1
282 codesys_prop_dynarr *the_codesys_prop_dynarr;
284 enum codesys_prop_enum
287 CODESYS_PROP_ISO2022,
292 /************************************************************************/
293 /* Coding system functions */
294 /************************************************************************/
296 static Lisp_Object mark_coding_system (Lisp_Object);
297 static void print_coding_system (Lisp_Object, Lisp_Object, int);
298 static void finalize_coding_system (void *header, int for_disksave);
301 static const struct lrecord_description ccs_description_1[] = {
302 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
303 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
307 static const struct struct_description ccs_description = {
308 sizeof (charset_conversion_spec),
312 static const struct lrecord_description ccsd_description_1[] = {
313 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
317 static const struct struct_description ccsd_description = {
318 sizeof (charset_conversion_spec_dynarr),
323 static const struct lrecord_description coding_system_description[] = {
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
333 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
334 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
335 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
345 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
346 mark_coding_system, print_coding_system,
347 finalize_coding_system,
348 0, 0, coding_system_description,
352 mark_coding_system (Lisp_Object obj)
354 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
356 mark_object (CODING_SYSTEM_NAME (codesys));
357 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
358 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
359 mark_object (CODING_SYSTEM_EOL_LF (codesys));
360 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
361 mark_object (CODING_SYSTEM_EOL_CR (codesys));
363 switch (CODING_SYSTEM_TYPE (codesys))
367 case CODESYS_ISO2022:
368 for (i = 0; i < 4; i++)
369 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
370 if (codesys->iso2022.input_conv)
372 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
374 struct charset_conversion_spec *ccs =
375 Dynarr_atp (codesys->iso2022.input_conv, i);
376 mark_object (ccs->from_charset);
377 mark_object (ccs->to_charset);
380 if (codesys->iso2022.output_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.output_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
394 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
395 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
400 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
401 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
408 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
410 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
412 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
416 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
419 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
421 error ("printing unreadable object #<coding_system 0x%x>",
424 write_c_string ("#<coding_system ", printcharfun);
425 print_internal (c->name, printcharfun, 1);
426 write_c_string (">", printcharfun);
430 finalize_coding_system (void *header, int for_disksave)
432 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
433 /* Since coding systems never go away, this function is not
434 necessary. But it would be necessary if we changed things
435 so that coding systems could go away. */
436 if (!for_disksave) /* see comment in lstream.c */
438 switch (CODING_SYSTEM_TYPE (c))
441 case CODESYS_ISO2022:
442 if (c->iso2022.input_conv)
444 Dynarr_free (c->iso2022.input_conv);
445 c->iso2022.input_conv = 0;
447 if (c->iso2022.output_conv)
449 Dynarr_free (c->iso2022.output_conv);
450 c->iso2022.output_conv = 0;
461 symbol_to_eol_type (Lisp_Object symbol)
463 CHECK_SYMBOL (symbol);
464 if (NILP (symbol)) return EOL_AUTODETECT;
465 if (EQ (symbol, Qlf)) return EOL_LF;
466 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
467 if (EQ (symbol, Qcr)) return EOL_CR;
469 signal_simple_error ("Unrecognized eol type", symbol);
470 return EOL_AUTODETECT; /* not reached */
474 eol_type_to_symbol (eol_type_t type)
479 case EOL_LF: return Qlf;
480 case EOL_CRLF: return Qcrlf;
481 case EOL_CR: return Qcr;
482 case EOL_AUTODETECT: return Qnil;
487 setup_eol_coding_systems (Lisp_Coding_System *codesys)
489 Lisp_Object codesys_obj;
490 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
491 char *codesys_name = (char *) alloca (len + 7);
493 char *codesys_mnemonic=0;
495 Lisp_Object codesys_name_sym, sub_codesys_obj;
499 XSETCODING_SYSTEM (codesys_obj, codesys);
501 memcpy (codesys_name,
502 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
504 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
506 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
507 codesys_mnemonic = (char *) alloca (mlen + 7);
508 memcpy (codesys_mnemonic,
509 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
512 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
513 strcpy (codesys_name + len, "-" op_sys); \
515 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
516 codesys_name_sym = intern (codesys_name); \
517 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
518 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
520 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
521 build_string (codesys_mnemonic); \
522 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
525 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
526 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
527 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
530 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
531 Return t if OBJECT is a coding system.
532 A coding system is an object that defines how text containing multiple
533 character sets is encoded into a stream of (typically 8-bit) bytes.
534 The coding system is used to decode the stream into a series of
535 characters (which may be from multiple charsets) when the text is read
536 from a file or process, and is used to encode the text back into the
537 same format when it is written out to a file or process.
539 For example, many ISO2022-compliant coding systems (such as Compound
540 Text, which is used for inter-client data under the X Window System)
541 use escape sequences to switch between different charsets -- Japanese
542 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
543 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
544 `make-coding-system' for more information.
546 Coding systems are normally identified using a symbol, and the
547 symbol is accepted in place of the actual coding system object whenever
548 a coding system is called for. (This is similar to how faces work.)
552 return CODING_SYSTEMP (object) ? Qt : Qnil;
555 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
556 Retrieve the coding system of the given name.
558 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
559 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
560 If there is no such coding system, nil is returned. Otherwise the
561 associated coding system object is returned.
563 (coding_system_or_name))
565 if (NILP (coding_system_or_name))
566 coding_system_or_name = Qbinary;
567 else if (CODING_SYSTEMP (coding_system_or_name))
568 return coding_system_or_name;
570 CHECK_SYMBOL (coding_system_or_name);
574 coding_system_or_name =
575 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
577 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
578 return coding_system_or_name;
582 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
583 Retrieve the coding system of the given name.
584 Same as `find-coding-system' except that if there is no such
585 coding system, an error is signaled instead of returning nil.
589 Lisp_Object coding_system = Ffind_coding_system (name);
591 if (NILP (coding_system))
592 signal_simple_error ("No such coding system", name);
593 return coding_system;
596 /* We store the coding systems in hash tables with the names as the key and the
597 actual coding system object as the value. Occasionally we need to use them
598 in a list format. These routines provide us with that. */
599 struct coding_system_list_closure
601 Lisp_Object *coding_system_list;
605 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
606 void *coding_system_list_closure)
608 /* This function can GC */
609 struct coding_system_list_closure *cscl =
610 (struct coding_system_list_closure *) coding_system_list_closure;
611 Lisp_Object *coding_system_list = cscl->coding_system_list;
613 *coding_system_list = Fcons (key, *coding_system_list);
617 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
618 Return a list of the names of all defined coding systems.
622 Lisp_Object coding_system_list = Qnil;
624 struct coding_system_list_closure coding_system_list_closure;
626 GCPRO1 (coding_system_list);
627 coding_system_list_closure.coding_system_list = &coding_system_list;
628 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
629 &coding_system_list_closure);
632 return coding_system_list;
635 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
636 Return the name of the given coding system.
640 coding_system = Fget_coding_system (coding_system);
641 return XCODING_SYSTEM_NAME (coding_system);
644 static Lisp_Coding_System *
645 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
647 Lisp_Coding_System *codesys =
648 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
650 zero_lcrecord (codesys);
651 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
652 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
653 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
654 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
655 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
656 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
657 CODING_SYSTEM_TYPE (codesys) = type;
658 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
661 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
663 if (type == CODESYS_ISO2022)
666 for (i = 0; i < 4; i++)
667 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
670 if (type == CODESYS_UTF8)
672 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
674 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
678 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
681 else if (type == CODESYS_BIG5)
683 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
685 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
686 = Vcharset_chinese_big5;
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
693 else if (type == CODESYS_CCL)
695 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
696 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
699 CODING_SYSTEM_NAME (codesys) = name;
705 /* Given a list of charset conversion specs as specified in a Lisp
706 program, parse it into STORE_HERE. */
709 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
710 Lisp_Object spec_list)
714 EXTERNAL_LIST_LOOP (rest, spec_list)
716 Lisp_Object car = XCAR (rest);
717 Lisp_Object from, to;
718 struct charset_conversion_spec spec;
720 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
721 signal_simple_error ("Invalid charset conversion spec", car);
722 from = Fget_charset (XCAR (car));
723 to = Fget_charset (XCAR (XCDR (car)));
724 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
725 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
726 signal_simple_error_2
727 ("Attempted conversion between different charset types",
729 spec.from_charset = from;
730 spec.to_charset = to;
732 Dynarr_add (store_here, spec);
736 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
737 specs, return the equivalent as the Lisp programmer would see it.
739 If LOAD_HERE is 0, return Qnil. */
742 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
749 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
751 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
752 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
755 return Fnreverse (result);
760 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
761 Register symbol NAME as a coding system.
763 TYPE describes the conversion method used and should be one of
766 Automatic conversion. XEmacs attempts to detect the coding system
769 No conversion. Use this for binary files and such. On output,
770 graphic characters that are not in ASCII or Latin-1 will be
771 replaced by a ?. (For a no-conversion-encoded buffer, these
772 characters will only be present if you explicitly insert them.)
774 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
776 ISO 10646 UCS-4 encoding.
778 ISO 10646 UTF-8 encoding.
780 Any ISO2022-compliant encoding. Among other things, this includes
781 JIS (the Japanese encoding commonly used for e-mail), EUC (the
782 standard Unix encoding for Japanese and other languages), and
783 Compound Text (the encoding used in X11). You can specify more
784 specific information about the conversion with the PROPS argument.
786 Big5 (the encoding commonly used for Taiwanese).
788 The conversion is performed using a user-written pseudo-code
789 program. CCL (Code Conversion Language) is the name of this
792 Write out or read in the raw contents of the memory representing
793 the buffer's text. This is primarily useful for debugging
794 purposes, and is only enabled when XEmacs has been compiled with
795 DEBUG_XEMACS defined (via the --debug configure option).
796 WARNING: Reading in a file using 'internal conversion can result
797 in an internal inconsistency in the memory representing a
798 buffer's text, which will produce unpredictable results and may
799 cause XEmacs to crash. Under normal circumstances you should
800 never use 'internal conversion.
802 DOC-STRING is a string describing the coding system.
804 PROPS is a property list, describing the specific nature of the
805 character set. Recognized properties are:
808 String to be displayed in the modeline when this coding system is
812 End-of-line conversion to be used. It should be one of
815 Automatically detect the end-of-line type (LF, CRLF,
816 or CR). Also generate subsidiary coding systems named
817 `NAME-unix', `NAME-dos', and `NAME-mac', that are
818 identical to this coding system but have an EOL-TYPE
819 value of 'lf, 'crlf, and 'cr, respectively.
821 The end of a line is marked externally using ASCII LF.
822 Since this is also the way that XEmacs represents an
823 end-of-line internally, specifying this option results
824 in no end-of-line conversion. This is the standard
825 format for Unix text files.
827 The end of a line is marked externally using ASCII
828 CRLF. This is the standard format for MS-DOS text
831 The end of a line is marked externally using ASCII CR.
832 This is the standard format for Macintosh text files.
834 Automatically detect the end-of-line type but do not
835 generate subsidiary coding systems. (This value is
836 converted to nil when stored internally, and
837 `coding-system-property' will return nil.)
840 If non-nil, composition/decomposition for combining characters
843 'use-entity-reference
844 If non-nil, SGML style entity-reference is used for non-system-characters.
846 'post-read-conversion
847 Function called after a file has been read in, to perform the
848 decoding. Called with two arguments, START and END, denoting
849 a region of the current buffer to be decoded.
851 'pre-write-conversion
852 Function called before a file is written out, to perform the
853 encoding. Called with two arguments, START and END, denoting
854 a region of the current buffer to be encoded.
857 The following additional properties are recognized if TYPE is 'iso2022:
863 The character set initially designated to the G0 - G3 registers.
864 The value should be one of
866 -- A charset object (designate that character set)
867 -- nil (do not ever use this register)
868 -- t (no character set is initially designated to
869 the register, but may be later on; this automatically
870 sets the corresponding `force-g*-on-output' property)
876 If non-nil, send an explicit designation sequence on output before
877 using the specified register.
880 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
881 "ESC $ B" on output in place of the full designation sequences
882 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
885 If non-nil, don't designate ASCII to G0 at each end of line on output.
886 Setting this to non-nil also suppresses other state-resetting that
887 normally happens at the end of a line.
890 If non-nil, don't designate ASCII to G0 before control chars on output.
893 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
897 If non-nil, use locking-shift (SO/SI) instead of single-shift
898 or designation by escape sequence.
901 If non-nil, don't use ISO6429's direction specification.
904 If non-nil, literal control characters that are the same as
905 the beginning of a recognized ISO2022 or ISO6429 escape sequence
906 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
907 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
908 so that they can be properly distinguished from an escape sequence.
909 (Note that doing this results in a non-portable encoding.) This
910 encoding flag is used for byte-compiled files. Note that ESC
911 is a good choice for a quoting character because there are no
912 escape sequences whose second byte is a character from the Control-0
913 or Control-1 character sets; this is explicitly disallowed by the
916 'input-charset-conversion
917 A list of conversion specifications, specifying conversion of
918 characters in one charset to another when decoding is performed.
919 Each specification is a list of two elements: the source charset,
920 and the destination charset.
922 'output-charset-conversion
923 A list of conversion specifications, specifying conversion of
924 characters in one charset to another when encoding is performed.
925 The form of each specification is the same as for
926 'input-charset-conversion.
929 The following additional properties are recognized (and required)
933 CCL program used for decoding (converting to internal format).
936 CCL program used for encoding (converting to external format).
938 (name, type, doc_string, props))
940 Lisp_Coding_System *codesys;
941 enum coding_system_type ty;
942 int need_to_setup_eol_systems = 1;
944 /* Convert type to constant */
945 if (NILP (type) || EQ (type, Qundecided))
946 { ty = CODESYS_AUTODETECT; }
948 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
949 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
950 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
951 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
952 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
953 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
955 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
957 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
960 signal_simple_error ("Invalid coding system type", type);
964 codesys = allocate_coding_system (ty, name);
966 if (NILP (doc_string))
967 doc_string = build_string ("");
969 CHECK_STRING (doc_string);
970 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
973 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
975 if (EQ (key, Qmnemonic))
978 CHECK_STRING (value);
979 CODING_SYSTEM_MNEMONIC (codesys) = value;
982 else if (EQ (key, Qeol_type))
984 need_to_setup_eol_systems = NILP (value);
987 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
990 else if (EQ (key, Qpost_read_conversion))
991 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
992 else if (EQ (key, Qpre_write_conversion))
993 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
995 else if (EQ (key, Qdisable_composition))
996 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
997 else if (EQ (key, Quse_entity_reference))
998 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1001 else if (ty == CODESYS_ISO2022)
1003 #define FROB_INITIAL_CHARSET(charset_num) \
1004 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1005 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1007 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1008 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1009 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1010 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1012 #define FROB_FORCE_CHARSET(charset_num) \
1013 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1015 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1016 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1017 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1018 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1020 #define FROB_BOOLEAN_PROPERTY(prop) \
1021 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1023 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1024 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1025 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1026 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1027 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1028 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1029 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1031 else if (EQ (key, Qinput_charset_conversion))
1033 codesys->iso2022.input_conv =
1034 Dynarr_new (charset_conversion_spec);
1035 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1038 else if (EQ (key, Qoutput_charset_conversion))
1040 codesys->iso2022.output_conv =
1041 Dynarr_new (charset_conversion_spec);
1042 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1046 signal_simple_error ("Unrecognized property", key);
1049 else if (ty == CODESYS_UTF8)
1051 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1052 else if (EQ (key, Qcharset_g1))
1053 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1054 else if (EQ (key, Qcharset_g2))
1055 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1057 signal_simple_error ("Unrecognized property", key);
1059 else if (ty == CODESYS_BIG5)
1061 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1062 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1064 signal_simple_error ("Unrecognized property", key);
1067 else if (EQ (type, Qccl))
1070 struct ccl_program test_ccl;
1073 /* Check key first. */
1074 if (EQ (key, Qdecode))
1075 suffix = "-ccl-decode";
1076 else if (EQ (key, Qencode))
1077 suffix = "-ccl-encode";
1079 signal_simple_error ("Unrecognized property", key);
1081 /* If value is vector, register it as a ccl program
1082 associated with an newly created symbol for
1083 backward compatibility. */
1084 if (VECTORP (value))
1086 sym = Fintern (concat2 (Fsymbol_name (name),
1087 build_string (suffix)),
1089 Fregister_ccl_program (sym, value);
1093 CHECK_SYMBOL (value);
1096 /* check if the given ccl programs are valid. */
1097 if (setup_ccl_program (&test_ccl, sym) < 0)
1098 signal_simple_error ("Invalid CCL program", value);
1100 if (EQ (key, Qdecode))
1101 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1102 else if (EQ (key, Qencode))
1103 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1108 signal_simple_error ("Unrecognized property", key);
1112 if (need_to_setup_eol_systems)
1113 setup_eol_coding_systems (codesys);
1116 Lisp_Object codesys_obj;
1117 XSETCODING_SYSTEM (codesys_obj, codesys);
1118 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1123 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1124 Copy OLD-CODING-SYSTEM to NEW-NAME.
1125 If NEW-NAME does not name an existing coding system, a new one will
1128 (old_coding_system, new_name))
1130 Lisp_Object new_coding_system;
1131 old_coding_system = Fget_coding_system (old_coding_system);
1132 new_coding_system = Ffind_coding_system (new_name);
1133 if (NILP (new_coding_system))
1135 XSETCODING_SYSTEM (new_coding_system,
1136 allocate_coding_system
1137 (XCODING_SYSTEM_TYPE (old_coding_system),
1139 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1143 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1144 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1145 memcpy (((char *) to ) + sizeof (to->header),
1146 ((char *) from) + sizeof (from->header),
1147 sizeof (*from) - sizeof (from->header));
1148 to->name = new_name;
1150 return new_coding_system;
1153 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1154 Return t if OBJECT names a coding system, and is not a coding system alias.
1158 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1162 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1163 Return t if OBJECT is a coding system alias.
1164 All coding system aliases are created by `define-coding-system-alias'.
1168 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1172 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1173 Return the coding-system symbol for which symbol ALIAS is an alias.
1177 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1178 if (SYMBOLP (aliasee))
1181 signal_simple_error ("Symbol is not a coding system alias", alias);
1182 return Qnil; /* To keep the compiler happy */
1186 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1188 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1192 /* A maphash function, for removing dangling coding system aliases. */
1194 dangling_coding_system_alias_p (Lisp_Object alias,
1195 Lisp_Object aliasee,
1196 void *dangling_aliases)
1198 if (SYMBOLP (aliasee)
1199 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1201 (*(int *) dangling_aliases)++;
1208 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1209 Define symbol ALIAS as an alias for coding system ALIASEE.
1211 You can use this function to redefine an alias that has already been defined,
1212 but you cannot redefine a name which is the canonical name for a coding system.
1213 \(a canonical name of a coding system is what is returned when you call
1214 `coding-system-name' on a coding system).
1216 ALIASEE itself can be an alias, which allows you to define nested aliases.
1218 You are forbidden, however, from creating alias loops or `dangling' aliases.
1219 These will be detected, and an error will be signaled if you attempt to do so.
1221 If ALIASEE is nil, then ALIAS will simply be undefined.
1223 See also `coding-system-alias-p', `coding-system-aliasee',
1224 and `coding-system-canonical-name-p'.
1228 Lisp_Object real_coding_system, probe;
1230 CHECK_SYMBOL (alias);
1232 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1234 ("Symbol is the canonical name of a coding system and cannot be redefined",
1239 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1240 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1241 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1243 Fremhash (alias, Vcoding_system_hash_table);
1245 /* Undefine subsidiary aliases,
1246 presumably created by a previous call to this function */
1247 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1248 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1249 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1251 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1252 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1253 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1256 /* Undefine dangling coding system aliases. */
1258 int dangling_aliases;
1261 dangling_aliases = 0;
1262 elisp_map_remhash (dangling_coding_system_alias_p,
1263 Vcoding_system_hash_table,
1265 } while (dangling_aliases > 0);
1271 if (CODING_SYSTEMP (aliasee))
1272 aliasee = XCODING_SYSTEM_NAME (aliasee);
1274 /* Checks that aliasee names a coding-system */
1275 real_coding_system = Fget_coding_system (aliasee);
1277 /* Check for coding system alias loops */
1278 if (EQ (alias, aliasee))
1279 alias_loop: signal_simple_error_2
1280 ("Attempt to create a coding system alias loop", alias, aliasee);
1282 for (probe = aliasee;
1284 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1286 if (EQ (probe, alias))
1290 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1292 /* Set up aliases for subsidiaries.
1293 #### There must be a better way to handle subsidiary coding systems. */
1295 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1297 for (i = 0; i < countof (suffixes); i++)
1299 Lisp_Object alias_subsidiary =
1300 append_suffix_to_symbol (alias, suffixes[i]);
1301 Lisp_Object aliasee_subsidiary =
1302 append_suffix_to_symbol (aliasee, suffixes[i]);
1304 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1305 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1308 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1309 but it doesn't look intentional, so I'd rather return something
1310 meaningful or nothing at all. */
1315 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1317 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1318 Lisp_Object new_coding_system;
1320 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1321 return coding_system;
1325 case EOL_AUTODETECT: return coding_system;
1326 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1327 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1328 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1329 default: abort (); return Qnil;
1332 return NILP (new_coding_system) ? coding_system : new_coding_system;
1335 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1336 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1338 (coding_system, eol_type))
1340 coding_system = Fget_coding_system (coding_system);
1342 return subsidiary_coding_system (coding_system,
1343 symbol_to_eol_type (eol_type));
1347 /************************************************************************/
1348 /* Coding system accessors */
1349 /************************************************************************/
1351 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1352 Return the doc string for CODING-SYSTEM.
1356 coding_system = Fget_coding_system (coding_system);
1357 return XCODING_SYSTEM_DOC_STRING (coding_system);
1360 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1361 Return the type of CODING-SYSTEM.
1365 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1368 case CODESYS_AUTODETECT: return Qundecided;
1370 case CODESYS_SHIFT_JIS: return Qshift_jis;
1371 case CODESYS_ISO2022: return Qiso2022;
1372 case CODESYS_BIG5: return Qbig5;
1373 case CODESYS_UCS4: return Qucs4;
1374 case CODESYS_UTF8: return Qutf8;
1375 case CODESYS_CCL: return Qccl;
1377 case CODESYS_NO_CONVERSION: return Qno_conversion;
1379 case CODESYS_INTERNAL: return Qinternal;
1386 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1389 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1391 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1394 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1395 Return initial charset of CODING-SYSTEM designated to GNUM.
1398 (coding_system, gnum))
1400 coding_system = Fget_coding_system (coding_system);
1403 return coding_system_charset (coding_system, XINT (gnum));
1407 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1408 Return the PROP property of CODING-SYSTEM.
1410 (coding_system, prop))
1413 enum coding_system_type type;
1415 coding_system = Fget_coding_system (coding_system);
1416 CHECK_SYMBOL (prop);
1417 type = XCODING_SYSTEM_TYPE (coding_system);
1419 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1420 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1423 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1425 case CODESYS_PROP_ALL_OK:
1428 case CODESYS_PROP_ISO2022:
1429 if (type != CODESYS_ISO2022)
1431 ("Property only valid in ISO2022 coding systems",
1435 case CODESYS_PROP_CCL:
1436 if (type != CODESYS_CCL)
1438 ("Property only valid in CCL coding systems",
1448 signal_simple_error ("Unrecognized property", prop);
1450 if (EQ (prop, Qname))
1451 return XCODING_SYSTEM_NAME (coding_system);
1452 else if (EQ (prop, Qtype))
1453 return Fcoding_system_type (coding_system);
1454 else if (EQ (prop, Qdoc_string))
1455 return XCODING_SYSTEM_DOC_STRING (coding_system);
1456 else if (EQ (prop, Qmnemonic))
1457 return XCODING_SYSTEM_MNEMONIC (coding_system);
1458 else if (EQ (prop, Qeol_type))
1459 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1460 else if (EQ (prop, Qeol_lf))
1461 return XCODING_SYSTEM_EOL_LF (coding_system);
1462 else if (EQ (prop, Qeol_crlf))
1463 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1464 else if (EQ (prop, Qeol_cr))
1465 return XCODING_SYSTEM_EOL_CR (coding_system);
1466 else if (EQ (prop, Qpost_read_conversion))
1467 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1468 else if (EQ (prop, Qpre_write_conversion))
1469 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1472 else if (EQ (prop, Qdisable_composition))
1473 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1474 else if (EQ (prop, Quse_entity_reference))
1475 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1477 else if (type == CODESYS_ISO2022)
1479 if (EQ (prop, Qcharset_g0))
1480 return coding_system_charset (coding_system, 0);
1481 else if (EQ (prop, Qcharset_g1))
1482 return coding_system_charset (coding_system, 1);
1483 else if (EQ (prop, Qcharset_g2))
1484 return coding_system_charset (coding_system, 2);
1485 else if (EQ (prop, Qcharset_g3))
1486 return coding_system_charset (coding_system, 3);
1488 #define FORCE_CHARSET(charset_num) \
1489 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1490 (coding_system, charset_num) ? Qt : Qnil)
1492 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1493 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1494 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1495 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1497 #define LISP_BOOLEAN(prop) \
1498 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1500 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1501 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1502 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1503 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1504 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1505 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1506 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1508 else if (EQ (prop, Qinput_charset_conversion))
1510 unparse_charset_conversion_specs
1511 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1512 else if (EQ (prop, Qoutput_charset_conversion))
1514 unparse_charset_conversion_specs
1515 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1519 else if (type == CODESYS_CCL)
1521 if (EQ (prop, Qdecode))
1522 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1523 else if (EQ (prop, Qencode))
1524 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1532 return Qnil; /* not reached */
1536 /************************************************************************/
1537 /* Coding category functions */
1538 /************************************************************************/
1541 decode_coding_category (Lisp_Object symbol)
1545 CHECK_SYMBOL (symbol);
1546 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1547 if (EQ (coding_category_symbol[i], symbol))
1550 signal_simple_error ("Unrecognized coding category", symbol);
1551 return 0; /* not reached */
1554 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1555 Return a list of all recognized coding categories.
1560 Lisp_Object list = Qnil;
1562 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1563 list = Fcons (coding_category_symbol[i], list);
1567 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1568 Change the priority order of the coding categories.
1569 LIST should be list of coding categories, in descending order of
1570 priority. Unspecified coding categories will be lower in priority
1571 than all specified ones, in the same relative order they were in
1576 int category_to_priority[CODING_CATEGORY_LAST];
1580 /* First generate a list that maps coding categories to priorities. */
1582 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583 category_to_priority[i] = -1;
1585 /* Highest priority comes from the specified list. */
1587 EXTERNAL_LIST_LOOP (rest, list)
1589 int cat = decode_coding_category (XCAR (rest));
1591 if (category_to_priority[cat] >= 0)
1592 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1593 category_to_priority[cat] = i++;
1596 /* Now go through the existing categories by priority to retrieve
1597 the categories not yet specified and preserve their priority
1599 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1601 int cat = fcd->coding_category_by_priority[j];
1602 if (category_to_priority[cat] < 0)
1603 category_to_priority[cat] = i++;
1606 /* Now we need to construct the inverse of the mapping we just
1609 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1610 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1612 /* Phew! That was confusing. */
1616 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1617 Return a list of coding categories in descending order of priority.
1622 Lisp_Object list = Qnil;
1624 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1625 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1630 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1631 Change the coding system associated with a coding category.
1633 (coding_category, coding_system))
1635 int cat = decode_coding_category (coding_category);
1637 coding_system = Fget_coding_system (coding_system);
1638 fcd->coding_category_system[cat] = coding_system;
1642 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1643 Return the coding system associated with a coding category.
1647 int cat = decode_coding_category (coding_category);
1648 Lisp_Object sys = fcd->coding_category_system[cat];
1651 return XCODING_SYSTEM_NAME (sys);
1656 /************************************************************************/
1657 /* Detecting the encoding of data */
1658 /************************************************************************/
1660 struct detection_state
1662 eol_type_t eol_type;
1698 struct iso2022_decoder iso;
1700 int high_byte_count;
1701 unsigned int saw_single_shift:1;
1714 acceptable_control_char_p (int c)
1718 /* Allow and ignore control characters that you might
1719 reasonably see in a text file */
1724 case 8: /* backspace */
1725 case 11: /* vertical tab */
1726 case 12: /* form feed */
1727 case 26: /* MS-DOS C-z junk */
1728 case 31: /* '^_' -- for info */
1736 mask_has_at_most_one_bit_p (int mask)
1738 /* Perhaps the only thing useful you learn from intensive Microsoft
1739 technical interviews */
1740 return (mask & (mask - 1)) == 0;
1744 detect_eol_type (struct detection_state *st, const Extbyte *src,
1749 unsigned char c = *(unsigned char *)src++;
1752 if (st->eol.just_saw_cr)
1754 else if (st->eol.seen_anything)
1757 else if (st->eol.just_saw_cr)
1760 st->eol.just_saw_cr = 1;
1762 st->eol.just_saw_cr = 0;
1763 st->eol.seen_anything = 1;
1766 return EOL_AUTODETECT;
1769 /* Attempt to determine the encoding and EOL type of the given text.
1770 Before calling this function for the first type, you must initialize
1771 st->eol_type as appropriate and initialize st->mask to ~0.
1773 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1776 st->mask holds the determined coding category mask, or ~0 if only
1777 ASCII has been seen so far.
1781 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1782 is present in st->mask
1783 1 == definitive answers are here for both st->eol_type and st->mask
1787 detect_coding_type (struct detection_state *st, const Extbyte *src,
1788 size_t n, int just_do_eol)
1790 if (st->eol_type == EOL_AUTODETECT)
1791 st->eol_type = detect_eol_type (st, src, n);
1794 return st->eol_type != EOL_AUTODETECT;
1796 if (!st->seen_non_ascii)
1798 for (; n; n--, src++)
1800 unsigned char c = *(unsigned char *) src;
1801 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1803 st->seen_non_ascii = 1;
1805 st->shift_jis.mask = ~0;
1809 st->iso2022.mask = ~0;
1819 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1820 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1821 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1822 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1823 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1824 st->big5.mask = detect_coding_big5 (st, src, n);
1825 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1826 st->utf8.mask = detect_coding_utf8 (st, src, n);
1827 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1828 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1831 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1832 | st->utf8.mask | st->ucs4.mask;
1835 int retval = mask_has_at_most_one_bit_p (st->mask);
1836 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1837 return retval && st->eol_type != EOL_AUTODETECT;
1842 coding_system_from_mask (int mask)
1846 /* If the file was entirely or basically ASCII, use the
1847 default value of `buffer-file-coding-system'. */
1848 Lisp_Object retval =
1849 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1852 retval = Ffind_coding_system (retval);
1856 (Qbad_variable, Qwarning,
1857 "Invalid `default-buffer-file-coding-system', set to nil");
1858 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1862 retval = Fget_coding_system (Qraw_text);
1870 mask = postprocess_iso2022_mask (mask);
1872 /* Look through the coding categories by priority and find
1873 the first one that is allowed. */
1874 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1876 cat = fcd->coding_category_by_priority[i];
1877 if ((mask & (1 << cat)) &&
1878 !NILP (fcd->coding_category_system[cat]))
1882 return fcd->coding_category_system[cat];
1884 return Fget_coding_system (Qraw_text);
1888 /* Given a seekable read stream and potential coding system and EOL type
1889 as specified, do any autodetection that is called for. If the
1890 coding system and/or EOL type are not `autodetect', they will be left
1891 alone; but this function will never return an autodetect coding system
1894 This function does not automatically fetch subsidiary coding systems;
1895 that should be unnecessary with the explicit eol-type argument. */
1897 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1900 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1901 eol_type_t *eol_type_in_out)
1903 struct detection_state decst;
1905 if (*eol_type_in_out == EOL_AUTODETECT)
1906 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1909 decst.eol_type = *eol_type_in_out;
1912 /* If autodetection is called for, do it now. */
1913 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1914 || *eol_type_in_out == EOL_AUTODETECT)
1917 Lisp_Object coding_system = Qnil;
1919 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1922 /* Look for initial "-*-"; mode line prefix */
1924 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1929 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1931 Extbyte *local_vars_beg = p + 3;
1932 /* Look for final "-*-"; mode line suffix */
1933 for (p = local_vars_beg,
1934 scan_end = buf + nread - LENGTH ("-*-");
1939 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1941 Extbyte *suffix = p;
1942 /* Look for "coding:" */
1943 for (p = local_vars_beg,
1944 scan_end = suffix - LENGTH ("coding:?");
1947 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1948 && (p == local_vars_beg
1949 || (*(p-1) == ' ' ||
1955 p += LENGTH ("coding:");
1956 while (*p == ' ' || *p == '\t') p++;
1958 /* Get coding system name */
1959 save = *suffix; *suffix = '\0';
1960 /* Characters valid in a MIME charset name (rfc 1521),
1961 and in a Lisp symbol name. */
1962 n = strspn ( (char *) p,
1963 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1964 "abcdefghijklmnopqrstuvwxyz"
1970 save = p[n]; p[n] = '\0';
1972 Ffind_coding_system (intern ((char *) p));
1982 if (NILP (coding_system))
1985 if (detect_coding_type (&decst, buf, nread,
1986 XCODING_SYSTEM_TYPE (*codesys_in_out)
1987 != CODESYS_AUTODETECT))
1989 nread = Lstream_read (stream, buf, sizeof (buf));
1995 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1996 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1999 if (detect_coding_type (&decst, buf, nread, 1))
2001 nread = Lstream_read (stream, buf, sizeof (buf));
2007 *eol_type_in_out = decst.eol_type;
2008 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2010 if (NILP (coding_system))
2011 *codesys_in_out = coding_system_from_mask (decst.mask);
2013 *codesys_in_out = coding_system;
2017 /* If we absolutely can't determine the EOL type, just assume LF. */
2018 if (*eol_type_in_out == EOL_AUTODETECT)
2019 *eol_type_in_out = EOL_LF;
2021 Lstream_rewind (stream);
2024 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2025 Detect coding system of the text in the region between START and END.
2026 Return a list of possible coding systems ordered by priority.
2027 If only ASCII characters are found, return 'undecided or one of
2028 its subsidiary coding systems according to a detected end-of-line
2029 type. Optional arg BUFFER defaults to the current buffer.
2031 (start, end, buffer))
2033 Lisp_Object val = Qnil;
2034 struct buffer *buf = decode_buffer (buffer, 0);
2036 Lisp_Object instream, lb_instream;
2037 Lstream *istr, *lb_istr;
2038 struct detection_state decst;
2039 struct gcpro gcpro1, gcpro2;
2041 get_buffer_range_char (buf, start, end, &b, &e, 0);
2042 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2043 lb_istr = XLSTREAM (lb_instream);
2044 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2045 istr = XLSTREAM (instream);
2046 GCPRO2 (instream, lb_instream);
2048 decst.eol_type = EOL_AUTODETECT;
2052 Extbyte random_buffer[4096];
2053 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2057 if (detect_coding_type (&decst, random_buffer, nread, 0))
2061 if (decst.mask == ~0)
2062 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2070 decst.mask = postprocess_iso2022_mask (decst.mask);
2072 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2074 int sys = fcd->coding_category_by_priority[i];
2075 if (decst.mask & (1 << sys))
2077 Lisp_Object codesys = fcd->coding_category_system[sys];
2078 if (!NILP (codesys))
2079 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2080 val = Fcons (codesys, val);
2084 Lstream_close (istr);
2086 Lstream_delete (istr);
2087 Lstream_delete (lb_istr);
2092 /************************************************************************/
2093 /* Converting to internal Mule format ("decoding") */
2094 /************************************************************************/
2096 /* A decoding stream is a stream used for decoding text (i.e.
2097 converting from some external format to internal format).
2098 The decoding-stream object keeps track of the actual coding
2099 stream, the stream that is at the other end, and data that
2100 needs to be persistent across the lifetime of the stream. */
2102 /* Handle the EOL stuff related to just-read-in character C.
2103 EOL_TYPE is the EOL type of the coding stream.
2104 FLAGS is the current value of FLAGS in the coding stream, and may
2105 be modified by this macro. (The macro only looks at the
2106 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2107 bytes are to be written. You need to also define a local goto
2108 label "label_continue_loop" that is at the end of the main
2109 character-reading loop.
2111 If C is a CR character, then this macro handles it entirely and
2112 jumps to label_continue_loop. Otherwise, this macro does not add
2113 anything to DST, and continues normally. You should continue
2114 processing C normally after this macro. */
2116 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2120 if (eol_type == EOL_CR) \
2121 Dynarr_add (dst, '\n'); \
2122 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2123 Dynarr_add (dst, c); \
2125 flags |= CODING_STATE_CR; \
2126 goto label_continue_loop; \
2128 else if (flags & CODING_STATE_CR) \
2129 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2131 Dynarr_add (dst, '\r'); \
2132 flags &= ~CODING_STATE_CR; \
2136 /* C should be a binary character in the range 0 - 255; convert
2137 to internal format and add to Dynarr DST. */
2140 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2142 if (BYTE_ASCII_P (c)) \
2143 Dynarr_add (dst, c); \
2146 Dynarr_add (dst, (c >> 6) | 0xc0); \
2147 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2151 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2153 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2157 Dynarr_add (dst, c);
2159 else if ( c <= 0x7ff )
2161 Dynarr_add (dst, (c >> 6) | 0xc0);
2162 Dynarr_add (dst, (c & 0x3f) | 0x80);
2164 else if ( c <= 0xffff )
2166 Dynarr_add (dst, (c >> 12) | 0xe0);
2167 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2168 Dynarr_add (dst, (c & 0x3f) | 0x80);
2170 else if ( c <= 0x1fffff )
2172 Dynarr_add (dst, (c >> 18) | 0xf0);
2173 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2174 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2175 Dynarr_add (dst, (c & 0x3f) | 0x80);
2177 else if ( c <= 0x3ffffff )
2179 Dynarr_add (dst, (c >> 24) | 0xf8);
2180 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2181 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2182 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2183 Dynarr_add (dst, (c & 0x3f) | 0x80);
2187 Dynarr_add (dst, (c >> 30) | 0xfc);
2188 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2189 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2190 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2191 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2192 Dynarr_add (dst, (c & 0x3f) | 0x80);
2196 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2198 if (BYTE_ASCII_P (c)) \
2199 Dynarr_add (dst, c); \
2200 else if (BYTE_C1_P (c)) \
2202 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2203 Dynarr_add (dst, c + 0x20); \
2207 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2208 Dynarr_add (dst, c); \
2213 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2217 DECODE_ADD_BINARY_CHAR (ch, dst); \
2222 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2224 if (flags & CODING_STATE_END) \
2226 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2227 if (flags & CODING_STATE_CR) \
2228 Dynarr_add (dst, '\r'); \
2232 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2234 struct decoding_stream
2236 /* Coding system that governs the conversion. */
2237 Lisp_Coding_System *codesys;
2239 /* Stream that we read the encoded data from or
2240 write the decoded data to. */
2243 /* If we are reading, then we can return only a fixed amount of
2244 data, so if the conversion resulted in too much data, we store it
2245 here for retrieval the next time around. */
2246 unsigned_char_dynarr *runoff;
2248 /* FLAGS holds flags indicating the current state of the decoding.
2249 Some of these flags are dependent on the coding system. */
2252 /* CPOS holds a partially built-up code-point of character. */
2255 /* EOL_TYPE specifies the type of end-of-line conversion that
2256 currently applies. We need to keep this separate from the
2257 EOL type stored in CODESYS because the latter might indicate
2258 automatic EOL-type detection while the former will always
2259 indicate a particular EOL type. */
2260 eol_type_t eol_type;
2262 /* Additional ISO2022 information. We define the structure above
2263 because it's also needed by the detection routines. */
2264 struct iso2022_decoder iso2022;
2266 /* Additional information (the state of the running CCL program)
2267 used by the CCL decoder. */
2268 struct ccl_program ccl;
2270 /* counter for UTF-8 or UCS-4 */
2271 unsigned char counter;
2274 unsigned char er_counter;
2275 unsigned char er_buf[16];
2277 unsigned combined_char_count;
2278 Emchar combined_chars[16];
2279 Lisp_Object combining_table;
2281 struct detection_state decst;
2285 extern Lisp_Object Qcomposition;
2288 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
2290 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
2292 if ( str->er_counter > 0)
2294 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2295 str->er_counter = 0;
2299 void decode_add_er_char (struct decoding_stream *str, Emchar character,
2300 unsigned_char_dynarr* dst);
2302 decode_add_er_char (struct decoding_stream *str, Emchar c,
2303 unsigned_char_dynarr* dst)
2305 if (str->er_counter == 0)
2307 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
2310 str->er_buf[0] = '&';
2314 DECODE_ADD_UCS_CHAR (c, dst);
2318 Lisp_Object string = make_string (str->er_buf,
2320 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
2327 while (!NILP (rest))
2331 if (NILP (ccs = Ffind_charset (ccs)))
2346 pat = concat3 (build_string ("^&"),
2347 pat, build_string ("\\([0-9]+\\)$"));
2350 else if (EQ (ret, Qx))
2352 pat = concat3 (build_string ("^&"),
2353 pat, build_string ("\\([0-9a-f]+\\)$"));
2356 else if (EQ (ret, QX))
2358 pat = concat3 (build_string ("^&"),
2359 pat, build_string ("\\([0-9A-F]+\\)$"));
2365 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
2368 = XINT (Fstring_to_number
2369 (Fsubstring (string,
2370 Fmatch_beginning (make_int (1)),
2371 Fmatch_end (make_int (1))),
2374 DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
2379 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
2380 string, Qnil, Qnil)))
2383 = XINT (Fstring_to_number
2384 (Fsubstring (string,
2385 Fmatch_beginning (make_int (1)),
2386 Fmatch_end (make_int (1))),
2389 DECODE_ADD_UCS_CHAR (code, dst);
2393 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2394 Dynarr_add (dst, ';');
2397 str->er_counter = 0;
2399 else if ( (str->er_counter >= 16) || (c >= 0x7F) )
2401 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2402 str->er_counter = 0;
2403 DECODE_ADD_UCS_CHAR (c, dst);
2406 str->er_buf[str->er_counter++] = c;
2410 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2412 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2416 for (i = 0; i < str->combined_char_count; i++)
2417 decode_add_er_char (str, str->combined_chars[i], dst);
2418 str->combined_char_count = 0;
2419 str->combining_table = Qnil;
2422 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
2423 unsigned_char_dynarr* dst);
2425 COMPOSE_ADD_CHAR (struct decoding_stream *str,
2426 Emchar character, unsigned_char_dynarr* dst)
2428 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2429 decode_add_er_char (str, character, dst);
2430 else if (!CONSP (str->combining_table))
2433 = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
2436 decode_add_er_char (str, character, dst);
2439 str->combined_chars[0] = character;
2440 str->combined_char_count = 1;
2441 str->combining_table = ret;
2446 Lisp_Object ret = Fcdr (Fassq (make_char (character), str->combining_table));
2450 Emchar char2 = XCHARVAL (ret);
2451 ret = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
2454 decode_add_er_char (str, character, dst);
2455 str->combined_char_count = 0;
2456 str->combining_table = Qnil;
2460 str->combined_chars[0] = char2;
2461 str->combined_char_count = 1;
2462 str->combining_table = ret;
2467 COMPOSE_FLUSH_CHARS (str, dst);
2468 decode_add_er_char (str, character, dst);
2472 #else /* not UTF2000 */
2473 #define COMPOSE_FLUSH_CHARS(str, dst)
2474 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2475 #endif /* UTF2000 */
2477 static ssize_t decoding_reader (Lstream *stream,
2478 unsigned char *data, size_t size);
2479 static ssize_t decoding_writer (Lstream *stream,
2480 const unsigned char *data, size_t size);
2481 static int decoding_rewinder (Lstream *stream);
2482 static int decoding_seekable_p (Lstream *stream);
2483 static int decoding_flusher (Lstream *stream);
2484 static int decoding_closer (Lstream *stream);
2486 static Lisp_Object decoding_marker (Lisp_Object stream);
2488 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2489 sizeof (struct decoding_stream));
2492 decoding_marker (Lisp_Object stream)
2494 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2495 Lisp_Object str_obj;
2497 /* We do not need to mark the coding systems or charsets stored
2498 within the stream because they are stored in a global list
2499 and automatically marked. */
2501 XSETLSTREAM (str_obj, str);
2502 mark_object (str_obj);
2503 if (str->imp->marker)
2504 return (str->imp->marker) (str_obj);
2509 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2510 so we read data from the other end, decode it, and store it into DATA. */
2513 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2515 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2516 unsigned char *orig_data = data;
2518 int error_occurred = 0;
2520 /* We need to interface to mule_decode(), which expects to take some
2521 amount of data and store the result into a Dynarr. We have
2522 mule_decode() store into str->runoff, and take data from there
2525 /* We loop until we have enough data, reading chunks from the other
2526 end and decoding it. */
2529 /* Take data from the runoff if we can. Make sure to take at
2530 most SIZE bytes, and delete the data from the runoff. */
2531 if (Dynarr_length (str->runoff) > 0)
2533 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2534 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2535 Dynarr_delete_many (str->runoff, 0, chunk);
2541 break; /* No more room for data */
2543 if (str->flags & CODING_STATE_END)
2544 /* This means that on the previous iteration, we hit the EOF on
2545 the other end. We loop once more so that mule_decode() can
2546 output any final stuff it may be holding, or any "go back
2547 to a sane state" escape sequences. (This latter makes sense
2548 during encoding.) */
2551 /* Exhausted the runoff, so get some more. DATA has at least
2552 SIZE bytes left of storage in it, so it's OK to read directly
2553 into it. (We'll be overwriting above, after we've decoded it
2554 into the runoff.) */
2555 read_size = Lstream_read (str->other_end, data, size);
2562 /* There might be some more end data produced in the translation.
2563 See the comment above. */
2564 str->flags |= CODING_STATE_END;
2565 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2568 if (data - orig_data == 0)
2569 return error_occurred ? -1 : 0;
2571 return data - orig_data;
2575 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2577 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2580 /* Decode all our data into the runoff, and then attempt to write
2581 it all out to the other end. Remove whatever chunk we succeeded
2583 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2584 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2585 Dynarr_length (str->runoff));
2587 Dynarr_delete_many (str->runoff, 0, retval);
2588 /* Do NOT return retval. The return value indicates how much
2589 of the incoming data was written, not how many bytes were
2595 reset_decoding_stream (struct decoding_stream *str)
2598 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2600 Lisp_Object coding_system;
2601 XSETCODING_SYSTEM (coding_system, str->codesys);
2602 reset_iso2022 (coding_system, &str->iso2022);
2604 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2606 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2611 str->er_counter = 0;
2612 str->combined_char_count = 0;
2613 str->combining_table = Qnil;
2615 str->flags = str->cpos = 0;
2619 decoding_rewinder (Lstream *stream)
2621 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2622 reset_decoding_stream (str);
2623 Dynarr_reset (str->runoff);
2624 return Lstream_rewind (str->other_end);
2628 decoding_seekable_p (Lstream *stream)
2630 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2631 return Lstream_seekable_p (str->other_end);
2635 decoding_flusher (Lstream *stream)
2637 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2638 return Lstream_flush (str->other_end);
2642 decoding_closer (Lstream *stream)
2644 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2645 if (stream->flags & LSTREAM_FL_WRITE)
2647 str->flags |= CODING_STATE_END;
2648 decoding_writer (stream, 0, 0);
2650 Dynarr_free (str->runoff);
2652 #ifdef ENABLE_COMPOSITE_CHARS
2653 if (str->iso2022.composite_chars)
2654 Dynarr_free (str->iso2022.composite_chars);
2657 return Lstream_close (str->other_end);
2661 decoding_stream_coding_system (Lstream *stream)
2663 Lisp_Object coding_system;
2664 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2666 XSETCODING_SYSTEM (coding_system, str->codesys);
2667 return subsidiary_coding_system (coding_system, str->eol_type);
2671 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2673 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2674 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2676 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2677 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2678 reset_decoding_stream (str);
2681 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2682 stream for writing, no automatic code detection will be performed.
2683 The reason for this is that automatic code detection requires a
2684 seekable input. Things will also fail if you open a decoding
2685 stream for reading using a non-fully-specified coding system and
2686 a non-seekable input stream. */
2689 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2692 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2693 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2697 str->other_end = stream;
2698 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2699 str->eol_type = EOL_AUTODETECT;
2700 if (!strcmp (mode, "r")
2701 && Lstream_seekable_p (stream))
2702 /* We can determine the coding system now. */
2703 determine_real_coding_system (stream, &codesys, &str->eol_type);
2704 set_decoding_stream_coding_system (lstr, codesys);
2705 str->decst.eol_type = str->eol_type;
2706 str->decst.mask = ~0;
2707 XSETLSTREAM (obj, lstr);
2712 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2714 return make_decoding_stream_1 (stream, codesys, "r");
2718 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2720 return make_decoding_stream_1 (stream, codesys, "w");
2723 /* Note: the decode_coding_* functions all take the same
2724 arguments as mule_decode(), which is to say some SRC data of
2725 size N, which is to be stored into dynamic array DST.
2726 DECODING is the stream within which the decoding is
2727 taking place, but no data is actually read from or
2728 written to that stream; that is handled in decoding_reader()
2729 or decoding_writer(). This allows the same functions to
2730 be used for both reading and writing. */
2733 mule_decode (Lstream *decoding, const Extbyte *src,
2734 unsigned_char_dynarr *dst, size_t n)
2736 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2738 /* If necessary, do encoding-detection now. We do this when
2739 we're a writing stream or a non-seekable reading stream,
2740 meaning that we can't just process the whole input,
2741 rewind, and start over. */
2743 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2744 str->eol_type == EOL_AUTODETECT)
2746 Lisp_Object codesys;
2748 XSETCODING_SYSTEM (codesys, str->codesys);
2749 detect_coding_type (&str->decst, src, n,
2750 CODING_SYSTEM_TYPE (str->codesys) !=
2751 CODESYS_AUTODETECT);
2752 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2753 str->decst.mask != ~0)
2754 /* #### This is cheesy. What we really ought to do is
2755 buffer up a certain amount of data so as to get a
2756 less random result. */
2757 codesys = coding_system_from_mask (str->decst.mask);
2758 str->eol_type = str->decst.eol_type;
2759 if (XCODING_SYSTEM (codesys) != str->codesys)
2761 /* Preserve the CODING_STATE_END flag in case it was set.
2762 If we erase it, bad things might happen. */
2763 int was_end = str->flags & CODING_STATE_END;
2764 set_decoding_stream_coding_system (decoding, codesys);
2766 str->flags |= CODING_STATE_END;
2770 switch (CODING_SYSTEM_TYPE (str->codesys))
2773 case CODESYS_INTERNAL:
2774 Dynarr_add_many (dst, src, n);
2777 case CODESYS_AUTODETECT:
2778 /* If we got this far and still haven't decided on the coding
2779 system, then do no conversion. */
2780 case CODESYS_NO_CONVERSION:
2781 decode_coding_no_conversion (decoding, src, dst, n);
2784 case CODESYS_SHIFT_JIS:
2785 decode_coding_sjis (decoding, src, dst, n);
2788 decode_coding_big5 (decoding, src, dst, n);
2791 decode_coding_ucs4 (decoding, src, dst, n);
2794 decode_coding_utf8 (decoding, src, dst, n);
2797 str->ccl.last_block = str->flags & CODING_STATE_END;
2798 /* When applying ccl program to stream, MUST NOT set NULL
2800 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2801 dst, n, 0, CCL_MODE_DECODING);
2803 case CODESYS_ISO2022:
2804 decode_coding_iso2022 (decoding, src, dst, n);
2812 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2813 Decode the text between START and END which is encoded in CODING-SYSTEM.
2814 This is useful if you've read in encoded text from a file without decoding
2815 it (e.g. you read in a JIS-formatted file but used the `binary' or
2816 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2817 Return length of decoded text.
2818 BUFFER defaults to the current buffer if unspecified.
2820 (start, end, coding_system, buffer))
2823 struct buffer *buf = decode_buffer (buffer, 0);
2824 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2825 Lstream *istr, *ostr;
2826 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2828 get_buffer_range_char (buf, start, end, &b, &e, 0);
2830 barf_if_buffer_read_only (buf, b, e);
2832 coding_system = Fget_coding_system (coding_system);
2833 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2834 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2835 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2837 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2838 Fget_coding_system (Qbinary));
2839 istr = XLSTREAM (instream);
2840 ostr = XLSTREAM (outstream);
2841 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2843 /* The chain of streams looks like this:
2845 [BUFFER] <----- send through
2846 ------> [ENCODE AS BINARY]
2847 ------> [DECODE AS SPECIFIED]
2853 char tempbuf[1024]; /* some random amount */
2854 Bufpos newpos, even_newer_pos;
2855 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2856 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2860 newpos = lisp_buffer_stream_startpos (istr);
2861 Lstream_write (ostr, tempbuf, size_in_bytes);
2862 even_newer_pos = lisp_buffer_stream_startpos (istr);
2863 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2866 Lstream_close (istr);
2867 Lstream_close (ostr);
2869 Lstream_delete (istr);
2870 Lstream_delete (ostr);
2871 Lstream_delete (XLSTREAM (de_outstream));
2872 Lstream_delete (XLSTREAM (lb_outstream));
2877 /************************************************************************/
2878 /* Converting to an external encoding ("encoding") */
2879 /************************************************************************/
2881 /* An encoding stream is an output stream. When you create the
2882 stream, you specify the coding system that governs the encoding
2883 and another stream that the resulting encoded data is to be
2884 sent to, and then start sending data to it. */
2886 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2888 struct encoding_stream
2890 /* Coding system that governs the conversion. */
2891 Lisp_Coding_System *codesys;
2893 /* Stream that we read the encoded data from or
2894 write the decoded data to. */
2897 /* If we are reading, then we can return only a fixed amount of
2898 data, so if the conversion resulted in too much data, we store it
2899 here for retrieval the next time around. */
2900 unsigned_char_dynarr *runoff;
2902 /* FLAGS holds flags indicating the current state of the encoding.
2903 Some of these flags are dependent on the coding system. */
2906 /* CH holds a partially built-up character. Since we only deal
2907 with one- and two-byte characters at the moment, we only use
2908 this to store the first byte of a two-byte character. */
2911 /* Additional information used by the ISO2022 encoder. */
2914 /* CHARSET holds the character sets currently assigned to the G0
2915 through G3 registers. It is initialized from the array
2916 INITIAL_CHARSET in CODESYS. */
2917 Lisp_Object charset[4];
2919 /* Which registers are currently invoked into the left (GL) and
2920 right (GR) halves of the 8-bit encoding space? */
2921 int register_left, register_right;
2923 /* Whether we need to explicitly designate the charset in the
2924 G? register before using it. It is initialized from the
2925 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2926 unsigned char force_charset_on_output[4];
2928 /* Other state variables that need to be preserved across
2930 Lisp_Object current_charset;
2932 int current_char_boundary;
2935 void (*encode_char) (struct encoding_stream *str, Emchar c,
2936 unsigned_char_dynarr *dst, unsigned int *flags);
2937 void (*finish) (struct encoding_stream *str,
2938 unsigned_char_dynarr *dst, unsigned int *flags);
2940 /* Additional information (the state of the running CCL program)
2941 used by the CCL encoder. */
2942 struct ccl_program ccl;
2946 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2947 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2949 static int encoding_rewinder (Lstream *stream);
2950 static int encoding_seekable_p (Lstream *stream);
2951 static int encoding_flusher (Lstream *stream);
2952 static int encoding_closer (Lstream *stream);
2954 static Lisp_Object encoding_marker (Lisp_Object stream);
2956 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2957 sizeof (struct encoding_stream));
2960 encoding_marker (Lisp_Object stream)
2962 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2963 Lisp_Object str_obj;
2965 /* We do not need to mark the coding systems or charsets stored
2966 within the stream because they are stored in a global list
2967 and automatically marked. */
2969 XSETLSTREAM (str_obj, str);
2970 mark_object (str_obj);
2971 if (str->imp->marker)
2972 return (str->imp->marker) (str_obj);
2977 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2978 so we read data from the other end, encode it, and store it into DATA. */
2981 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2983 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2984 unsigned char *orig_data = data;
2986 int error_occurred = 0;
2988 /* We need to interface to mule_encode(), which expects to take some
2989 amount of data and store the result into a Dynarr. We have
2990 mule_encode() store into str->runoff, and take data from there
2993 /* We loop until we have enough data, reading chunks from the other
2994 end and encoding it. */
2997 /* Take data from the runoff if we can. Make sure to take at
2998 most SIZE bytes, and delete the data from the runoff. */
2999 if (Dynarr_length (str->runoff) > 0)
3001 int chunk = min ((int) size, Dynarr_length (str->runoff));
3002 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
3003 Dynarr_delete_many (str->runoff, 0, chunk);
3009 break; /* No more room for data */
3011 if (str->flags & CODING_STATE_END)
3012 /* This means that on the previous iteration, we hit the EOF on
3013 the other end. We loop once more so that mule_encode() can
3014 output any final stuff it may be holding, or any "go back
3015 to a sane state" escape sequences. (This latter makes sense
3016 during encoding.) */
3019 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
3020 left of storage in it, so it's OK to read directly into it.
3021 (We'll be overwriting above, after we've encoded it into the
3023 read_size = Lstream_read (str->other_end, data, size);
3030 /* There might be some more end data produced in the translation.
3031 See the comment above. */
3032 str->flags |= CODING_STATE_END;
3033 mule_encode (stream, data, str->runoff, read_size);
3036 if (data == orig_data)
3037 return error_occurred ? -1 : 0;
3039 return data - orig_data;
3043 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
3045 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3048 /* Encode all our data into the runoff, and then attempt to write
3049 it all out to the other end. Remove whatever chunk we succeeded
3051 mule_encode (stream, data, str->runoff, size);
3052 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
3053 Dynarr_length (str->runoff));
3055 Dynarr_delete_many (str->runoff, 0, retval);
3056 /* Do NOT return retval. The return value indicates how much
3057 of the incoming data was written, not how many bytes were
3063 reset_encoding_stream (struct encoding_stream *str)
3066 switch (CODING_SYSTEM_TYPE (str->codesys))
3068 case CODESYS_ISO2022:
3072 str->encode_char = &char_encode_iso2022;
3073 str->finish = &char_finish_iso2022;
3074 for (i = 0; i < 4; i++)
3076 str->iso2022.charset[i] =
3077 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
3078 str->iso2022.force_charset_on_output[i] =
3079 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
3081 str->iso2022.register_left = 0;
3082 str->iso2022.register_right = 1;
3083 str->iso2022.current_charset = Qnil;
3084 str->iso2022.current_half = 0;
3088 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
3091 str->encode_char = &char_encode_utf8;
3092 str->finish = &char_finish_utf8;
3095 str->encode_char = &char_encode_ucs4;
3096 str->finish = &char_finish_ucs4;
3098 case CODESYS_SHIFT_JIS:
3099 str->encode_char = &char_encode_shift_jis;
3100 str->finish = &char_finish_shift_jis;
3103 str->encode_char = &char_encode_big5;
3104 str->finish = &char_finish_big5;
3110 str->iso2022.current_char_boundary = 0;
3111 str->flags = str->ch = 0;
3115 encoding_rewinder (Lstream *stream)
3117 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3118 reset_encoding_stream (str);
3119 Dynarr_reset (str->runoff);
3120 return Lstream_rewind (str->other_end);
3124 encoding_seekable_p (Lstream *stream)
3126 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3127 return Lstream_seekable_p (str->other_end);
3131 encoding_flusher (Lstream *stream)
3133 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3134 return Lstream_flush (str->other_end);
3138 encoding_closer (Lstream *stream)
3140 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3141 if (stream->flags & LSTREAM_FL_WRITE)
3143 str->flags |= CODING_STATE_END;
3144 encoding_writer (stream, 0, 0);
3146 Dynarr_free (str->runoff);
3147 return Lstream_close (str->other_end);
3151 encoding_stream_coding_system (Lstream *stream)
3153 Lisp_Object coding_system;
3154 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3156 XSETCODING_SYSTEM (coding_system, str->codesys);
3157 return coding_system;
3161 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3163 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3164 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3166 reset_encoding_stream (str);
3170 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3173 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3174 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3178 str->runoff = Dynarr_new (unsigned_char);
3179 str->other_end = stream;
3180 set_encoding_stream_coding_system (lstr, codesys);
3181 XSETLSTREAM (obj, lstr);
3186 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3188 return make_encoding_stream_1 (stream, codesys, "r");
3192 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3194 return make_encoding_stream_1 (stream, codesys, "w");
3197 /* Convert N bytes of internally-formatted data stored in SRC to an
3198 external format, according to the encoding stream ENCODING.
3199 Store the encoded data into DST. */
3202 mule_encode (Lstream *encoding, const Bufbyte *src,
3203 unsigned_char_dynarr *dst, size_t n)
3205 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3207 switch (CODING_SYSTEM_TYPE (str->codesys))
3210 case CODESYS_INTERNAL:
3211 Dynarr_add_many (dst, src, n);
3214 case CODESYS_AUTODETECT:
3215 /* If we got this far and still haven't decided on the coding
3216 system, then do no conversion. */
3217 case CODESYS_NO_CONVERSION:
3218 encode_coding_no_conversion (encoding, src, dst, n);
3222 str->ccl.last_block = str->flags & CODING_STATE_END;
3223 /* When applying ccl program to stream, MUST NOT set NULL
3225 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3226 dst, n, 0, CCL_MODE_ENCODING);
3230 text_encode_generic (encoding, src, dst, n);
3234 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3235 Encode the text between START and END using CODING-SYSTEM.
3236 This will, for example, convert Japanese characters into stuff such as
3237 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3238 text. BUFFER defaults to the current buffer if unspecified.
3240 (start, end, coding_system, buffer))
3243 struct buffer *buf = decode_buffer (buffer, 0);
3244 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3245 Lstream *istr, *ostr;
3246 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3248 get_buffer_range_char (buf, start, end, &b, &e, 0);
3250 barf_if_buffer_read_only (buf, b, e);
3252 coding_system = Fget_coding_system (coding_system);
3253 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3254 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3255 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3256 Fget_coding_system (Qbinary));
3257 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3259 istr = XLSTREAM (instream);
3260 ostr = XLSTREAM (outstream);
3261 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3262 /* The chain of streams looks like this:
3264 [BUFFER] <----- send through
3265 ------> [ENCODE AS SPECIFIED]
3266 ------> [DECODE AS BINARY]
3271 char tempbuf[1024]; /* some random amount */
3272 Bufpos newpos, even_newer_pos;
3273 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3274 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3278 newpos = lisp_buffer_stream_startpos (istr);
3279 Lstream_write (ostr, tempbuf, size_in_bytes);
3280 even_newer_pos = lisp_buffer_stream_startpos (istr);
3281 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3287 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3288 Lstream_close (istr);
3289 Lstream_close (ostr);
3291 Lstream_delete (istr);
3292 Lstream_delete (ostr);
3293 Lstream_delete (XLSTREAM (de_outstream));
3294 Lstream_delete (XLSTREAM (lb_outstream));
3295 return make_int (retlen);
3302 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3303 unsigned_char_dynarr *dst, size_t n)
3306 unsigned char char_boundary;
3307 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3308 unsigned int flags = str->flags;
3309 Emchar ch = str->ch;
3311 char_boundary = str->iso2022.current_char_boundary;
3317 if (char_boundary == 0)
3345 (*str->encode_char) (str, c, dst, &flags);
3347 else if (char_boundary == 1)
3349 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3355 ch = (ch << 6) | (c & 0x3f);
3360 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3362 (*str->finish) (str, dst, &flags);
3367 str->iso2022.current_char_boundary = char_boundary;
3371 /************************************************************************/
3372 /* Shift-JIS methods */
3373 /************************************************************************/
3375 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3376 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3377 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3378 encoded by "position-code + 0x80". A character of JISX0208
3379 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3380 position-codes are divided and shifted so that it fit in the range
3383 --- CODE RANGE of Shift-JIS ---
3384 (character set) (range)
3386 JISX0201-Kana 0xA0 .. 0xDF
3387 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3388 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3389 -------------------------------
3393 /* Is this the first byte of a Shift-JIS two-byte char? */
3395 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3396 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3398 /* Is this the second byte of a Shift-JIS two-byte char? */
3400 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3401 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3403 #define BYTE_SJIS_KATAKANA_P(c) \
3404 ((c) >= 0xA1 && (c) <= 0xDF)
3407 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3411 unsigned char c = *(unsigned char *)src++;
3412 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3414 if (st->shift_jis.in_second_byte)
3416 st->shift_jis.in_second_byte = 0;
3420 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3421 st->shift_jis.in_second_byte = 1;
3423 return CODING_CATEGORY_SHIFT_JIS_MASK;
3426 /* Convert Shift-JIS data to internal format. */
3429 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3430 unsigned_char_dynarr *dst, size_t n)
3432 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3433 unsigned int flags = str->flags;
3434 unsigned int cpos = str->cpos;
3435 eol_type_t eol_type = str->eol_type;
3439 unsigned char c = *(unsigned char *)src++;
3443 /* Previous character was first byte of Shift-JIS Kanji char. */
3444 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3446 unsigned char e1, e2;
3448 DECODE_SJIS (cpos, c, e1, e2);
3450 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3454 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3455 Dynarr_add (dst, e1);
3456 Dynarr_add (dst, e2);
3461 DECODE_ADD_BINARY_CHAR (cpos, dst);
3462 DECODE_ADD_BINARY_CHAR (c, dst);
3468 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3469 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3471 else if (BYTE_SJIS_KATAKANA_P (c))
3474 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3477 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3478 Dynarr_add (dst, c);
3483 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3487 DECODE_ADD_BINARY_CHAR (c, dst);
3489 label_continue_loop:;
3492 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3498 /* Convert internal character representation to Shift_JIS. */
3501 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3502 unsigned_char_dynarr *dst, unsigned int *flags)
3504 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3508 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3509 Dynarr_add (dst, '\r');
3510 if (eol_type != EOL_CR)
3511 Dynarr_add (dst, ch);
3515 unsigned int s1, s2;
3517 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3519 if (code_point >= 0)
3520 Dynarr_add (dst, code_point);
3521 else if ((code_point
3522 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3525 ENCODE_SJIS ((code_point >> 8) | 0x80,
3526 (code_point & 0xFF) | 0x80, s1, s2);
3527 Dynarr_add (dst, s1);
3528 Dynarr_add (dst, s2);
3530 else if ((code_point
3531 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3533 Dynarr_add (dst, code_point | 0x80);
3534 else if ((code_point
3535 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3538 ENCODE_SJIS ((code_point >> 8) | 0x80,
3539 (code_point & 0xFF) | 0x80, s1, s2);
3540 Dynarr_add (dst, s1);
3541 Dynarr_add (dst, s2);
3543 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3545 Dynarr_add (dst, code_point);
3547 Dynarr_add (dst, '?');
3549 Lisp_Object charset;
3550 unsigned int c1, c2;
3552 BREAKUP_CHAR (ch, charset, c1, c2);
3554 if (EQ(charset, Vcharset_katakana_jisx0201))
3556 Dynarr_add (dst, c1 | 0x80);
3560 Dynarr_add (dst, c1);
3562 else if (EQ(charset, Vcharset_japanese_jisx0208))
3564 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3565 Dynarr_add (dst, s1);
3566 Dynarr_add (dst, s2);
3569 Dynarr_add (dst, '?');
3575 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3576 unsigned int *flags)
3580 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3581 Decode a JISX0208 character of Shift-JIS coding-system.
3582 CODE is the character code in Shift-JIS as a cons of type bytes.
3583 Return the corresponding character.
3587 unsigned char c1, c2, s1, s2;
3590 CHECK_INT (XCAR (code));
3591 CHECK_INT (XCDR (code));
3592 s1 = XINT (XCAR (code));
3593 s2 = XINT (XCDR (code));
3594 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3595 BYTE_SJIS_TWO_BYTE_2_P (s2))
3597 DECODE_SJIS (s1, s2, c1, c2);
3598 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3599 c1 & 0x7F, c2 & 0x7F));
3605 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3606 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3607 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3611 Lisp_Object charset;
3614 CHECK_CHAR_COERCE_INT (character);
3615 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3616 if (EQ (charset, Vcharset_japanese_jisx0208))
3618 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3619 return Fcons (make_int (s1), make_int (s2));
3626 /************************************************************************/
3628 /************************************************************************/
3630 /* BIG5 is a coding system encoding two character sets: ASCII and
3631 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3632 character set and is encoded in two-byte.
3634 --- CODE RANGE of BIG5 ---
3635 (character set) (range)
3637 Big5 (1st byte) 0xA1 .. 0xFE
3638 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3639 --------------------------
3641 Since the number of characters in Big5 is larger than maximum
3642 characters in Emacs' charset (96x96), it can't be handled as one
3643 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3644 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3645 contains frequently used characters and the latter contains less
3646 frequently used characters. */
3649 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3650 ((c) >= 0x81 && (c) <= 0xFE)
3652 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3653 ((c) >= 0xA1 && (c) <= 0xFE)
3656 /* Is this the second byte of a Shift-JIS two-byte char? */
3658 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3659 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3661 /* Number of Big5 characters which have the same code in 1st byte. */
3663 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3665 /* Code conversion macros. These are macros because they are used in
3666 inner loops during code conversion.
3668 Note that temporary variables in macros introduce the classic
3669 dynamic-scoping problems with variable names. We use capital-
3670 lettered variables in the assumption that XEmacs does not use
3671 capital letters in variables except in a very formalized way
3674 /* Convert Big5 code (b1, b2) into its internal string representation
3677 /* There is a much simpler way to split the Big5 charset into two.
3678 For the moment I'm going to leave the algorithm as-is because it
3679 claims to separate out the most-used characters into a single
3680 charset, which perhaps will lead to optimizations in various
3683 The way the algorithm works is something like this:
3685 Big5 can be viewed as a 94x157 charset, where the row is
3686 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3687 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3688 the split between low and high column numbers is apparently
3689 meaningless; ascending rows produce less and less frequent chars.
3690 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3691 the first charset, and the upper half (0xC9 .. 0xFE) to the
3692 second. To do the conversion, we convert the character into
3693 a single number where 0 .. 156 is the first row, 157 .. 313
3694 is the second, etc. That way, the characters are ordered by
3695 decreasing frequency. Then we just chop the space in two
3696 and coerce the result into a 94x94 space.
3699 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3701 int B1 = b1, B2 = b2; \
3703 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3707 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3711 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3712 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3714 c1 = I / (0xFF - 0xA1) + 0xA1; \
3715 c2 = I % (0xFF - 0xA1) + 0xA1; \
3718 /* Convert the internal string representation of a Big5 character
3719 (lb, c1, c2) into Big5 code (b1, b2). */
3721 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3723 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3725 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3727 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3729 b1 = I / BIG5_SAME_ROW + 0xA1; \
3730 b2 = I % BIG5_SAME_ROW; \
3731 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3735 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3739 unsigned char c = *(unsigned char *)src++;
3740 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3742 || (c >= 0x80 && c <= 0xA0)
3746 if (st->big5.in_second_byte)
3748 st->big5.in_second_byte = 0;
3749 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3759 st->big5.in_second_byte = 1;
3761 return CODING_CATEGORY_BIG5_MASK;
3764 /* Convert Big5 data to internal format. */
3767 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3768 unsigned_char_dynarr *dst, size_t n)
3770 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3771 unsigned int flags = str->flags;
3772 unsigned int cpos = str->cpos;
3773 eol_type_t eol_type = str->eol_type;
3776 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3777 (decoding)->codesys, 1);
3782 unsigned char c = *(unsigned char *)src++;
3785 /* Previous character was first byte of Big5 char. */
3786 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3789 int code_point = (cpos << 8) | c;
3790 Emchar char_id = decode_defined_char (ccs, code_point);
3793 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3794 DECODE_ADD_UCS_CHAR (char_id, dst);
3796 unsigned char b1, b2, b3;
3797 DECODE_BIG5 (cpos, c, b1, b2, b3);
3798 Dynarr_add (dst, b1);
3799 Dynarr_add (dst, b2);
3800 Dynarr_add (dst, b3);
3805 DECODE_ADD_BINARY_CHAR (cpos, dst);
3806 DECODE_ADD_BINARY_CHAR (c, dst);
3812 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3813 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3815 decode_flush_er_chars (str, dst);
3818 else if ( c >= ' ' )
3820 /* DECODE_ADD_BINARY_CHAR (c, dst); */
3821 decode_add_er_char (str, c, dst);
3825 decode_flush_er_chars (str, dst);
3826 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3827 DECODE_ADD_BINARY_CHAR (c, dst);
3830 label_continue_loop:;
3833 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
3834 if (flags & CODING_STATE_END)
3836 decode_flush_er_chars (str, dst);
3837 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3838 if (flags & CODING_STATE_CR)
3839 Dynarr_add (dst, '\r');
3846 /* Convert internally-formatted data to Big5. */
3849 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3850 unsigned_char_dynarr *dst, unsigned int *flags)
3852 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3856 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3857 Dynarr_add (dst, '\r');
3858 if (eol_type != EOL_CR)
3859 Dynarr_add (dst, ch);
3866 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3868 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3869 Dynarr_add (dst, code_point);
3870 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3872 Dynarr_add (dst, code_point >> 8);
3873 Dynarr_add (dst, code_point & 0xFF);
3875 else if ((code_point
3876 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3878 Dynarr_add (dst, code_point >> 8);
3879 Dynarr_add (dst, code_point & 0xFF);
3881 else if ((code_point
3882 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3885 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3886 + ((code_point & 0xFF) - 33);
3887 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3888 unsigned char b2 = I % BIG5_SAME_ROW;
3890 b2 += b2 < 0x3F ? 0x40 : 0x62;
3891 Dynarr_add (dst, b1);
3892 Dynarr_add (dst, b2);
3894 else if ((code_point
3895 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3898 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3899 + ((code_point & 0xFF) - 33);
3900 unsigned char b1, b2;
3902 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3903 b1 = I / BIG5_SAME_ROW + 0xA1;
3904 b2 = I % BIG5_SAME_ROW;
3905 b2 += b2 < 0x3F ? 0x40 : 0x62;
3906 Dynarr_add (dst, b1);
3907 Dynarr_add (dst, b2);
3910 Dynarr_add (dst, '?');
3917 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3918 unsigned int *flags)
3923 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3924 Decode a Big5 character CODE of BIG5 coding-system.
3925 CODE is the character code in BIG5, a cons of two integers.
3926 Return the corresponding character.
3930 unsigned char c1, c2, b1, b2;
3933 CHECK_INT (XCAR (code));
3934 CHECK_INT (XCDR (code));
3935 b1 = XINT (XCAR (code));
3936 b2 = XINT (XCDR (code));
3937 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3938 BYTE_BIG5_TWO_BYTE_2_P (b2))
3940 Charset_ID leading_byte;
3941 Lisp_Object charset;
3942 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3943 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3944 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3950 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3951 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3952 Return the corresponding character code in Big5.
3956 Lisp_Object charset;
3959 CHECK_CHAR_COERCE_INT (character);
3960 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3961 if (EQ (charset, Vcharset_chinese_big5_1) ||
3962 EQ (charset, Vcharset_chinese_big5_2))
3964 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3966 return Fcons (make_int (b1), make_int (b2));
3973 /************************************************************************/
3975 /************************************************************************/
3978 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3982 unsigned char c = *(unsigned char *)src++;
3983 switch (st->ucs4.in_byte)
3992 st->ucs4.in_byte = 0;
3998 return CODING_CATEGORY_UCS4_MASK;
4002 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4003 unsigned_char_dynarr *dst, size_t n)
4005 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4006 unsigned int flags = str->flags;
4007 unsigned int cpos = str->cpos;
4008 unsigned char counter = str->counter;
4012 unsigned char c = *(unsigned char *)src++;
4020 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4025 cpos = ( cpos << 8 ) | c;
4029 if (counter & CODING_STATE_END)
4030 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4034 str->counter = counter;
4038 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4039 unsigned_char_dynarr *dst, unsigned int *flags)
4041 Dynarr_add (dst, ch >> 24);
4042 Dynarr_add (dst, ch >> 16);
4043 Dynarr_add (dst, ch >> 8);
4044 Dynarr_add (dst, ch );
4048 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4049 unsigned int *flags)
4054 /************************************************************************/
4056 /************************************************************************/
4059 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4063 unsigned char c = *(unsigned char *)src++;
4064 switch (st->utf8.in_byte)
4067 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4070 st->utf8.in_byte = 5;
4072 st->utf8.in_byte = 4;
4074 st->utf8.in_byte = 3;
4076 st->utf8.in_byte = 2;
4078 st->utf8.in_byte = 1;
4083 if ((c & 0xc0) != 0x80)
4089 return CODING_CATEGORY_UTF8_MASK;
4093 decode_output_utf8_partial_char (unsigned char counter,
4095 unsigned_char_dynarr *dst)
4098 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4099 else if (counter == 4)
4101 if (cpos < (1 << 6))
4102 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4105 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4106 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4109 else if (counter == 3)
4111 if (cpos < (1 << 6))
4112 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4113 else if (cpos < (1 << 12))
4115 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4116 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4120 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4121 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4122 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4125 else if (counter == 2)
4127 if (cpos < (1 << 6))
4128 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4129 else if (cpos < (1 << 12))
4131 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4132 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4134 else if (cpos < (1 << 18))
4136 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4137 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4138 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4142 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4143 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4144 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4145 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4150 if (cpos < (1 << 6))
4151 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4152 else if (cpos < (1 << 12))
4154 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4155 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4157 else if (cpos < (1 << 18))
4159 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4160 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4161 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4163 else if (cpos < (1 << 24))
4165 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4166 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4167 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4168 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4172 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4173 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4174 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4175 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4176 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4182 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4183 unsigned_char_dynarr *dst, size_t n)
4185 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4186 unsigned int flags = str->flags;
4187 unsigned int cpos = str->cpos;
4188 eol_type_t eol_type = str->eol_type;
4189 unsigned char counter = str->counter;
4192 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4193 (decoding)->codesys, 0);
4198 unsigned char c = *(unsigned char *)src++;
4203 COMPOSE_FLUSH_CHARS (str, dst);
4204 decode_flush_er_chars (str, dst);
4205 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4206 DECODE_ADD_UCS_CHAR (c, dst);
4208 else if ( c < 0xC0 )
4209 /* decode_add_er_char (str, c, dst); */
4210 COMPOSE_ADD_CHAR (str, c, dst);
4213 /* decode_flush_er_chars (str, dst); */
4219 else if ( c < 0xF0 )
4224 else if ( c < 0xF8 )
4229 else if ( c < 0xFC )
4241 else if ( (c & 0xC0) == 0x80 )
4243 cpos = ( cpos << 6 ) | ( c & 0x3f );
4246 Emchar char_id = decode_defined_char (ccs, cpos);
4250 COMPOSE_ADD_CHAR (str, char_id, dst);
4259 COMPOSE_FLUSH_CHARS (str, dst);
4260 decode_flush_er_chars (str, dst);
4261 decode_output_utf8_partial_char (counter, cpos, dst);
4262 DECODE_ADD_BINARY_CHAR (c, dst);
4266 label_continue_loop:;
4269 if (flags & CODING_STATE_END)
4271 COMPOSE_FLUSH_CHARS (str, dst);
4272 decode_flush_er_chars (str, dst);
4275 decode_output_utf8_partial_char (counter, cpos, dst);
4282 str->counter = counter;
4286 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4287 unsigned_char_dynarr *dst, unsigned int *flags)
4289 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4293 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4294 Dynarr_add (dst, '\r');
4295 if (eol_type != EOL_CR)
4296 Dynarr_add (dst, ch);
4298 else if (ch <= 0x7f)
4300 Dynarr_add (dst, ch);
4305 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4306 int code_point = charset_code_point (ucs_ccs, ch);
4308 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4311 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4315 && INTP (ret = Fget_char_attribute (make_char (ch),
4317 code_point = XINT (ret);
4318 else if ( !NILP (map =
4319 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4321 && INTP (ret = Fget_char_attribute (make_char (ch),
4323 code_point = XINT (ret);
4324 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4326 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4329 int format_columns, idx;
4330 char buf[16], format[16];
4332 while (!NILP (rest))
4336 if (!NILP (ccs = Ffind_charset (ccs)))
4339 = charset_code_point (ccs, ch)) >= 0 )
4344 && ((idx =XSTRING_LENGTH (ret)) <= 6))
4346 strncpy (format, XSTRING_DATA (ret), idx);
4355 format [idx++] = '%';
4356 format_columns = XINT (ret);
4357 if ( (2 <= format_columns)
4358 && (format_columns <= 8) )
4360 format [idx++] = '0';
4361 format [idx++] = '0' + format_columns;
4368 format [idx++] = 'd';
4369 else if (EQ (ret, Qx))
4370 format [idx++] = 'x';
4371 else if (EQ (ret, QX))
4372 format [idx++] = 'X';
4377 sprintf (buf, format, code_point);
4378 Dynarr_add (dst, '&');
4379 Dynarr_add_many (dst, buf, strlen (buf));
4380 Dynarr_add (dst, ';');
4386 sprintf (buf, "&MCS-%08X;", ch);
4387 Dynarr_add_many (dst, buf, strlen (buf));
4393 if (code_point <= 0x7ff)
4395 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4396 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4398 else if (code_point <= 0xffff)
4400 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4401 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4402 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4404 else if (code_point <= 0x1fffff)
4406 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4407 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4408 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4409 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4411 else if (code_point <= 0x3ffffff)
4413 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4414 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4415 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4416 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4417 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4421 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4422 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4423 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4424 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4425 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4426 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4432 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4433 unsigned int *flags)
4438 /************************************************************************/
4439 /* ISO2022 methods */
4440 /************************************************************************/
4442 /* The following note describes the coding system ISO2022 briefly.
4443 Since the intention of this note is to help understand the
4444 functions in this file, some parts are NOT ACCURATE or OVERLY
4445 SIMPLIFIED. For thorough understanding, please refer to the
4446 original document of ISO2022.
4448 ISO2022 provides many mechanisms to encode several character sets
4449 in 7-bit and 8-bit environments. For 7-bit environments, all text
4450 is encoded using bytes less than 128. This may make the encoded
4451 text a little bit longer, but the text passes more easily through
4452 several gateways, some of which strip off MSB (Most Signigant Bit).
4454 There are two kinds of character sets: control character set and
4455 graphic character set. The former contains control characters such
4456 as `newline' and `escape' to provide control functions (control
4457 functions are also provided by escape sequences). The latter
4458 contains graphic characters such as 'A' and '-'. Emacs recognizes
4459 two control character sets and many graphic character sets.
4461 Graphic character sets are classified into one of the following
4462 four classes, according to the number of bytes (DIMENSION) and
4463 number of characters in one dimension (CHARS) of the set:
4464 - DIMENSION1_CHARS94
4465 - DIMENSION1_CHARS96
4466 - DIMENSION2_CHARS94
4467 - DIMENSION2_CHARS96
4469 In addition, each character set is assigned an identification tag,
4470 unique for each set, called "final character" (denoted as <F>
4471 hereafter). The <F> of each character set is decided by ECMA(*)
4472 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4473 (0x30..0x3F are for private use only).
4475 Note (*): ECMA = European Computer Manufacturers Association
4477 Here are examples of graphic character set [NAME(<F>)]:
4478 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4479 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4480 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4481 o DIMENSION2_CHARS96 -- none for the moment
4483 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4484 C0 [0x00..0x1F] -- control character plane 0
4485 GL [0x20..0x7F] -- graphic character plane 0
4486 C1 [0x80..0x9F] -- control character plane 1
4487 GR [0xA0..0xFF] -- graphic character plane 1
4489 A control character set is directly designated and invoked to C0 or
4490 C1 by an escape sequence. The most common case is that:
4491 - ISO646's control character set is designated/invoked to C0, and
4492 - ISO6429's control character set is designated/invoked to C1,
4493 and usually these designations/invocations are omitted in encoded
4494 text. In a 7-bit environment, only C0 can be used, and a control
4495 character for C1 is encoded by an appropriate escape sequence to
4496 fit into the environment. All control characters for C1 are
4497 defined to have corresponding escape sequences.
4499 A graphic character set is at first designated to one of four
4500 graphic registers (G0 through G3), then these graphic registers are
4501 invoked to GL or GR. These designations and invocations can be
4502 done independently. The most common case is that G0 is invoked to
4503 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4504 these invocations and designations are omitted in encoded text.
4505 In a 7-bit environment, only GL can be used.
4507 When a graphic character set of CHARS94 is invoked to GL, codes
4508 0x20 and 0x7F of the GL area work as control characters SPACE and
4509 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4512 There are two ways of invocation: locking-shift and single-shift.
4513 With locking-shift, the invocation lasts until the next different
4514 invocation, whereas with single-shift, the invocation affects the
4515 following character only and doesn't affect the locking-shift
4516 state. Invocations are done by the following control characters or
4519 ----------------------------------------------------------------------
4520 abbrev function cntrl escape seq description
4521 ----------------------------------------------------------------------
4522 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4523 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4524 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4525 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4526 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4527 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4528 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4529 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4530 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4531 ----------------------------------------------------------------------
4532 (*) These are not used by any known coding system.
4534 Control characters for these functions are defined by macros
4535 ISO_CODE_XXX in `coding.h'.
4537 Designations are done by the following escape sequences:
4538 ----------------------------------------------------------------------
4539 escape sequence description
4540 ----------------------------------------------------------------------
4541 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4542 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4543 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4544 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4545 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4546 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4547 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4548 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4549 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4550 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4551 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4552 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4553 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4554 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4555 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4556 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4557 ----------------------------------------------------------------------
4559 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4560 of dimension 1, chars 94, and final character <F>, etc...
4562 Note (*): Although these designations are not allowed in ISO2022,
4563 Emacs accepts them on decoding, and produces them on encoding
4564 CHARS96 character sets in a coding system which is characterized as
4565 7-bit environment, non-locking-shift, and non-single-shift.
4567 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4568 '(' can be omitted. We refer to this as "short-form" hereafter.
4570 Now you may notice that there are a lot of ways for encoding the
4571 same multilingual text in ISO2022. Actually, there exist many
4572 coding systems such as Compound Text (used in X11's inter client
4573 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4574 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4575 localized platforms), and all of these are variants of ISO2022.
4577 In addition to the above, Emacs handles two more kinds of escape
4578 sequences: ISO6429's direction specification and Emacs' private
4579 sequence for specifying character composition.
4581 ISO6429's direction specification takes the following form:
4582 o CSI ']' -- end of the current direction
4583 o CSI '0' ']' -- end of the current direction
4584 o CSI '1' ']' -- start of left-to-right text
4585 o CSI '2' ']' -- start of right-to-left text
4586 The control character CSI (0x9B: control sequence introducer) is
4587 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4589 Character composition specification takes the following form:
4590 o ESC '0' -- start character composition
4591 o ESC '1' -- end character composition
4592 Since these are not standard escape sequences of any ISO standard,
4593 their use with these meanings is restricted to Emacs only. */
4596 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4600 for (i = 0; i < 4; i++)
4602 if (!NILP (coding_system))
4604 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4606 iso->charset[i] = Qt;
4607 iso->invalid_designated[i] = 0;
4609 iso->esc = ISO_ESC_NOTHING;
4610 iso->esc_bytes_index = 0;
4611 iso->register_left = 0;
4612 iso->register_right = 1;
4613 iso->switched_dir_and_no_valid_charset_yet = 0;
4614 iso->invalid_switch_dir = 0;
4615 iso->output_direction_sequence = 0;
4616 iso->output_literally = 0;
4617 #ifdef ENABLE_COMPOSITE_CHARS
4618 if (iso->composite_chars)
4619 Dynarr_reset (iso->composite_chars);
4624 fit_to_be_escape_quoted (unsigned char c)
4641 /* Parse one byte of an ISO2022 escape sequence.
4642 If the result is an invalid escape sequence, return 0 and
4643 do not change anything in STR. Otherwise, if the result is
4644 an incomplete escape sequence, update ISO2022.ESC and
4645 ISO2022.ESC_BYTES and return -1. Otherwise, update
4646 all the state variables (but not ISO2022.ESC_BYTES) and
4649 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4650 or invocation of an invalid character set and treat that as
4651 an unrecognized escape sequence. */
4654 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4655 unsigned char c, unsigned int *flags,
4656 int check_invalid_charsets)
4658 /* (1) If we're at the end of a designation sequence, CS is the
4659 charset being designated and REG is the register to designate
4662 (2) If we're at the end of a locking-shift sequence, REG is
4663 the register to invoke and HALF (0 == left, 1 == right) is
4664 the half to invoke it into.
4666 (3) If we're at the end of a single-shift sequence, REG is
4667 the register to invoke. */
4668 Lisp_Object cs = Qnil;
4671 /* NOTE: This code does goto's all over the fucking place.
4672 The reason for this is that we're basically implementing
4673 a state machine here, and hierarchical languages like C
4674 don't really provide a clean way of doing this. */
4676 if (! (*flags & CODING_STATE_ESCAPE))
4677 /* At beginning of escape sequence; we need to reset our
4678 escape-state variables. */
4679 iso->esc = ISO_ESC_NOTHING;
4681 iso->output_literally = 0;
4682 iso->output_direction_sequence = 0;
4686 case ISO_ESC_NOTHING:
4687 iso->esc_bytes_index = 0;
4690 case ISO_CODE_ESC: /* Start escape sequence */
4691 *flags |= CODING_STATE_ESCAPE;
4695 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4696 *flags |= CODING_STATE_ESCAPE;
4697 iso->esc = ISO_ESC_5_11;
4700 case ISO_CODE_SO: /* locking shift 1 */
4703 case ISO_CODE_SI: /* locking shift 0 */
4707 case ISO_CODE_SS2: /* single shift */
4710 case ISO_CODE_SS3: /* single shift */
4714 default: /* Other control characters */
4721 /**** single shift ****/
4723 case 'N': /* single shift 2 */
4726 case 'O': /* single shift 3 */
4730 /**** locking shift ****/
4732 case '~': /* locking shift 1 right */
4735 case 'n': /* locking shift 2 */
4738 case '}': /* locking shift 2 right */
4741 case 'o': /* locking shift 3 */
4744 case '|': /* locking shift 3 right */
4748 #ifdef ENABLE_COMPOSITE_CHARS
4749 /**** composite ****/
4752 iso->esc = ISO_ESC_START_COMPOSITE;
4753 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4754 CODING_STATE_COMPOSITE;
4758 iso->esc = ISO_ESC_END_COMPOSITE;
4759 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4760 ~CODING_STATE_COMPOSITE;
4762 #endif /* ENABLE_COMPOSITE_CHARS */
4764 /**** directionality ****/
4767 iso->esc = ISO_ESC_5_11;
4770 /**** designation ****/
4772 case '$': /* multibyte charset prefix */
4773 iso->esc = ISO_ESC_2_4;
4777 if (0x28 <= c && c <= 0x2F)
4779 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4783 /* This function is called with CODESYS equal to nil when
4784 doing coding-system detection. */
4786 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4787 && fit_to_be_escape_quoted (c))
4789 iso->esc = ISO_ESC_LITERAL;
4790 *flags &= CODING_STATE_ISO2022_LOCK;
4800 /**** directionality ****/
4802 case ISO_ESC_5_11: /* ISO6429 direction control */
4805 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4806 goto directionality;
4808 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4809 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4810 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4814 case ISO_ESC_5_11_0:
4817 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4818 goto directionality;
4822 case ISO_ESC_5_11_1:
4825 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4826 goto directionality;
4830 case ISO_ESC_5_11_2:
4833 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4834 goto directionality;
4839 iso->esc = ISO_ESC_DIRECTIONALITY;
4840 /* Various junk here to attempt to preserve the direction sequences
4841 literally in the text if they would otherwise be swallowed due
4842 to invalid designations that don't show up as actual charset
4843 changes in the text. */
4844 if (iso->invalid_switch_dir)
4846 /* We already inserted a direction switch literally into the
4847 text. We assume (#### this may not be right) that the
4848 next direction switch is the one going the other way,
4849 and we need to output that literally as well. */
4850 iso->output_literally = 1;
4851 iso->invalid_switch_dir = 0;
4857 /* If we are in the thrall of an invalid designation,
4858 then stick the directionality sequence literally into the
4859 output stream so it ends up in the original text again. */
4860 for (jj = 0; jj < 4; jj++)
4861 if (iso->invalid_designated[jj])
4865 iso->output_literally = 1;
4866 iso->invalid_switch_dir = 1;
4869 /* Indicate that we haven't yet seen a valid designation,
4870 so that if a switch-dir is directly followed by an
4871 invalid designation, both get inserted literally. */
4872 iso->switched_dir_and_no_valid_charset_yet = 1;
4877 /**** designation ****/
4880 if (0x28 <= c && c <= 0x2F)
4882 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4885 if (0x40 <= c && c <= 0x42)
4888 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4889 *flags & CODING_STATE_R2L ?
4890 CHARSET_RIGHT_TO_LEFT :
4891 CHARSET_LEFT_TO_RIGHT);
4902 if (c < '0' || c > '~')
4903 return 0; /* bad final byte */
4905 if (iso->esc >= ISO_ESC_2_8 &&
4906 iso->esc <= ISO_ESC_2_15)
4908 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4909 single = 1; /* single-byte */
4910 reg = (iso->esc - ISO_ESC_2_8) & 3;
4912 else if (iso->esc >= ISO_ESC_2_4_8 &&
4913 iso->esc <= ISO_ESC_2_4_15)
4915 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4916 single = -1; /* multi-byte */
4917 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4921 /* Can this ever be reached? -slb */
4925 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4926 *flags & CODING_STATE_R2L ?
4927 CHARSET_RIGHT_TO_LEFT :
4928 CHARSET_LEFT_TO_RIGHT);
4934 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4938 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4939 /* can't invoke something that ain't there. */
4941 iso->esc = ISO_ESC_SINGLE_SHIFT;
4942 *flags &= CODING_STATE_ISO2022_LOCK;
4944 *flags |= CODING_STATE_SS2;
4946 *flags |= CODING_STATE_SS3;
4950 if (check_invalid_charsets &&
4951 !CHARSETP (iso->charset[reg]))
4952 /* can't invoke something that ain't there. */
4955 iso->register_right = reg;
4957 iso->register_left = reg;
4958 *flags &= CODING_STATE_ISO2022_LOCK;
4959 iso->esc = ISO_ESC_LOCKING_SHIFT;
4963 if (NILP (cs) && check_invalid_charsets)
4965 iso->invalid_designated[reg] = 1;
4966 iso->charset[reg] = Vcharset_ascii;
4967 iso->esc = ISO_ESC_DESIGNATE;
4968 *flags &= CODING_STATE_ISO2022_LOCK;
4969 iso->output_literally = 1;
4970 if (iso->switched_dir_and_no_valid_charset_yet)
4972 /* We encountered a switch-direction followed by an
4973 invalid designation. Ensure that the switch-direction
4974 gets outputted; otherwise it will probably get eaten
4975 when the text is written out again. */
4976 iso->switched_dir_and_no_valid_charset_yet = 0;
4977 iso->output_direction_sequence = 1;
4978 /* And make sure that the switch-dir going the other
4979 way gets outputted, as well. */
4980 iso->invalid_switch_dir = 1;
4984 /* This function is called with CODESYS equal to nil when
4985 doing coding-system detection. */
4986 if (!NILP (codesys))
4988 charset_conversion_spec_dynarr *dyn =
4989 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4995 for (i = 0; i < Dynarr_length (dyn); i++)
4997 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4998 if (EQ (cs, spec->from_charset))
4999 cs = spec->to_charset;
5004 iso->charset[reg] = cs;
5005 iso->esc = ISO_ESC_DESIGNATE;
5006 *flags &= CODING_STATE_ISO2022_LOCK;
5007 if (iso->invalid_designated[reg])
5009 iso->invalid_designated[reg] = 0;
5010 iso->output_literally = 1;
5012 if (iso->switched_dir_and_no_valid_charset_yet)
5013 iso->switched_dir_and_no_valid_charset_yet = 0;
5018 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
5022 /* #### There are serious deficiencies in the recognition mechanism
5023 here. This needs to be much smarter if it's going to cut it.
5024 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5025 it should be detected as Latin-1.
5026 All the ISO2022 stuff in this file should be synced up with the
5027 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5028 Perhaps we should wait till R2L works in FSF Emacs? */
5030 if (!st->iso2022.initted)
5032 reset_iso2022 (Qnil, &st->iso2022.iso);
5033 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5034 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5035 CODING_CATEGORY_ISO_8_1_MASK |
5036 CODING_CATEGORY_ISO_8_2_MASK |
5037 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5038 st->iso2022.flags = 0;
5039 st->iso2022.high_byte_count = 0;
5040 st->iso2022.saw_single_shift = 0;
5041 st->iso2022.initted = 1;
5044 mask = st->iso2022.mask;
5048 unsigned char c = *(unsigned char *)src++;
5051 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5052 st->iso2022.high_byte_count++;
5056 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5058 if (st->iso2022.high_byte_count & 1)
5059 /* odd number of high bytes; assume not iso-8-2 */
5060 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5062 st->iso2022.high_byte_count = 0;
5063 st->iso2022.saw_single_shift = 0;
5065 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5067 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5068 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5069 { /* control chars */
5072 /* Allow and ignore control characters that you might
5073 reasonably see in a text file */
5078 case 8: /* backspace */
5079 case 11: /* vertical tab */
5080 case 12: /* form feed */
5081 case 26: /* MS-DOS C-z junk */
5082 case 31: /* '^_' -- for info */
5083 goto label_continue_loop;
5090 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5093 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5094 &st->iso2022.flags, 0))
5096 switch (st->iso2022.iso.esc)
5098 case ISO_ESC_DESIGNATE:
5099 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5100 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5102 case ISO_ESC_LOCKING_SHIFT:
5103 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5104 goto ran_out_of_chars;
5105 case ISO_ESC_SINGLE_SHIFT:
5106 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5107 st->iso2022.saw_single_shift = 1;
5116 goto ran_out_of_chars;
5119 label_continue_loop:;
5128 postprocess_iso2022_mask (int mask)
5130 /* #### kind of cheesy */
5131 /* If seven-bit ISO is allowed, then assume that the encoding is
5132 entirely seven-bit and turn off the eight-bit ones. */
5133 if (mask & CODING_CATEGORY_ISO_7_MASK)
5134 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5135 CODING_CATEGORY_ISO_8_1_MASK |
5136 CODING_CATEGORY_ISO_8_2_MASK);
5140 /* If FLAGS is a null pointer or specifies right-to-left motion,
5141 output a switch-dir-to-left-to-right sequence to DST.
5142 Also update FLAGS if it is not a null pointer.
5143 If INTERNAL_P is set, we are outputting in internal format and
5144 need to handle the CSI differently. */
5147 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5148 unsigned_char_dynarr *dst,
5149 unsigned int *flags,
5152 if (!flags || (*flags & CODING_STATE_R2L))
5154 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5156 Dynarr_add (dst, ISO_CODE_ESC);
5157 Dynarr_add (dst, '[');
5159 else if (internal_p)
5160 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5162 Dynarr_add (dst, ISO_CODE_CSI);
5163 Dynarr_add (dst, '0');
5164 Dynarr_add (dst, ']');
5166 *flags &= ~CODING_STATE_R2L;
5170 /* If FLAGS is a null pointer or specifies a direction different from
5171 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5172 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5173 sequence to DST. Also update FLAGS if it is not a null pointer.
5174 If INTERNAL_P is set, we are outputting in internal format and
5175 need to handle the CSI differently. */
5178 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5179 unsigned_char_dynarr *dst, unsigned int *flags,
5182 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5183 direction == CHARSET_LEFT_TO_RIGHT)
5184 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5185 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5186 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5187 direction == CHARSET_RIGHT_TO_LEFT)
5189 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5191 Dynarr_add (dst, ISO_CODE_ESC);
5192 Dynarr_add (dst, '[');
5194 else if (internal_p)
5195 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5197 Dynarr_add (dst, ISO_CODE_CSI);
5198 Dynarr_add (dst, '2');
5199 Dynarr_add (dst, ']');
5201 *flags |= CODING_STATE_R2L;
5205 /* Convert ISO2022-format data to internal format. */
5208 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5209 unsigned_char_dynarr *dst, size_t n)
5211 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5212 unsigned int flags = str->flags;
5213 unsigned int cpos = str->cpos;
5214 unsigned char counter = str->counter;
5215 eol_type_t eol_type = str->eol_type;
5216 #ifdef ENABLE_COMPOSITE_CHARS
5217 unsigned_char_dynarr *real_dst = dst;
5219 Lisp_Object coding_system;
5221 XSETCODING_SYSTEM (coding_system, str->codesys);
5223 #ifdef ENABLE_COMPOSITE_CHARS
5224 if (flags & CODING_STATE_COMPOSITE)
5225 dst = str->iso2022.composite_chars;
5226 #endif /* ENABLE_COMPOSITE_CHARS */
5230 unsigned char c = *(unsigned char *)src++;
5231 if (flags & CODING_STATE_ESCAPE)
5232 { /* Within ESC sequence */
5233 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5238 switch (str->iso2022.esc)
5240 #ifdef ENABLE_COMPOSITE_CHARS
5241 case ISO_ESC_START_COMPOSITE:
5242 if (str->iso2022.composite_chars)
5243 Dynarr_reset (str->iso2022.composite_chars);
5245 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5246 dst = str->iso2022.composite_chars;
5248 case ISO_ESC_END_COMPOSITE:
5250 Bufbyte comstr[MAX_EMCHAR_LEN];
5252 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5253 Dynarr_length (dst));
5255 len = set_charptr_emchar (comstr, emch);
5256 Dynarr_add_many (dst, comstr, len);
5259 #endif /* ENABLE_COMPOSITE_CHARS */
5261 case ISO_ESC_LITERAL:
5262 COMPOSE_FLUSH_CHARS (str, dst);
5263 decode_flush_er_chars (str, dst);
5264 DECODE_ADD_BINARY_CHAR (c, dst);
5268 /* Everything else handled already */
5273 /* Attempted error recovery. */
5274 if (str->iso2022.output_direction_sequence)
5275 ensure_correct_direction (flags & CODING_STATE_R2L ?
5276 CHARSET_RIGHT_TO_LEFT :
5277 CHARSET_LEFT_TO_RIGHT,
5278 str->codesys, dst, 0, 1);
5279 /* More error recovery. */
5280 if (!retval || str->iso2022.output_literally)
5282 /* Output the (possibly invalid) sequence */
5284 COMPOSE_FLUSH_CHARS (str, dst);
5285 decode_flush_er_chars (str, dst);
5286 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5287 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5288 flags &= CODING_STATE_ISO2022_LOCK;
5290 n++, src--;/* Repeat the loop with the same character. */
5293 /* No sense in reprocessing the final byte of the
5294 escape sequence; it could mess things up anyway.
5296 COMPOSE_FLUSH_CHARS (str, dst);
5297 decode_flush_er_chars (str, dst);
5298 DECODE_ADD_BINARY_CHAR (c, dst);
5304 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5305 { /* Control characters */
5307 /***** Error-handling *****/
5309 /* If we were in the middle of a character, dump out the
5310 partial character. */
5313 COMPOSE_FLUSH_CHARS (str, dst);
5314 decode_flush_er_chars (str, dst);
5318 DECODE_ADD_BINARY_CHAR
5319 ((unsigned char)(cpos >> (counter * 8)), dst);
5324 /* If we just saw a single-shift character, dump it out.
5325 This may dump out the wrong sort of single-shift character,
5326 but least it will give an indication that something went
5328 if (flags & CODING_STATE_SS2)
5330 COMPOSE_FLUSH_CHARS (str, dst);
5331 decode_flush_er_chars (str, dst);
5332 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5333 flags &= ~CODING_STATE_SS2;
5335 if (flags & CODING_STATE_SS3)
5337 COMPOSE_FLUSH_CHARS (str, dst);
5338 decode_flush_er_chars (str, dst);
5339 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5340 flags &= ~CODING_STATE_SS3;
5343 /***** Now handle the control characters. *****/
5349 COMPOSE_FLUSH_CHARS (str, dst);
5350 decode_flush_er_chars (str, dst);
5351 if (eol_type == EOL_CR)
5352 Dynarr_add (dst, '\n');
5353 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5354 Dynarr_add (dst, c);
5356 flags |= CODING_STATE_CR;
5357 goto label_continue_loop;
5359 else if (flags & CODING_STATE_CR)
5360 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5362 Dynarr_add (dst, '\r');
5363 flags &= ~CODING_STATE_CR;
5366 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5369 flags &= CODING_STATE_ISO2022_LOCK;
5371 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5373 COMPOSE_FLUSH_CHARS (str, dst);
5374 decode_flush_er_chars (str, dst);
5375 DECODE_ADD_BINARY_CHAR (c, dst);
5379 { /* Graphic characters */
5380 Lisp_Object charset;
5389 COMPOSE_FLUSH_CHARS (str, dst);
5390 decode_flush_er_chars (str, dst);
5391 if (eol_type == EOL_CR)
5392 Dynarr_add (dst, '\n');
5393 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5394 Dynarr_add (dst, c);
5396 flags |= CODING_STATE_CR;
5397 goto label_continue_loop;
5399 else if (flags & CODING_STATE_CR)
5400 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5402 Dynarr_add (dst, '\r');
5403 flags &= ~CODING_STATE_CR;
5406 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5409 /* Now determine the charset. */
5410 reg = ((flags & CODING_STATE_SS2) ? 2
5411 : (flags & CODING_STATE_SS3) ? 3
5412 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5413 : str->iso2022.register_left);
5414 charset = str->iso2022.charset[reg];
5416 /* Error checking: */
5417 if (! CHARSETP (charset)
5418 || str->iso2022.invalid_designated[reg]
5419 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5420 && XCHARSET_CHARS (charset) == 94))
5421 /* Mrmph. We are trying to invoke a register that has no
5422 or an invalid charset in it, or trying to add a character
5423 outside the range of the charset. Insert that char literally
5424 to preserve it for the output. */
5426 COMPOSE_FLUSH_CHARS (str, dst);
5427 decode_flush_er_chars (str, dst);
5431 DECODE_ADD_BINARY_CHAR
5432 ((unsigned char)(cpos >> (counter * 8)), dst);
5435 DECODE_ADD_BINARY_CHAR (c, dst);
5440 /* Things are probably hunky-dorey. */
5442 /* Fetch reverse charset, maybe. */
5443 if (((flags & CODING_STATE_R2L) &&
5444 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5446 (!(flags & CODING_STATE_R2L) &&
5447 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5449 Lisp_Object new_charset =
5450 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5451 if (!NILP (new_charset))
5452 charset = new_charset;
5457 if (XCHARSET_DIMENSION (charset) == counter)
5459 COMPOSE_ADD_CHAR (str,
5460 DECODE_CHAR (charset,
5461 ((cpos & 0x7F7F7F) << 8)
5468 cpos = (cpos << 8) | c;
5470 lb = XCHARSET_LEADING_BYTE (charset);
5471 switch (XCHARSET_REP_BYTES (charset))
5474 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5475 Dynarr_add (dst, c & 0x7F);
5478 case 2: /* one-byte official */
5479 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5480 Dynarr_add (dst, lb);
5481 Dynarr_add (dst, c | 0x80);
5484 case 3: /* one-byte private or two-byte official */
5485 if (XCHARSET_PRIVATE_P (charset))
5487 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5488 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5489 Dynarr_add (dst, lb);
5490 Dynarr_add (dst, c | 0x80);
5496 Dynarr_add (dst, lb);
5497 Dynarr_add (dst, ch | 0x80);
5498 Dynarr_add (dst, c | 0x80);
5506 default: /* two-byte private */
5509 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5510 Dynarr_add (dst, lb);
5511 Dynarr_add (dst, ch | 0x80);
5512 Dynarr_add (dst, c | 0x80);
5522 flags &= CODING_STATE_ISO2022_LOCK;
5525 label_continue_loop:;
5528 if (flags & CODING_STATE_END)
5530 COMPOSE_FLUSH_CHARS (str, dst);
5531 decode_flush_er_chars (str, dst);
5532 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5536 str->counter = counter;
5540 /***** ISO2022 encoder *****/
5542 /* Designate CHARSET into register REG. */
5545 iso2022_designate (Lisp_Object charset, unsigned char reg,
5546 struct encoding_stream *str, unsigned_char_dynarr *dst)
5548 static const char inter94[] = "()*+";
5549 static const char inter96[] = ",-./";
5550 unsigned short chars;
5551 unsigned char dimension;
5552 unsigned char final;
5553 Lisp_Object old_charset = str->iso2022.charset[reg];
5555 str->iso2022.charset[reg] = charset;
5556 if (!CHARSETP (charset))
5557 /* charset might be an initial nil or t. */
5559 chars = XCHARSET_CHARS (charset);
5560 dimension = XCHARSET_DIMENSION (charset);
5561 final = XCHARSET_FINAL (charset);
5562 if (!str->iso2022.force_charset_on_output[reg] &&
5563 CHARSETP (old_charset) &&
5564 XCHARSET_CHARS (old_charset) == chars &&
5565 XCHARSET_DIMENSION (old_charset) == dimension &&
5566 XCHARSET_FINAL (old_charset) == final)
5569 str->iso2022.force_charset_on_output[reg] = 0;
5572 charset_conversion_spec_dynarr *dyn =
5573 str->codesys->iso2022.output_conv;
5579 for (i = 0; i < Dynarr_length (dyn); i++)
5581 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5582 if (EQ (charset, spec->from_charset))
5583 charset = spec->to_charset;
5588 Dynarr_add (dst, ISO_CODE_ESC);
5593 Dynarr_add (dst, inter94[reg]);
5596 Dynarr_add (dst, '$');
5598 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5601 Dynarr_add (dst, inter94[reg]);
5606 Dynarr_add (dst, inter96[reg]);
5609 Dynarr_add (dst, '$');
5610 Dynarr_add (dst, inter96[reg]);
5614 Dynarr_add (dst, final);
5618 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5620 if (str->iso2022.register_left != 0)
5622 Dynarr_add (dst, ISO_CODE_SI);
5623 str->iso2022.register_left = 0;
5628 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5630 if (str->iso2022.register_left != 1)
5632 Dynarr_add (dst, ISO_CODE_SO);
5633 str->iso2022.register_left = 1;
5638 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5639 unsigned_char_dynarr *dst, unsigned int *flags)
5641 unsigned char charmask;
5642 Lisp_Coding_System* codesys = str->codesys;
5643 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5645 Lisp_Object charset = str->iso2022.current_charset;
5646 int half = str->iso2022.current_half;
5647 int code_point = -1;
5651 restore_left_to_right_direction (codesys, dst, flags, 0);
5653 /* Make sure G0 contains ASCII */
5654 if ((ch > ' ' && ch < ISO_CODE_DEL)
5655 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5657 ensure_normal_shift (str, dst);
5658 iso2022_designate (Vcharset_ascii, 0, str, dst);
5661 /* If necessary, restore everything to the default state
5663 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5665 restore_left_to_right_direction (codesys, dst, flags, 0);
5667 ensure_normal_shift (str, dst);
5669 for (i = 0; i < 4; i++)
5671 Lisp_Object initial_charset =
5672 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5673 iso2022_designate (initial_charset, i, str, dst);
5678 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5679 Dynarr_add (dst, '\r');
5680 if (eol_type != EOL_CR)
5681 Dynarr_add (dst, ch);
5685 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5686 && fit_to_be_escape_quoted (ch))
5687 Dynarr_add (dst, ISO_CODE_ESC);
5688 Dynarr_add (dst, ch);
5691 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5693 charmask = (half == 0 ? 0x00 : 0x80);
5695 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5696 && fit_to_be_escape_quoted (ch))
5697 Dynarr_add (dst, ISO_CODE_ESC);
5698 /* you asked for it ... */
5699 Dynarr_add (dst, ch);
5705 /* Now determine which register to use. */
5707 for (i = 0; i < 4; i++)
5709 if ((CHARSETP (charset = str->iso2022.charset[i])
5710 && ((code_point = charset_code_point (charset, ch)) >= 0))
5714 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5715 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5723 Lisp_Object original_default_coded_charset_priority_list
5724 = Vdefault_coded_charset_priority_list;
5726 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5728 code_point = ENCODE_CHAR (ch, charset);
5729 if (XCHARSET_FINAL (charset))
5731 Vdefault_coded_charset_priority_list
5732 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5733 Vdefault_coded_charset_priority_list));
5735 code_point = ENCODE_CHAR (ch, charset);
5736 if (!XCHARSET_FINAL (charset))
5738 charset = Vcharset_ascii;
5742 Vdefault_coded_charset_priority_list
5743 = original_default_coded_charset_priority_list;
5745 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5746 codesys, dst, flags, 0);
5750 if (XCHARSET_GRAPHIC (charset) != 0)
5752 if (!NILP (str->iso2022.charset[1]) &&
5753 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5754 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5756 else if (!NILP (str->iso2022.charset[2]))
5758 else if (!NILP (str->iso2022.charset[3]))
5767 iso2022_designate (charset, reg, str, dst);
5769 /* Now invoke that register. */
5773 ensure_normal_shift (str, dst);
5777 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5779 ensure_shift_out (str, dst);
5786 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5788 Dynarr_add (dst, ISO_CODE_ESC);
5789 Dynarr_add (dst, 'N');
5794 Dynarr_add (dst, ISO_CODE_SS2);
5799 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5801 Dynarr_add (dst, ISO_CODE_ESC);
5802 Dynarr_add (dst, 'O');
5807 Dynarr_add (dst, ISO_CODE_SS3);
5815 charmask = (half == 0 ? 0x00 : 0x80);
5817 switch (XCHARSET_DIMENSION (charset))
5820 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5823 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5824 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5827 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5828 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5829 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5832 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5833 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5834 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5835 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5841 str->iso2022.current_charset = charset;
5842 str->iso2022.current_half = half;
5846 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5847 unsigned int *flags)
5849 Lisp_Coding_System* codesys = str->codesys;
5852 restore_left_to_right_direction (codesys, dst, flags, 0);
5853 ensure_normal_shift (str, dst);
5854 for (i = 0; i < 4; i++)
5856 Lisp_Object initial_charset
5857 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5858 iso2022_designate (initial_charset, i, str, dst);
5863 /************************************************************************/
5864 /* No-conversion methods */
5865 /************************************************************************/
5867 /* This is used when reading in "binary" files -- i.e. files that may
5868 contain all 256 possible byte values and that are not to be
5869 interpreted as being in any particular decoding. */
5871 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5872 unsigned_char_dynarr *dst, size_t n)
5874 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5875 unsigned int flags = str->flags;
5876 unsigned int cpos = str->cpos;
5877 eol_type_t eol_type = str->eol_type;
5881 unsigned char c = *(unsigned char *)src++;
5883 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5884 DECODE_ADD_BINARY_CHAR (c, dst);
5885 label_continue_loop:;
5888 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5895 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5896 unsigned_char_dynarr *dst, size_t n)
5899 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5900 unsigned int flags = str->flags;
5901 unsigned int ch = str->ch;
5902 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5904 unsigned char char_boundary = str->iso2022.current_char_boundary;
5911 if (char_boundary == 0)
5917 else if ( c >= 0xf8 )
5922 else if ( c >= 0xf0 )
5927 else if ( c >= 0xe0 )
5932 else if ( c >= 0xc0 )
5942 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5943 Dynarr_add (dst, '\r');
5944 if (eol_type != EOL_CR)
5945 Dynarr_add (dst, c);
5948 Dynarr_add (dst, c);
5951 else if (char_boundary == 1)
5953 ch = ( ch << 6 ) | ( c & 0x3f );
5954 Dynarr_add (dst, ch & 0xff);
5959 ch = ( ch << 6 ) | ( c & 0x3f );
5962 #else /* not UTF2000 */
5965 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5966 Dynarr_add (dst, '\r');
5967 if (eol_type != EOL_CR)
5968 Dynarr_add (dst, '\n');
5971 else if (BYTE_ASCII_P (c))
5974 Dynarr_add (dst, c);
5976 else if (BUFBYTE_LEADING_BYTE_P (c))
5979 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5980 c == LEADING_BYTE_CONTROL_1)
5983 Dynarr_add (dst, '~'); /* untranslatable character */
5987 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5988 Dynarr_add (dst, c);
5989 else if (ch == LEADING_BYTE_CONTROL_1)
5992 Dynarr_add (dst, c - 0x20);
5994 /* else it should be the second or third byte of an
5995 untranslatable character, so ignore it */
5998 #endif /* not UTF2000 */
6004 str->iso2022.current_char_boundary = char_boundary;
6010 /************************************************************************/
6011 /* Initialization */
6012 /************************************************************************/
6015 syms_of_file_coding (void)
6017 INIT_LRECORD_IMPLEMENTATION (coding_system);
6019 deferror (&Qcoding_system_error, "coding-system-error",
6020 "Coding-system error", Qio_error);
6022 DEFSUBR (Fcoding_system_p);
6023 DEFSUBR (Ffind_coding_system);
6024 DEFSUBR (Fget_coding_system);
6025 DEFSUBR (Fcoding_system_list);
6026 DEFSUBR (Fcoding_system_name);
6027 DEFSUBR (Fmake_coding_system);
6028 DEFSUBR (Fcopy_coding_system);
6029 DEFSUBR (Fcoding_system_canonical_name_p);
6030 DEFSUBR (Fcoding_system_alias_p);
6031 DEFSUBR (Fcoding_system_aliasee);
6032 DEFSUBR (Fdefine_coding_system_alias);
6033 DEFSUBR (Fsubsidiary_coding_system);
6035 DEFSUBR (Fcoding_system_type);
6036 DEFSUBR (Fcoding_system_doc_string);
6038 DEFSUBR (Fcoding_system_charset);
6040 DEFSUBR (Fcoding_system_property);
6042 DEFSUBR (Fcoding_category_list);
6043 DEFSUBR (Fset_coding_priority_list);
6044 DEFSUBR (Fcoding_priority_list);
6045 DEFSUBR (Fset_coding_category_system);
6046 DEFSUBR (Fcoding_category_system);
6048 DEFSUBR (Fdetect_coding_region);
6049 DEFSUBR (Fdecode_coding_region);
6050 DEFSUBR (Fencode_coding_region);
6052 DEFSUBR (Fdecode_shift_jis_char);
6053 DEFSUBR (Fencode_shift_jis_char);
6054 DEFSUBR (Fdecode_big5_char);
6055 DEFSUBR (Fencode_big5_char);
6057 defsymbol (&Qcoding_systemp, "coding-system-p");
6058 defsymbol (&Qno_conversion, "no-conversion");
6059 defsymbol (&Qraw_text, "raw-text");
6061 defsymbol (&Qbig5, "big5");
6062 defsymbol (&Qshift_jis, "shift-jis");
6063 defsymbol (&Qucs4, "ucs-4");
6064 defsymbol (&Qutf8, "utf-8");
6065 defsymbol (&Qccl, "ccl");
6066 defsymbol (&Qiso2022, "iso2022");
6068 defsymbol (&Qmnemonic, "mnemonic");
6069 defsymbol (&Qeol_type, "eol-type");
6070 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6071 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6073 defsymbol (&Qcr, "cr");
6074 defsymbol (&Qlf, "lf");
6075 defsymbol (&Qcrlf, "crlf");
6076 defsymbol (&Qeol_cr, "eol-cr");
6077 defsymbol (&Qeol_lf, "eol-lf");
6078 defsymbol (&Qeol_crlf, "eol-crlf");
6080 defsymbol (&Qcharset_g0, "charset-g0");
6081 defsymbol (&Qcharset_g1, "charset-g1");
6082 defsymbol (&Qcharset_g2, "charset-g2");
6083 defsymbol (&Qcharset_g3, "charset-g3");
6084 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6085 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6086 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6087 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6088 defsymbol (&Qno_iso6429, "no-iso6429");
6089 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6090 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6092 defsymbol (&Qshort, "short");
6093 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6094 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6095 defsymbol (&Qseven, "seven");
6096 defsymbol (&Qlock_shift, "lock-shift");
6097 defsymbol (&Qescape_quoted, "escape-quoted");
6100 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6101 defsymbol (&Qdisable_composition, "disable-composition");
6102 defsymbol (&Quse_entity_reference, "use-entity-reference");
6103 defsymbol (&Qd, "d");
6104 defsymbol (&Qx, "x");
6105 defsymbol (&QX, "X");
6107 defsymbol (&Qencode, "encode");
6108 defsymbol (&Qdecode, "decode");
6111 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6113 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6115 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6117 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6119 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6121 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6123 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6125 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6127 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6130 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6135 lstream_type_create_file_coding (void)
6137 LSTREAM_HAS_METHOD (decoding, reader);
6138 LSTREAM_HAS_METHOD (decoding, writer);
6139 LSTREAM_HAS_METHOD (decoding, rewinder);
6140 LSTREAM_HAS_METHOD (decoding, seekable_p);
6141 LSTREAM_HAS_METHOD (decoding, flusher);
6142 LSTREAM_HAS_METHOD (decoding, closer);
6143 LSTREAM_HAS_METHOD (decoding, marker);
6145 LSTREAM_HAS_METHOD (encoding, reader);
6146 LSTREAM_HAS_METHOD (encoding, writer);
6147 LSTREAM_HAS_METHOD (encoding, rewinder);
6148 LSTREAM_HAS_METHOD (encoding, seekable_p);
6149 LSTREAM_HAS_METHOD (encoding, flusher);
6150 LSTREAM_HAS_METHOD (encoding, closer);
6151 LSTREAM_HAS_METHOD (encoding, marker);
6155 vars_of_file_coding (void)
6159 fcd = xnew (struct file_coding_dump);
6160 dump_add_root_struct_ptr (&fcd, &fcd_description);
6162 /* Initialize to something reasonable ... */
6163 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6165 fcd->coding_category_system[i] = Qnil;
6166 fcd->coding_category_by_priority[i] = i;
6169 Fprovide (intern ("file-coding"));
6171 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6172 Coding system used for TTY keyboard input.
6173 Not used under a windowing system.
6175 Vkeyboard_coding_system = Qnil;
6177 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6178 Coding system used for TTY display output.
6179 Not used under a windowing system.
6181 Vterminal_coding_system = Qnil;
6183 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6184 Overriding coding system used when reading from a file or process.
6185 You should bind this variable with `let', but do not set it globally.
6186 If this is non-nil, it specifies the coding system that will be used
6187 to decode input on read operations, such as from a file or process.
6188 It overrides `buffer-file-coding-system-for-read',
6189 `insert-file-contents-pre-hook', etc. Use those variables instead of
6190 this one for permanent changes to the environment. */ );
6191 Vcoding_system_for_read = Qnil;
6193 DEFVAR_LISP ("coding-system-for-write",
6194 &Vcoding_system_for_write /*
6195 Overriding coding system used when writing to a file or process.
6196 You should bind this variable with `let', but do not set it globally.
6197 If this is non-nil, it specifies the coding system that will be used
6198 to encode output for write operations, such as to a file or process.
6199 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6200 Use those variables instead of this one for permanent changes to the
6202 Vcoding_system_for_write = Qnil;
6204 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6205 Coding system used to convert pathnames when accessing files.
6207 Vfile_name_coding_system = Qnil;
6209 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6210 &Vcoded_charset_entity_reference_alist /*
6211 Alist of coded-charset vs corresponding entity-reference.
6212 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6213 CCS is coded-charset.
6214 CODE-COLUMNS is columns of code-point of entity-reference.
6215 CODE-TYPE is format type of code-point of entity-reference.
6216 `d' means decimal value and `x' means hexadecimal value.
6218 Vcoded_charset_entity_reference_alist = Qnil;
6220 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6221 Non-nil means the buffer contents are regarded as multi-byte form
6222 of characters, not a binary code. This affects the display, file I/O,
6223 and behaviors of various editing commands.
6225 Setting this to nil does not do anything.
6227 enable_multibyte_characters = 1;
6231 complex_vars_of_file_coding (void)
6233 staticpro (&Vcoding_system_hash_table);
6234 Vcoding_system_hash_table =
6235 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6237 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6238 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6240 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6242 struct codesys_prop csp; \
6244 csp.prop_type = (Prop_Type); \
6245 Dynarr_add (the_codesys_prop_dynarr, csp); \
6248 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6249 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6250 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6251 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6252 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6253 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6254 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6256 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6257 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6258 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6259 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6260 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6261 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6262 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6263 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6264 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6265 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6266 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6267 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6268 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6269 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6270 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6271 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6272 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6274 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6275 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6277 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6278 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6281 /* Need to create this here or we're really screwed. */
6283 (Qraw_text, Qno_conversion,
6284 build_string ("Raw text, which means it converts only line-break-codes."),
6285 list2 (Qmnemonic, build_string ("Raw")));
6288 (Qbinary, Qno_conversion,
6289 build_string ("Binary, which means it does not convert anything."),
6290 list4 (Qeol_type, Qlf,
6291 Qmnemonic, build_string ("Binary")));
6297 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6298 list2 (Qmnemonic, build_string ("MTF8")));
6301 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6303 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6305 Fdefine_coding_system_alias (Qterminal, Qbinary);
6306 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6308 /* Need this for bootstrapping */
6309 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6310 Fget_coding_system (Qraw_text);
6313 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6314 = Fget_coding_system (Qutf_8_mcs);
6317 #if defined(MULE) && !defined(UTF2000)
6321 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6322 fcd->ucs_to_mule_table[i] = Qnil;
6324 staticpro (&mule_to_ucs_table);
6325 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6326 #endif /* defined(MULE) && !defined(UTF2000) */