1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Quse_entity_reference;
111 Lisp_Object Qd, Qx, QX;
113 Lisp_Object Qencode, Qdecode;
115 Lisp_Object Vcoding_system_hash_table;
117 int enable_multibyte_characters;
120 /* Additional information used by the ISO2022 decoder and detector. */
121 struct iso2022_decoder
123 /* CHARSET holds the character sets currently assigned to the G0
124 through G3 variables. It is initialized from the array
125 INITIAL_CHARSET in CODESYS. */
126 Lisp_Object charset[4];
128 /* Which registers are currently invoked into the left (GL) and
129 right (GR) halves of the 8-bit encoding space? */
130 int register_left, register_right;
132 /* ISO_ESC holds a value indicating part of an escape sequence
133 that has already been seen. */
134 enum iso_esc_flag esc;
136 /* This records the bytes we've seen so far in an escape sequence,
137 in case the sequence is invalid (we spit out the bytes unchanged). */
138 unsigned char esc_bytes[8];
140 /* Index for next byte to store in ISO escape sequence. */
143 #ifdef ENABLE_COMPOSITE_CHARS
144 /* Stuff seen so far when composing a string. */
145 unsigned_char_dynarr *composite_chars;
148 /* If we saw an invalid designation sequence for a particular
149 register, we flag it here and switch to ASCII. The next time we
150 see a valid designation for this register, we turn off the flag
151 and do the designation normally, but pretend the sequence was
152 invalid. The effect of all this is that (most of the time) the
153 escape sequences for both the switch to the unknown charset, and
154 the switch back to the known charset, get inserted literally into
155 the buffer and saved out as such. The hope is that we can
156 preserve the escape sequences so that the resulting written out
157 file makes sense. If we don't do any of this, the designation
158 to the invalid charset will be preserved but that switch back
159 to the known charset will probably get eaten because it was
160 the same charset that was already present in the register. */
161 unsigned char invalid_designated[4];
163 /* We try to do similar things as above for direction-switching
164 sequences. If we encountered a direction switch while an
165 invalid designation was present, or an invalid designation
166 just after a direction switch (i.e. no valid designation
167 encountered yet), we insert the direction-switch escape
168 sequence literally into the output stream, and later on
169 insert the corresponding direction-restoring escape sequence
171 unsigned int switched_dir_and_no_valid_charset_yet :1;
172 unsigned int invalid_switch_dir :1;
174 /* Tells the decoder to output the escape sequence literally
175 even though it was valid. Used in the games we play to
176 avoid lossage when we encounter invalid designations. */
177 unsigned int output_literally :1;
178 /* We encountered a direction switch followed by an invalid
179 designation. We didn't output the direction switch
180 literally because we didn't know about the invalid designation;
181 but we have to do so now. */
182 unsigned int output_direction_sequence :1;
185 EXFUN (Fcopy_coding_system, 2);
187 struct detection_state;
190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
191 unsigned_char_dynarr *dst, size_t n);
193 static int detect_coding_sjis (struct detection_state *st,
194 const Extbyte *src, size_t n);
195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
196 unsigned_char_dynarr *dst, size_t n);
197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_shift_jis (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int detect_coding_big5 (struct detection_state *st,
203 const Extbyte *src, size_t n);
204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
205 unsigned_char_dynarr *dst, size_t n);
206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
207 unsigned_char_dynarr *dst, unsigned int *flags);
208 void char_finish_big5 (struct encoding_stream *str,
209 unsigned_char_dynarr *dst, unsigned int *flags);
211 static int detect_coding_ucs4 (struct detection_state *st,
212 const Extbyte *src, size_t n);
213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
214 unsigned_char_dynarr *dst, size_t n);
215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
216 unsigned_char_dynarr *dst, unsigned int *flags);
217 void char_finish_ucs4 (struct encoding_stream *str,
218 unsigned_char_dynarr *dst, unsigned int *flags);
220 static int detect_coding_utf8 (struct detection_state *st,
221 const Extbyte *src, size_t n);
222 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
223 unsigned_char_dynarr *dst, size_t n);
224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf8 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int postprocess_iso2022_mask (int mask);
230 static void reset_iso2022 (Lisp_Object coding_system,
231 struct iso2022_decoder *iso);
232 static int detect_coding_iso2022 (struct detection_state *st,
233 const Extbyte *src, size_t n);
234 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
235 unsigned_char_dynarr *dst, size_t n);
236 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
237 unsigned_char_dynarr *dst, unsigned int *flags);
238 void char_finish_iso2022 (struct encoding_stream *str,
239 unsigned_char_dynarr *dst, unsigned int *flags);
241 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
243 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
244 unsigned_char_dynarr *dst, size_t n);
245 static void mule_decode (Lstream *decoding, const Extbyte *src,
246 unsigned_char_dynarr *dst, size_t n);
247 static void mule_encode (Lstream *encoding, const Bufbyte *src,
248 unsigned_char_dynarr *dst, size_t n);
250 typedef struct codesys_prop codesys_prop;
259 Dynarr_declare (codesys_prop);
260 } codesys_prop_dynarr;
262 static const struct lrecord_description codesys_prop_description_1[] = {
263 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
267 static const struct struct_description codesys_prop_description = {
268 sizeof (codesys_prop),
269 codesys_prop_description_1
272 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
273 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
277 static const struct struct_description codesys_prop_dynarr_description = {
278 sizeof (codesys_prop_dynarr),
279 codesys_prop_dynarr_description_1
282 codesys_prop_dynarr *the_codesys_prop_dynarr;
284 enum codesys_prop_enum
287 CODESYS_PROP_ISO2022,
292 /************************************************************************/
293 /* Coding system functions */
294 /************************************************************************/
296 static Lisp_Object mark_coding_system (Lisp_Object);
297 static void print_coding_system (Lisp_Object, Lisp_Object, int);
298 static void finalize_coding_system (void *header, int for_disksave);
301 static const struct lrecord_description ccs_description_1[] = {
302 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
303 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
307 static const struct struct_description ccs_description = {
308 sizeof (charset_conversion_spec),
312 static const struct lrecord_description ccsd_description_1[] = {
313 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
317 static const struct struct_description ccsd_description = {
318 sizeof (charset_conversion_spec_dynarr),
323 static const struct lrecord_description coding_system_description[] = {
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
333 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
334 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
335 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
345 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
346 mark_coding_system, print_coding_system,
347 finalize_coding_system,
348 0, 0, coding_system_description,
352 mark_coding_system (Lisp_Object obj)
354 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
356 mark_object (CODING_SYSTEM_NAME (codesys));
357 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
358 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
359 mark_object (CODING_SYSTEM_EOL_LF (codesys));
360 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
361 mark_object (CODING_SYSTEM_EOL_CR (codesys));
363 switch (CODING_SYSTEM_TYPE (codesys))
367 case CODESYS_ISO2022:
368 for (i = 0; i < 4; i++)
369 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
370 if (codesys->iso2022.input_conv)
372 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
374 struct charset_conversion_spec *ccs =
375 Dynarr_atp (codesys->iso2022.input_conv, i);
376 mark_object (ccs->from_charset);
377 mark_object (ccs->to_charset);
380 if (codesys->iso2022.output_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.output_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
394 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
395 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
400 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
401 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
408 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
410 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
412 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
416 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
419 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
421 error ("printing unreadable object #<coding_system 0x%x>",
424 write_c_string ("#<coding_system ", printcharfun);
425 print_internal (c->name, printcharfun, 1);
426 write_c_string (">", printcharfun);
430 finalize_coding_system (void *header, int for_disksave)
432 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
433 /* Since coding systems never go away, this function is not
434 necessary. But it would be necessary if we changed things
435 so that coding systems could go away. */
436 if (!for_disksave) /* see comment in lstream.c */
438 switch (CODING_SYSTEM_TYPE (c))
441 case CODESYS_ISO2022:
442 if (c->iso2022.input_conv)
444 Dynarr_free (c->iso2022.input_conv);
445 c->iso2022.input_conv = 0;
447 if (c->iso2022.output_conv)
449 Dynarr_free (c->iso2022.output_conv);
450 c->iso2022.output_conv = 0;
461 symbol_to_eol_type (Lisp_Object symbol)
463 CHECK_SYMBOL (symbol);
464 if (NILP (symbol)) return EOL_AUTODETECT;
465 if (EQ (symbol, Qlf)) return EOL_LF;
466 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
467 if (EQ (symbol, Qcr)) return EOL_CR;
469 signal_simple_error ("Unrecognized eol type", symbol);
470 return EOL_AUTODETECT; /* not reached */
474 eol_type_to_symbol (eol_type_t type)
479 case EOL_LF: return Qlf;
480 case EOL_CRLF: return Qcrlf;
481 case EOL_CR: return Qcr;
482 case EOL_AUTODETECT: return Qnil;
487 setup_eol_coding_systems (Lisp_Coding_System *codesys)
489 Lisp_Object codesys_obj;
490 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
491 char *codesys_name = (char *) alloca (len + 7);
493 char *codesys_mnemonic=0;
495 Lisp_Object codesys_name_sym, sub_codesys_obj;
499 XSETCODING_SYSTEM (codesys_obj, codesys);
501 memcpy (codesys_name,
502 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
504 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
506 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
507 codesys_mnemonic = (char *) alloca (mlen + 7);
508 memcpy (codesys_mnemonic,
509 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
512 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
513 strcpy (codesys_name + len, "-" op_sys); \
515 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
516 codesys_name_sym = intern (codesys_name); \
517 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
518 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
520 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
521 build_string (codesys_mnemonic); \
522 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
525 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
526 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
527 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
530 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
531 Return t if OBJECT is a coding system.
532 A coding system is an object that defines how text containing multiple
533 character sets is encoded into a stream of (typically 8-bit) bytes.
534 The coding system is used to decode the stream into a series of
535 characters (which may be from multiple charsets) when the text is read
536 from a file or process, and is used to encode the text back into the
537 same format when it is written out to a file or process.
539 For example, many ISO2022-compliant coding systems (such as Compound
540 Text, which is used for inter-client data under the X Window System)
541 use escape sequences to switch between different charsets -- Japanese
542 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
543 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
544 `make-coding-system' for more information.
546 Coding systems are normally identified using a symbol, and the
547 symbol is accepted in place of the actual coding system object whenever
548 a coding system is called for. (This is similar to how faces work.)
552 return CODING_SYSTEMP (object) ? Qt : Qnil;
555 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
556 Retrieve the coding system of the given name.
558 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
559 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
560 If there is no such coding system, nil is returned. Otherwise the
561 associated coding system object is returned.
563 (coding_system_or_name))
565 if (NILP (coding_system_or_name))
566 coding_system_or_name = Qbinary;
567 else if (CODING_SYSTEMP (coding_system_or_name))
568 return coding_system_or_name;
570 CHECK_SYMBOL (coding_system_or_name);
574 coding_system_or_name =
575 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
577 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
578 return coding_system_or_name;
582 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
583 Retrieve the coding system of the given name.
584 Same as `find-coding-system' except that if there is no such
585 coding system, an error is signaled instead of returning nil.
589 Lisp_Object coding_system = Ffind_coding_system (name);
591 if (NILP (coding_system))
592 signal_simple_error ("No such coding system", name);
593 return coding_system;
596 /* We store the coding systems in hash tables with the names as the key and the
597 actual coding system object as the value. Occasionally we need to use them
598 in a list format. These routines provide us with that. */
599 struct coding_system_list_closure
601 Lisp_Object *coding_system_list;
605 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
606 void *coding_system_list_closure)
608 /* This function can GC */
609 struct coding_system_list_closure *cscl =
610 (struct coding_system_list_closure *) coding_system_list_closure;
611 Lisp_Object *coding_system_list = cscl->coding_system_list;
613 *coding_system_list = Fcons (key, *coding_system_list);
617 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
618 Return a list of the names of all defined coding systems.
622 Lisp_Object coding_system_list = Qnil;
624 struct coding_system_list_closure coding_system_list_closure;
626 GCPRO1 (coding_system_list);
627 coding_system_list_closure.coding_system_list = &coding_system_list;
628 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
629 &coding_system_list_closure);
632 return coding_system_list;
635 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
636 Return the name of the given coding system.
640 coding_system = Fget_coding_system (coding_system);
641 return XCODING_SYSTEM_NAME (coding_system);
644 static Lisp_Coding_System *
645 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
647 Lisp_Coding_System *codesys =
648 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
650 zero_lcrecord (codesys);
651 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
652 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
653 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
654 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
655 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
656 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
657 CODING_SYSTEM_TYPE (codesys) = type;
658 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
661 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
663 if (type == CODESYS_ISO2022)
666 for (i = 0; i < 4; i++)
667 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
670 if (type == CODESYS_UTF8)
672 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
674 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
678 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
681 else if (type == CODESYS_BIG5)
683 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
685 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
686 = Vcharset_chinese_big5;
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
693 else if (type == CODESYS_CCL)
695 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
696 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
699 CODING_SYSTEM_NAME (codesys) = name;
705 /* Given a list of charset conversion specs as specified in a Lisp
706 program, parse it into STORE_HERE. */
709 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
710 Lisp_Object spec_list)
714 EXTERNAL_LIST_LOOP (rest, spec_list)
716 Lisp_Object car = XCAR (rest);
717 Lisp_Object from, to;
718 struct charset_conversion_spec spec;
720 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
721 signal_simple_error ("Invalid charset conversion spec", car);
722 from = Fget_charset (XCAR (car));
723 to = Fget_charset (XCAR (XCDR (car)));
724 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
725 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
726 signal_simple_error_2
727 ("Attempted conversion between different charset types",
729 spec.from_charset = from;
730 spec.to_charset = to;
732 Dynarr_add (store_here, spec);
736 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
737 specs, return the equivalent as the Lisp programmer would see it.
739 If LOAD_HERE is 0, return Qnil. */
742 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
749 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
751 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
752 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
755 return Fnreverse (result);
760 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
761 Register symbol NAME as a coding system.
763 TYPE describes the conversion method used and should be one of
766 Automatic conversion. XEmacs attempts to detect the coding system
769 No conversion. Use this for binary files and such. On output,
770 graphic characters that are not in ASCII or Latin-1 will be
771 replaced by a ?. (For a no-conversion-encoded buffer, these
772 characters will only be present if you explicitly insert them.)
774 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
776 ISO 10646 UCS-4 encoding.
778 ISO 10646 UTF-8 encoding.
780 Any ISO2022-compliant encoding. Among other things, this includes
781 JIS (the Japanese encoding commonly used for e-mail), EUC (the
782 standard Unix encoding for Japanese and other languages), and
783 Compound Text (the encoding used in X11). You can specify more
784 specific information about the conversion with the PROPS argument.
786 Big5 (the encoding commonly used for Taiwanese).
788 The conversion is performed using a user-written pseudo-code
789 program. CCL (Code Conversion Language) is the name of this
792 Write out or read in the raw contents of the memory representing
793 the buffer's text. This is primarily useful for debugging
794 purposes, and is only enabled when XEmacs has been compiled with
795 DEBUG_XEMACS defined (via the --debug configure option).
796 WARNING: Reading in a file using 'internal conversion can result
797 in an internal inconsistency in the memory representing a
798 buffer's text, which will produce unpredictable results and may
799 cause XEmacs to crash. Under normal circumstances you should
800 never use 'internal conversion.
802 DOC-STRING is a string describing the coding system.
804 PROPS is a property list, describing the specific nature of the
805 character set. Recognized properties are:
808 String to be displayed in the modeline when this coding system is
812 End-of-line conversion to be used. It should be one of
815 Automatically detect the end-of-line type (LF, CRLF,
816 or CR). Also generate subsidiary coding systems named
817 `NAME-unix', `NAME-dos', and `NAME-mac', that are
818 identical to this coding system but have an EOL-TYPE
819 value of 'lf, 'crlf, and 'cr, respectively.
821 The end of a line is marked externally using ASCII LF.
822 Since this is also the way that XEmacs represents an
823 end-of-line internally, specifying this option results
824 in no end-of-line conversion. This is the standard
825 format for Unix text files.
827 The end of a line is marked externally using ASCII
828 CRLF. This is the standard format for MS-DOS text
831 The end of a line is marked externally using ASCII CR.
832 This is the standard format for Macintosh text files.
834 Automatically detect the end-of-line type but do not
835 generate subsidiary coding systems. (This value is
836 converted to nil when stored internally, and
837 `coding-system-property' will return nil.)
840 If non-nil, composition/decomposition for combining characters
843 'use-entity-reference
844 If non-nil, SGML style entity-reference is used for non-system-characters.
846 'post-read-conversion
847 Function called after a file has been read in, to perform the
848 decoding. Called with two arguments, START and END, denoting
849 a region of the current buffer to be decoded.
851 'pre-write-conversion
852 Function called before a file is written out, to perform the
853 encoding. Called with two arguments, START and END, denoting
854 a region of the current buffer to be encoded.
857 The following additional properties are recognized if TYPE is 'iso2022:
863 The character set initially designated to the G0 - G3 registers.
864 The value should be one of
866 -- A charset object (designate that character set)
867 -- nil (do not ever use this register)
868 -- t (no character set is initially designated to
869 the register, but may be later on; this automatically
870 sets the corresponding `force-g*-on-output' property)
876 If non-nil, send an explicit designation sequence on output before
877 using the specified register.
880 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
881 "ESC $ B" on output in place of the full designation sequences
882 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
885 If non-nil, don't designate ASCII to G0 at each end of line on output.
886 Setting this to non-nil also suppresses other state-resetting that
887 normally happens at the end of a line.
890 If non-nil, don't designate ASCII to G0 before control chars on output.
893 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
897 If non-nil, use locking-shift (SO/SI) instead of single-shift
898 or designation by escape sequence.
901 If non-nil, don't use ISO6429's direction specification.
904 If non-nil, literal control characters that are the same as
905 the beginning of a recognized ISO2022 or ISO6429 escape sequence
906 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
907 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
908 so that they can be properly distinguished from an escape sequence.
909 (Note that doing this results in a non-portable encoding.) This
910 encoding flag is used for byte-compiled files. Note that ESC
911 is a good choice for a quoting character because there are no
912 escape sequences whose second byte is a character from the Control-0
913 or Control-1 character sets; this is explicitly disallowed by the
916 'input-charset-conversion
917 A list of conversion specifications, specifying conversion of
918 characters in one charset to another when decoding is performed.
919 Each specification is a list of two elements: the source charset,
920 and the destination charset.
922 'output-charset-conversion
923 A list of conversion specifications, specifying conversion of
924 characters in one charset to another when encoding is performed.
925 The form of each specification is the same as for
926 'input-charset-conversion.
929 The following additional properties are recognized (and required)
933 CCL program used for decoding (converting to internal format).
936 CCL program used for encoding (converting to external format).
938 (name, type, doc_string, props))
940 Lisp_Coding_System *codesys;
941 enum coding_system_type ty;
942 int need_to_setup_eol_systems = 1;
944 /* Convert type to constant */
945 if (NILP (type) || EQ (type, Qundecided))
946 { ty = CODESYS_AUTODETECT; }
948 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
949 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
950 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
951 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
952 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
953 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
955 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
957 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
960 signal_simple_error ("Invalid coding system type", type);
964 codesys = allocate_coding_system (ty, name);
966 if (NILP (doc_string))
967 doc_string = build_string ("");
969 CHECK_STRING (doc_string);
970 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
973 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
975 if (EQ (key, Qmnemonic))
978 CHECK_STRING (value);
979 CODING_SYSTEM_MNEMONIC (codesys) = value;
982 else if (EQ (key, Qeol_type))
984 need_to_setup_eol_systems = NILP (value);
987 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
990 else if (EQ (key, Qpost_read_conversion))
991 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
992 else if (EQ (key, Qpre_write_conversion))
993 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
995 else if (EQ (key, Qdisable_composition))
996 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
997 else if (EQ (key, Quse_entity_reference))
998 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1001 else if (ty == CODESYS_ISO2022)
1003 #define FROB_INITIAL_CHARSET(charset_num) \
1004 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1005 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1007 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1008 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1009 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1010 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1012 #define FROB_FORCE_CHARSET(charset_num) \
1013 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1015 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1016 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1017 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1018 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1020 #define FROB_BOOLEAN_PROPERTY(prop) \
1021 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1023 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1024 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1025 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1026 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1027 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1028 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1029 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1031 else if (EQ (key, Qinput_charset_conversion))
1033 codesys->iso2022.input_conv =
1034 Dynarr_new (charset_conversion_spec);
1035 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1038 else if (EQ (key, Qoutput_charset_conversion))
1040 codesys->iso2022.output_conv =
1041 Dynarr_new (charset_conversion_spec);
1042 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1046 signal_simple_error ("Unrecognized property", key);
1049 else if (ty == CODESYS_UTF8)
1051 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1052 else if (EQ (key, Qcharset_g1))
1053 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1054 else if (EQ (key, Qcharset_g2))
1055 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1057 signal_simple_error ("Unrecognized property", key);
1059 else if (ty == CODESYS_BIG5)
1061 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1062 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1064 signal_simple_error ("Unrecognized property", key);
1067 else if (EQ (type, Qccl))
1070 struct ccl_program test_ccl;
1073 /* Check key first. */
1074 if (EQ (key, Qdecode))
1075 suffix = "-ccl-decode";
1076 else if (EQ (key, Qencode))
1077 suffix = "-ccl-encode";
1079 signal_simple_error ("Unrecognized property", key);
1081 /* If value is vector, register it as a ccl program
1082 associated with an newly created symbol for
1083 backward compatibility. */
1084 if (VECTORP (value))
1086 sym = Fintern (concat2 (Fsymbol_name (name),
1087 build_string (suffix)),
1089 Fregister_ccl_program (sym, value);
1093 CHECK_SYMBOL (value);
1096 /* check if the given ccl programs are valid. */
1097 if (setup_ccl_program (&test_ccl, sym) < 0)
1098 signal_simple_error ("Invalid CCL program", value);
1100 if (EQ (key, Qdecode))
1101 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1102 else if (EQ (key, Qencode))
1103 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1108 signal_simple_error ("Unrecognized property", key);
1112 if (need_to_setup_eol_systems)
1113 setup_eol_coding_systems (codesys);
1116 Lisp_Object codesys_obj;
1117 XSETCODING_SYSTEM (codesys_obj, codesys);
1118 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1123 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1124 Copy OLD-CODING-SYSTEM to NEW-NAME.
1125 If NEW-NAME does not name an existing coding system, a new one will
1128 (old_coding_system, new_name))
1130 Lisp_Object new_coding_system;
1131 old_coding_system = Fget_coding_system (old_coding_system);
1132 new_coding_system = Ffind_coding_system (new_name);
1133 if (NILP (new_coding_system))
1135 XSETCODING_SYSTEM (new_coding_system,
1136 allocate_coding_system
1137 (XCODING_SYSTEM_TYPE (old_coding_system),
1139 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1143 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1144 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1145 memcpy (((char *) to ) + sizeof (to->header),
1146 ((char *) from) + sizeof (from->header),
1147 sizeof (*from) - sizeof (from->header));
1148 to->name = new_name;
1150 return new_coding_system;
1153 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1154 Return t if OBJECT names a coding system, and is not a coding system alias.
1158 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1162 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1163 Return t if OBJECT is a coding system alias.
1164 All coding system aliases are created by `define-coding-system-alias'.
1168 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1172 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1173 Return the coding-system symbol for which symbol ALIAS is an alias.
1177 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1178 if (SYMBOLP (aliasee))
1181 signal_simple_error ("Symbol is not a coding system alias", alias);
1182 return Qnil; /* To keep the compiler happy */
1186 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1188 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1192 /* A maphash function, for removing dangling coding system aliases. */
1194 dangling_coding_system_alias_p (Lisp_Object alias,
1195 Lisp_Object aliasee,
1196 void *dangling_aliases)
1198 if (SYMBOLP (aliasee)
1199 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1201 (*(int *) dangling_aliases)++;
1208 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1209 Define symbol ALIAS as an alias for coding system ALIASEE.
1211 You can use this function to redefine an alias that has already been defined,
1212 but you cannot redefine a name which is the canonical name for a coding system.
1213 \(a canonical name of a coding system is what is returned when you call
1214 `coding-system-name' on a coding system).
1216 ALIASEE itself can be an alias, which allows you to define nested aliases.
1218 You are forbidden, however, from creating alias loops or `dangling' aliases.
1219 These will be detected, and an error will be signaled if you attempt to do so.
1221 If ALIASEE is nil, then ALIAS will simply be undefined.
1223 See also `coding-system-alias-p', `coding-system-aliasee',
1224 and `coding-system-canonical-name-p'.
1228 Lisp_Object real_coding_system, probe;
1230 CHECK_SYMBOL (alias);
1232 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1234 ("Symbol is the canonical name of a coding system and cannot be redefined",
1239 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1240 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1241 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1243 Fremhash (alias, Vcoding_system_hash_table);
1245 /* Undefine subsidiary aliases,
1246 presumably created by a previous call to this function */
1247 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1248 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1249 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1251 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1252 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1253 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1256 /* Undefine dangling coding system aliases. */
1258 int dangling_aliases;
1261 dangling_aliases = 0;
1262 elisp_map_remhash (dangling_coding_system_alias_p,
1263 Vcoding_system_hash_table,
1265 } while (dangling_aliases > 0);
1271 if (CODING_SYSTEMP (aliasee))
1272 aliasee = XCODING_SYSTEM_NAME (aliasee);
1274 /* Checks that aliasee names a coding-system */
1275 real_coding_system = Fget_coding_system (aliasee);
1277 /* Check for coding system alias loops */
1278 if (EQ (alias, aliasee))
1279 alias_loop: signal_simple_error_2
1280 ("Attempt to create a coding system alias loop", alias, aliasee);
1282 for (probe = aliasee;
1284 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1286 if (EQ (probe, alias))
1290 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1292 /* Set up aliases for subsidiaries.
1293 #### There must be a better way to handle subsidiary coding systems. */
1295 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1297 for (i = 0; i < countof (suffixes); i++)
1299 Lisp_Object alias_subsidiary =
1300 append_suffix_to_symbol (alias, suffixes[i]);
1301 Lisp_Object aliasee_subsidiary =
1302 append_suffix_to_symbol (aliasee, suffixes[i]);
1304 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1305 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1308 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1309 but it doesn't look intentional, so I'd rather return something
1310 meaningful or nothing at all. */
1315 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1317 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1318 Lisp_Object new_coding_system;
1320 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1321 return coding_system;
1325 case EOL_AUTODETECT: return coding_system;
1326 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1327 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1328 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1329 default: abort (); return Qnil;
1332 return NILP (new_coding_system) ? coding_system : new_coding_system;
1335 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1336 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1338 (coding_system, eol_type))
1340 coding_system = Fget_coding_system (coding_system);
1342 return subsidiary_coding_system (coding_system,
1343 symbol_to_eol_type (eol_type));
1347 /************************************************************************/
1348 /* Coding system accessors */
1349 /************************************************************************/
1351 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1352 Return the doc string for CODING-SYSTEM.
1356 coding_system = Fget_coding_system (coding_system);
1357 return XCODING_SYSTEM_DOC_STRING (coding_system);
1360 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1361 Return the type of CODING-SYSTEM.
1365 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1368 case CODESYS_AUTODETECT: return Qundecided;
1370 case CODESYS_SHIFT_JIS: return Qshift_jis;
1371 case CODESYS_ISO2022: return Qiso2022;
1372 case CODESYS_BIG5: return Qbig5;
1373 case CODESYS_UCS4: return Qucs4;
1374 case CODESYS_UTF8: return Qutf8;
1375 case CODESYS_CCL: return Qccl;
1377 case CODESYS_NO_CONVERSION: return Qno_conversion;
1379 case CODESYS_INTERNAL: return Qinternal;
1386 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1389 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1391 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1394 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1395 Return initial charset of CODING-SYSTEM designated to GNUM.
1398 (coding_system, gnum))
1400 coding_system = Fget_coding_system (coding_system);
1403 return coding_system_charset (coding_system, XINT (gnum));
1407 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1408 Return the PROP property of CODING-SYSTEM.
1410 (coding_system, prop))
1413 enum coding_system_type type;
1415 coding_system = Fget_coding_system (coding_system);
1416 CHECK_SYMBOL (prop);
1417 type = XCODING_SYSTEM_TYPE (coding_system);
1419 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1420 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1423 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1425 case CODESYS_PROP_ALL_OK:
1428 case CODESYS_PROP_ISO2022:
1429 if (type != CODESYS_ISO2022)
1431 ("Property only valid in ISO2022 coding systems",
1435 case CODESYS_PROP_CCL:
1436 if (type != CODESYS_CCL)
1438 ("Property only valid in CCL coding systems",
1448 signal_simple_error ("Unrecognized property", prop);
1450 if (EQ (prop, Qname))
1451 return XCODING_SYSTEM_NAME (coding_system);
1452 else if (EQ (prop, Qtype))
1453 return Fcoding_system_type (coding_system);
1454 else if (EQ (prop, Qdoc_string))
1455 return XCODING_SYSTEM_DOC_STRING (coding_system);
1456 else if (EQ (prop, Qmnemonic))
1457 return XCODING_SYSTEM_MNEMONIC (coding_system);
1458 else if (EQ (prop, Qeol_type))
1459 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1460 else if (EQ (prop, Qeol_lf))
1461 return XCODING_SYSTEM_EOL_LF (coding_system);
1462 else if (EQ (prop, Qeol_crlf))
1463 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1464 else if (EQ (prop, Qeol_cr))
1465 return XCODING_SYSTEM_EOL_CR (coding_system);
1466 else if (EQ (prop, Qpost_read_conversion))
1467 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1468 else if (EQ (prop, Qpre_write_conversion))
1469 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1472 else if (EQ (prop, Qdisable_composition))
1473 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1474 else if (EQ (prop, Quse_entity_reference))
1475 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1477 else if (type == CODESYS_ISO2022)
1479 if (EQ (prop, Qcharset_g0))
1480 return coding_system_charset (coding_system, 0);
1481 else if (EQ (prop, Qcharset_g1))
1482 return coding_system_charset (coding_system, 1);
1483 else if (EQ (prop, Qcharset_g2))
1484 return coding_system_charset (coding_system, 2);
1485 else if (EQ (prop, Qcharset_g3))
1486 return coding_system_charset (coding_system, 3);
1488 #define FORCE_CHARSET(charset_num) \
1489 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1490 (coding_system, charset_num) ? Qt : Qnil)
1492 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1493 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1494 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1495 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1497 #define LISP_BOOLEAN(prop) \
1498 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1500 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1501 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1502 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1503 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1504 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1505 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1506 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1508 else if (EQ (prop, Qinput_charset_conversion))
1510 unparse_charset_conversion_specs
1511 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1512 else if (EQ (prop, Qoutput_charset_conversion))
1514 unparse_charset_conversion_specs
1515 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1519 else if (type == CODESYS_CCL)
1521 if (EQ (prop, Qdecode))
1522 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1523 else if (EQ (prop, Qencode))
1524 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1532 return Qnil; /* not reached */
1536 /************************************************************************/
1537 /* Coding category functions */
1538 /************************************************************************/
1541 decode_coding_category (Lisp_Object symbol)
1545 CHECK_SYMBOL (symbol);
1546 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1547 if (EQ (coding_category_symbol[i], symbol))
1550 signal_simple_error ("Unrecognized coding category", symbol);
1551 return 0; /* not reached */
1554 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1555 Return a list of all recognized coding categories.
1560 Lisp_Object list = Qnil;
1562 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1563 list = Fcons (coding_category_symbol[i], list);
1567 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1568 Change the priority order of the coding categories.
1569 LIST should be list of coding categories, in descending order of
1570 priority. Unspecified coding categories will be lower in priority
1571 than all specified ones, in the same relative order they were in
1576 int category_to_priority[CODING_CATEGORY_LAST];
1580 /* First generate a list that maps coding categories to priorities. */
1582 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583 category_to_priority[i] = -1;
1585 /* Highest priority comes from the specified list. */
1587 EXTERNAL_LIST_LOOP (rest, list)
1589 int cat = decode_coding_category (XCAR (rest));
1591 if (category_to_priority[cat] >= 0)
1592 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1593 category_to_priority[cat] = i++;
1596 /* Now go through the existing categories by priority to retrieve
1597 the categories not yet specified and preserve their priority
1599 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1601 int cat = fcd->coding_category_by_priority[j];
1602 if (category_to_priority[cat] < 0)
1603 category_to_priority[cat] = i++;
1606 /* Now we need to construct the inverse of the mapping we just
1609 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1610 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1612 /* Phew! That was confusing. */
1616 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1617 Return a list of coding categories in descending order of priority.
1622 Lisp_Object list = Qnil;
1624 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1625 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1630 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1631 Change the coding system associated with a coding category.
1633 (coding_category, coding_system))
1635 int cat = decode_coding_category (coding_category);
1637 coding_system = Fget_coding_system (coding_system);
1638 fcd->coding_category_system[cat] = coding_system;
1642 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1643 Return the coding system associated with a coding category.
1647 int cat = decode_coding_category (coding_category);
1648 Lisp_Object sys = fcd->coding_category_system[cat];
1651 return XCODING_SYSTEM_NAME (sys);
1656 /************************************************************************/
1657 /* Detecting the encoding of data */
1658 /************************************************************************/
1660 struct detection_state
1662 eol_type_t eol_type;
1698 struct iso2022_decoder iso;
1700 int high_byte_count;
1701 unsigned int saw_single_shift:1;
1714 acceptable_control_char_p (int c)
1718 /* Allow and ignore control characters that you might
1719 reasonably see in a text file */
1724 case 8: /* backspace */
1725 case 11: /* vertical tab */
1726 case 12: /* form feed */
1727 case 26: /* MS-DOS C-z junk */
1728 case 31: /* '^_' -- for info */
1736 mask_has_at_most_one_bit_p (int mask)
1738 /* Perhaps the only thing useful you learn from intensive Microsoft
1739 technical interviews */
1740 return (mask & (mask - 1)) == 0;
1744 detect_eol_type (struct detection_state *st, const Extbyte *src,
1749 unsigned char c = *(unsigned char *)src++;
1752 if (st->eol.just_saw_cr)
1754 else if (st->eol.seen_anything)
1757 else if (st->eol.just_saw_cr)
1760 st->eol.just_saw_cr = 1;
1762 st->eol.just_saw_cr = 0;
1763 st->eol.seen_anything = 1;
1766 return EOL_AUTODETECT;
1769 /* Attempt to determine the encoding and EOL type of the given text.
1770 Before calling this function for the first type, you must initialize
1771 st->eol_type as appropriate and initialize st->mask to ~0.
1773 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1776 st->mask holds the determined coding category mask, or ~0 if only
1777 ASCII has been seen so far.
1781 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1782 is present in st->mask
1783 1 == definitive answers are here for both st->eol_type and st->mask
1787 detect_coding_type (struct detection_state *st, const Extbyte *src,
1788 size_t n, int just_do_eol)
1790 if (st->eol_type == EOL_AUTODETECT)
1791 st->eol_type = detect_eol_type (st, src, n);
1794 return st->eol_type != EOL_AUTODETECT;
1796 if (!st->seen_non_ascii)
1798 for (; n; n--, src++)
1800 unsigned char c = *(unsigned char *) src;
1801 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1803 st->seen_non_ascii = 1;
1805 st->shift_jis.mask = ~0;
1809 st->iso2022.mask = ~0;
1819 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1820 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1821 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1822 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1823 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1824 st->big5.mask = detect_coding_big5 (st, src, n);
1825 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1826 st->utf8.mask = detect_coding_utf8 (st, src, n);
1827 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1828 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1831 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1832 | st->utf8.mask | st->ucs4.mask;
1835 int retval = mask_has_at_most_one_bit_p (st->mask);
1836 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1837 return retval && st->eol_type != EOL_AUTODETECT;
1842 coding_system_from_mask (int mask)
1846 /* If the file was entirely or basically ASCII, use the
1847 default value of `buffer-file-coding-system'. */
1848 Lisp_Object retval =
1849 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1852 retval = Ffind_coding_system (retval);
1856 (Qbad_variable, Qwarning,
1857 "Invalid `default-buffer-file-coding-system', set to nil");
1858 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1862 retval = Fget_coding_system (Qraw_text);
1870 mask = postprocess_iso2022_mask (mask);
1872 /* Look through the coding categories by priority and find
1873 the first one that is allowed. */
1874 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1876 cat = fcd->coding_category_by_priority[i];
1877 if ((mask & (1 << cat)) &&
1878 !NILP (fcd->coding_category_system[cat]))
1882 return fcd->coding_category_system[cat];
1884 return Fget_coding_system (Qraw_text);
1888 /* Given a seekable read stream and potential coding system and EOL type
1889 as specified, do any autodetection that is called for. If the
1890 coding system and/or EOL type are not `autodetect', they will be left
1891 alone; but this function will never return an autodetect coding system
1894 This function does not automatically fetch subsidiary coding systems;
1895 that should be unnecessary with the explicit eol-type argument. */
1897 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1900 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1901 eol_type_t *eol_type_in_out)
1903 struct detection_state decst;
1905 if (*eol_type_in_out == EOL_AUTODETECT)
1906 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1909 decst.eol_type = *eol_type_in_out;
1912 /* If autodetection is called for, do it now. */
1913 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1914 || *eol_type_in_out == EOL_AUTODETECT)
1917 Lisp_Object coding_system = Qnil;
1919 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1922 /* Look for initial "-*-"; mode line prefix */
1924 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1929 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1931 Extbyte *local_vars_beg = p + 3;
1932 /* Look for final "-*-"; mode line suffix */
1933 for (p = local_vars_beg,
1934 scan_end = buf + nread - LENGTH ("-*-");
1939 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1941 Extbyte *suffix = p;
1942 /* Look for "coding:" */
1943 for (p = local_vars_beg,
1944 scan_end = suffix - LENGTH ("coding:?");
1947 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1948 && (p == local_vars_beg
1949 || (*(p-1) == ' ' ||
1955 p += LENGTH ("coding:");
1956 while (*p == ' ' || *p == '\t') p++;
1958 /* Get coding system name */
1959 save = *suffix; *suffix = '\0';
1960 /* Characters valid in a MIME charset name (rfc 1521),
1961 and in a Lisp symbol name. */
1962 n = strspn ( (char *) p,
1963 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1964 "abcdefghijklmnopqrstuvwxyz"
1970 save = p[n]; p[n] = '\0';
1972 Ffind_coding_system (intern ((char *) p));
1982 if (NILP (coding_system))
1985 if (detect_coding_type (&decst, buf, nread,
1986 XCODING_SYSTEM_TYPE (*codesys_in_out)
1987 != CODESYS_AUTODETECT))
1989 nread = Lstream_read (stream, buf, sizeof (buf));
1995 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1996 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1999 if (detect_coding_type (&decst, buf, nread, 1))
2001 nread = Lstream_read (stream, buf, sizeof (buf));
2007 *eol_type_in_out = decst.eol_type;
2008 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2010 if (NILP (coding_system))
2011 *codesys_in_out = coding_system_from_mask (decst.mask);
2013 *codesys_in_out = coding_system;
2017 /* If we absolutely can't determine the EOL type, just assume LF. */
2018 if (*eol_type_in_out == EOL_AUTODETECT)
2019 *eol_type_in_out = EOL_LF;
2021 Lstream_rewind (stream);
2024 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2025 Detect coding system of the text in the region between START and END.
2026 Return a list of possible coding systems ordered by priority.
2027 If only ASCII characters are found, return 'undecided or one of
2028 its subsidiary coding systems according to a detected end-of-line
2029 type. Optional arg BUFFER defaults to the current buffer.
2031 (start, end, buffer))
2033 Lisp_Object val = Qnil;
2034 struct buffer *buf = decode_buffer (buffer, 0);
2036 Lisp_Object instream, lb_instream;
2037 Lstream *istr, *lb_istr;
2038 struct detection_state decst;
2039 struct gcpro gcpro1, gcpro2;
2041 get_buffer_range_char (buf, start, end, &b, &e, 0);
2042 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2043 lb_istr = XLSTREAM (lb_instream);
2044 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2045 istr = XLSTREAM (instream);
2046 GCPRO2 (instream, lb_instream);
2048 decst.eol_type = EOL_AUTODETECT;
2052 Extbyte random_buffer[4096];
2053 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2057 if (detect_coding_type (&decst, random_buffer, nread, 0))
2061 if (decst.mask == ~0)
2062 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2070 decst.mask = postprocess_iso2022_mask (decst.mask);
2072 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2074 int sys = fcd->coding_category_by_priority[i];
2075 if (decst.mask & (1 << sys))
2077 Lisp_Object codesys = fcd->coding_category_system[sys];
2078 if (!NILP (codesys))
2079 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2080 val = Fcons (codesys, val);
2084 Lstream_close (istr);
2086 Lstream_delete (istr);
2087 Lstream_delete (lb_istr);
2092 /************************************************************************/
2093 /* Converting to internal Mule format ("decoding") */
2094 /************************************************************************/
2096 /* A decoding stream is a stream used for decoding text (i.e.
2097 converting from some external format to internal format).
2098 The decoding-stream object keeps track of the actual coding
2099 stream, the stream that is at the other end, and data that
2100 needs to be persistent across the lifetime of the stream. */
2102 /* Handle the EOL stuff related to just-read-in character C.
2103 EOL_TYPE is the EOL type of the coding stream.
2104 FLAGS is the current value of FLAGS in the coding stream, and may
2105 be modified by this macro. (The macro only looks at the
2106 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2107 bytes are to be written. You need to also define a local goto
2108 label "label_continue_loop" that is at the end of the main
2109 character-reading loop.
2111 If C is a CR character, then this macro handles it entirely and
2112 jumps to label_continue_loop. Otherwise, this macro does not add
2113 anything to DST, and continues normally. You should continue
2114 processing C normally after this macro. */
2116 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2120 if (eol_type == EOL_CR) \
2121 Dynarr_add (dst, '\n'); \
2122 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2123 Dynarr_add (dst, c); \
2125 flags |= CODING_STATE_CR; \
2126 goto label_continue_loop; \
2128 else if (flags & CODING_STATE_CR) \
2129 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2131 Dynarr_add (dst, '\r'); \
2132 flags &= ~CODING_STATE_CR; \
2136 /* C should be a binary character in the range 0 - 255; convert
2137 to internal format and add to Dynarr DST. */
2140 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2142 if (BYTE_ASCII_P (c)) \
2143 Dynarr_add (dst, c); \
2146 Dynarr_add (dst, (c >> 6) | 0xc0); \
2147 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2151 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2153 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2157 Dynarr_add (dst, c);
2159 else if ( c <= 0x7ff )
2161 Dynarr_add (dst, (c >> 6) | 0xc0);
2162 Dynarr_add (dst, (c & 0x3f) | 0x80);
2164 else if ( c <= 0xffff )
2166 Dynarr_add (dst, (c >> 12) | 0xe0);
2167 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2168 Dynarr_add (dst, (c & 0x3f) | 0x80);
2170 else if ( c <= 0x1fffff )
2172 Dynarr_add (dst, (c >> 18) | 0xf0);
2173 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2174 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2175 Dynarr_add (dst, (c & 0x3f) | 0x80);
2177 else if ( c <= 0x3ffffff )
2179 Dynarr_add (dst, (c >> 24) | 0xf8);
2180 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2181 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2182 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2183 Dynarr_add (dst, (c & 0x3f) | 0x80);
2187 Dynarr_add (dst, (c >> 30) | 0xfc);
2188 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2189 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2190 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2191 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2192 Dynarr_add (dst, (c & 0x3f) | 0x80);
2196 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2198 if (BYTE_ASCII_P (c)) \
2199 Dynarr_add (dst, c); \
2200 else if (BYTE_C1_P (c)) \
2202 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2203 Dynarr_add (dst, c + 0x20); \
2207 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2208 Dynarr_add (dst, c); \
2213 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2217 DECODE_ADD_BINARY_CHAR (ch, dst); \
2222 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2224 if (flags & CODING_STATE_END) \
2226 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2227 if (flags & CODING_STATE_CR) \
2228 Dynarr_add (dst, '\r'); \
2232 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2234 struct decoding_stream
2236 /* Coding system that governs the conversion. */
2237 Lisp_Coding_System *codesys;
2239 /* Stream that we read the encoded data from or
2240 write the decoded data to. */
2243 /* If we are reading, then we can return only a fixed amount of
2244 data, so if the conversion resulted in too much data, we store it
2245 here for retrieval the next time around. */
2246 unsigned_char_dynarr *runoff;
2248 /* FLAGS holds flags indicating the current state of the decoding.
2249 Some of these flags are dependent on the coding system. */
2252 /* CPOS holds a partially built-up code-point of character. */
2255 /* EOL_TYPE specifies the type of end-of-line conversion that
2256 currently applies. We need to keep this separate from the
2257 EOL type stored in CODESYS because the latter might indicate
2258 automatic EOL-type detection while the former will always
2259 indicate a particular EOL type. */
2260 eol_type_t eol_type;
2262 /* Additional ISO2022 information. We define the structure above
2263 because it's also needed by the detection routines. */
2264 struct iso2022_decoder iso2022;
2266 /* Additional information (the state of the running CCL program)
2267 used by the CCL decoder. */
2268 struct ccl_program ccl;
2270 /* counter for UTF-8 or UCS-4 */
2271 unsigned char counter;
2274 unsigned char er_counter;
2275 unsigned char er_buf[16];
2277 unsigned combined_char_count;
2278 Emchar combined_chars[16];
2279 Lisp_Object combining_table;
2281 struct detection_state decst;
2284 static ssize_t decoding_reader (Lstream *stream,
2285 unsigned char *data, size_t size);
2286 static ssize_t decoding_writer (Lstream *stream,
2287 const unsigned char *data, size_t size);
2288 static int decoding_rewinder (Lstream *stream);
2289 static int decoding_seekable_p (Lstream *stream);
2290 static int decoding_flusher (Lstream *stream);
2291 static int decoding_closer (Lstream *stream);
2293 static Lisp_Object decoding_marker (Lisp_Object stream);
2295 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2296 sizeof (struct decoding_stream));
2299 decoding_marker (Lisp_Object stream)
2301 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2302 Lisp_Object str_obj;
2304 /* We do not need to mark the coding systems or charsets stored
2305 within the stream because they are stored in a global list
2306 and automatically marked. */
2308 XSETLSTREAM (str_obj, str);
2309 mark_object (str_obj);
2310 if (str->imp->marker)
2311 return (str->imp->marker) (str_obj);
2316 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2317 so we read data from the other end, decode it, and store it into DATA. */
2320 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2322 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2323 unsigned char *orig_data = data;
2325 int error_occurred = 0;
2327 /* We need to interface to mule_decode(), which expects to take some
2328 amount of data and store the result into a Dynarr. We have
2329 mule_decode() store into str->runoff, and take data from there
2332 /* We loop until we have enough data, reading chunks from the other
2333 end and decoding it. */
2336 /* Take data from the runoff if we can. Make sure to take at
2337 most SIZE bytes, and delete the data from the runoff. */
2338 if (Dynarr_length (str->runoff) > 0)
2340 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2341 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2342 Dynarr_delete_many (str->runoff, 0, chunk);
2348 break; /* No more room for data */
2350 if (str->flags & CODING_STATE_END)
2351 /* This means that on the previous iteration, we hit the EOF on
2352 the other end. We loop once more so that mule_decode() can
2353 output any final stuff it may be holding, or any "go back
2354 to a sane state" escape sequences. (This latter makes sense
2355 during encoding.) */
2358 /* Exhausted the runoff, so get some more. DATA has at least
2359 SIZE bytes left of storage in it, so it's OK to read directly
2360 into it. (We'll be overwriting above, after we've decoded it
2361 into the runoff.) */
2362 read_size = Lstream_read (str->other_end, data, size);
2369 /* There might be some more end data produced in the translation.
2370 See the comment above. */
2371 str->flags |= CODING_STATE_END;
2372 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2375 if (data - orig_data == 0)
2376 return error_occurred ? -1 : 0;
2378 return data - orig_data;
2382 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2384 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2387 /* Decode all our data into the runoff, and then attempt to write
2388 it all out to the other end. Remove whatever chunk we succeeded
2390 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2391 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2392 Dynarr_length (str->runoff));
2394 Dynarr_delete_many (str->runoff, 0, retval);
2395 /* Do NOT return retval. The return value indicates how much
2396 of the incoming data was written, not how many bytes were
2402 reset_decoding_stream (struct decoding_stream *str)
2405 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2407 Lisp_Object coding_system;
2408 XSETCODING_SYSTEM (coding_system, str->codesys);
2409 reset_iso2022 (coding_system, &str->iso2022);
2411 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2413 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2418 str->er_counter = 0;
2419 str->combined_char_count = 0;
2420 str->combining_table = Qnil;
2422 str->flags = str->cpos = 0;
2426 decoding_rewinder (Lstream *stream)
2428 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2429 reset_decoding_stream (str);
2430 Dynarr_reset (str->runoff);
2431 return Lstream_rewind (str->other_end);
2435 decoding_seekable_p (Lstream *stream)
2437 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2438 return Lstream_seekable_p (str->other_end);
2442 decoding_flusher (Lstream *stream)
2444 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2445 return Lstream_flush (str->other_end);
2449 decoding_closer (Lstream *stream)
2451 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2452 if (stream->flags & LSTREAM_FL_WRITE)
2454 str->flags |= CODING_STATE_END;
2455 decoding_writer (stream, 0, 0);
2457 Dynarr_free (str->runoff);
2459 #ifdef ENABLE_COMPOSITE_CHARS
2460 if (str->iso2022.composite_chars)
2461 Dynarr_free (str->iso2022.composite_chars);
2464 return Lstream_close (str->other_end);
2468 decoding_stream_coding_system (Lstream *stream)
2470 Lisp_Object coding_system;
2471 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2473 XSETCODING_SYSTEM (coding_system, str->codesys);
2474 return subsidiary_coding_system (coding_system, str->eol_type);
2478 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2480 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2481 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2483 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2484 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2485 reset_decoding_stream (str);
2488 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2489 stream for writing, no automatic code detection will be performed.
2490 The reason for this is that automatic code detection requires a
2491 seekable input. Things will also fail if you open a decoding
2492 stream for reading using a non-fully-specified coding system and
2493 a non-seekable input stream. */
2496 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2499 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2500 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2504 str->other_end = stream;
2505 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2506 str->eol_type = EOL_AUTODETECT;
2507 if (!strcmp (mode, "r")
2508 && Lstream_seekable_p (stream))
2509 /* We can determine the coding system now. */
2510 determine_real_coding_system (stream, &codesys, &str->eol_type);
2511 set_decoding_stream_coding_system (lstr, codesys);
2512 str->decst.eol_type = str->eol_type;
2513 str->decst.mask = ~0;
2514 XSETLSTREAM (obj, lstr);
2519 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2521 return make_decoding_stream_1 (stream, codesys, "r");
2525 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2527 return make_decoding_stream_1 (stream, codesys, "w");
2530 /* Note: the decode_coding_* functions all take the same
2531 arguments as mule_decode(), which is to say some SRC data of
2532 size N, which is to be stored into dynamic array DST.
2533 DECODING is the stream within which the decoding is
2534 taking place, but no data is actually read from or
2535 written to that stream; that is handled in decoding_reader()
2536 or decoding_writer(). This allows the same functions to
2537 be used for both reading and writing. */
2540 mule_decode (Lstream *decoding, const Extbyte *src,
2541 unsigned_char_dynarr *dst, size_t n)
2543 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2545 /* If necessary, do encoding-detection now. We do this when
2546 we're a writing stream or a non-seekable reading stream,
2547 meaning that we can't just process the whole input,
2548 rewind, and start over. */
2550 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2551 str->eol_type == EOL_AUTODETECT)
2553 Lisp_Object codesys;
2555 XSETCODING_SYSTEM (codesys, str->codesys);
2556 detect_coding_type (&str->decst, src, n,
2557 CODING_SYSTEM_TYPE (str->codesys) !=
2558 CODESYS_AUTODETECT);
2559 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2560 str->decst.mask != ~0)
2561 /* #### This is cheesy. What we really ought to do is
2562 buffer up a certain amount of data so as to get a
2563 less random result. */
2564 codesys = coding_system_from_mask (str->decst.mask);
2565 str->eol_type = str->decst.eol_type;
2566 if (XCODING_SYSTEM (codesys) != str->codesys)
2568 /* Preserve the CODING_STATE_END flag in case it was set.
2569 If we erase it, bad things might happen. */
2570 int was_end = str->flags & CODING_STATE_END;
2571 set_decoding_stream_coding_system (decoding, codesys);
2573 str->flags |= CODING_STATE_END;
2577 switch (CODING_SYSTEM_TYPE (str->codesys))
2580 case CODESYS_INTERNAL:
2581 Dynarr_add_many (dst, src, n);
2584 case CODESYS_AUTODETECT:
2585 /* If we got this far and still haven't decided on the coding
2586 system, then do no conversion. */
2587 case CODESYS_NO_CONVERSION:
2588 decode_coding_no_conversion (decoding, src, dst, n);
2591 case CODESYS_SHIFT_JIS:
2592 decode_coding_sjis (decoding, src, dst, n);
2595 decode_coding_big5 (decoding, src, dst, n);
2598 decode_coding_ucs4 (decoding, src, dst, n);
2601 decode_coding_utf8 (decoding, src, dst, n);
2604 str->ccl.last_block = str->flags & CODING_STATE_END;
2605 /* When applying ccl program to stream, MUST NOT set NULL
2607 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2608 dst, n, 0, CCL_MODE_DECODING);
2610 case CODESYS_ISO2022:
2611 decode_coding_iso2022 (decoding, src, dst, n);
2619 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2620 Decode the text between START and END which is encoded in CODING-SYSTEM.
2621 This is useful if you've read in encoded text from a file without decoding
2622 it (e.g. you read in a JIS-formatted file but used the `binary' or
2623 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2624 Return length of decoded text.
2625 BUFFER defaults to the current buffer if unspecified.
2627 (start, end, coding_system, buffer))
2630 struct buffer *buf = decode_buffer (buffer, 0);
2631 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2632 Lstream *istr, *ostr;
2633 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2635 get_buffer_range_char (buf, start, end, &b, &e, 0);
2637 barf_if_buffer_read_only (buf, b, e);
2639 coding_system = Fget_coding_system (coding_system);
2640 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2641 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2642 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2644 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2645 Fget_coding_system (Qbinary));
2646 istr = XLSTREAM (instream);
2647 ostr = XLSTREAM (outstream);
2648 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2650 /* The chain of streams looks like this:
2652 [BUFFER] <----- send through
2653 ------> [ENCODE AS BINARY]
2654 ------> [DECODE AS SPECIFIED]
2660 char tempbuf[1024]; /* some random amount */
2661 Bufpos newpos, even_newer_pos;
2662 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2663 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2667 newpos = lisp_buffer_stream_startpos (istr);
2668 Lstream_write (ostr, tempbuf, size_in_bytes);
2669 even_newer_pos = lisp_buffer_stream_startpos (istr);
2670 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2673 Lstream_close (istr);
2674 Lstream_close (ostr);
2676 Lstream_delete (istr);
2677 Lstream_delete (ostr);
2678 Lstream_delete (XLSTREAM (de_outstream));
2679 Lstream_delete (XLSTREAM (lb_outstream));
2684 /************************************************************************/
2685 /* Converting to an external encoding ("encoding") */
2686 /************************************************************************/
2688 /* An encoding stream is an output stream. When you create the
2689 stream, you specify the coding system that governs the encoding
2690 and another stream that the resulting encoded data is to be
2691 sent to, and then start sending data to it. */
2693 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2695 struct encoding_stream
2697 /* Coding system that governs the conversion. */
2698 Lisp_Coding_System *codesys;
2700 /* Stream that we read the encoded data from or
2701 write the decoded data to. */
2704 /* If we are reading, then we can return only a fixed amount of
2705 data, so if the conversion resulted in too much data, we store it
2706 here for retrieval the next time around. */
2707 unsigned_char_dynarr *runoff;
2709 /* FLAGS holds flags indicating the current state of the encoding.
2710 Some of these flags are dependent on the coding system. */
2713 /* CH holds a partially built-up character. Since we only deal
2714 with one- and two-byte characters at the moment, we only use
2715 this to store the first byte of a two-byte character. */
2718 /* Additional information used by the ISO2022 encoder. */
2721 /* CHARSET holds the character sets currently assigned to the G0
2722 through G3 registers. It is initialized from the array
2723 INITIAL_CHARSET in CODESYS. */
2724 Lisp_Object charset[4];
2726 /* Which registers are currently invoked into the left (GL) and
2727 right (GR) halves of the 8-bit encoding space? */
2728 int register_left, register_right;
2730 /* Whether we need to explicitly designate the charset in the
2731 G? register before using it. It is initialized from the
2732 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2733 unsigned char force_charset_on_output[4];
2735 /* Other state variables that need to be preserved across
2737 Lisp_Object current_charset;
2739 int current_char_boundary;
2742 void (*encode_char) (struct encoding_stream *str, Emchar c,
2743 unsigned_char_dynarr *dst, unsigned int *flags);
2744 void (*finish) (struct encoding_stream *str,
2745 unsigned_char_dynarr *dst, unsigned int *flags);
2747 /* Additional information (the state of the running CCL program)
2748 used by the CCL encoder. */
2749 struct ccl_program ccl;
2753 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2754 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2756 static int encoding_rewinder (Lstream *stream);
2757 static int encoding_seekable_p (Lstream *stream);
2758 static int encoding_flusher (Lstream *stream);
2759 static int encoding_closer (Lstream *stream);
2761 static Lisp_Object encoding_marker (Lisp_Object stream);
2763 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2764 sizeof (struct encoding_stream));
2767 encoding_marker (Lisp_Object stream)
2769 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2770 Lisp_Object str_obj;
2772 /* We do not need to mark the coding systems or charsets stored
2773 within the stream because they are stored in a global list
2774 and automatically marked. */
2776 XSETLSTREAM (str_obj, str);
2777 mark_object (str_obj);
2778 if (str->imp->marker)
2779 return (str->imp->marker) (str_obj);
2784 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2785 so we read data from the other end, encode it, and store it into DATA. */
2788 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2790 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2791 unsigned char *orig_data = data;
2793 int error_occurred = 0;
2795 /* We need to interface to mule_encode(), which expects to take some
2796 amount of data and store the result into a Dynarr. We have
2797 mule_encode() store into str->runoff, and take data from there
2800 /* We loop until we have enough data, reading chunks from the other
2801 end and encoding it. */
2804 /* Take data from the runoff if we can. Make sure to take at
2805 most SIZE bytes, and delete the data from the runoff. */
2806 if (Dynarr_length (str->runoff) > 0)
2808 int chunk = min ((int) size, Dynarr_length (str->runoff));
2809 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2810 Dynarr_delete_many (str->runoff, 0, chunk);
2816 break; /* No more room for data */
2818 if (str->flags & CODING_STATE_END)
2819 /* This means that on the previous iteration, we hit the EOF on
2820 the other end. We loop once more so that mule_encode() can
2821 output any final stuff it may be holding, or any "go back
2822 to a sane state" escape sequences. (This latter makes sense
2823 during encoding.) */
2826 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2827 left of storage in it, so it's OK to read directly into it.
2828 (We'll be overwriting above, after we've encoded it into the
2830 read_size = Lstream_read (str->other_end, data, size);
2837 /* There might be some more end data produced in the translation.
2838 See the comment above. */
2839 str->flags |= CODING_STATE_END;
2840 mule_encode (stream, data, str->runoff, read_size);
2843 if (data == orig_data)
2844 return error_occurred ? -1 : 0;
2846 return data - orig_data;
2850 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2852 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2855 /* Encode all our data into the runoff, and then attempt to write
2856 it all out to the other end. Remove whatever chunk we succeeded
2858 mule_encode (stream, data, str->runoff, size);
2859 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2860 Dynarr_length (str->runoff));
2862 Dynarr_delete_many (str->runoff, 0, retval);
2863 /* Do NOT return retval. The return value indicates how much
2864 of the incoming data was written, not how many bytes were
2870 reset_encoding_stream (struct encoding_stream *str)
2873 switch (CODING_SYSTEM_TYPE (str->codesys))
2875 case CODESYS_ISO2022:
2879 str->encode_char = &char_encode_iso2022;
2880 str->finish = &char_finish_iso2022;
2881 for (i = 0; i < 4; i++)
2883 str->iso2022.charset[i] =
2884 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2885 str->iso2022.force_charset_on_output[i] =
2886 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2888 str->iso2022.register_left = 0;
2889 str->iso2022.register_right = 1;
2890 str->iso2022.current_charset = Qnil;
2891 str->iso2022.current_half = 0;
2895 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2898 str->encode_char = &char_encode_utf8;
2899 str->finish = &char_finish_utf8;
2902 str->encode_char = &char_encode_ucs4;
2903 str->finish = &char_finish_ucs4;
2905 case CODESYS_SHIFT_JIS:
2906 str->encode_char = &char_encode_shift_jis;
2907 str->finish = &char_finish_shift_jis;
2910 str->encode_char = &char_encode_big5;
2911 str->finish = &char_finish_big5;
2917 str->iso2022.current_char_boundary = 0;
2918 str->flags = str->ch = 0;
2922 encoding_rewinder (Lstream *stream)
2924 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2925 reset_encoding_stream (str);
2926 Dynarr_reset (str->runoff);
2927 return Lstream_rewind (str->other_end);
2931 encoding_seekable_p (Lstream *stream)
2933 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2934 return Lstream_seekable_p (str->other_end);
2938 encoding_flusher (Lstream *stream)
2940 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2941 return Lstream_flush (str->other_end);
2945 encoding_closer (Lstream *stream)
2947 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2948 if (stream->flags & LSTREAM_FL_WRITE)
2950 str->flags |= CODING_STATE_END;
2951 encoding_writer (stream, 0, 0);
2953 Dynarr_free (str->runoff);
2954 return Lstream_close (str->other_end);
2958 encoding_stream_coding_system (Lstream *stream)
2960 Lisp_Object coding_system;
2961 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2963 XSETCODING_SYSTEM (coding_system, str->codesys);
2964 return coding_system;
2968 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2970 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2971 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2973 reset_encoding_stream (str);
2977 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2980 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2981 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2985 str->runoff = Dynarr_new (unsigned_char);
2986 str->other_end = stream;
2987 set_encoding_stream_coding_system (lstr, codesys);
2988 XSETLSTREAM (obj, lstr);
2993 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2995 return make_encoding_stream_1 (stream, codesys, "r");
2999 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3001 return make_encoding_stream_1 (stream, codesys, "w");
3004 /* Convert N bytes of internally-formatted data stored in SRC to an
3005 external format, according to the encoding stream ENCODING.
3006 Store the encoded data into DST. */
3009 mule_encode (Lstream *encoding, const Bufbyte *src,
3010 unsigned_char_dynarr *dst, size_t n)
3012 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3014 switch (CODING_SYSTEM_TYPE (str->codesys))
3017 case CODESYS_INTERNAL:
3018 Dynarr_add_many (dst, src, n);
3021 case CODESYS_AUTODETECT:
3022 /* If we got this far and still haven't decided on the coding
3023 system, then do no conversion. */
3024 case CODESYS_NO_CONVERSION:
3025 encode_coding_no_conversion (encoding, src, dst, n);
3029 str->ccl.last_block = str->flags & CODING_STATE_END;
3030 /* When applying ccl program to stream, MUST NOT set NULL
3032 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3033 dst, n, 0, CCL_MODE_ENCODING);
3037 text_encode_generic (encoding, src, dst, n);
3041 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3042 Encode the text between START and END using CODING-SYSTEM.
3043 This will, for example, convert Japanese characters into stuff such as
3044 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3045 text. BUFFER defaults to the current buffer if unspecified.
3047 (start, end, coding_system, buffer))
3050 struct buffer *buf = decode_buffer (buffer, 0);
3051 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3052 Lstream *istr, *ostr;
3053 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3055 get_buffer_range_char (buf, start, end, &b, &e, 0);
3057 barf_if_buffer_read_only (buf, b, e);
3059 coding_system = Fget_coding_system (coding_system);
3060 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3061 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3062 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3063 Fget_coding_system (Qbinary));
3064 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3066 istr = XLSTREAM (instream);
3067 ostr = XLSTREAM (outstream);
3068 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3069 /* The chain of streams looks like this:
3071 [BUFFER] <----- send through
3072 ------> [ENCODE AS SPECIFIED]
3073 ------> [DECODE AS BINARY]
3078 char tempbuf[1024]; /* some random amount */
3079 Bufpos newpos, even_newer_pos;
3080 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3081 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3085 newpos = lisp_buffer_stream_startpos (istr);
3086 Lstream_write (ostr, tempbuf, size_in_bytes);
3087 even_newer_pos = lisp_buffer_stream_startpos (istr);
3088 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3094 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3095 Lstream_close (istr);
3096 Lstream_close (ostr);
3098 Lstream_delete (istr);
3099 Lstream_delete (ostr);
3100 Lstream_delete (XLSTREAM (de_outstream));
3101 Lstream_delete (XLSTREAM (lb_outstream));
3102 return make_int (retlen);
3109 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3110 unsigned_char_dynarr *dst, size_t n)
3113 unsigned char char_boundary;
3114 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3115 unsigned int flags = str->flags;
3116 Emchar ch = str->ch;
3118 char_boundary = str->iso2022.current_char_boundary;
3124 if (char_boundary == 0)
3152 (*str->encode_char) (str, c, dst, &flags);
3154 else if (char_boundary == 1)
3156 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3162 ch = (ch << 6) | (c & 0x3f);
3167 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3169 (*str->finish) (str, dst, &flags);
3174 str->iso2022.current_char_boundary = char_boundary;
3179 /************************************************************************/
3180 /* entity reference */
3181 /************************************************************************/
3184 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3186 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3188 if ( str->er_counter > 0)
3190 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3191 str->er_counter = 0;
3195 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3196 unsigned_char_dynarr* dst);
3198 decode_add_er_char (struct decoding_stream *str, Emchar c,
3199 unsigned_char_dynarr* dst)
3201 if (str->er_counter == 0)
3203 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3206 str->er_buf[0] = '&';
3210 DECODE_ADD_UCS_CHAR (c, dst);
3214 Lisp_Object string = make_string (str->er_buf,
3216 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3221 Lisp_Object char_type;
3224 while (!NILP (rest))
3230 char_type = XCDR (ccs);
3235 if (NILP (ccs = Ffind_charset (ccs)))
3250 pat = concat3 (build_string ("^&"),
3251 pat, build_string ("\\([0-9]+\\)$"));
3254 else if (EQ (ret, Qx))
3256 pat = concat3 (build_string ("^&"),
3257 pat, build_string ("\\([0-9a-f]+\\)$"));
3260 else if (EQ (ret, QX))
3262 pat = concat3 (build_string ("^&"),
3263 pat, build_string ("\\([0-9A-F]+\\)$"));
3269 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3272 = XINT (Fstring_to_number
3273 (Fsubstring (string,
3274 Fmatch_beginning (make_int (1)),
3275 Fmatch_end (make_int (1))),
3279 ? DECODE_CHAR (ccs, code)
3280 : decode_builtin_char (ccs, code);
3282 DECODE_ADD_UCS_CHAR (chr, dst);
3287 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3288 string, Qnil, Qnil)))
3291 = XINT (Fstring_to_number
3292 (Fsubstring (string,
3293 Fmatch_beginning (make_int (1)),
3294 Fmatch_end (make_int (1))),
3297 DECODE_ADD_UCS_CHAR (code, dst);
3301 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3302 Dynarr_add (dst, ';');
3305 str->er_counter = 0;
3307 else if ( (str->er_counter >= 16) || (c >= 0x7F) )
3309 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3310 str->er_counter = 0;
3311 DECODE_ADD_UCS_CHAR (c, dst);
3314 str->er_buf[str->er_counter++] = c;
3317 void char_encode_as_entity_reference (Emchar ch, char* buf);
3319 char_encode_as_entity_reference (Emchar ch, char* buf)
3321 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3324 Lisp_Object char_type;
3325 int format_columns, idx;
3328 while (!NILP (rest))
3334 char_type = XCDR (ccs);
3339 if (!NILP (ccs = Ffind_charset (ccs)))
3341 int code_point = charset_code_point (ccs, ch);
3343 if ( (code_point >= 0)
3344 && (NILP (char_type)
3345 || DECODE_CHAR (ccs, code_point) != ch) )
3351 if (STRINGP (ret) && ((idx = XSTRING_LENGTH (ret)) <= 6))
3354 strncpy (&format[1], XSTRING_DATA (ret), idx);
3364 format[idx++] = '%';
3365 format_columns = XINT (ret);
3366 if ( (2 <= format_columns) && (format_columns <= 8) )
3368 format [idx++] = '0';
3369 format [idx++] = '0' + format_columns;
3376 format [idx++] = 'd';
3377 else if (EQ (ret, Qx))
3378 format [idx++] = 'x';
3379 else if (EQ (ret, QX))
3380 format [idx++] = 'X';
3383 format [idx++] = ';';
3386 sprintf (buf, format, code_point);
3392 sprintf (buf, "&MCS-%08X;", ch);
3396 /************************************************************************/
3397 /* character composition */
3398 /************************************************************************/
3399 extern Lisp_Object Qcomposition;
3402 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3404 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3408 for (i = 0; i < str->combined_char_count; i++)
3409 decode_add_er_char (str, str->combined_chars[i], dst);
3410 str->combined_char_count = 0;
3411 str->combining_table = Qnil;
3414 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3415 unsigned_char_dynarr* dst);
3417 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3418 Emchar character, unsigned_char_dynarr* dst)
3420 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3421 decode_add_er_char (str, character, dst);
3422 else if (!CONSP (str->combining_table))
3425 = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
3428 decode_add_er_char (str, character, dst);
3431 str->combined_chars[0] = character;
3432 str->combined_char_count = 1;
3433 str->combining_table = ret;
3439 = Fcdr (Fassq (make_char (character), str->combining_table));
3443 Emchar char2 = XCHARVAL (ret);
3444 ret = Fget_char_attribute (make_char (character), Qcomposition,
3448 decode_add_er_char (str, character, dst);
3449 str->combined_char_count = 0;
3450 str->combining_table = Qnil;
3454 str->combined_chars[0] = char2;
3455 str->combined_char_count = 1;
3456 str->combining_table = ret;
3461 COMPOSE_FLUSH_CHARS (str, dst);
3462 decode_add_er_char (str, character, dst);
3466 #else /* not UTF2000 */
3467 #define COMPOSE_FLUSH_CHARS(str, dst)
3468 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3469 #endif /* UTF2000 */
3472 /************************************************************************/
3473 /* Shift-JIS methods */
3474 /************************************************************************/
3476 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3477 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3478 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3479 encoded by "position-code + 0x80". A character of JISX0208
3480 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3481 position-codes are divided and shifted so that it fit in the range
3484 --- CODE RANGE of Shift-JIS ---
3485 (character set) (range)
3487 JISX0201-Kana 0xA0 .. 0xDF
3488 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3489 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3490 -------------------------------
3494 /* Is this the first byte of a Shift-JIS two-byte char? */
3496 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3497 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3499 /* Is this the second byte of a Shift-JIS two-byte char? */
3501 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3502 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3504 #define BYTE_SJIS_KATAKANA_P(c) \
3505 ((c) >= 0xA1 && (c) <= 0xDF)
3508 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3512 unsigned char c = *(unsigned char *)src++;
3513 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3515 if (st->shift_jis.in_second_byte)
3517 st->shift_jis.in_second_byte = 0;
3521 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3522 st->shift_jis.in_second_byte = 1;
3524 return CODING_CATEGORY_SHIFT_JIS_MASK;
3527 /* Convert Shift-JIS data to internal format. */
3530 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3531 unsigned_char_dynarr *dst, size_t n)
3533 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3534 unsigned int flags = str->flags;
3535 unsigned int cpos = str->cpos;
3536 eol_type_t eol_type = str->eol_type;
3540 unsigned char c = *(unsigned char *)src++;
3544 /* Previous character was first byte of Shift-JIS Kanji char. */
3545 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3547 unsigned char e1, e2;
3549 DECODE_SJIS (cpos, c, e1, e2);
3551 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3555 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3556 Dynarr_add (dst, e1);
3557 Dynarr_add (dst, e2);
3562 DECODE_ADD_BINARY_CHAR (cpos, dst);
3563 DECODE_ADD_BINARY_CHAR (c, dst);
3569 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3570 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3572 else if (BYTE_SJIS_KATAKANA_P (c))
3575 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3578 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3579 Dynarr_add (dst, c);
3584 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3588 DECODE_ADD_BINARY_CHAR (c, dst);
3590 label_continue_loop:;
3593 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3599 /* Convert internal character representation to Shift_JIS. */
3602 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3603 unsigned_char_dynarr *dst, unsigned int *flags)
3605 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3609 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3610 Dynarr_add (dst, '\r');
3611 if (eol_type != EOL_CR)
3612 Dynarr_add (dst, ch);
3616 unsigned int s1, s2;
3618 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3620 if (code_point >= 0)
3621 Dynarr_add (dst, code_point);
3622 else if ((code_point
3623 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3626 ENCODE_SJIS ((code_point >> 8) | 0x80,
3627 (code_point & 0xFF) | 0x80, s1, s2);
3628 Dynarr_add (dst, s1);
3629 Dynarr_add (dst, s2);
3631 else if ((code_point
3632 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3634 Dynarr_add (dst, code_point | 0x80);
3635 else if ((code_point
3636 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3639 ENCODE_SJIS ((code_point >> 8) | 0x80,
3640 (code_point & 0xFF) | 0x80, s1, s2);
3641 Dynarr_add (dst, s1);
3642 Dynarr_add (dst, s2);
3644 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3646 Dynarr_add (dst, code_point);
3648 Dynarr_add (dst, '?');
3650 Lisp_Object charset;
3651 unsigned int c1, c2;
3653 BREAKUP_CHAR (ch, charset, c1, c2);
3655 if (EQ(charset, Vcharset_katakana_jisx0201))
3657 Dynarr_add (dst, c1 | 0x80);
3661 Dynarr_add (dst, c1);
3663 else if (EQ(charset, Vcharset_japanese_jisx0208))
3665 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3666 Dynarr_add (dst, s1);
3667 Dynarr_add (dst, s2);
3670 Dynarr_add (dst, '?');
3676 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3677 unsigned int *flags)
3681 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3682 Decode a JISX0208 character of Shift-JIS coding-system.
3683 CODE is the character code in Shift-JIS as a cons of type bytes.
3684 Return the corresponding character.
3688 unsigned char c1, c2, s1, s2;
3691 CHECK_INT (XCAR (code));
3692 CHECK_INT (XCDR (code));
3693 s1 = XINT (XCAR (code));
3694 s2 = XINT (XCDR (code));
3695 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3696 BYTE_SJIS_TWO_BYTE_2_P (s2))
3698 DECODE_SJIS (s1, s2, c1, c2);
3699 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3700 c1 & 0x7F, c2 & 0x7F));
3706 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3707 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3708 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3712 Lisp_Object charset;
3715 CHECK_CHAR_COERCE_INT (character);
3716 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3717 if (EQ (charset, Vcharset_japanese_jisx0208))
3719 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3720 return Fcons (make_int (s1), make_int (s2));
3727 /************************************************************************/
3729 /************************************************************************/
3731 /* BIG5 is a coding system encoding two character sets: ASCII and
3732 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3733 character set and is encoded in two-byte.
3735 --- CODE RANGE of BIG5 ---
3736 (character set) (range)
3738 Big5 (1st byte) 0xA1 .. 0xFE
3739 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3740 --------------------------
3742 Since the number of characters in Big5 is larger than maximum
3743 characters in Emacs' charset (96x96), it can't be handled as one
3744 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3745 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3746 contains frequently used characters and the latter contains less
3747 frequently used characters. */
3750 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3751 ((c) >= 0x81 && (c) <= 0xFE)
3753 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3754 ((c) >= 0xA1 && (c) <= 0xFE)
3757 /* Is this the second byte of a Shift-JIS two-byte char? */
3759 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3760 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3762 /* Number of Big5 characters which have the same code in 1st byte. */
3764 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3766 /* Code conversion macros. These are macros because they are used in
3767 inner loops during code conversion.
3769 Note that temporary variables in macros introduce the classic
3770 dynamic-scoping problems with variable names. We use capital-
3771 lettered variables in the assumption that XEmacs does not use
3772 capital letters in variables except in a very formalized way
3775 /* Convert Big5 code (b1, b2) into its internal string representation
3778 /* There is a much simpler way to split the Big5 charset into two.
3779 For the moment I'm going to leave the algorithm as-is because it
3780 claims to separate out the most-used characters into a single
3781 charset, which perhaps will lead to optimizations in various
3784 The way the algorithm works is something like this:
3786 Big5 can be viewed as a 94x157 charset, where the row is
3787 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3788 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3789 the split between low and high column numbers is apparently
3790 meaningless; ascending rows produce less and less frequent chars.
3791 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3792 the first charset, and the upper half (0xC9 .. 0xFE) to the
3793 second. To do the conversion, we convert the character into
3794 a single number where 0 .. 156 is the first row, 157 .. 313
3795 is the second, etc. That way, the characters are ordered by
3796 decreasing frequency. Then we just chop the space in two
3797 and coerce the result into a 94x94 space.
3800 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3802 int B1 = b1, B2 = b2; \
3804 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3808 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3812 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3813 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3815 c1 = I / (0xFF - 0xA1) + 0xA1; \
3816 c2 = I % (0xFF - 0xA1) + 0xA1; \
3819 /* Convert the internal string representation of a Big5 character
3820 (lb, c1, c2) into Big5 code (b1, b2). */
3822 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3824 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3826 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3828 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3830 b1 = I / BIG5_SAME_ROW + 0xA1; \
3831 b2 = I % BIG5_SAME_ROW; \
3832 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3836 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3840 unsigned char c = *(unsigned char *)src++;
3841 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3843 || (c >= 0x80 && c <= 0xA0)
3847 if (st->big5.in_second_byte)
3849 st->big5.in_second_byte = 0;
3850 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3860 st->big5.in_second_byte = 1;
3862 return CODING_CATEGORY_BIG5_MASK;
3865 /* Convert Big5 data to internal format. */
3868 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3869 unsigned_char_dynarr *dst, size_t n)
3871 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3872 unsigned int flags = str->flags;
3873 unsigned int cpos = str->cpos;
3874 eol_type_t eol_type = str->eol_type;
3877 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3878 (decoding)->codesys, 1);
3883 unsigned char c = *(unsigned char *)src++;
3886 /* Previous character was first byte of Big5 char. */
3887 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3890 int code_point = (cpos << 8) | c;
3891 Emchar char_id = decode_defined_char (ccs, code_point);
3894 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3895 DECODE_ADD_UCS_CHAR (char_id, dst);
3897 unsigned char b1, b2, b3;
3898 DECODE_BIG5 (cpos, c, b1, b2, b3);
3899 Dynarr_add (dst, b1);
3900 Dynarr_add (dst, b2);
3901 Dynarr_add (dst, b3);
3906 DECODE_ADD_BINARY_CHAR (cpos, dst);
3907 DECODE_ADD_BINARY_CHAR (c, dst);
3913 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3914 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3916 decode_flush_er_chars (str, dst);
3921 decode_flush_er_chars (str, dst);
3922 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3923 DECODE_ADD_BINARY_CHAR (c, dst);
3927 /* DECODE_ADD_BINARY_CHAR (c, dst); */
3928 decode_add_er_char (str, c, dst);
3931 label_continue_loop:;
3934 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
3935 if (flags & CODING_STATE_END)
3937 decode_flush_er_chars (str, dst);
3938 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3939 if (flags & CODING_STATE_CR)
3940 Dynarr_add (dst, '\r');
3947 /* Convert internally-formatted data to Big5. */
3950 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3951 unsigned_char_dynarr *dst, unsigned int *flags)
3953 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3957 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3958 Dynarr_add (dst, '\r');
3959 if (eol_type != EOL_CR)
3960 Dynarr_add (dst, ch);
3967 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3969 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3970 Dynarr_add (dst, code_point);
3971 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3973 Dynarr_add (dst, code_point >> 8);
3974 Dynarr_add (dst, code_point & 0xFF);
3976 else if ((code_point
3977 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3979 Dynarr_add (dst, code_point >> 8);
3980 Dynarr_add (dst, code_point & 0xFF);
3982 else if ((code_point
3983 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3986 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3987 + ((code_point & 0xFF) - 33);
3988 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3989 unsigned char b2 = I % BIG5_SAME_ROW;
3991 b2 += b2 < 0x3F ? 0x40 : 0x62;
3992 Dynarr_add (dst, b1);
3993 Dynarr_add (dst, b2);
3995 else if ((code_point
3996 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3999 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4000 + ((code_point & 0xFF) - 33);
4001 unsigned char b1, b2;
4003 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4004 b1 = I / BIG5_SAME_ROW + 0xA1;
4005 b2 = I % BIG5_SAME_ROW;
4006 b2 += b2 < 0x3F ? 0x40 : 0x62;
4007 Dynarr_add (dst, b1);
4008 Dynarr_add (dst, b2);
4010 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4014 char_encode_as_entity_reference (ch, buf);
4015 Dynarr_add_many (dst, buf, strlen (buf));
4018 Dynarr_add (dst, '?');
4025 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4026 unsigned int *flags)
4031 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4032 Decode a Big5 character CODE of BIG5 coding-system.
4033 CODE is the character code in BIG5, a cons of two integers.
4034 Return the corresponding character.
4038 unsigned char c1, c2, b1, b2;
4041 CHECK_INT (XCAR (code));
4042 CHECK_INT (XCDR (code));
4043 b1 = XINT (XCAR (code));
4044 b2 = XINT (XCDR (code));
4045 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4046 BYTE_BIG5_TWO_BYTE_2_P (b2))
4048 Charset_ID leading_byte;
4049 Lisp_Object charset;
4050 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4051 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4052 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4058 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4059 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4060 Return the corresponding character code in Big5.
4064 Lisp_Object charset;
4067 CHECK_CHAR_COERCE_INT (character);
4068 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4069 if (EQ (charset, Vcharset_chinese_big5_1) ||
4070 EQ (charset, Vcharset_chinese_big5_2))
4072 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4074 return Fcons (make_int (b1), make_int (b2));
4081 /************************************************************************/
4083 /************************************************************************/
4086 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
4090 unsigned char c = *(unsigned char *)src++;
4091 switch (st->ucs4.in_byte)
4100 st->ucs4.in_byte = 0;
4106 return CODING_CATEGORY_UCS4_MASK;
4110 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4111 unsigned_char_dynarr *dst, size_t n)
4113 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4114 unsigned int flags = str->flags;
4115 unsigned int cpos = str->cpos;
4116 unsigned char counter = str->counter;
4120 unsigned char c = *(unsigned char *)src++;
4128 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4133 cpos = ( cpos << 8 ) | c;
4137 if (counter & CODING_STATE_END)
4138 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4142 str->counter = counter;
4146 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4147 unsigned_char_dynarr *dst, unsigned int *flags)
4149 Dynarr_add (dst, ch >> 24);
4150 Dynarr_add (dst, ch >> 16);
4151 Dynarr_add (dst, ch >> 8);
4152 Dynarr_add (dst, ch );
4156 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4157 unsigned int *flags)
4162 /************************************************************************/
4164 /************************************************************************/
4167 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4171 unsigned char c = *(unsigned char *)src++;
4172 switch (st->utf8.in_byte)
4175 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4178 st->utf8.in_byte = 5;
4180 st->utf8.in_byte = 4;
4182 st->utf8.in_byte = 3;
4184 st->utf8.in_byte = 2;
4186 st->utf8.in_byte = 1;
4191 if ((c & 0xc0) != 0x80)
4197 return CODING_CATEGORY_UTF8_MASK;
4201 decode_output_utf8_partial_char (unsigned char counter,
4203 unsigned_char_dynarr *dst)
4206 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4207 else if (counter == 4)
4209 if (cpos < (1 << 6))
4210 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4213 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4214 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4217 else if (counter == 3)
4219 if (cpos < (1 << 6))
4220 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4221 else if (cpos < (1 << 12))
4223 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4224 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4228 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4229 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4230 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4233 else if (counter == 2)
4235 if (cpos < (1 << 6))
4236 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4237 else if (cpos < (1 << 12))
4239 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4240 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4242 else if (cpos < (1 << 18))
4244 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4245 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4246 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4250 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4251 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4252 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4253 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4258 if (cpos < (1 << 6))
4259 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4260 else if (cpos < (1 << 12))
4262 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4263 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4265 else if (cpos < (1 << 18))
4267 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4268 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4269 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4271 else if (cpos < (1 << 24))
4273 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4274 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4275 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4276 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4280 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4281 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4282 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4283 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4284 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4290 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4291 unsigned_char_dynarr *dst, size_t n)
4293 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4294 unsigned int flags = str->flags;
4295 unsigned int cpos = str->cpos;
4296 eol_type_t eol_type = str->eol_type;
4297 unsigned char counter = str->counter;
4300 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4301 (decoding)->codesys, 0);
4306 unsigned char c = *(unsigned char *)src++;
4311 COMPOSE_FLUSH_CHARS (str, dst);
4312 decode_flush_er_chars (str, dst);
4313 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4314 DECODE_ADD_UCS_CHAR (c, dst);
4316 else if ( c < 0xC0 )
4317 /* decode_add_er_char (str, c, dst); */
4318 COMPOSE_ADD_CHAR (str, c, dst);
4321 /* decode_flush_er_chars (str, dst); */
4327 else if ( c < 0xF0 )
4332 else if ( c < 0xF8 )
4337 else if ( c < 0xFC )
4349 else if ( (c & 0xC0) == 0x80 )
4351 cpos = ( cpos << 6 ) | ( c & 0x3f );
4358 char_id = decode_defined_char (ccs, cpos);
4365 COMPOSE_ADD_CHAR (str, char_id, dst);
4374 COMPOSE_FLUSH_CHARS (str, dst);
4375 decode_flush_er_chars (str, dst);
4376 decode_output_utf8_partial_char (counter, cpos, dst);
4377 DECODE_ADD_BINARY_CHAR (c, dst);
4381 label_continue_loop:;
4384 if (flags & CODING_STATE_END)
4386 COMPOSE_FLUSH_CHARS (str, dst);
4387 decode_flush_er_chars (str, dst);
4390 decode_output_utf8_partial_char (counter, cpos, dst);
4397 str->counter = counter;
4401 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4402 unsigned_char_dynarr *dst, unsigned int *flags)
4404 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4408 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4409 Dynarr_add (dst, '\r');
4410 if (eol_type != EOL_CR)
4411 Dynarr_add (dst, ch);
4413 else if (ch <= 0x7f)
4415 Dynarr_add (dst, ch);
4420 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4421 int code_point = charset_code_point (ucs_ccs, ch);
4423 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4426 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4430 && INTP (ret = Fget_char_attribute (make_char (ch),
4432 code_point = XINT (ret);
4433 else if ( !NILP (map =
4434 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4436 && INTP (ret = Fget_char_attribute (make_char (ch),
4438 code_point = XINT (ret);
4439 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4443 char_encode_as_entity_reference (ch, buf);
4444 Dynarr_add_many (dst, buf, strlen (buf));
4450 if (code_point <= 0x7ff)
4452 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4453 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4455 else if (code_point <= 0xffff)
4457 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4458 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4459 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4461 else if (code_point <= 0x1fffff)
4463 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4464 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4465 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4466 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4468 else if (code_point <= 0x3ffffff)
4470 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4471 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4472 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4473 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4474 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4478 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4479 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4480 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4481 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4482 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4483 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4489 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4490 unsigned int *flags)
4495 /************************************************************************/
4496 /* ISO2022 methods */
4497 /************************************************************************/
4499 /* The following note describes the coding system ISO2022 briefly.
4500 Since the intention of this note is to help understand the
4501 functions in this file, some parts are NOT ACCURATE or OVERLY
4502 SIMPLIFIED. For thorough understanding, please refer to the
4503 original document of ISO2022.
4505 ISO2022 provides many mechanisms to encode several character sets
4506 in 7-bit and 8-bit environments. For 7-bit environments, all text
4507 is encoded using bytes less than 128. This may make the encoded
4508 text a little bit longer, but the text passes more easily through
4509 several gateways, some of which strip off MSB (Most Signigant Bit).
4511 There are two kinds of character sets: control character set and
4512 graphic character set. The former contains control characters such
4513 as `newline' and `escape' to provide control functions (control
4514 functions are also provided by escape sequences). The latter
4515 contains graphic characters such as 'A' and '-'. Emacs recognizes
4516 two control character sets and many graphic character sets.
4518 Graphic character sets are classified into one of the following
4519 four classes, according to the number of bytes (DIMENSION) and
4520 number of characters in one dimension (CHARS) of the set:
4521 - DIMENSION1_CHARS94
4522 - DIMENSION1_CHARS96
4523 - DIMENSION2_CHARS94
4524 - DIMENSION2_CHARS96
4526 In addition, each character set is assigned an identification tag,
4527 unique for each set, called "final character" (denoted as <F>
4528 hereafter). The <F> of each character set is decided by ECMA(*)
4529 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4530 (0x30..0x3F are for private use only).
4532 Note (*): ECMA = European Computer Manufacturers Association
4534 Here are examples of graphic character set [NAME(<F>)]:
4535 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4536 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4537 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4538 o DIMENSION2_CHARS96 -- none for the moment
4540 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4541 C0 [0x00..0x1F] -- control character plane 0
4542 GL [0x20..0x7F] -- graphic character plane 0
4543 C1 [0x80..0x9F] -- control character plane 1
4544 GR [0xA0..0xFF] -- graphic character plane 1
4546 A control character set is directly designated and invoked to C0 or
4547 C1 by an escape sequence. The most common case is that:
4548 - ISO646's control character set is designated/invoked to C0, and
4549 - ISO6429's control character set is designated/invoked to C1,
4550 and usually these designations/invocations are omitted in encoded
4551 text. In a 7-bit environment, only C0 can be used, and a control
4552 character for C1 is encoded by an appropriate escape sequence to
4553 fit into the environment. All control characters for C1 are
4554 defined to have corresponding escape sequences.
4556 A graphic character set is at first designated to one of four
4557 graphic registers (G0 through G3), then these graphic registers are
4558 invoked to GL or GR. These designations and invocations can be
4559 done independently. The most common case is that G0 is invoked to
4560 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4561 these invocations and designations are omitted in encoded text.
4562 In a 7-bit environment, only GL can be used.
4564 When a graphic character set of CHARS94 is invoked to GL, codes
4565 0x20 and 0x7F of the GL area work as control characters SPACE and
4566 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4569 There are two ways of invocation: locking-shift and single-shift.
4570 With locking-shift, the invocation lasts until the next different
4571 invocation, whereas with single-shift, the invocation affects the
4572 following character only and doesn't affect the locking-shift
4573 state. Invocations are done by the following control characters or
4576 ----------------------------------------------------------------------
4577 abbrev function cntrl escape seq description
4578 ----------------------------------------------------------------------
4579 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4580 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4581 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4582 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4583 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4584 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4585 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4586 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4587 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4588 ----------------------------------------------------------------------
4589 (*) These are not used by any known coding system.
4591 Control characters for these functions are defined by macros
4592 ISO_CODE_XXX in `coding.h'.
4594 Designations are done by the following escape sequences:
4595 ----------------------------------------------------------------------
4596 escape sequence description
4597 ----------------------------------------------------------------------
4598 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4599 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4600 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4601 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4602 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4603 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4604 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4605 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4606 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4607 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4608 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4609 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4610 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4611 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4612 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4613 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4614 ----------------------------------------------------------------------
4616 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4617 of dimension 1, chars 94, and final character <F>, etc...
4619 Note (*): Although these designations are not allowed in ISO2022,
4620 Emacs accepts them on decoding, and produces them on encoding
4621 CHARS96 character sets in a coding system which is characterized as
4622 7-bit environment, non-locking-shift, and non-single-shift.
4624 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4625 '(' can be omitted. We refer to this as "short-form" hereafter.
4627 Now you may notice that there are a lot of ways for encoding the
4628 same multilingual text in ISO2022. Actually, there exist many
4629 coding systems such as Compound Text (used in X11's inter client
4630 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4631 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4632 localized platforms), and all of these are variants of ISO2022.
4634 In addition to the above, Emacs handles two more kinds of escape
4635 sequences: ISO6429's direction specification and Emacs' private
4636 sequence for specifying character composition.
4638 ISO6429's direction specification takes the following form:
4639 o CSI ']' -- end of the current direction
4640 o CSI '0' ']' -- end of the current direction
4641 o CSI '1' ']' -- start of left-to-right text
4642 o CSI '2' ']' -- start of right-to-left text
4643 The control character CSI (0x9B: control sequence introducer) is
4644 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4646 Character composition specification takes the following form:
4647 o ESC '0' -- start character composition
4648 o ESC '1' -- end character composition
4649 Since these are not standard escape sequences of any ISO standard,
4650 their use with these meanings is restricted to Emacs only. */
4653 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4657 for (i = 0; i < 4; i++)
4659 if (!NILP (coding_system))
4661 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4663 iso->charset[i] = Qt;
4664 iso->invalid_designated[i] = 0;
4666 iso->esc = ISO_ESC_NOTHING;
4667 iso->esc_bytes_index = 0;
4668 iso->register_left = 0;
4669 iso->register_right = 1;
4670 iso->switched_dir_and_no_valid_charset_yet = 0;
4671 iso->invalid_switch_dir = 0;
4672 iso->output_direction_sequence = 0;
4673 iso->output_literally = 0;
4674 #ifdef ENABLE_COMPOSITE_CHARS
4675 if (iso->composite_chars)
4676 Dynarr_reset (iso->composite_chars);
4681 fit_to_be_escape_quoted (unsigned char c)
4698 /* Parse one byte of an ISO2022 escape sequence.
4699 If the result is an invalid escape sequence, return 0 and
4700 do not change anything in STR. Otherwise, if the result is
4701 an incomplete escape sequence, update ISO2022.ESC and
4702 ISO2022.ESC_BYTES and return -1. Otherwise, update
4703 all the state variables (but not ISO2022.ESC_BYTES) and
4706 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4707 or invocation of an invalid character set and treat that as
4708 an unrecognized escape sequence. */
4711 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4712 unsigned char c, unsigned int *flags,
4713 int check_invalid_charsets)
4715 /* (1) If we're at the end of a designation sequence, CS is the
4716 charset being designated and REG is the register to designate
4719 (2) If we're at the end of a locking-shift sequence, REG is
4720 the register to invoke and HALF (0 == left, 1 == right) is
4721 the half to invoke it into.
4723 (3) If we're at the end of a single-shift sequence, REG is
4724 the register to invoke. */
4725 Lisp_Object cs = Qnil;
4728 /* NOTE: This code does goto's all over the fucking place.
4729 The reason for this is that we're basically implementing
4730 a state machine here, and hierarchical languages like C
4731 don't really provide a clean way of doing this. */
4733 if (! (*flags & CODING_STATE_ESCAPE))
4734 /* At beginning of escape sequence; we need to reset our
4735 escape-state variables. */
4736 iso->esc = ISO_ESC_NOTHING;
4738 iso->output_literally = 0;
4739 iso->output_direction_sequence = 0;
4743 case ISO_ESC_NOTHING:
4744 iso->esc_bytes_index = 0;
4747 case ISO_CODE_ESC: /* Start escape sequence */
4748 *flags |= CODING_STATE_ESCAPE;
4752 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4753 *flags |= CODING_STATE_ESCAPE;
4754 iso->esc = ISO_ESC_5_11;
4757 case ISO_CODE_SO: /* locking shift 1 */
4760 case ISO_CODE_SI: /* locking shift 0 */
4764 case ISO_CODE_SS2: /* single shift */
4767 case ISO_CODE_SS3: /* single shift */
4771 default: /* Other control characters */
4778 /**** single shift ****/
4780 case 'N': /* single shift 2 */
4783 case 'O': /* single shift 3 */
4787 /**** locking shift ****/
4789 case '~': /* locking shift 1 right */
4792 case 'n': /* locking shift 2 */
4795 case '}': /* locking shift 2 right */
4798 case 'o': /* locking shift 3 */
4801 case '|': /* locking shift 3 right */
4805 #ifdef ENABLE_COMPOSITE_CHARS
4806 /**** composite ****/
4809 iso->esc = ISO_ESC_START_COMPOSITE;
4810 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4811 CODING_STATE_COMPOSITE;
4815 iso->esc = ISO_ESC_END_COMPOSITE;
4816 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4817 ~CODING_STATE_COMPOSITE;
4819 #endif /* ENABLE_COMPOSITE_CHARS */
4821 /**** directionality ****/
4824 iso->esc = ISO_ESC_5_11;
4827 /**** designation ****/
4829 case '$': /* multibyte charset prefix */
4830 iso->esc = ISO_ESC_2_4;
4834 if (0x28 <= c && c <= 0x2F)
4836 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4840 /* This function is called with CODESYS equal to nil when
4841 doing coding-system detection. */
4843 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4844 && fit_to_be_escape_quoted (c))
4846 iso->esc = ISO_ESC_LITERAL;
4847 *flags &= CODING_STATE_ISO2022_LOCK;
4857 /**** directionality ****/
4859 case ISO_ESC_5_11: /* ISO6429 direction control */
4862 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4863 goto directionality;
4865 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4866 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4867 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4871 case ISO_ESC_5_11_0:
4874 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4875 goto directionality;
4879 case ISO_ESC_5_11_1:
4882 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4883 goto directionality;
4887 case ISO_ESC_5_11_2:
4890 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4891 goto directionality;
4896 iso->esc = ISO_ESC_DIRECTIONALITY;
4897 /* Various junk here to attempt to preserve the direction sequences
4898 literally in the text if they would otherwise be swallowed due
4899 to invalid designations that don't show up as actual charset
4900 changes in the text. */
4901 if (iso->invalid_switch_dir)
4903 /* We already inserted a direction switch literally into the
4904 text. We assume (#### this may not be right) that the
4905 next direction switch is the one going the other way,
4906 and we need to output that literally as well. */
4907 iso->output_literally = 1;
4908 iso->invalid_switch_dir = 0;
4914 /* If we are in the thrall of an invalid designation,
4915 then stick the directionality sequence literally into the
4916 output stream so it ends up in the original text again. */
4917 for (jj = 0; jj < 4; jj++)
4918 if (iso->invalid_designated[jj])
4922 iso->output_literally = 1;
4923 iso->invalid_switch_dir = 1;
4926 /* Indicate that we haven't yet seen a valid designation,
4927 so that if a switch-dir is directly followed by an
4928 invalid designation, both get inserted literally. */
4929 iso->switched_dir_and_no_valid_charset_yet = 1;
4934 /**** designation ****/
4937 if (0x28 <= c && c <= 0x2F)
4939 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4942 if (0x40 <= c && c <= 0x42)
4945 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4946 *flags & CODING_STATE_R2L ?
4947 CHARSET_RIGHT_TO_LEFT :
4948 CHARSET_LEFT_TO_RIGHT);
4959 if (c < '0' || c > '~')
4960 return 0; /* bad final byte */
4962 if (iso->esc >= ISO_ESC_2_8 &&
4963 iso->esc <= ISO_ESC_2_15)
4965 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4966 single = 1; /* single-byte */
4967 reg = (iso->esc - ISO_ESC_2_8) & 3;
4969 else if (iso->esc >= ISO_ESC_2_4_8 &&
4970 iso->esc <= ISO_ESC_2_4_15)
4972 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4973 single = -1; /* multi-byte */
4974 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4978 /* Can this ever be reached? -slb */
4982 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4983 *flags & CODING_STATE_R2L ?
4984 CHARSET_RIGHT_TO_LEFT :
4985 CHARSET_LEFT_TO_RIGHT);
4991 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4995 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4996 /* can't invoke something that ain't there. */
4998 iso->esc = ISO_ESC_SINGLE_SHIFT;
4999 *flags &= CODING_STATE_ISO2022_LOCK;
5001 *flags |= CODING_STATE_SS2;
5003 *flags |= CODING_STATE_SS3;
5007 if (check_invalid_charsets &&
5008 !CHARSETP (iso->charset[reg]))
5009 /* can't invoke something that ain't there. */
5012 iso->register_right = reg;
5014 iso->register_left = reg;
5015 *flags &= CODING_STATE_ISO2022_LOCK;
5016 iso->esc = ISO_ESC_LOCKING_SHIFT;
5020 if (NILP (cs) && check_invalid_charsets)
5022 iso->invalid_designated[reg] = 1;
5023 iso->charset[reg] = Vcharset_ascii;
5024 iso->esc = ISO_ESC_DESIGNATE;
5025 *flags &= CODING_STATE_ISO2022_LOCK;
5026 iso->output_literally = 1;
5027 if (iso->switched_dir_and_no_valid_charset_yet)
5029 /* We encountered a switch-direction followed by an
5030 invalid designation. Ensure that the switch-direction
5031 gets outputted; otherwise it will probably get eaten
5032 when the text is written out again. */
5033 iso->switched_dir_and_no_valid_charset_yet = 0;
5034 iso->output_direction_sequence = 1;
5035 /* And make sure that the switch-dir going the other
5036 way gets outputted, as well. */
5037 iso->invalid_switch_dir = 1;
5041 /* This function is called with CODESYS equal to nil when
5042 doing coding-system detection. */
5043 if (!NILP (codesys))
5045 charset_conversion_spec_dynarr *dyn =
5046 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5052 for (i = 0; i < Dynarr_length (dyn); i++)
5054 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5055 if (EQ (cs, spec->from_charset))
5056 cs = spec->to_charset;
5061 iso->charset[reg] = cs;
5062 iso->esc = ISO_ESC_DESIGNATE;
5063 *flags &= CODING_STATE_ISO2022_LOCK;
5064 if (iso->invalid_designated[reg])
5066 iso->invalid_designated[reg] = 0;
5067 iso->output_literally = 1;
5069 if (iso->switched_dir_and_no_valid_charset_yet)
5070 iso->switched_dir_and_no_valid_charset_yet = 0;
5075 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
5079 /* #### There are serious deficiencies in the recognition mechanism
5080 here. This needs to be much smarter if it's going to cut it.
5081 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5082 it should be detected as Latin-1.
5083 All the ISO2022 stuff in this file should be synced up with the
5084 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5085 Perhaps we should wait till R2L works in FSF Emacs? */
5087 if (!st->iso2022.initted)
5089 reset_iso2022 (Qnil, &st->iso2022.iso);
5090 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5091 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5092 CODING_CATEGORY_ISO_8_1_MASK |
5093 CODING_CATEGORY_ISO_8_2_MASK |
5094 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5095 st->iso2022.flags = 0;
5096 st->iso2022.high_byte_count = 0;
5097 st->iso2022.saw_single_shift = 0;
5098 st->iso2022.initted = 1;
5101 mask = st->iso2022.mask;
5105 unsigned char c = *(unsigned char *)src++;
5108 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5109 st->iso2022.high_byte_count++;
5113 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5115 if (st->iso2022.high_byte_count & 1)
5116 /* odd number of high bytes; assume not iso-8-2 */
5117 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5119 st->iso2022.high_byte_count = 0;
5120 st->iso2022.saw_single_shift = 0;
5122 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5124 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5125 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5126 { /* control chars */
5129 /* Allow and ignore control characters that you might
5130 reasonably see in a text file */
5135 case 8: /* backspace */
5136 case 11: /* vertical tab */
5137 case 12: /* form feed */
5138 case 26: /* MS-DOS C-z junk */
5139 case 31: /* '^_' -- for info */
5140 goto label_continue_loop;
5147 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5150 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5151 &st->iso2022.flags, 0))
5153 switch (st->iso2022.iso.esc)
5155 case ISO_ESC_DESIGNATE:
5156 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5157 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5159 case ISO_ESC_LOCKING_SHIFT:
5160 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5161 goto ran_out_of_chars;
5162 case ISO_ESC_SINGLE_SHIFT:
5163 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5164 st->iso2022.saw_single_shift = 1;
5173 goto ran_out_of_chars;
5176 label_continue_loop:;
5185 postprocess_iso2022_mask (int mask)
5187 /* #### kind of cheesy */
5188 /* If seven-bit ISO is allowed, then assume that the encoding is
5189 entirely seven-bit and turn off the eight-bit ones. */
5190 if (mask & CODING_CATEGORY_ISO_7_MASK)
5191 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5192 CODING_CATEGORY_ISO_8_1_MASK |
5193 CODING_CATEGORY_ISO_8_2_MASK);
5197 /* If FLAGS is a null pointer or specifies right-to-left motion,
5198 output a switch-dir-to-left-to-right sequence to DST.
5199 Also update FLAGS if it is not a null pointer.
5200 If INTERNAL_P is set, we are outputting in internal format and
5201 need to handle the CSI differently. */
5204 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5205 unsigned_char_dynarr *dst,
5206 unsigned int *flags,
5209 if (!flags || (*flags & CODING_STATE_R2L))
5211 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5213 Dynarr_add (dst, ISO_CODE_ESC);
5214 Dynarr_add (dst, '[');
5216 else if (internal_p)
5217 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5219 Dynarr_add (dst, ISO_CODE_CSI);
5220 Dynarr_add (dst, '0');
5221 Dynarr_add (dst, ']');
5223 *flags &= ~CODING_STATE_R2L;
5227 /* If FLAGS is a null pointer or specifies a direction different from
5228 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5229 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5230 sequence to DST. Also update FLAGS if it is not a null pointer.
5231 If INTERNAL_P is set, we are outputting in internal format and
5232 need to handle the CSI differently. */
5235 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5236 unsigned_char_dynarr *dst, unsigned int *flags,
5239 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5240 direction == CHARSET_LEFT_TO_RIGHT)
5241 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5242 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5243 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5244 direction == CHARSET_RIGHT_TO_LEFT)
5246 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5248 Dynarr_add (dst, ISO_CODE_ESC);
5249 Dynarr_add (dst, '[');
5251 else if (internal_p)
5252 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5254 Dynarr_add (dst, ISO_CODE_CSI);
5255 Dynarr_add (dst, '2');
5256 Dynarr_add (dst, ']');
5258 *flags |= CODING_STATE_R2L;
5262 /* Convert ISO2022-format data to internal format. */
5265 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5266 unsigned_char_dynarr *dst, size_t n)
5268 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5269 unsigned int flags = str->flags;
5270 unsigned int cpos = str->cpos;
5271 unsigned char counter = str->counter;
5272 eol_type_t eol_type = str->eol_type;
5273 #ifdef ENABLE_COMPOSITE_CHARS
5274 unsigned_char_dynarr *real_dst = dst;
5276 Lisp_Object coding_system;
5278 XSETCODING_SYSTEM (coding_system, str->codesys);
5280 #ifdef ENABLE_COMPOSITE_CHARS
5281 if (flags & CODING_STATE_COMPOSITE)
5282 dst = str->iso2022.composite_chars;
5283 #endif /* ENABLE_COMPOSITE_CHARS */
5287 unsigned char c = *(unsigned char *)src++;
5288 if (flags & CODING_STATE_ESCAPE)
5289 { /* Within ESC sequence */
5290 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5295 switch (str->iso2022.esc)
5297 #ifdef ENABLE_COMPOSITE_CHARS
5298 case ISO_ESC_START_COMPOSITE:
5299 if (str->iso2022.composite_chars)
5300 Dynarr_reset (str->iso2022.composite_chars);
5302 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5303 dst = str->iso2022.composite_chars;
5305 case ISO_ESC_END_COMPOSITE:
5307 Bufbyte comstr[MAX_EMCHAR_LEN];
5309 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5310 Dynarr_length (dst));
5312 len = set_charptr_emchar (comstr, emch);
5313 Dynarr_add_many (dst, comstr, len);
5316 #endif /* ENABLE_COMPOSITE_CHARS */
5318 case ISO_ESC_LITERAL:
5319 COMPOSE_FLUSH_CHARS (str, dst);
5320 decode_flush_er_chars (str, dst);
5321 DECODE_ADD_BINARY_CHAR (c, dst);
5325 /* Everything else handled already */
5330 /* Attempted error recovery. */
5331 if (str->iso2022.output_direction_sequence)
5332 ensure_correct_direction (flags & CODING_STATE_R2L ?
5333 CHARSET_RIGHT_TO_LEFT :
5334 CHARSET_LEFT_TO_RIGHT,
5335 str->codesys, dst, 0, 1);
5336 /* More error recovery. */
5337 if (!retval || str->iso2022.output_literally)
5339 /* Output the (possibly invalid) sequence */
5341 COMPOSE_FLUSH_CHARS (str, dst);
5342 decode_flush_er_chars (str, dst);
5343 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5344 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5345 flags &= CODING_STATE_ISO2022_LOCK;
5347 n++, src--;/* Repeat the loop with the same character. */
5350 /* No sense in reprocessing the final byte of the
5351 escape sequence; it could mess things up anyway.
5353 COMPOSE_FLUSH_CHARS (str, dst);
5354 decode_flush_er_chars (str, dst);
5355 DECODE_ADD_BINARY_CHAR (c, dst);
5361 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5362 { /* Control characters */
5364 /***** Error-handling *****/
5366 /* If we were in the middle of a character, dump out the
5367 partial character. */
5370 COMPOSE_FLUSH_CHARS (str, dst);
5371 decode_flush_er_chars (str, dst);
5375 DECODE_ADD_BINARY_CHAR
5376 ((unsigned char)(cpos >> (counter * 8)), dst);
5381 /* If we just saw a single-shift character, dump it out.
5382 This may dump out the wrong sort of single-shift character,
5383 but least it will give an indication that something went
5385 if (flags & CODING_STATE_SS2)
5387 COMPOSE_FLUSH_CHARS (str, dst);
5388 decode_flush_er_chars (str, dst);
5389 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5390 flags &= ~CODING_STATE_SS2;
5392 if (flags & CODING_STATE_SS3)
5394 COMPOSE_FLUSH_CHARS (str, dst);
5395 decode_flush_er_chars (str, dst);
5396 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5397 flags &= ~CODING_STATE_SS3;
5400 /***** Now handle the control characters. *****/
5406 COMPOSE_FLUSH_CHARS (str, dst);
5407 decode_flush_er_chars (str, dst);
5408 if (eol_type == EOL_CR)
5409 Dynarr_add (dst, '\n');
5410 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5411 Dynarr_add (dst, c);
5413 flags |= CODING_STATE_CR;
5414 goto label_continue_loop;
5416 else if (flags & CODING_STATE_CR)
5417 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5419 Dynarr_add (dst, '\r');
5420 flags &= ~CODING_STATE_CR;
5423 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5426 flags &= CODING_STATE_ISO2022_LOCK;
5428 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5430 COMPOSE_FLUSH_CHARS (str, dst);
5431 decode_flush_er_chars (str, dst);
5432 DECODE_ADD_BINARY_CHAR (c, dst);
5436 { /* Graphic characters */
5437 Lisp_Object charset;
5446 COMPOSE_FLUSH_CHARS (str, dst);
5447 decode_flush_er_chars (str, dst);
5448 if (eol_type == EOL_CR)
5449 Dynarr_add (dst, '\n');
5450 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5451 Dynarr_add (dst, c);
5453 flags |= CODING_STATE_CR;
5454 goto label_continue_loop;
5456 else if (flags & CODING_STATE_CR)
5457 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5459 Dynarr_add (dst, '\r');
5460 flags &= ~CODING_STATE_CR;
5463 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5466 /* Now determine the charset. */
5467 reg = ((flags & CODING_STATE_SS2) ? 2
5468 : (flags & CODING_STATE_SS3) ? 3
5469 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5470 : str->iso2022.register_left);
5471 charset = str->iso2022.charset[reg];
5473 /* Error checking: */
5474 if (! CHARSETP (charset)
5475 || str->iso2022.invalid_designated[reg]
5476 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5477 && XCHARSET_CHARS (charset) == 94))
5478 /* Mrmph. We are trying to invoke a register that has no
5479 or an invalid charset in it, or trying to add a character
5480 outside the range of the charset. Insert that char literally
5481 to preserve it for the output. */
5483 COMPOSE_FLUSH_CHARS (str, dst);
5484 decode_flush_er_chars (str, dst);
5488 DECODE_ADD_BINARY_CHAR
5489 ((unsigned char)(cpos >> (counter * 8)), dst);
5492 DECODE_ADD_BINARY_CHAR (c, dst);
5497 /* Things are probably hunky-dorey. */
5499 /* Fetch reverse charset, maybe. */
5500 if (((flags & CODING_STATE_R2L) &&
5501 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5503 (!(flags & CODING_STATE_R2L) &&
5504 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5506 Lisp_Object new_charset =
5507 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5508 if (!NILP (new_charset))
5509 charset = new_charset;
5514 if (XCHARSET_DIMENSION (charset) == counter)
5516 COMPOSE_ADD_CHAR (str,
5517 DECODE_CHAR (charset,
5518 ((cpos & 0x7F7F7F) << 8)
5525 cpos = (cpos << 8) | c;
5527 lb = XCHARSET_LEADING_BYTE (charset);
5528 switch (XCHARSET_REP_BYTES (charset))
5531 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5532 Dynarr_add (dst, c & 0x7F);
5535 case 2: /* one-byte official */
5536 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5537 Dynarr_add (dst, lb);
5538 Dynarr_add (dst, c | 0x80);
5541 case 3: /* one-byte private or two-byte official */
5542 if (XCHARSET_PRIVATE_P (charset))
5544 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5545 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5546 Dynarr_add (dst, lb);
5547 Dynarr_add (dst, c | 0x80);
5553 Dynarr_add (dst, lb);
5554 Dynarr_add (dst, ch | 0x80);
5555 Dynarr_add (dst, c | 0x80);
5563 default: /* two-byte private */
5566 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5567 Dynarr_add (dst, lb);
5568 Dynarr_add (dst, ch | 0x80);
5569 Dynarr_add (dst, c | 0x80);
5579 flags &= CODING_STATE_ISO2022_LOCK;
5582 label_continue_loop:;
5585 if (flags & CODING_STATE_END)
5587 COMPOSE_FLUSH_CHARS (str, dst);
5588 decode_flush_er_chars (str, dst);
5589 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5593 str->counter = counter;
5597 /***** ISO2022 encoder *****/
5599 /* Designate CHARSET into register REG. */
5602 iso2022_designate (Lisp_Object charset, unsigned char reg,
5603 struct encoding_stream *str, unsigned_char_dynarr *dst)
5605 static const char inter94[] = "()*+";
5606 static const char inter96[] = ",-./";
5607 unsigned short chars;
5608 unsigned char dimension;
5609 unsigned char final;
5610 Lisp_Object old_charset = str->iso2022.charset[reg];
5612 str->iso2022.charset[reg] = charset;
5613 if (!CHARSETP (charset))
5614 /* charset might be an initial nil or t. */
5616 chars = XCHARSET_CHARS (charset);
5617 dimension = XCHARSET_DIMENSION (charset);
5618 final = XCHARSET_FINAL (charset);
5619 if (!str->iso2022.force_charset_on_output[reg] &&
5620 CHARSETP (old_charset) &&
5621 XCHARSET_CHARS (old_charset) == chars &&
5622 XCHARSET_DIMENSION (old_charset) == dimension &&
5623 XCHARSET_FINAL (old_charset) == final)
5626 str->iso2022.force_charset_on_output[reg] = 0;
5629 charset_conversion_spec_dynarr *dyn =
5630 str->codesys->iso2022.output_conv;
5636 for (i = 0; i < Dynarr_length (dyn); i++)
5638 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5639 if (EQ (charset, spec->from_charset))
5640 charset = spec->to_charset;
5645 Dynarr_add (dst, ISO_CODE_ESC);
5650 Dynarr_add (dst, inter94[reg]);
5653 Dynarr_add (dst, '$');
5655 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5658 Dynarr_add (dst, inter94[reg]);
5663 Dynarr_add (dst, inter96[reg]);
5666 Dynarr_add (dst, '$');
5667 Dynarr_add (dst, inter96[reg]);
5671 Dynarr_add (dst, final);
5675 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5677 if (str->iso2022.register_left != 0)
5679 Dynarr_add (dst, ISO_CODE_SI);
5680 str->iso2022.register_left = 0;
5685 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5687 if (str->iso2022.register_left != 1)
5689 Dynarr_add (dst, ISO_CODE_SO);
5690 str->iso2022.register_left = 1;
5695 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5696 unsigned_char_dynarr *dst, unsigned int *flags)
5698 unsigned char charmask;
5699 Lisp_Coding_System* codesys = str->codesys;
5700 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5702 Lisp_Object charset = str->iso2022.current_charset;
5703 int half = str->iso2022.current_half;
5704 int code_point = -1;
5708 restore_left_to_right_direction (codesys, dst, flags, 0);
5710 /* Make sure G0 contains ASCII */
5711 if ((ch > ' ' && ch < ISO_CODE_DEL)
5712 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5714 ensure_normal_shift (str, dst);
5715 iso2022_designate (Vcharset_ascii, 0, str, dst);
5718 /* If necessary, restore everything to the default state
5720 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5722 restore_left_to_right_direction (codesys, dst, flags, 0);
5724 ensure_normal_shift (str, dst);
5726 for (i = 0; i < 4; i++)
5728 Lisp_Object initial_charset =
5729 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5730 iso2022_designate (initial_charset, i, str, dst);
5735 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5736 Dynarr_add (dst, '\r');
5737 if (eol_type != EOL_CR)
5738 Dynarr_add (dst, ch);
5742 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5743 && fit_to_be_escape_quoted (ch))
5744 Dynarr_add (dst, ISO_CODE_ESC);
5745 Dynarr_add (dst, ch);
5748 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5750 charmask = (half == 0 ? 0x00 : 0x80);
5752 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5753 && fit_to_be_escape_quoted (ch))
5754 Dynarr_add (dst, ISO_CODE_ESC);
5755 /* you asked for it ... */
5756 Dynarr_add (dst, ch);
5762 /* Now determine which register to use. */
5764 for (i = 0; i < 4; i++)
5766 if ((CHARSETP (charset = str->iso2022.charset[i])
5767 && ((code_point = charset_code_point (charset, ch)) >= 0))
5771 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5772 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5780 Lisp_Object original_default_coded_charset_priority_list
5781 = Vdefault_coded_charset_priority_list;
5783 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5785 code_point = ENCODE_CHAR (ch, charset);
5786 if (XCHARSET_FINAL (charset))
5788 Vdefault_coded_charset_priority_list
5789 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5790 Vdefault_coded_charset_priority_list));
5792 code_point = ENCODE_CHAR (ch, charset);
5793 if (!XCHARSET_FINAL (charset))
5795 charset = Vcharset_ascii;
5799 Vdefault_coded_charset_priority_list
5800 = original_default_coded_charset_priority_list;
5802 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5803 codesys, dst, flags, 0);
5807 if (XCHARSET_GRAPHIC (charset) != 0)
5809 if (!NILP (str->iso2022.charset[1]) &&
5810 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5811 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5813 else if (!NILP (str->iso2022.charset[2]))
5815 else if (!NILP (str->iso2022.charset[3]))
5824 iso2022_designate (charset, reg, str, dst);
5826 /* Now invoke that register. */
5830 ensure_normal_shift (str, dst);
5834 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5836 ensure_shift_out (str, dst);
5843 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5845 Dynarr_add (dst, ISO_CODE_ESC);
5846 Dynarr_add (dst, 'N');
5851 Dynarr_add (dst, ISO_CODE_SS2);
5856 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5858 Dynarr_add (dst, ISO_CODE_ESC);
5859 Dynarr_add (dst, 'O');
5864 Dynarr_add (dst, ISO_CODE_SS3);
5872 charmask = (half == 0 ? 0x00 : 0x80);
5874 switch (XCHARSET_DIMENSION (charset))
5877 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5880 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5881 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5884 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5885 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5886 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5889 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5890 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5891 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5892 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5898 str->iso2022.current_charset = charset;
5899 str->iso2022.current_half = half;
5903 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5904 unsigned int *flags)
5906 Lisp_Coding_System* codesys = str->codesys;
5909 restore_left_to_right_direction (codesys, dst, flags, 0);
5910 ensure_normal_shift (str, dst);
5911 for (i = 0; i < 4; i++)
5913 Lisp_Object initial_charset
5914 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5915 iso2022_designate (initial_charset, i, str, dst);
5920 /************************************************************************/
5921 /* No-conversion methods */
5922 /************************************************************************/
5924 /* This is used when reading in "binary" files -- i.e. files that may
5925 contain all 256 possible byte values and that are not to be
5926 interpreted as being in any particular decoding. */
5928 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5929 unsigned_char_dynarr *dst, size_t n)
5931 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5932 unsigned int flags = str->flags;
5933 unsigned int cpos = str->cpos;
5934 eol_type_t eol_type = str->eol_type;
5938 unsigned char c = *(unsigned char *)src++;
5940 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5941 DECODE_ADD_BINARY_CHAR (c, dst);
5942 label_continue_loop:;
5945 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5952 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5953 unsigned_char_dynarr *dst, size_t n)
5956 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5957 unsigned int flags = str->flags;
5958 unsigned int ch = str->ch;
5959 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5961 unsigned char char_boundary = str->iso2022.current_char_boundary;
5968 if (char_boundary == 0)
5974 else if ( c >= 0xf8 )
5979 else if ( c >= 0xf0 )
5984 else if ( c >= 0xe0 )
5989 else if ( c >= 0xc0 )
5999 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6000 Dynarr_add (dst, '\r');
6001 if (eol_type != EOL_CR)
6002 Dynarr_add (dst, c);
6005 Dynarr_add (dst, c);
6008 else if (char_boundary == 1)
6010 ch = ( ch << 6 ) | ( c & 0x3f );
6011 Dynarr_add (dst, ch & 0xff);
6016 ch = ( ch << 6 ) | ( c & 0x3f );
6019 #else /* not UTF2000 */
6022 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6023 Dynarr_add (dst, '\r');
6024 if (eol_type != EOL_CR)
6025 Dynarr_add (dst, '\n');
6028 else if (BYTE_ASCII_P (c))
6031 Dynarr_add (dst, c);
6033 else if (BUFBYTE_LEADING_BYTE_P (c))
6036 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6037 c == LEADING_BYTE_CONTROL_1)
6040 Dynarr_add (dst, '~'); /* untranslatable character */
6044 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6045 Dynarr_add (dst, c);
6046 else if (ch == LEADING_BYTE_CONTROL_1)
6049 Dynarr_add (dst, c - 0x20);
6051 /* else it should be the second or third byte of an
6052 untranslatable character, so ignore it */
6055 #endif /* not UTF2000 */
6061 str->iso2022.current_char_boundary = char_boundary;
6067 /************************************************************************/
6068 /* Initialization */
6069 /************************************************************************/
6072 syms_of_file_coding (void)
6074 INIT_LRECORD_IMPLEMENTATION (coding_system);
6076 deferror (&Qcoding_system_error, "coding-system-error",
6077 "Coding-system error", Qio_error);
6079 DEFSUBR (Fcoding_system_p);
6080 DEFSUBR (Ffind_coding_system);
6081 DEFSUBR (Fget_coding_system);
6082 DEFSUBR (Fcoding_system_list);
6083 DEFSUBR (Fcoding_system_name);
6084 DEFSUBR (Fmake_coding_system);
6085 DEFSUBR (Fcopy_coding_system);
6086 DEFSUBR (Fcoding_system_canonical_name_p);
6087 DEFSUBR (Fcoding_system_alias_p);
6088 DEFSUBR (Fcoding_system_aliasee);
6089 DEFSUBR (Fdefine_coding_system_alias);
6090 DEFSUBR (Fsubsidiary_coding_system);
6092 DEFSUBR (Fcoding_system_type);
6093 DEFSUBR (Fcoding_system_doc_string);
6095 DEFSUBR (Fcoding_system_charset);
6097 DEFSUBR (Fcoding_system_property);
6099 DEFSUBR (Fcoding_category_list);
6100 DEFSUBR (Fset_coding_priority_list);
6101 DEFSUBR (Fcoding_priority_list);
6102 DEFSUBR (Fset_coding_category_system);
6103 DEFSUBR (Fcoding_category_system);
6105 DEFSUBR (Fdetect_coding_region);
6106 DEFSUBR (Fdecode_coding_region);
6107 DEFSUBR (Fencode_coding_region);
6109 DEFSUBR (Fdecode_shift_jis_char);
6110 DEFSUBR (Fencode_shift_jis_char);
6111 DEFSUBR (Fdecode_big5_char);
6112 DEFSUBR (Fencode_big5_char);
6114 defsymbol (&Qcoding_systemp, "coding-system-p");
6115 defsymbol (&Qno_conversion, "no-conversion");
6116 defsymbol (&Qraw_text, "raw-text");
6118 defsymbol (&Qbig5, "big5");
6119 defsymbol (&Qshift_jis, "shift-jis");
6120 defsymbol (&Qucs4, "ucs-4");
6121 defsymbol (&Qutf8, "utf-8");
6122 defsymbol (&Qccl, "ccl");
6123 defsymbol (&Qiso2022, "iso2022");
6125 defsymbol (&Qmnemonic, "mnemonic");
6126 defsymbol (&Qeol_type, "eol-type");
6127 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6128 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6130 defsymbol (&Qcr, "cr");
6131 defsymbol (&Qlf, "lf");
6132 defsymbol (&Qcrlf, "crlf");
6133 defsymbol (&Qeol_cr, "eol-cr");
6134 defsymbol (&Qeol_lf, "eol-lf");
6135 defsymbol (&Qeol_crlf, "eol-crlf");
6137 defsymbol (&Qcharset_g0, "charset-g0");
6138 defsymbol (&Qcharset_g1, "charset-g1");
6139 defsymbol (&Qcharset_g2, "charset-g2");
6140 defsymbol (&Qcharset_g3, "charset-g3");
6141 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6142 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6143 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6144 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6145 defsymbol (&Qno_iso6429, "no-iso6429");
6146 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6147 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6149 defsymbol (&Qshort, "short");
6150 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6151 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6152 defsymbol (&Qseven, "seven");
6153 defsymbol (&Qlock_shift, "lock-shift");
6154 defsymbol (&Qescape_quoted, "escape-quoted");
6157 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6158 defsymbol (&Qdisable_composition, "disable-composition");
6159 defsymbol (&Quse_entity_reference, "use-entity-reference");
6160 defsymbol (&Qd, "d");
6161 defsymbol (&Qx, "x");
6162 defsymbol (&QX, "X");
6164 defsymbol (&Qencode, "encode");
6165 defsymbol (&Qdecode, "decode");
6168 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6170 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6172 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6174 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6176 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6178 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6180 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6182 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6184 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6187 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6192 lstream_type_create_file_coding (void)
6194 LSTREAM_HAS_METHOD (decoding, reader);
6195 LSTREAM_HAS_METHOD (decoding, writer);
6196 LSTREAM_HAS_METHOD (decoding, rewinder);
6197 LSTREAM_HAS_METHOD (decoding, seekable_p);
6198 LSTREAM_HAS_METHOD (decoding, flusher);
6199 LSTREAM_HAS_METHOD (decoding, closer);
6200 LSTREAM_HAS_METHOD (decoding, marker);
6202 LSTREAM_HAS_METHOD (encoding, reader);
6203 LSTREAM_HAS_METHOD (encoding, writer);
6204 LSTREAM_HAS_METHOD (encoding, rewinder);
6205 LSTREAM_HAS_METHOD (encoding, seekable_p);
6206 LSTREAM_HAS_METHOD (encoding, flusher);
6207 LSTREAM_HAS_METHOD (encoding, closer);
6208 LSTREAM_HAS_METHOD (encoding, marker);
6212 vars_of_file_coding (void)
6216 fcd = xnew (struct file_coding_dump);
6217 dump_add_root_struct_ptr (&fcd, &fcd_description);
6219 /* Initialize to something reasonable ... */
6220 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6222 fcd->coding_category_system[i] = Qnil;
6223 fcd->coding_category_by_priority[i] = i;
6226 Fprovide (intern ("file-coding"));
6228 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6229 Coding system used for TTY keyboard input.
6230 Not used under a windowing system.
6232 Vkeyboard_coding_system = Qnil;
6234 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6235 Coding system used for TTY display output.
6236 Not used under a windowing system.
6238 Vterminal_coding_system = Qnil;
6240 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6241 Overriding coding system used when reading from a file or process.
6242 You should bind this variable with `let', but do not set it globally.
6243 If this is non-nil, it specifies the coding system that will be used
6244 to decode input on read operations, such as from a file or process.
6245 It overrides `buffer-file-coding-system-for-read',
6246 `insert-file-contents-pre-hook', etc. Use those variables instead of
6247 this one for permanent changes to the environment. */ );
6248 Vcoding_system_for_read = Qnil;
6250 DEFVAR_LISP ("coding-system-for-write",
6251 &Vcoding_system_for_write /*
6252 Overriding coding system used when writing to a file or process.
6253 You should bind this variable with `let', but do not set it globally.
6254 If this is non-nil, it specifies the coding system that will be used
6255 to encode output for write operations, such as to a file or process.
6256 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6257 Use those variables instead of this one for permanent changes to the
6259 Vcoding_system_for_write = Qnil;
6261 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6262 Coding system used to convert pathnames when accessing files.
6264 Vfile_name_coding_system = Qnil;
6266 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6267 &Vcoded_charset_entity_reference_alist /*
6268 Alist of coded-charset vs corresponding entity-reference.
6269 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6270 CCS is coded-charset.
6271 CODE-COLUMNS is columns of code-point of entity-reference.
6272 CODE-TYPE is format type of code-point of entity-reference.
6273 `d' means decimal value and `x' means hexadecimal value.
6275 Vcoded_charset_entity_reference_alist = Qnil;
6277 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6278 Non-nil means the buffer contents are regarded as multi-byte form
6279 of characters, not a binary code. This affects the display, file I/O,
6280 and behaviors of various editing commands.
6282 Setting this to nil does not do anything.
6284 enable_multibyte_characters = 1;
6288 complex_vars_of_file_coding (void)
6290 staticpro (&Vcoding_system_hash_table);
6291 Vcoding_system_hash_table =
6292 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6294 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6295 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6297 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6299 struct codesys_prop csp; \
6301 csp.prop_type = (Prop_Type); \
6302 Dynarr_add (the_codesys_prop_dynarr, csp); \
6305 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6306 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6307 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6308 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6309 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6310 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6311 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6313 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6314 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6315 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6316 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6317 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6318 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6319 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6320 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6321 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6322 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6323 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6324 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6325 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6326 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6327 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6328 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6329 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6331 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6332 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6334 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6335 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6338 /* Need to create this here or we're really screwed. */
6340 (Qraw_text, Qno_conversion,
6341 build_string ("Raw text, which means it converts only line-break-codes."),
6342 list2 (Qmnemonic, build_string ("Raw")));
6345 (Qbinary, Qno_conversion,
6346 build_string ("Binary, which means it does not convert anything."),
6347 list4 (Qeol_type, Qlf,
6348 Qmnemonic, build_string ("Binary")));
6354 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6355 list2 (Qmnemonic, build_string ("MTF8")));
6358 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6360 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6362 Fdefine_coding_system_alias (Qterminal, Qbinary);
6363 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6365 /* Need this for bootstrapping */
6366 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6367 Fget_coding_system (Qraw_text);
6370 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6371 = Fget_coding_system (Qutf_8_mcs);
6374 #if defined(MULE) && !defined(UTF2000)
6378 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6379 fcd->ucs_to_mule_table[i] = Qnil;
6381 staticpro (&mule_to_ucs_table);
6382 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6383 #endif /* defined(MULE) && !defined(UTF2000) */