1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof(struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qctext, Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qdisable_composition;
106 Lisp_Object Qencode, Qdecode;
108 Lisp_Object Vcoding_system_hash_table;
110 int enable_multibyte_characters;
113 /* Additional information used by the ISO2022 decoder and detector. */
114 struct iso2022_decoder
116 /* CHARSET holds the character sets currently assigned to the G0
117 through G3 variables. It is initialized from the array
118 INITIAL_CHARSET in CODESYS. */
119 Lisp_Object charset[4];
121 /* Which registers are currently invoked into the left (GL) and
122 right (GR) halves of the 8-bit encoding space? */
123 int register_left, register_right;
125 /* ISO_ESC holds a value indicating part of an escape sequence
126 that has already been seen. */
127 enum iso_esc_flag esc;
129 /* This records the bytes we've seen so far in an escape sequence,
130 in case the sequence is invalid (we spit out the bytes unchanged). */
131 unsigned char esc_bytes[8];
133 /* Index for next byte to store in ISO escape sequence. */
136 #ifdef ENABLE_COMPOSITE_CHARS
137 /* Stuff seen so far when composing a string. */
138 unsigned_char_dynarr *composite_chars;
141 /* If we saw an invalid designation sequence for a particular
142 register, we flag it here and switch to ASCII. The next time we
143 see a valid designation for this register, we turn off the flag
144 and do the designation normally, but pretend the sequence was
145 invalid. The effect of all this is that (most of the time) the
146 escape sequences for both the switch to the unknown charset, and
147 the switch back to the known charset, get inserted literally into
148 the buffer and saved out as such. The hope is that we can
149 preserve the escape sequences so that the resulting written out
150 file makes sense. If we don't do any of this, the designation
151 to the invalid charset will be preserved but that switch back
152 to the known charset will probably get eaten because it was
153 the same charset that was already present in the register. */
154 unsigned char invalid_designated[4];
156 /* We try to do similar things as above for direction-switching
157 sequences. If we encountered a direction switch while an
158 invalid designation was present, or an invalid designation
159 just after a direction switch (i.e. no valid designation
160 encountered yet), we insert the direction-switch escape
161 sequence literally into the output stream, and later on
162 insert the corresponding direction-restoring escape sequence
164 unsigned int switched_dir_and_no_valid_charset_yet :1;
165 unsigned int invalid_switch_dir :1;
167 /* Tells the decoder to output the escape sequence literally
168 even though it was valid. Used in the games we play to
169 avoid lossage when we encounter invalid designations. */
170 unsigned int output_literally :1;
171 /* We encountered a direction switch followed by an invalid
172 designation. We didn't output the direction switch
173 literally because we didn't know about the invalid designation;
174 but we have to do so now. */
175 unsigned int output_direction_sequence :1;
178 EXFUN (Fcopy_coding_system, 2);
180 struct detection_state;
183 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
184 unsigned_char_dynarr *dst, unsigned int n);
186 static int detect_coding_sjis (struct detection_state *st,
187 CONST unsigned char *src,
189 static void decode_coding_sjis (Lstream *decoding,
190 CONST unsigned char *src,
191 unsigned_char_dynarr *dst,
193 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
194 unsigned_char_dynarr *dst, unsigned int *flags);
195 void char_finish_shift_jis (struct encoding_stream *str,
196 unsigned_char_dynarr *dst, unsigned int *flags);
198 static int detect_coding_big5 (struct detection_state *st,
199 CONST unsigned char *src,
201 static void decode_coding_big5 (Lstream *decoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_big5 (Lstream *encoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_ucs4 (struct detection_state *st,
208 CONST unsigned char *src,
210 static void decode_coding_ucs4 (Lstream *decoding,
211 CONST unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
214 unsigned_char_dynarr *dst, unsigned int *flags);
215 void char_finish_ucs4 (struct encoding_stream *str,
216 unsigned_char_dynarr *dst, unsigned int *flags);
218 static int detect_coding_utf8 (struct detection_state *st,
219 CONST unsigned char *src,
221 static void decode_coding_utf8 (Lstream *decoding,
222 CONST unsigned char *src,
223 unsigned_char_dynarr *dst, unsigned int n);
224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf8 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int postprocess_iso2022_mask (int mask);
230 static void reset_iso2022 (Lisp_Object coding_system,
231 struct iso2022_decoder *iso);
232 static int detect_coding_iso2022 (struct detection_state *st,
233 CONST unsigned char *src,
235 static void decode_coding_iso2022 (Lstream *decoding,
236 CONST unsigned char *src,
237 unsigned_char_dynarr *dst, unsigned int n);
238 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
239 unsigned_char_dynarr *dst, unsigned int *flags);
240 void char_finish_iso2022 (struct encoding_stream *str,
241 unsigned_char_dynarr *dst, unsigned int *flags);
243 static void decode_coding_no_conversion (Lstream *decoding,
244 CONST unsigned char *src,
245 unsigned_char_dynarr *dst,
247 static void encode_coding_no_conversion (Lstream *encoding,
248 CONST unsigned char *src,
249 unsigned_char_dynarr *dst,
251 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
252 unsigned_char_dynarr *dst, unsigned int n);
253 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
254 unsigned_char_dynarr *dst, unsigned int n);
256 typedef struct codesys_prop codesys_prop;
265 Dynarr_declare (codesys_prop);
266 } codesys_prop_dynarr;
268 static const struct lrecord_description codesys_prop_description_1[] = {
269 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
273 static const struct struct_description codesys_prop_description = {
274 sizeof(codesys_prop),
275 codesys_prop_description_1
278 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
279 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
283 static const struct struct_description codesys_prop_dynarr_description = {
284 sizeof(codesys_prop_dynarr),
285 codesys_prop_dynarr_description_1
288 codesys_prop_dynarr *the_codesys_prop_dynarr;
290 enum codesys_prop_enum
293 CODESYS_PROP_ISO2022,
298 /************************************************************************/
299 /* Coding system functions */
300 /************************************************************************/
302 static Lisp_Object mark_coding_system (Lisp_Object);
303 static void print_coding_system (Lisp_Object, Lisp_Object, int);
304 static void finalize_coding_system (void *header, int for_disksave);
307 static const struct lrecord_description ccs_description_1[] = {
308 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
312 static const struct struct_description ccs_description = {
313 sizeof(charset_conversion_spec),
317 static const struct lrecord_description ccsd_description_1[] = {
318 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
322 static const struct struct_description ccsd_description = {
323 sizeof(charset_conversion_spec_dynarr),
328 static const struct lrecord_description coding_system_description[] = {
329 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
330 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
331 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
333 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
334 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
335 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
336 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
341 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
342 mark_coding_system, print_coding_system,
343 finalize_coding_system,
344 0, 0, coding_system_description,
345 struct Lisp_Coding_System);
348 mark_coding_system (Lisp_Object obj)
350 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
352 mark_object (CODING_SYSTEM_NAME (codesys));
353 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
354 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
355 mark_object (CODING_SYSTEM_EOL_LF (codesys));
356 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
357 mark_object (CODING_SYSTEM_EOL_CR (codesys));
359 switch (CODING_SYSTEM_TYPE (codesys))
363 case CODESYS_ISO2022:
364 for (i = 0; i < 4; i++)
365 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
366 if (codesys->iso2022.input_conv)
368 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
370 struct charset_conversion_spec *ccs =
371 Dynarr_atp (codesys->iso2022.input_conv, i);
372 mark_object (ccs->from_charset);
373 mark_object (ccs->to_charset);
376 if (codesys->iso2022.output_conv)
378 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
380 struct charset_conversion_spec *ccs =
381 Dynarr_atp (codesys->iso2022.output_conv, i);
382 mark_object (ccs->from_charset);
383 mark_object (ccs->to_charset);
389 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
390 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
397 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
398 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
402 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
405 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
407 error ("printing unreadable object #<coding_system 0x%x>",
410 write_c_string ("#<coding_system ", printcharfun);
411 print_internal (c->name, printcharfun, 1);
412 write_c_string (">", printcharfun);
416 finalize_coding_system (void *header, int for_disksave)
418 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
419 /* Since coding systems never go away, this function is not
420 necessary. But it would be necessary if we changed things
421 so that coding systems could go away. */
422 if (!for_disksave) /* see comment in lstream.c */
424 switch (CODING_SYSTEM_TYPE (c))
427 case CODESYS_ISO2022:
428 if (c->iso2022.input_conv)
430 Dynarr_free (c->iso2022.input_conv);
431 c->iso2022.input_conv = 0;
433 if (c->iso2022.output_conv)
435 Dynarr_free (c->iso2022.output_conv);
436 c->iso2022.output_conv = 0;
447 symbol_to_eol_type (Lisp_Object symbol)
449 CHECK_SYMBOL (symbol);
450 if (NILP (symbol)) return EOL_AUTODETECT;
451 if (EQ (symbol, Qlf)) return EOL_LF;
452 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
453 if (EQ (symbol, Qcr)) return EOL_CR;
455 signal_simple_error ("Unrecognized eol type", symbol);
456 return EOL_AUTODETECT; /* not reached */
460 eol_type_to_symbol (enum eol_type type)
465 case EOL_LF: return Qlf;
466 case EOL_CRLF: return Qcrlf;
467 case EOL_CR: return Qcr;
468 case EOL_AUTODETECT: return Qnil;
473 setup_eol_coding_systems (Lisp_Coding_System *codesys)
475 Lisp_Object codesys_obj;
476 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
477 char *codesys_name = (char *) alloca (len + 7);
479 char *codesys_mnemonic=0;
481 Lisp_Object codesys_name_sym, sub_codesys_obj;
485 XSETCODING_SYSTEM (codesys_obj, codesys);
487 memcpy (codesys_name,
488 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
490 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
492 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
493 codesys_mnemonic = (char *) alloca (mlen + 7);
494 memcpy (codesys_mnemonic,
495 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
498 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
499 strcpy (codesys_name + len, "-" op_sys); \
501 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
502 codesys_name_sym = intern (codesys_name); \
503 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
504 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
506 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
507 build_string (codesys_mnemonic); \
508 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
511 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
512 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
513 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
516 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
517 Return t if OBJECT is a coding system.
518 A coding system is an object that defines how text containing multiple
519 character sets is encoded into a stream of (typically 8-bit) bytes.
520 The coding system is used to decode the stream into a series of
521 characters (which may be from multiple charsets) when the text is read
522 from a file or process, and is used to encode the text back into the
523 same format when it is written out to a file or process.
525 For example, many ISO2022-compliant coding systems (such as Compound
526 Text, which is used for inter-client data under the X Window System)
527 use escape sequences to switch between different charsets -- Japanese
528 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
529 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
530 `make-coding-system' for more information.
532 Coding systems are normally identified using a symbol, and the
533 symbol is accepted in place of the actual coding system object whenever
534 a coding system is called for. (This is similar to how faces work.)
538 return CODING_SYSTEMP (object) ? Qt : Qnil;
541 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
542 Retrieve the coding system of the given name.
544 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
545 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
546 If there is no such coding system, nil is returned. Otherwise the
547 associated coding system object is returned.
549 (coding_system_or_name))
551 if (CODING_SYSTEMP (coding_system_or_name))
552 return coding_system_or_name;
554 if (NILP (coding_system_or_name))
555 coding_system_or_name = Qbinary;
557 CHECK_SYMBOL (coding_system_or_name);
559 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
562 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
563 Retrieve the coding system of the given name.
564 Same as `find-coding-system' except that if there is no such
565 coding system, an error is signaled instead of returning nil.
569 Lisp_Object coding_system = Ffind_coding_system (name);
571 if (NILP (coding_system))
572 signal_simple_error ("No such coding system", name);
573 return coding_system;
576 /* We store the coding systems in hash tables with the names as the key and the
577 actual coding system object as the value. Occasionally we need to use them
578 in a list format. These routines provide us with that. */
579 struct coding_system_list_closure
581 Lisp_Object *coding_system_list;
585 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
586 void *coding_system_list_closure)
588 /* This function can GC */
589 struct coding_system_list_closure *cscl =
590 (struct coding_system_list_closure *) coding_system_list_closure;
591 Lisp_Object *coding_system_list = cscl->coding_system_list;
593 *coding_system_list = Fcons (key, *coding_system_list);
597 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
598 Return a list of the names of all defined coding systems.
602 Lisp_Object coding_system_list = Qnil;
604 struct coding_system_list_closure coding_system_list_closure;
606 GCPRO1 (coding_system_list);
607 coding_system_list_closure.coding_system_list = &coding_system_list;
608 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
609 &coding_system_list_closure);
612 return coding_system_list;
615 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
616 Return the name of the given coding system.
620 coding_system = Fget_coding_system (coding_system);
621 return XCODING_SYSTEM_NAME (coding_system);
624 static Lisp_Coding_System *
625 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
627 Lisp_Coding_System *codesys =
628 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
630 zero_lcrecord (codesys);
631 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
633 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
634 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
635 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
636 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
637 CODING_SYSTEM_TYPE (codesys) = type;
638 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
640 if (type == CODESYS_ISO2022)
643 for (i = 0; i < 4; i++)
644 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
646 else if (type == CODESYS_CCL)
648 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
649 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
652 CODING_SYSTEM_NAME (codesys) = name;
658 /* Given a list of charset conversion specs as specified in a Lisp
659 program, parse it into STORE_HERE. */
662 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
663 Lisp_Object spec_list)
667 EXTERNAL_LIST_LOOP (rest, spec_list)
669 Lisp_Object car = XCAR (rest);
670 Lisp_Object from, to;
671 struct charset_conversion_spec spec;
673 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
674 signal_simple_error ("Invalid charset conversion spec", car);
675 from = Fget_charset (XCAR (car));
676 to = Fget_charset (XCAR (XCDR (car)));
677 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
678 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
679 signal_simple_error_2
680 ("Attempted conversion between different charset types",
682 spec.from_charset = from;
683 spec.to_charset = to;
685 Dynarr_add (store_here, spec);
689 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
690 specs, return the equivalent as the Lisp programmer would see it.
692 If LOAD_HERE is 0, return Qnil. */
695 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
702 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
704 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
705 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
708 return Fnreverse (result);
713 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
714 Register symbol NAME as a coding system.
716 TYPE describes the conversion method used and should be one of
719 Automatic conversion. XEmacs attempts to detect the coding system
722 No conversion. Use this for binary files and such. On output,
723 graphic characters that are not in ASCII or Latin-1 will be
724 replaced by a ?. (For a no-conversion-encoded buffer, these
725 characters will only be present if you explicitly insert them.)
727 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
729 ISO 10646 UCS-4 encoding.
731 ISO 10646 UTF-8 encoding.
733 Any ISO2022-compliant encoding. Among other things, this includes
734 JIS (the Japanese encoding commonly used for e-mail), EUC (the
735 standard Unix encoding for Japanese and other languages), and
736 Compound Text (the encoding used in X11). You can specify more
737 specific information about the conversion with the FLAGS argument.
739 Big5 (the encoding commonly used for Taiwanese).
741 The conversion is performed using a user-written pseudo-code
742 program. CCL (Code Conversion Language) is the name of this
745 Write out or read in the raw contents of the memory representing
746 the buffer's text. This is primarily useful for debugging
747 purposes, and is only enabled when XEmacs has been compiled with
748 DEBUG_XEMACS defined (via the --debug configure option).
749 WARNING: Reading in a file using 'internal conversion can result
750 in an internal inconsistency in the memory representing a
751 buffer's text, which will produce unpredictable results and may
752 cause XEmacs to crash. Under normal circumstances you should
753 never use 'internal conversion.
755 DOC-STRING is a string describing the coding system.
757 PROPS is a property list, describing the specific nature of the
758 character set. Recognized properties are:
761 String to be displayed in the modeline when this coding system is
765 End-of-line conversion to be used. It should be one of
768 Automatically detect the end-of-line type (LF, CRLF,
769 or CR). Also generate subsidiary coding systems named
770 `NAME-unix', `NAME-dos', and `NAME-mac', that are
771 identical to this coding system but have an EOL-TYPE
772 value of 'lf, 'crlf, and 'cr, respectively.
774 The end of a line is marked externally using ASCII LF.
775 Since this is also the way that XEmacs represents an
776 end-of-line internally, specifying this option results
777 in no end-of-line conversion. This is the standard
778 format for Unix text files.
780 The end of a line is marked externally using ASCII
781 CRLF. This is the standard format for MS-DOS text
784 The end of a line is marked externally using ASCII CR.
785 This is the standard format for Macintosh text files.
787 Automatically detect the end-of-line type but do not
788 generate subsidiary coding systems. (This value is
789 converted to nil when stored internally, and
790 `coding-system-property' will return nil.)
793 If non-nil, composition/decomposition for combining characters
796 'post-read-conversion
797 Function called after a file has been read in, to perform the
798 decoding. Called with two arguments, BEG and END, denoting
799 a region of the current buffer to be decoded.
801 'pre-write-conversion
802 Function called before a file is written out, to perform the
803 encoding. Called with two arguments, BEG and END, denoting
804 a region of the current buffer to be encoded.
807 The following additional properties are recognized if TYPE is 'iso2022:
813 The character set initially designated to the G0 - G3 registers.
814 The value should be one of
816 -- A charset object (designate that character set)
817 -- nil (do not ever use this register)
818 -- t (no character set is initially designated to
819 the register, but may be later on; this automatically
820 sets the corresponding `force-g*-on-output' property)
826 If non-nil, send an explicit designation sequence on output before
827 using the specified register.
830 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
831 "ESC $ B" on output in place of the full designation sequences
832 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
835 If non-nil, don't designate ASCII to G0 at each end of line on output.
836 Setting this to non-nil also suppresses other state-resetting that
837 normally happens at the end of a line.
840 If non-nil, don't designate ASCII to G0 before control chars on output.
843 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
847 If non-nil, use locking-shift (SO/SI) instead of single-shift
848 or designation by escape sequence.
851 If non-nil, don't use ISO6429's direction specification.
854 If non-nil, literal control characters that are the same as
855 the beginning of a recognized ISO2022 or ISO6429 escape sequence
856 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
857 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
858 so that they can be properly distinguished from an escape sequence.
859 (Note that doing this results in a non-portable encoding.) This
860 encoding flag is used for byte-compiled files. Note that ESC
861 is a good choice for a quoting character because there are no
862 escape sequences whose second byte is a character from the Control-0
863 or Control-1 character sets; this is explicitly disallowed by the
866 'input-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when decoding is performed.
869 Each specification is a list of two elements: the source charset,
870 and the destination charset.
872 'output-charset-conversion
873 A list of conversion specifications, specifying conversion of
874 characters in one charset to another when encoding is performed.
875 The form of each specification is the same as for
876 'input-charset-conversion.
879 The following additional properties are recognized (and required)
883 CCL program used for decoding (converting to internal format).
886 CCL program used for encoding (converting to external format).
888 (name, type, doc_string, props))
890 Lisp_Coding_System *codesys;
891 Lisp_Object rest, key, value;
892 enum coding_system_type ty;
893 int need_to_setup_eol_systems = 1;
895 /* Convert type to constant */
896 if (NILP (type) || EQ (type, Qundecided))
897 { ty = CODESYS_AUTODETECT; }
899 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
900 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
901 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
902 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
903 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
904 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
906 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
908 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
911 signal_simple_error ("Invalid coding system type", type);
915 codesys = allocate_coding_system (ty, name);
917 if (NILP (doc_string))
918 doc_string = build_string ("");
920 CHECK_STRING (doc_string);
921 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
923 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
925 if (EQ (key, Qmnemonic))
928 CHECK_STRING (value);
929 CODING_SYSTEM_MNEMONIC (codesys) = value;
932 else if (EQ (key, Qeol_type))
934 need_to_setup_eol_systems = NILP (value);
937 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
940 else if (EQ (key, Qpost_read_conversion))
941 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
942 else if (EQ (key, Qpre_write_conversion))
943 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
945 else if (EQ (key, Qdisable_composition))
946 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
949 else if (ty == CODESYS_ISO2022)
951 #define FROB_INITIAL_CHARSET(charset_num) \
952 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
953 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
955 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
956 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
957 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
958 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
960 #define FROB_FORCE_CHARSET(charset_num) \
961 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
963 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
964 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
965 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
966 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
968 #define FROB_BOOLEAN_PROPERTY(prop) \
969 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
971 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
972 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
973 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
974 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
975 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
976 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
977 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
979 else if (EQ (key, Qinput_charset_conversion))
981 codesys->iso2022.input_conv =
982 Dynarr_new (charset_conversion_spec);
983 parse_charset_conversion_specs (codesys->iso2022.input_conv,
986 else if (EQ (key, Qoutput_charset_conversion))
988 codesys->iso2022.output_conv =
989 Dynarr_new (charset_conversion_spec);
990 parse_charset_conversion_specs (codesys->iso2022.output_conv,
994 signal_simple_error ("Unrecognized property", key);
996 else if (EQ (type, Qccl))
998 if (EQ (key, Qdecode))
1000 CHECK_VECTOR (value);
1001 CODING_SYSTEM_CCL_DECODE (codesys) = value;
1003 else if (EQ (key, Qencode))
1005 CHECK_VECTOR (value);
1006 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1009 signal_simple_error ("Unrecognized property", key);
1013 signal_simple_error ("Unrecognized property", key);
1016 if (need_to_setup_eol_systems)
1017 setup_eol_coding_systems (codesys);
1020 Lisp_Object codesys_obj;
1021 XSETCODING_SYSTEM (codesys_obj, codesys);
1022 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1027 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1028 Copy OLD-CODING-SYSTEM to NEW-NAME.
1029 If NEW-NAME does not name an existing coding system, a new one will
1032 (old_coding_system, new_name))
1034 Lisp_Object new_coding_system;
1035 old_coding_system = Fget_coding_system (old_coding_system);
1036 new_coding_system = Ffind_coding_system (new_name);
1037 if (NILP (new_coding_system))
1039 XSETCODING_SYSTEM (new_coding_system,
1040 allocate_coding_system
1041 (XCODING_SYSTEM_TYPE (old_coding_system),
1043 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1047 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1048 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1049 memcpy (((char *) to ) + sizeof (to->header),
1050 ((char *) from) + sizeof (from->header),
1051 sizeof (*from) - sizeof (from->header));
1052 to->name = new_name;
1054 return new_coding_system;
1057 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1058 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1060 (alias, coding_system))
1062 CHECK_SYMBOL (alias);
1063 if (!NILP (Ffind_coding_system (alias)))
1064 signal_simple_error ("Symbol already names a coding system", alias);
1065 coding_system = Fget_coding_system (coding_system);
1066 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1068 /* Set up aliases for subsidiaries. */
1069 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1072 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1073 #define FROB(type, name) \
1075 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1076 if (!NILP (subsidiary)) \
1077 Fdefine_coding_system_alias \
1078 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1081 FROB (CRLF, "-dos");
1085 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1086 but it doesn't look intentional, so I'd rather return something
1087 meaningful or nothing at all. */
1092 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1094 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1095 Lisp_Object new_coding_system;
1097 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1098 return coding_system;
1102 case EOL_AUTODETECT: return coding_system;
1103 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1104 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1105 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1109 return NILP (new_coding_system) ? coding_system : new_coding_system;
1112 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1113 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1115 (coding_system, eol_type))
1117 coding_system = Fget_coding_system (coding_system);
1119 return subsidiary_coding_system (coding_system,
1120 symbol_to_eol_type (eol_type));
1124 /************************************************************************/
1125 /* Coding system accessors */
1126 /************************************************************************/
1128 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1129 Return the doc string for CODING-SYSTEM.
1133 coding_system = Fget_coding_system (coding_system);
1134 return XCODING_SYSTEM_DOC_STRING (coding_system);
1137 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1138 Return the type of CODING-SYSTEM.
1142 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1145 case CODESYS_AUTODETECT: return Qundecided;
1147 case CODESYS_SHIFT_JIS: return Qshift_jis;
1148 case CODESYS_ISO2022: return Qiso2022;
1149 case CODESYS_BIG5: return Qbig5;
1150 case CODESYS_UCS4: return Qucs4;
1151 case CODESYS_UTF8: return Qutf8;
1152 case CODESYS_CCL: return Qccl;
1154 case CODESYS_NO_CONVERSION: return Qno_conversion;
1156 case CODESYS_INTERNAL: return Qinternal;
1163 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1166 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1168 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1171 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1172 Return initial charset of CODING-SYSTEM designated to GNUM.
1175 (coding_system, gnum))
1177 coding_system = Fget_coding_system (coding_system);
1180 return coding_system_charset (coding_system, XINT (gnum));
1184 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1185 Return the PROP property of CODING-SYSTEM.
1187 (coding_system, prop))
1190 enum coding_system_type type;
1192 coding_system = Fget_coding_system (coding_system);
1193 CHECK_SYMBOL (prop);
1194 type = XCODING_SYSTEM_TYPE (coding_system);
1196 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1197 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1200 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1202 case CODESYS_PROP_ALL_OK:
1205 case CODESYS_PROP_ISO2022:
1206 if (type != CODESYS_ISO2022)
1208 ("Property only valid in ISO2022 coding systems",
1212 case CODESYS_PROP_CCL:
1213 if (type != CODESYS_CCL)
1215 ("Property only valid in CCL coding systems",
1225 signal_simple_error ("Unrecognized property", prop);
1227 if (EQ (prop, Qname))
1228 return XCODING_SYSTEM_NAME (coding_system);
1229 else if (EQ (prop, Qtype))
1230 return Fcoding_system_type (coding_system);
1231 else if (EQ (prop, Qdoc_string))
1232 return XCODING_SYSTEM_DOC_STRING (coding_system);
1233 else if (EQ (prop, Qmnemonic))
1234 return XCODING_SYSTEM_MNEMONIC (coding_system);
1235 else if (EQ (prop, Qeol_type))
1236 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1237 else if (EQ (prop, Qeol_lf))
1238 return XCODING_SYSTEM_EOL_LF (coding_system);
1239 else if (EQ (prop, Qeol_crlf))
1240 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1241 else if (EQ (prop, Qeol_cr))
1242 return XCODING_SYSTEM_EOL_CR (coding_system);
1243 else if (EQ (prop, Qpost_read_conversion))
1244 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1245 else if (EQ (prop, Qpre_write_conversion))
1246 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1248 else if (type == CODESYS_ISO2022)
1250 if (EQ (prop, Qcharset_g0))
1251 return coding_system_charset (coding_system, 0);
1252 else if (EQ (prop, Qcharset_g1))
1253 return coding_system_charset (coding_system, 1);
1254 else if (EQ (prop, Qcharset_g2))
1255 return coding_system_charset (coding_system, 2);
1256 else if (EQ (prop, Qcharset_g3))
1257 return coding_system_charset (coding_system, 3);
1259 #define FORCE_CHARSET(charset_num) \
1260 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1261 (coding_system, charset_num) ? Qt : Qnil)
1263 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1264 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1265 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1266 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1268 #define LISP_BOOLEAN(prop) \
1269 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1271 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1272 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1273 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1274 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1275 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1276 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1277 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1279 else if (EQ (prop, Qinput_charset_conversion))
1281 unparse_charset_conversion_specs
1282 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1283 else if (EQ (prop, Qoutput_charset_conversion))
1285 unparse_charset_conversion_specs
1286 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1290 else if (type == CODESYS_CCL)
1292 if (EQ (prop, Qdecode))
1293 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1294 else if (EQ (prop, Qencode))
1295 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1303 return Qnil; /* not reached */
1307 /************************************************************************/
1308 /* Coding category functions */
1309 /************************************************************************/
1312 decode_coding_category (Lisp_Object symbol)
1316 CHECK_SYMBOL (symbol);
1317 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1318 if (EQ (coding_category_symbol[i], symbol))
1321 signal_simple_error ("Unrecognized coding category", symbol);
1322 return 0; /* not reached */
1325 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1326 Return a list of all recognized coding categories.
1331 Lisp_Object list = Qnil;
1333 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1334 list = Fcons (coding_category_symbol[i], list);
1338 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1339 Change the priority order of the coding categories.
1340 LIST should be list of coding categories, in descending order of
1341 priority. Unspecified coding categories will be lower in priority
1342 than all specified ones, in the same relative order they were in
1347 int category_to_priority[CODING_CATEGORY_LAST + 1];
1351 /* First generate a list that maps coding categories to priorities. */
1353 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1354 category_to_priority[i] = -1;
1356 /* Highest priority comes from the specified list. */
1358 EXTERNAL_LIST_LOOP (rest, list)
1360 int cat = decode_coding_category (XCAR (rest));
1362 if (category_to_priority[cat] >= 0)
1363 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1364 category_to_priority[cat] = i++;
1367 /* Now go through the existing categories by priority to retrieve
1368 the categories not yet specified and preserve their priority
1370 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1372 int cat = fcd->coding_category_by_priority[j];
1373 if (category_to_priority[cat] < 0)
1374 category_to_priority[cat] = i++;
1377 /* Now we need to construct the inverse of the mapping we just
1380 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1381 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1383 /* Phew! That was confusing. */
1387 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1388 Return a list of coding categories in descending order of priority.
1393 Lisp_Object list = Qnil;
1395 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1396 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1401 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1402 Change the coding system associated with a coding category.
1404 (coding_category, coding_system))
1406 int cat = decode_coding_category (coding_category);
1408 coding_system = Fget_coding_system (coding_system);
1409 fcd->coding_category_system[cat] = coding_system;
1413 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1414 Return the coding system associated with a coding category.
1418 int cat = decode_coding_category (coding_category);
1419 Lisp_Object sys = fcd->coding_category_system[cat];
1422 return XCODING_SYSTEM_NAME (sys);
1427 /************************************************************************/
1428 /* Detecting the encoding of data */
1429 /************************************************************************/
1431 struct detection_state
1433 enum eol_type eol_type;
1469 struct iso2022_decoder iso;
1471 int high_byte_count;
1472 unsigned int saw_single_shift:1;
1485 acceptable_control_char_p (int c)
1489 /* Allow and ignore control characters that you might
1490 reasonably see in a text file */
1495 case 8: /* backspace */
1496 case 11: /* vertical tab */
1497 case 12: /* form feed */
1498 case 26: /* MS-DOS C-z junk */
1499 case 31: /* '^_' -- for info */
1507 mask_has_at_most_one_bit_p (int mask)
1509 /* Perhaps the only thing useful you learn from intensive Microsoft
1510 technical interviews */
1511 return (mask & (mask - 1)) == 0;
1514 static enum eol_type
1515 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1525 if (st->eol.just_saw_cr)
1527 else if (st->eol.seen_anything)
1530 else if (st->eol.just_saw_cr)
1533 st->eol.just_saw_cr = 1;
1535 st->eol.just_saw_cr = 0;
1536 st->eol.seen_anything = 1;
1539 return EOL_AUTODETECT;
1542 /* Attempt to determine the encoding and EOL type of the given text.
1543 Before calling this function for the first type, you must initialize
1544 st->eol_type as appropriate and initialize st->mask to ~0.
1546 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1549 st->mask holds the determined coding category mask, or ~0 if only
1550 ASCII has been seen so far.
1554 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1555 is present in st->mask
1556 1 == definitive answers are here for both st->eol_type and st->mask
1560 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1561 unsigned int n, int just_do_eol)
1565 if (st->eol_type == EOL_AUTODETECT)
1566 st->eol_type = detect_eol_type (st, src, n);
1569 return st->eol_type != EOL_AUTODETECT;
1571 if (!st->seen_non_ascii)
1573 for (; n; n--, src++)
1576 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1578 st->seen_non_ascii = 1;
1580 st->shift_jis.mask = ~0;
1584 st->iso2022.mask = ~0;
1594 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1595 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1596 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1597 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1598 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1599 st->big5.mask = detect_coding_big5 (st, src, n);
1600 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1601 st->utf8.mask = detect_coding_utf8 (st, src, n);
1602 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1603 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1606 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1607 | st->utf8.mask | st->ucs4.mask;
1610 int retval = mask_has_at_most_one_bit_p (st->mask);
1611 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1612 return retval && st->eol_type != EOL_AUTODETECT;
1617 coding_system_from_mask (int mask)
1621 /* If the file was entirely or basically ASCII, use the
1622 default value of `buffer-file-coding-system'. */
1623 Lisp_Object retval =
1624 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1627 retval = Ffind_coding_system (retval);
1631 (Qbad_variable, Qwarning,
1632 "Invalid `default-buffer-file-coding-system', set to nil");
1633 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1637 retval = Fget_coding_system (Qraw_text);
1645 mask = postprocess_iso2022_mask (mask);
1647 /* Look through the coding categories by priority and find
1648 the first one that is allowed. */
1649 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1651 cat = fcd->coding_category_by_priority[i];
1652 if ((mask & (1 << cat)) &&
1653 !NILP (fcd->coding_category_system[cat]))
1657 return fcd->coding_category_system[cat];
1659 return Fget_coding_system (Qraw_text);
1663 /* Given a seekable read stream and potential coding system and EOL type
1664 as specified, do any autodetection that is called for. If the
1665 coding system and/or EOL type are not `autodetect', they will be left
1666 alone; but this function will never return an autodetect coding system
1669 This function does not automatically fetch subsidiary coding systems;
1670 that should be unnecessary with the explicit eol-type argument. */
1672 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1675 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1676 enum eol_type *eol_type_in_out)
1678 struct detection_state decst;
1680 if (*eol_type_in_out == EOL_AUTODETECT)
1681 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1684 decst.eol_type = *eol_type_in_out;
1687 /* If autodetection is called for, do it now. */
1688 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1689 || *eol_type_in_out == EOL_AUTODETECT)
1692 Lisp_Object coding_system = Qnil;
1694 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1697 /* Look for initial "-*-"; mode line prefix */
1699 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1704 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1706 Extbyte *local_vars_beg = p + 3;
1707 /* Look for final "-*-"; mode line suffix */
1708 for (p = local_vars_beg,
1709 scan_end = buf + nread - LENGTH ("-*-");
1714 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1716 Extbyte *suffix = p;
1717 /* Look for "coding:" */
1718 for (p = local_vars_beg,
1719 scan_end = suffix - LENGTH ("coding:?");
1722 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1723 && (p == local_vars_beg
1724 || (*(p-1) == ' ' ||
1730 p += LENGTH ("coding:");
1731 while (*p == ' ' || *p == '\t') p++;
1733 /* Get coding system name */
1734 save = *suffix; *suffix = '\0';
1735 /* Characters valid in a MIME charset name (rfc 1521),
1736 and in a Lisp symbol name. */
1737 n = strspn ( (char *) p,
1738 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1739 "abcdefghijklmnopqrstuvwxyz"
1745 save = p[n]; p[n] = '\0';
1747 Ffind_coding_system (intern ((char *) p));
1757 if (NILP (coding_system))
1760 if (detect_coding_type (&decst, buf, nread,
1761 XCODING_SYSTEM_TYPE (*codesys_in_out)
1762 != CODESYS_AUTODETECT))
1764 nread = Lstream_read (stream, buf, sizeof (buf));
1770 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1771 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1774 if (detect_coding_type (&decst, buf, nread, 1))
1776 nread = Lstream_read (stream, buf, sizeof (buf));
1782 *eol_type_in_out = decst.eol_type;
1783 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1785 if (NILP (coding_system))
1786 *codesys_in_out = coding_system_from_mask (decst.mask);
1788 *codesys_in_out = coding_system;
1792 /* If we absolutely can't determine the EOL type, just assume LF. */
1793 if (*eol_type_in_out == EOL_AUTODETECT)
1794 *eol_type_in_out = EOL_LF;
1796 Lstream_rewind (stream);
1799 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1800 Detect coding system of the text in the region between START and END.
1801 Returned a list of possible coding systems ordered by priority.
1802 If only ASCII characters are found, it returns 'undecided or one of
1803 its subsidiary coding systems according to a detected end-of-line
1804 type. Optional arg BUFFER defaults to the current buffer.
1806 (start, end, buffer))
1808 Lisp_Object val = Qnil;
1809 struct buffer *buf = decode_buffer (buffer, 0);
1811 Lisp_Object instream, lb_instream;
1812 Lstream *istr, *lb_istr;
1813 struct detection_state decst;
1814 struct gcpro gcpro1, gcpro2;
1816 get_buffer_range_char (buf, start, end, &b, &e, 0);
1817 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1818 lb_istr = XLSTREAM (lb_instream);
1819 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1820 istr = XLSTREAM (instream);
1821 GCPRO2 (instream, lb_instream);
1823 decst.eol_type = EOL_AUTODETECT;
1827 unsigned char random_buffer[4096];
1828 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1832 if (detect_coding_type (&decst, random_buffer, nread, 0))
1836 if (decst.mask == ~0)
1837 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1845 decst.mask = postprocess_iso2022_mask (decst.mask);
1847 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1849 int sys = fcd->coding_category_by_priority[i];
1850 if (decst.mask & (1 << sys))
1852 Lisp_Object codesys = fcd->coding_category_system[sys];
1853 if (!NILP (codesys))
1854 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1855 val = Fcons (codesys, val);
1859 Lstream_close (istr);
1861 Lstream_delete (istr);
1862 Lstream_delete (lb_istr);
1867 /************************************************************************/
1868 /* Converting to internal Mule format ("decoding") */
1869 /************************************************************************/
1871 /* A decoding stream is a stream used for decoding text (i.e.
1872 converting from some external format to internal format).
1873 The decoding-stream object keeps track of the actual coding
1874 stream, the stream that is at the other end, and data that
1875 needs to be persistent across the lifetime of the stream. */
1877 /* Handle the EOL stuff related to just-read-in character C.
1878 EOL_TYPE is the EOL type of the coding stream.
1879 FLAGS is the current value of FLAGS in the coding stream, and may
1880 be modified by this macro. (The macro only looks at the
1881 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1882 bytes are to be written. You need to also define a local goto
1883 label "label_continue_loop" that is at the end of the main
1884 character-reading loop.
1886 If C is a CR character, then this macro handles it entirely and
1887 jumps to label_continue_loop. Otherwise, this macro does not add
1888 anything to DST, and continues normally. You should continue
1889 processing C normally after this macro. */
1891 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1895 if (eol_type == EOL_CR) \
1896 Dynarr_add (dst, '\n'); \
1897 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1898 Dynarr_add (dst, c); \
1900 flags |= CODING_STATE_CR; \
1901 goto label_continue_loop; \
1903 else if (flags & CODING_STATE_CR) \
1904 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1906 Dynarr_add (dst, '\r'); \
1907 flags &= ~CODING_STATE_CR; \
1911 /* C should be a binary character in the range 0 - 255; convert
1912 to internal format and add to Dynarr DST. */
1915 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1917 if (BYTE_ASCII_P (c)) \
1918 Dynarr_add (dst, c); \
1921 Dynarr_add (dst, (c >> 6) | 0xc0); \
1922 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1927 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1931 Dynarr_add (dst, c);
1933 else if ( c <= 0x7ff )
1935 Dynarr_add (dst, (c >> 6) | 0xc0);
1936 Dynarr_add (dst, (c & 0x3f) | 0x80);
1938 else if ( c <= 0xffff )
1940 Dynarr_add (dst, (c >> 12) | 0xe0);
1941 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1942 Dynarr_add (dst, (c & 0x3f) | 0x80);
1944 else if ( c <= 0x1fffff )
1946 Dynarr_add (dst, (c >> 18) | 0xf0);
1947 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1948 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1949 Dynarr_add (dst, (c & 0x3f) | 0x80);
1951 else if ( c <= 0x3ffffff )
1953 Dynarr_add (dst, (c >> 24) | 0xf8);
1954 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1955 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1956 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1957 Dynarr_add (dst, (c & 0x3f) | 0x80);
1961 Dynarr_add (dst, (c >> 30) | 0xfc);
1962 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1963 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1964 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1965 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1966 Dynarr_add (dst, (c & 0x3f) | 0x80);
1970 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1972 if (BYTE_ASCII_P (c)) \
1973 Dynarr_add (dst, c); \
1974 else if (BYTE_C1_P (c)) \
1976 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1977 Dynarr_add (dst, c + 0x20); \
1981 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1982 Dynarr_add (dst, c); \
1987 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1991 DECODE_ADD_BINARY_CHAR (ch, dst); \
1996 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1998 if (flags & CODING_STATE_END) \
2000 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2001 if (flags & CODING_STATE_CR) \
2002 Dynarr_add (dst, '\r'); \
2006 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2008 struct decoding_stream
2010 /* Coding system that governs the conversion. */
2011 Lisp_Coding_System *codesys;
2013 /* Stream that we read the encoded data from or
2014 write the decoded data to. */
2017 /* If we are reading, then we can return only a fixed amount of
2018 data, so if the conversion resulted in too much data, we store it
2019 here for retrieval the next time around. */
2020 unsigned_char_dynarr *runoff;
2022 /* FLAGS holds flags indicating the current state of the decoding.
2023 Some of these flags are dependent on the coding system. */
2026 /* CH holds a partially built-up character. Since we only deal
2027 with one- and two-byte characters at the moment, we only use
2028 this to store the first byte of a two-byte character. */
2031 /* EOL_TYPE specifies the type of end-of-line conversion that
2032 currently applies. We need to keep this separate from the
2033 EOL type stored in CODESYS because the latter might indicate
2034 automatic EOL-type detection while the former will always
2035 indicate a particular EOL type. */
2036 enum eol_type eol_type;
2038 /* Additional ISO2022 information. We define the structure above
2039 because it's also needed by the detection routines. */
2040 struct iso2022_decoder iso2022;
2042 /* Additional information (the state of the running CCL program)
2043 used by the CCL decoder. */
2044 struct ccl_program ccl;
2046 /* counter for UTF-8 or UCS-4 */
2047 unsigned char counter;
2050 unsigned combined_char_count;
2051 Emchar combined_chars[16];
2052 Lisp_Object combining_table;
2054 struct detection_state decst;
2058 extern Lisp_Object Vcharacter_composition_table;
2061 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2065 for (i = 0; i < str->combined_char_count; i++)
2066 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2067 str->combined_char_count = 0;
2068 str->combining_table = Qnil;
2072 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2073 Emchar character, unsigned_char_dynarr* dst)
2075 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2076 DECODE_ADD_UCS_CHAR (character, dst);
2077 else if (!CHAR_CODE_TABLE_P (str->combining_table))
2080 = get_char_code_table (character, Vcharacter_composition_table);
2083 DECODE_ADD_UCS_CHAR (character, dst);
2086 str->combined_chars[0] = character;
2087 str->combined_char_count = 1;
2088 str->combining_table = ret;
2094 = get_char_code_table (character, str->combining_table);
2098 Emchar char2 = XCHARVAL (ret);
2099 ret = get_char_code_table (char2, Vcharacter_composition_table);
2102 DECODE_ADD_UCS_CHAR (char2, dst);
2103 str->combined_char_count = 0;
2104 str->combining_table = Qnil;
2108 str->combined_chars[0] = char2;
2109 str->combined_char_count = 1;
2110 str->combining_table = ret;
2113 else if (CHAR_CODE_TABLE_P (ret))
2115 str->combined_chars[str->combined_char_count++] = character;
2116 str->combining_table = ret;
2120 COMPOSE_FLUSH_CHARS (str, dst);
2121 DECODE_ADD_UCS_CHAR (character, dst);
2125 #else /* not UTF2000 */
2126 #define COMPOSE_FLUSH_CHARS(str, dst)
2127 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2128 #endif /* UTF2000 */
2130 static ssize_t decoding_reader (Lstream *stream,
2131 unsigned char *data, size_t size);
2132 static ssize_t decoding_writer (Lstream *stream,
2133 CONST unsigned char *data, size_t size);
2134 static int decoding_rewinder (Lstream *stream);
2135 static int decoding_seekable_p (Lstream *stream);
2136 static int decoding_flusher (Lstream *stream);
2137 static int decoding_closer (Lstream *stream);
2139 static Lisp_Object decoding_marker (Lisp_Object stream);
2141 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2142 sizeof (struct decoding_stream));
2145 decoding_marker (Lisp_Object stream)
2147 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2148 Lisp_Object str_obj;
2150 /* We do not need to mark the coding systems or charsets stored
2151 within the stream because they are stored in a global list
2152 and automatically marked. */
2154 XSETLSTREAM (str_obj, str);
2155 mark_object (str_obj);
2156 if (str->imp->marker)
2157 return (str->imp->marker) (str_obj);
2162 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2163 so we read data from the other end, decode it, and store it into DATA. */
2166 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2168 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2169 unsigned char *orig_data = data;
2171 int error_occurred = 0;
2173 /* We need to interface to mule_decode(), which expects to take some
2174 amount of data and store the result into a Dynarr. We have
2175 mule_decode() store into str->runoff, and take data from there
2178 /* We loop until we have enough data, reading chunks from the other
2179 end and decoding it. */
2182 /* Take data from the runoff if we can. Make sure to take at
2183 most SIZE bytes, and delete the data from the runoff. */
2184 if (Dynarr_length (str->runoff) > 0)
2186 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2187 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2188 Dynarr_delete_many (str->runoff, 0, chunk);
2194 break; /* No more room for data */
2196 if (str->flags & CODING_STATE_END)
2197 /* This means that on the previous iteration, we hit the EOF on
2198 the other end. We loop once more so that mule_decode() can
2199 output any final stuff it may be holding, or any "go back
2200 to a sane state" escape sequences. (This latter makes sense
2201 during encoding.) */
2204 /* Exhausted the runoff, so get some more. DATA has at least
2205 SIZE bytes left of storage in it, so it's OK to read directly
2206 into it. (We'll be overwriting above, after we've decoded it
2207 into the runoff.) */
2208 read_size = Lstream_read (str->other_end, data, size);
2215 /* There might be some more end data produced in the translation.
2216 See the comment above. */
2217 str->flags |= CODING_STATE_END;
2218 mule_decode (stream, data, str->runoff, read_size);
2221 if (data - orig_data == 0)
2222 return error_occurred ? -1 : 0;
2224 return data - orig_data;
2228 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2230 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2233 /* Decode all our data into the runoff, and then attempt to write
2234 it all out to the other end. Remove whatever chunk we succeeded
2236 mule_decode (stream, data, str->runoff, size);
2237 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2238 Dynarr_length (str->runoff));
2240 Dynarr_delete_many (str->runoff, 0, retval);
2241 /* Do NOT return retval. The return value indicates how much
2242 of the incoming data was written, not how many bytes were
2248 reset_decoding_stream (struct decoding_stream *str)
2251 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2253 Lisp_Object coding_system;
2254 XSETCODING_SYSTEM (coding_system, str->codesys);
2255 reset_iso2022 (coding_system, &str->iso2022);
2257 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2259 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2264 str->combined_char_count = 0;
2265 str->combining_table = Qnil;
2267 str->flags = str->ch = 0;
2271 decoding_rewinder (Lstream *stream)
2273 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2274 reset_decoding_stream (str);
2275 Dynarr_reset (str->runoff);
2276 return Lstream_rewind (str->other_end);
2280 decoding_seekable_p (Lstream *stream)
2282 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2283 return Lstream_seekable_p (str->other_end);
2287 decoding_flusher (Lstream *stream)
2289 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2290 return Lstream_flush (str->other_end);
2294 decoding_closer (Lstream *stream)
2296 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2297 if (stream->flags & LSTREAM_FL_WRITE)
2299 str->flags |= CODING_STATE_END;
2300 decoding_writer (stream, 0, 0);
2302 Dynarr_free (str->runoff);
2304 #ifdef ENABLE_COMPOSITE_CHARS
2305 if (str->iso2022.composite_chars)
2306 Dynarr_free (str->iso2022.composite_chars);
2309 return Lstream_close (str->other_end);
2313 decoding_stream_coding_system (Lstream *stream)
2315 Lisp_Object coding_system;
2316 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2318 XSETCODING_SYSTEM (coding_system, str->codesys);
2319 return subsidiary_coding_system (coding_system, str->eol_type);
2323 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2325 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2326 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2328 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2329 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2330 reset_decoding_stream (str);
2333 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2334 stream for writing, no automatic code detection will be performed.
2335 The reason for this is that automatic code detection requires a
2336 seekable input. Things will also fail if you open a decoding
2337 stream for reading using a non-fully-specified coding system and
2338 a non-seekable input stream. */
2341 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2344 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2345 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2349 str->other_end = stream;
2350 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2351 str->eol_type = EOL_AUTODETECT;
2352 if (!strcmp (mode, "r")
2353 && Lstream_seekable_p (stream))
2354 /* We can determine the coding system now. */
2355 determine_real_coding_system (stream, &codesys, &str->eol_type);
2356 set_decoding_stream_coding_system (lstr, codesys);
2357 str->decst.eol_type = str->eol_type;
2358 str->decst.mask = ~0;
2359 XSETLSTREAM (obj, lstr);
2364 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2366 return make_decoding_stream_1 (stream, codesys, "r");
2370 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2372 return make_decoding_stream_1 (stream, codesys, "w");
2375 /* Note: the decode_coding_* functions all take the same
2376 arguments as mule_decode(), which is to say some SRC data of
2377 size N, which is to be stored into dynamic array DST.
2378 DECODING is the stream within which the decoding is
2379 taking place, but no data is actually read from or
2380 written to that stream; that is handled in decoding_reader()
2381 or decoding_writer(). This allows the same functions to
2382 be used for both reading and writing. */
2385 mule_decode (Lstream *decoding, CONST unsigned char *src,
2386 unsigned_char_dynarr *dst, unsigned int n)
2388 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2390 /* If necessary, do encoding-detection now. We do this when
2391 we're a writing stream or a non-seekable reading stream,
2392 meaning that we can't just process the whole input,
2393 rewind, and start over. */
2395 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2396 str->eol_type == EOL_AUTODETECT)
2398 Lisp_Object codesys;
2400 XSETCODING_SYSTEM (codesys, str->codesys);
2401 detect_coding_type (&str->decst, src, n,
2402 CODING_SYSTEM_TYPE (str->codesys) !=
2403 CODESYS_AUTODETECT);
2404 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2405 str->decst.mask != ~0)
2406 /* #### This is cheesy. What we really ought to do is
2407 buffer up a certain amount of data so as to get a
2408 less random result. */
2409 codesys = coding_system_from_mask (str->decst.mask);
2410 str->eol_type = str->decst.eol_type;
2411 if (XCODING_SYSTEM (codesys) != str->codesys)
2413 /* Preserve the CODING_STATE_END flag in case it was set.
2414 If we erase it, bad things might happen. */
2415 int was_end = str->flags & CODING_STATE_END;
2416 set_decoding_stream_coding_system (decoding, codesys);
2418 str->flags |= CODING_STATE_END;
2422 switch (CODING_SYSTEM_TYPE (str->codesys))
2425 case CODESYS_INTERNAL:
2426 Dynarr_add_many (dst, src, n);
2429 case CODESYS_AUTODETECT:
2430 /* If we got this far and still haven't decided on the coding
2431 system, then do no conversion. */
2432 case CODESYS_NO_CONVERSION:
2433 decode_coding_no_conversion (decoding, src, dst, n);
2436 case CODESYS_SHIFT_JIS:
2437 decode_coding_sjis (decoding, src, dst, n);
2440 decode_coding_big5 (decoding, src, dst, n);
2443 decode_coding_ucs4 (decoding, src, dst, n);
2446 decode_coding_utf8 (decoding, src, dst, n);
2449 str->ccl.last_block = str->flags & CODING_STATE_END;
2450 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2452 case CODESYS_ISO2022:
2453 decode_coding_iso2022 (decoding, src, dst, n);
2461 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2462 Decode the text between START and END which is encoded in CODING-SYSTEM.
2463 This is useful if you've read in encoded text from a file without decoding
2464 it (e.g. you read in a JIS-formatted file but used the `binary' or
2465 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2466 Return length of decoded text.
2467 BUFFER defaults to the current buffer if unspecified.
2469 (start, end, coding_system, buffer))
2472 struct buffer *buf = decode_buffer (buffer, 0);
2473 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2474 Lstream *istr, *ostr;
2475 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2477 get_buffer_range_char (buf, start, end, &b, &e, 0);
2479 barf_if_buffer_read_only (buf, b, e);
2481 coding_system = Fget_coding_system (coding_system);
2482 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2483 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2484 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2486 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2487 Fget_coding_system (Qbinary));
2488 istr = XLSTREAM (instream);
2489 ostr = XLSTREAM (outstream);
2490 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2492 /* The chain of streams looks like this:
2494 [BUFFER] <----- send through
2495 ------> [ENCODE AS BINARY]
2496 ------> [DECODE AS SPECIFIED]
2502 char tempbuf[1024]; /* some random amount */
2503 Bufpos newpos, even_newer_pos;
2504 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2505 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2509 newpos = lisp_buffer_stream_startpos (istr);
2510 Lstream_write (ostr, tempbuf, size_in_bytes);
2511 even_newer_pos = lisp_buffer_stream_startpos (istr);
2512 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2515 Lstream_close (istr);
2516 Lstream_close (ostr);
2518 Lstream_delete (istr);
2519 Lstream_delete (ostr);
2520 Lstream_delete (XLSTREAM (de_outstream));
2521 Lstream_delete (XLSTREAM (lb_outstream));
2526 /************************************************************************/
2527 /* Converting to an external encoding ("encoding") */
2528 /************************************************************************/
2530 /* An encoding stream is an output stream. When you create the
2531 stream, you specify the coding system that governs the encoding
2532 and another stream that the resulting encoded data is to be
2533 sent to, and then start sending data to it. */
2535 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2537 struct encoding_stream
2539 /* Coding system that governs the conversion. */
2540 Lisp_Coding_System *codesys;
2542 /* Stream that we read the encoded data from or
2543 write the decoded data to. */
2546 /* If we are reading, then we can return only a fixed amount of
2547 data, so if the conversion resulted in too much data, we store it
2548 here for retrieval the next time around. */
2549 unsigned_char_dynarr *runoff;
2551 /* FLAGS holds flags indicating the current state of the encoding.
2552 Some of these flags are dependent on the coding system. */
2555 /* CH holds a partially built-up character. Since we only deal
2556 with one- and two-byte characters at the moment, we only use
2557 this to store the first byte of a two-byte character. */
2560 /* Additional information used by the ISO2022 encoder. */
2563 /* CHARSET holds the character sets currently assigned to the G0
2564 through G3 registers. It is initialized from the array
2565 INITIAL_CHARSET in CODESYS. */
2566 Lisp_Object charset[4];
2568 /* Which registers are currently invoked into the left (GL) and
2569 right (GR) halves of the 8-bit encoding space? */
2570 int register_left, register_right;
2572 /* Whether we need to explicitly designate the charset in the
2573 G? register before using it. It is initialized from the
2574 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2575 unsigned char force_charset_on_output[4];
2577 /* Other state variables that need to be preserved across
2579 Lisp_Object current_charset;
2581 int current_char_boundary;
2584 void (*encode_char) (struct encoding_stream *str, Emchar c,
2585 unsigned_char_dynarr *dst, unsigned int *flags);
2586 void (*finish) (struct encoding_stream *str,
2587 unsigned_char_dynarr *dst, unsigned int *flags);
2589 /* Additional information (the state of the running CCL program)
2590 used by the CCL encoder. */
2591 struct ccl_program ccl;
2595 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2596 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2598 static int encoding_rewinder (Lstream *stream);
2599 static int encoding_seekable_p (Lstream *stream);
2600 static int encoding_flusher (Lstream *stream);
2601 static int encoding_closer (Lstream *stream);
2603 static Lisp_Object encoding_marker (Lisp_Object stream);
2605 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2606 sizeof (struct encoding_stream));
2609 encoding_marker (Lisp_Object stream)
2611 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2612 Lisp_Object str_obj;
2614 /* We do not need to mark the coding systems or charsets stored
2615 within the stream because they are stored in a global list
2616 and automatically marked. */
2618 XSETLSTREAM (str_obj, str);
2619 mark_object (str_obj);
2620 if (str->imp->marker)
2621 return (str->imp->marker) (str_obj);
2626 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2627 so we read data from the other end, encode it, and store it into DATA. */
2630 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2632 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2633 unsigned char *orig_data = data;
2635 int error_occurred = 0;
2637 /* We need to interface to mule_encode(), which expects to take some
2638 amount of data and store the result into a Dynarr. We have
2639 mule_encode() store into str->runoff, and take data from there
2642 /* We loop until we have enough data, reading chunks from the other
2643 end and encoding it. */
2646 /* Take data from the runoff if we can. Make sure to take at
2647 most SIZE bytes, and delete the data from the runoff. */
2648 if (Dynarr_length (str->runoff) > 0)
2650 int chunk = min ((int) size, Dynarr_length (str->runoff));
2651 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2652 Dynarr_delete_many (str->runoff, 0, chunk);
2658 break; /* No more room for data */
2660 if (str->flags & CODING_STATE_END)
2661 /* This means that on the previous iteration, we hit the EOF on
2662 the other end. We loop once more so that mule_encode() can
2663 output any final stuff it may be holding, or any "go back
2664 to a sane state" escape sequences. (This latter makes sense
2665 during encoding.) */
2668 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2669 left of storage in it, so it's OK to read directly into it.
2670 (We'll be overwriting above, after we've encoded it into the
2672 read_size = Lstream_read (str->other_end, data, size);
2679 /* There might be some more end data produced in the translation.
2680 See the comment above. */
2681 str->flags |= CODING_STATE_END;
2682 mule_encode (stream, data, str->runoff, read_size);
2685 if (data == orig_data)
2686 return error_occurred ? -1 : 0;
2688 return data - orig_data;
2692 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2694 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2697 /* Encode all our data into the runoff, and then attempt to write
2698 it all out to the other end. Remove whatever chunk we succeeded
2700 mule_encode (stream, data, str->runoff, size);
2701 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2702 Dynarr_length (str->runoff));
2704 Dynarr_delete_many (str->runoff, 0, retval);
2705 /* Do NOT return retval. The return value indicates how much
2706 of the incoming data was written, not how many bytes were
2712 reset_encoding_stream (struct encoding_stream *str)
2715 switch (CODING_SYSTEM_TYPE (str->codesys))
2717 case CODESYS_ISO2022:
2721 str->encode_char = &char_encode_iso2022;
2722 str->finish = &char_finish_iso2022;
2723 for (i = 0; i < 4; i++)
2725 str->iso2022.charset[i] =
2726 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2727 str->iso2022.force_charset_on_output[i] =
2728 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2730 str->iso2022.register_left = 0;
2731 str->iso2022.register_right = 1;
2732 str->iso2022.current_charset = Qnil;
2733 str->iso2022.current_half = 0;
2737 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2740 str->encode_char = &char_encode_utf8;
2741 str->finish = &char_finish_utf8;
2744 str->encode_char = &char_encode_ucs4;
2745 str->finish = &char_finish_ucs4;
2747 case CODESYS_SHIFT_JIS:
2748 str->encode_char = &char_encode_shift_jis;
2749 str->finish = &char_finish_shift_jis;
2755 str->iso2022.current_char_boundary = 0;
2756 str->flags = str->ch = 0;
2760 encoding_rewinder (Lstream *stream)
2762 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2763 reset_encoding_stream (str);
2764 Dynarr_reset (str->runoff);
2765 return Lstream_rewind (str->other_end);
2769 encoding_seekable_p (Lstream *stream)
2771 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2772 return Lstream_seekable_p (str->other_end);
2776 encoding_flusher (Lstream *stream)
2778 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2779 return Lstream_flush (str->other_end);
2783 encoding_closer (Lstream *stream)
2785 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2786 if (stream->flags & LSTREAM_FL_WRITE)
2788 str->flags |= CODING_STATE_END;
2789 encoding_writer (stream, 0, 0);
2791 Dynarr_free (str->runoff);
2792 return Lstream_close (str->other_end);
2796 encoding_stream_coding_system (Lstream *stream)
2798 Lisp_Object coding_system;
2799 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2801 XSETCODING_SYSTEM (coding_system, str->codesys);
2802 return coding_system;
2806 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2808 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2809 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2811 reset_encoding_stream (str);
2815 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2818 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2819 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2823 str->runoff = Dynarr_new (unsigned_char);
2824 str->other_end = stream;
2825 set_encoding_stream_coding_system (lstr, codesys);
2826 XSETLSTREAM (obj, lstr);
2831 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2833 return make_encoding_stream_1 (stream, codesys, "r");
2837 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2839 return make_encoding_stream_1 (stream, codesys, "w");
2842 /* Convert N bytes of internally-formatted data stored in SRC to an
2843 external format, according to the encoding stream ENCODING.
2844 Store the encoded data into DST. */
2847 mule_encode (Lstream *encoding, CONST unsigned char *src,
2848 unsigned_char_dynarr *dst, unsigned int n)
2850 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2852 switch (CODING_SYSTEM_TYPE (str->codesys))
2855 case CODESYS_INTERNAL:
2856 Dynarr_add_many (dst, src, n);
2859 case CODESYS_AUTODETECT:
2860 /* If we got this far and still haven't decided on the coding
2861 system, then do no conversion. */
2862 case CODESYS_NO_CONVERSION:
2863 encode_coding_no_conversion (encoding, src, dst, n);
2867 encode_coding_big5 (encoding, src, dst, n);
2870 str->ccl.last_block = str->flags & CODING_STATE_END;
2871 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2875 text_encode_generic (encoding, src, dst, n);
2879 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2880 Encode the text between START and END using CODING-SYSTEM.
2881 This will, for example, convert Japanese characters into stuff such as
2882 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2883 text. BUFFER defaults to the current buffer if unspecified.
2885 (start, end, coding_system, buffer))
2888 struct buffer *buf = decode_buffer (buffer, 0);
2889 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2890 Lstream *istr, *ostr;
2891 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2893 get_buffer_range_char (buf, start, end, &b, &e, 0);
2895 barf_if_buffer_read_only (buf, b, e);
2897 coding_system = Fget_coding_system (coding_system);
2898 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2899 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2900 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2901 Fget_coding_system (Qbinary));
2902 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2904 istr = XLSTREAM (instream);
2905 ostr = XLSTREAM (outstream);
2906 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2907 /* The chain of streams looks like this:
2909 [BUFFER] <----- send through
2910 ------> [ENCODE AS SPECIFIED]
2911 ------> [DECODE AS BINARY]
2916 char tempbuf[1024]; /* some random amount */
2917 Bufpos newpos, even_newer_pos;
2918 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2919 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2923 newpos = lisp_buffer_stream_startpos (istr);
2924 Lstream_write (ostr, tempbuf, size_in_bytes);
2925 even_newer_pos = lisp_buffer_stream_startpos (istr);
2926 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2932 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2933 Lstream_close (istr);
2934 Lstream_close (ostr);
2936 Lstream_delete (istr);
2937 Lstream_delete (ostr);
2938 Lstream_delete (XLSTREAM (de_outstream));
2939 Lstream_delete (XLSTREAM (lb_outstream));
2940 return make_int (retlen);
2947 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2948 unsigned_char_dynarr *dst, unsigned int n)
2951 unsigned char char_boundary;
2952 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2953 unsigned int flags = str->flags;
2954 Emchar ch = str->ch;
2956 char_boundary = str->iso2022.current_char_boundary;
2962 if (char_boundary == 0)
2990 (*str->encode_char) (str, c, dst, &flags);
2992 else if (char_boundary == 1)
2994 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3000 ch = (ch << 6) | (c & 0x3f);
3005 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3007 (*str->finish) (str, dst, &flags);
3012 str->iso2022.current_char_boundary = char_boundary;
3016 /************************************************************************/
3017 /* Shift-JIS methods */
3018 /************************************************************************/
3020 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3021 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3022 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3023 encoded by "position-code + 0x80". A character of JISX0208
3024 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3025 position-codes are divided and shifted so that it fit in the range
3028 --- CODE RANGE of Shift-JIS ---
3029 (character set) (range)
3031 JISX0201-Kana 0xA0 .. 0xDF
3032 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3033 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3034 -------------------------------
3038 /* Is this the first byte of a Shift-JIS two-byte char? */
3040 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3041 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3043 /* Is this the second byte of a Shift-JIS two-byte char? */
3045 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3046 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3048 #define BYTE_SJIS_KATAKANA_P(c) \
3049 ((c) >= 0xA1 && (c) <= 0xDF)
3052 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
3060 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3062 if (st->shift_jis.in_second_byte)
3064 st->shift_jis.in_second_byte = 0;
3068 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3069 st->shift_jis.in_second_byte = 1;
3071 return CODING_CATEGORY_SHIFT_JIS_MASK;
3074 /* Convert Shift-JIS data to internal format. */
3077 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
3078 unsigned_char_dynarr *dst, unsigned int n)
3081 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3082 unsigned int flags = str->flags;
3083 unsigned int ch = str->ch;
3084 eol_type_t eol_type = str->eol_type;
3092 /* Previous character was first byte of Shift-JIS Kanji char. */
3093 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3095 unsigned char e1, e2;
3097 DECODE_SJIS (ch, c, e1, e2);
3099 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3103 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3104 Dynarr_add (dst, e1);
3105 Dynarr_add (dst, e2);
3110 DECODE_ADD_BINARY_CHAR (ch, dst);
3111 DECODE_ADD_BINARY_CHAR (c, dst);
3117 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3118 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3120 else if (BYTE_SJIS_KATAKANA_P (c))
3123 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3126 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3127 Dynarr_add (dst, c);
3132 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3136 DECODE_ADD_BINARY_CHAR (c, dst);
3138 label_continue_loop:;
3141 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3147 /* Convert internal character representation to Shift_JIS. */
3150 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3151 unsigned_char_dynarr *dst, unsigned int *flags)
3153 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3157 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3158 Dynarr_add (dst, '\r');
3159 if (eol_type != EOL_CR)
3160 Dynarr_add (dst, ch);
3164 Lisp_Object charset;
3165 unsigned int c1, c2, s1, s2;
3167 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3169 if (code_point >= 0)
3171 charset = Vcharset_latin_jisx0201;
3177 BREAKUP_CHAR (ch, charset, c1, c2);
3179 if (EQ(charset, Vcharset_katakana_jisx0201))
3181 Dynarr_add (dst, c1 | 0x80);
3185 Dynarr_add (dst, c1);
3187 else if (EQ(charset, Vcharset_japanese_jisx0208))
3189 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3190 Dynarr_add (dst, s1);
3191 Dynarr_add (dst, s2);
3194 Dynarr_add (dst, '?');
3199 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3200 unsigned int *flags)
3204 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3205 Decode a JISX0208 character of Shift-JIS coding-system.
3206 CODE is the character code in Shift-JIS as a cons of type bytes.
3207 Return the corresponding character.
3211 unsigned char c1, c2, s1, s2;
3214 CHECK_INT (XCAR (code));
3215 CHECK_INT (XCDR (code));
3216 s1 = XINT (XCAR (code));
3217 s2 = XINT (XCDR (code));
3218 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3219 BYTE_SJIS_TWO_BYTE_2_P (s2))
3221 DECODE_SJIS (s1, s2, c1, c2);
3222 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3223 c1 & 0x7F, c2 & 0x7F));
3229 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3230 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3231 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3235 Lisp_Object charset;
3238 CHECK_CHAR_COERCE_INT (ch);
3239 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3240 if (EQ (charset, Vcharset_japanese_jisx0208))
3242 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3243 return Fcons (make_int (s1), make_int (s2));
3250 /************************************************************************/
3252 /************************************************************************/
3254 /* BIG5 is a coding system encoding two character sets: ASCII and
3255 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3256 character set and is encoded in two-byte.
3258 --- CODE RANGE of BIG5 ---
3259 (character set) (range)
3261 Big5 (1st byte) 0xA1 .. 0xFE
3262 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3263 --------------------------
3265 Since the number of characters in Big5 is larger than maximum
3266 characters in Emacs' charset (96x96), it can't be handled as one
3267 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3268 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3269 contains frequently used characters and the latter contains less
3270 frequently used characters. */
3272 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3273 ((c) >= 0xA1 && (c) <= 0xFE)
3275 /* Is this the second byte of a Shift-JIS two-byte char? */
3277 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3278 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3280 /* Number of Big5 characters which have the same code in 1st byte. */
3282 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3284 /* Code conversion macros. These are macros because they are used in
3285 inner loops during code conversion.
3287 Note that temporary variables in macros introduce the classic
3288 dynamic-scoping problems with variable names. We use capital-
3289 lettered variables in the assumption that XEmacs does not use
3290 capital letters in variables except in a very formalized way
3293 /* Convert Big5 code (b1, b2) into its internal string representation
3296 /* There is a much simpler way to split the Big5 charset into two.
3297 For the moment I'm going to leave the algorithm as-is because it
3298 claims to separate out the most-used characters into a single
3299 charset, which perhaps will lead to optimizations in various
3302 The way the algorithm works is something like this:
3304 Big5 can be viewed as a 94x157 charset, where the row is
3305 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3306 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3307 the split between low and high column numbers is apparently
3308 meaningless; ascending rows produce less and less frequent chars.
3309 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3310 the first charset, and the upper half (0xC9 .. 0xFE) to the
3311 second. To do the conversion, we convert the character into
3312 a single number where 0 .. 156 is the first row, 157 .. 313
3313 is the second, etc. That way, the characters are ordered by
3314 decreasing frequency. Then we just chop the space in two
3315 and coerce the result into a 94x94 space.
3318 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3320 int B1 = b1, B2 = b2; \
3322 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3326 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3330 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3331 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3333 c1 = I / (0xFF - 0xA1) + 0xA1; \
3334 c2 = I % (0xFF - 0xA1) + 0xA1; \
3337 /* Convert the internal string representation of a Big5 character
3338 (lb, c1, c2) into Big5 code (b1, b2). */
3340 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3342 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3344 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3346 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3348 b1 = I / BIG5_SAME_ROW + 0xA1; \
3349 b2 = I % BIG5_SAME_ROW; \
3350 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3354 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3362 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3363 (c >= 0x80 && c <= 0xA0))
3365 if (st->big5.in_second_byte)
3367 st->big5.in_second_byte = 0;
3368 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3372 st->big5.in_second_byte = 1;
3374 return CODING_CATEGORY_BIG5_MASK;
3377 /* Convert Big5 data to internal format. */
3380 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3381 unsigned_char_dynarr *dst, unsigned int n)
3384 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3385 unsigned int flags = str->flags;
3386 unsigned int ch = str->ch;
3387 eol_type_t eol_type = str->eol_type;
3394 /* Previous character was first byte of Big5 char. */
3395 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3397 unsigned char b1, b2, b3;
3398 DECODE_BIG5 (ch, c, b1, b2, b3);
3399 Dynarr_add (dst, b1);
3400 Dynarr_add (dst, b2);
3401 Dynarr_add (dst, b3);
3405 DECODE_ADD_BINARY_CHAR (ch, dst);
3406 DECODE_ADD_BINARY_CHAR (c, dst);
3412 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3413 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3416 DECODE_ADD_BINARY_CHAR (c, dst);
3418 label_continue_loop:;
3421 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3427 /* Convert internally-formatted data to Big5. */
3430 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3431 unsigned_char_dynarr *dst, unsigned int n)
3435 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3436 unsigned int flags = str->flags;
3437 unsigned int ch = str->ch;
3438 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3445 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3446 Dynarr_add (dst, '\r');
3447 if (eol_type != EOL_CR)
3448 Dynarr_add (dst, '\n');
3450 else if (BYTE_ASCII_P (c))
3453 Dynarr_add (dst, c);
3455 else if (BUFBYTE_LEADING_BYTE_P (c))
3457 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3458 c == LEADING_BYTE_CHINESE_BIG5_2)
3460 /* A recognized leading byte. */
3462 continue; /* not done with this character. */
3464 /* otherwise just ignore this character. */
3466 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3467 ch == LEADING_BYTE_CHINESE_BIG5_2)
3469 /* Previous char was a recognized leading byte. */
3471 continue; /* not done with this character. */
3475 /* Encountering second byte of a Big5 character. */
3476 unsigned char b1, b2;
3478 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3479 Dynarr_add (dst, b1);
3480 Dynarr_add (dst, b2);
3492 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3493 Decode a Big5 character CODE of BIG5 coding-system.
3494 CODE is the character code in BIG5, a cons of two integers.
3495 Return the corresponding character.
3499 unsigned char c1, c2, b1, b2;
3502 CHECK_INT (XCAR (code));
3503 CHECK_INT (XCDR (code));
3504 b1 = XINT (XCAR (code));
3505 b2 = XINT (XCDR (code));
3506 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3507 BYTE_BIG5_TWO_BYTE_2_P (b2))
3509 Charset_ID leading_byte;
3510 Lisp_Object charset;
3511 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3512 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3513 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3519 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3520 Encode the Big5 character CH to BIG5 coding-system.
3521 Return the corresponding character code in Big5.
3525 Lisp_Object charset;
3528 CHECK_CHAR_COERCE_INT (ch);
3529 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3530 if (EQ (charset, Vcharset_chinese_big5_1) ||
3531 EQ (charset, Vcharset_chinese_big5_2))
3533 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3535 return Fcons (make_int (b1), make_int (b2));
3542 /************************************************************************/
3544 /************************************************************************/
3547 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3553 switch (st->ucs4.in_byte)
3562 st->ucs4.in_byte = 0;
3568 return CODING_CATEGORY_UCS4_MASK;
3572 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3573 unsigned_char_dynarr *dst, unsigned int n)
3575 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3576 unsigned int flags = str->flags;
3577 unsigned int ch = str->ch;
3578 unsigned char counter = str->counter;
3582 unsigned char c = *src++;
3590 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3595 ch = ( ch << 8 ) | c;
3599 if (counter & CODING_STATE_END)
3600 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3604 str->counter = counter;
3608 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3609 unsigned_char_dynarr *dst, unsigned int *flags)
3611 Dynarr_add (dst, ch >> 24);
3612 Dynarr_add (dst, ch >> 16);
3613 Dynarr_add (dst, ch >> 8);
3614 Dynarr_add (dst, ch );
3618 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3619 unsigned int *flags)
3624 /************************************************************************/
3626 /************************************************************************/
3629 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3634 unsigned char c = *src++;
3635 switch (st->utf8.in_byte)
3638 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3641 st->utf8.in_byte = 5;
3643 st->utf8.in_byte = 4;
3645 st->utf8.in_byte = 3;
3647 st->utf8.in_byte = 2;
3649 st->utf8.in_byte = 1;
3654 if ((c & 0xc0) != 0x80)
3660 return CODING_CATEGORY_UTF8_MASK;
3664 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3665 unsigned_char_dynarr *dst, unsigned int n)
3667 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3668 unsigned int flags = str->flags;
3669 unsigned int ch = str->ch;
3670 eol_type_t eol_type = str->eol_type;
3671 unsigned char counter = str->counter;
3675 unsigned char c = *src++;
3684 else if ( c >= 0xf8 )
3689 else if ( c >= 0xf0 )
3694 else if ( c >= 0xe0 )
3699 else if ( c >= 0xc0 )
3706 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3707 DECODE_ADD_UCS_CHAR (c, dst);
3711 ch = ( ch << 6 ) | ( c & 0x3f );
3712 DECODE_ADD_UCS_CHAR (ch, dst);
3717 ch = ( ch << 6 ) | ( c & 0x3f );
3720 label_continue_loop:;
3723 if (flags & CODING_STATE_END)
3724 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3728 str->counter = counter;
3732 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3733 unsigned_char_dynarr *dst, unsigned int *flags)
3735 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3739 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3740 Dynarr_add (dst, '\r');
3741 if (eol_type != EOL_CR)
3742 Dynarr_add (dst, ch);
3744 else if (ch <= 0x7f)
3746 Dynarr_add (dst, ch);
3748 else if (ch <= 0x7ff)
3750 Dynarr_add (dst, (ch >> 6) | 0xc0);
3751 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3753 else if (ch <= 0xffff)
3755 Dynarr_add (dst, (ch >> 12) | 0xe0);
3756 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3757 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3759 else if (ch <= 0x1fffff)
3761 Dynarr_add (dst, (ch >> 18) | 0xf0);
3762 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3763 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3764 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3766 else if (ch <= 0x3ffffff)
3768 Dynarr_add (dst, (ch >> 24) | 0xf8);
3769 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3770 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3771 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3772 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3776 Dynarr_add (dst, (ch >> 30) | 0xfc);
3777 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3778 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3779 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3780 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3781 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3786 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3787 unsigned int *flags)
3792 /************************************************************************/
3793 /* ISO2022 methods */
3794 /************************************************************************/
3796 /* The following note describes the coding system ISO2022 briefly.
3797 Since the intention of this note is to help understand the
3798 functions in this file, some parts are NOT ACCURATE or OVERLY
3799 SIMPLIFIED. For thorough understanding, please refer to the
3800 original document of ISO2022.
3802 ISO2022 provides many mechanisms to encode several character sets
3803 in 7-bit and 8-bit environments. For 7-bit environments, all text
3804 is encoded using bytes less than 128. This may make the encoded
3805 text a little bit longer, but the text passes more easily through
3806 several gateways, some of which strip off MSB (Most Signigant Bit).
3808 There are two kinds of character sets: control character set and
3809 graphic character set. The former contains control characters such
3810 as `newline' and `escape' to provide control functions (control
3811 functions are also provided by escape sequences). The latter
3812 contains graphic characters such as 'A' and '-'. Emacs recognizes
3813 two control character sets and many graphic character sets.
3815 Graphic character sets are classified into one of the following
3816 four classes, according to the number of bytes (DIMENSION) and
3817 number of characters in one dimension (CHARS) of the set:
3818 - DIMENSION1_CHARS94
3819 - DIMENSION1_CHARS96
3820 - DIMENSION2_CHARS94
3821 - DIMENSION2_CHARS96
3823 In addition, each character set is assigned an identification tag,
3824 unique for each set, called "final character" (denoted as <F>
3825 hereafter). The <F> of each character set is decided by ECMA(*)
3826 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3827 (0x30..0x3F are for private use only).
3829 Note (*): ECMA = European Computer Manufacturers Association
3831 Here are examples of graphic character set [NAME(<F>)]:
3832 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3833 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3834 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3835 o DIMENSION2_CHARS96 -- none for the moment
3837 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3838 C0 [0x00..0x1F] -- control character plane 0
3839 GL [0x20..0x7F] -- graphic character plane 0
3840 C1 [0x80..0x9F] -- control character plane 1
3841 GR [0xA0..0xFF] -- graphic character plane 1
3843 A control character set is directly designated and invoked to C0 or
3844 C1 by an escape sequence. The most common case is that:
3845 - ISO646's control character set is designated/invoked to C0, and
3846 - ISO6429's control character set is designated/invoked to C1,
3847 and usually these designations/invocations are omitted in encoded
3848 text. In a 7-bit environment, only C0 can be used, and a control
3849 character for C1 is encoded by an appropriate escape sequence to
3850 fit into the environment. All control characters for C1 are
3851 defined to have corresponding escape sequences.
3853 A graphic character set is at first designated to one of four
3854 graphic registers (G0 through G3), then these graphic registers are
3855 invoked to GL or GR. These designations and invocations can be
3856 done independently. The most common case is that G0 is invoked to
3857 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3858 these invocations and designations are omitted in encoded text.
3859 In a 7-bit environment, only GL can be used.
3861 When a graphic character set of CHARS94 is invoked to GL, codes
3862 0x20 and 0x7F of the GL area work as control characters SPACE and
3863 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3866 There are two ways of invocation: locking-shift and single-shift.
3867 With locking-shift, the invocation lasts until the next different
3868 invocation, whereas with single-shift, the invocation affects the
3869 following character only and doesn't affect the locking-shift
3870 state. Invocations are done by the following control characters or
3873 ----------------------------------------------------------------------
3874 abbrev function cntrl escape seq description
3875 ----------------------------------------------------------------------
3876 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3877 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3878 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3879 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3880 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3881 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3882 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3883 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3884 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3885 ----------------------------------------------------------------------
3886 (*) These are not used by any known coding system.
3888 Control characters for these functions are defined by macros
3889 ISO_CODE_XXX in `coding.h'.
3891 Designations are done by the following escape sequences:
3892 ----------------------------------------------------------------------
3893 escape sequence description
3894 ----------------------------------------------------------------------
3895 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3896 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3897 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3898 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3899 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3900 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3901 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3902 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3903 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3904 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3905 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3906 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3907 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3908 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3909 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3910 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3911 ----------------------------------------------------------------------
3913 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3914 of dimension 1, chars 94, and final character <F>, etc...
3916 Note (*): Although these designations are not allowed in ISO2022,
3917 Emacs accepts them on decoding, and produces them on encoding
3918 CHARS96 character sets in a coding system which is characterized as
3919 7-bit environment, non-locking-shift, and non-single-shift.
3921 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3922 '(' can be omitted. We refer to this as "short-form" hereafter.
3924 Now you may notice that there are a lot of ways for encoding the
3925 same multilingual text in ISO2022. Actually, there exist many
3926 coding systems such as Compound Text (used in X11's inter client
3927 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3928 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3929 localized platforms), and all of these are variants of ISO2022.
3931 In addition to the above, Emacs handles two more kinds of escape
3932 sequences: ISO6429's direction specification and Emacs' private
3933 sequence for specifying character composition.
3935 ISO6429's direction specification takes the following form:
3936 o CSI ']' -- end of the current direction
3937 o CSI '0' ']' -- end of the current direction
3938 o CSI '1' ']' -- start of left-to-right text
3939 o CSI '2' ']' -- start of right-to-left text
3940 The control character CSI (0x9B: control sequence introducer) is
3941 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3943 Character composition specification takes the following form:
3944 o ESC '0' -- start character composition
3945 o ESC '1' -- end character composition
3946 Since these are not standard escape sequences of any ISO standard,
3947 their use with these meanings is restricted to Emacs only. */
3950 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3954 for (i = 0; i < 4; i++)
3956 if (!NILP (coding_system))
3958 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3960 iso->charset[i] = Qt;
3961 iso->invalid_designated[i] = 0;
3963 iso->esc = ISO_ESC_NOTHING;
3964 iso->esc_bytes_index = 0;
3965 iso->register_left = 0;
3966 iso->register_right = 1;
3967 iso->switched_dir_and_no_valid_charset_yet = 0;
3968 iso->invalid_switch_dir = 0;
3969 iso->output_direction_sequence = 0;
3970 iso->output_literally = 0;
3971 #ifdef ENABLE_COMPOSITE_CHARS
3972 if (iso->composite_chars)
3973 Dynarr_reset (iso->composite_chars);
3978 fit_to_be_escape_quoted (unsigned char c)
3995 /* Parse one byte of an ISO2022 escape sequence.
3996 If the result is an invalid escape sequence, return 0 and
3997 do not change anything in STR. Otherwise, if the result is
3998 an incomplete escape sequence, update ISO2022.ESC and
3999 ISO2022.ESC_BYTES and return -1. Otherwise, update
4000 all the state variables (but not ISO2022.ESC_BYTES) and
4003 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4004 or invocation of an invalid character set and treat that as
4005 an unrecognized escape sequence. */
4008 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4009 unsigned char c, unsigned int *flags,
4010 int check_invalid_charsets)
4012 /* (1) If we're at the end of a designation sequence, CS is the
4013 charset being designated and REG is the register to designate
4016 (2) If we're at the end of a locking-shift sequence, REG is
4017 the register to invoke and HALF (0 == left, 1 == right) is
4018 the half to invoke it into.
4020 (3) If we're at the end of a single-shift sequence, REG is
4021 the register to invoke. */
4022 Lisp_Object cs = Qnil;
4025 /* NOTE: This code does goto's all over the fucking place.
4026 The reason for this is that we're basically implementing
4027 a state machine here, and hierarchical languages like C
4028 don't really provide a clean way of doing this. */
4030 if (! (*flags & CODING_STATE_ESCAPE))
4031 /* At beginning of escape sequence; we need to reset our
4032 escape-state variables. */
4033 iso->esc = ISO_ESC_NOTHING;
4035 iso->output_literally = 0;
4036 iso->output_direction_sequence = 0;
4040 case ISO_ESC_NOTHING:
4041 iso->esc_bytes_index = 0;
4044 case ISO_CODE_ESC: /* Start escape sequence */
4045 *flags |= CODING_STATE_ESCAPE;
4049 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4050 *flags |= CODING_STATE_ESCAPE;
4051 iso->esc = ISO_ESC_5_11;
4054 case ISO_CODE_SO: /* locking shift 1 */
4057 case ISO_CODE_SI: /* locking shift 0 */
4061 case ISO_CODE_SS2: /* single shift */
4064 case ISO_CODE_SS3: /* single shift */
4068 default: /* Other control characters */
4075 /**** single shift ****/
4077 case 'N': /* single shift 2 */
4080 case 'O': /* single shift 3 */
4084 /**** locking shift ****/
4086 case '~': /* locking shift 1 right */
4089 case 'n': /* locking shift 2 */
4092 case '}': /* locking shift 2 right */
4095 case 'o': /* locking shift 3 */
4098 case '|': /* locking shift 3 right */
4102 #ifdef ENABLE_COMPOSITE_CHARS
4103 /**** composite ****/
4106 iso->esc = ISO_ESC_START_COMPOSITE;
4107 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4108 CODING_STATE_COMPOSITE;
4112 iso->esc = ISO_ESC_END_COMPOSITE;
4113 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4114 ~CODING_STATE_COMPOSITE;
4116 #endif /* ENABLE_COMPOSITE_CHARS */
4118 /**** directionality ****/
4121 iso->esc = ISO_ESC_5_11;
4124 /**** designation ****/
4126 case '$': /* multibyte charset prefix */
4127 iso->esc = ISO_ESC_2_4;
4131 if (0x28 <= c && c <= 0x2F)
4133 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4137 /* This function is called with CODESYS equal to nil when
4138 doing coding-system detection. */
4140 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4141 && fit_to_be_escape_quoted (c))
4143 iso->esc = ISO_ESC_LITERAL;
4144 *flags &= CODING_STATE_ISO2022_LOCK;
4154 /**** directionality ****/
4156 case ISO_ESC_5_11: /* ISO6429 direction control */
4159 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4160 goto directionality;
4162 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4163 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4164 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4168 case ISO_ESC_5_11_0:
4171 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4172 goto directionality;
4176 case ISO_ESC_5_11_1:
4179 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4180 goto directionality;
4184 case ISO_ESC_5_11_2:
4187 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4188 goto directionality;
4193 iso->esc = ISO_ESC_DIRECTIONALITY;
4194 /* Various junk here to attempt to preserve the direction sequences
4195 literally in the text if they would otherwise be swallowed due
4196 to invalid designations that don't show up as actual charset
4197 changes in the text. */
4198 if (iso->invalid_switch_dir)
4200 /* We already inserted a direction switch literally into the
4201 text. We assume (#### this may not be right) that the
4202 next direction switch is the one going the other way,
4203 and we need to output that literally as well. */
4204 iso->output_literally = 1;
4205 iso->invalid_switch_dir = 0;
4211 /* If we are in the thrall of an invalid designation,
4212 then stick the directionality sequence literally into the
4213 output stream so it ends up in the original text again. */
4214 for (jj = 0; jj < 4; jj++)
4215 if (iso->invalid_designated[jj])
4219 iso->output_literally = 1;
4220 iso->invalid_switch_dir = 1;
4223 /* Indicate that we haven't yet seen a valid designation,
4224 so that if a switch-dir is directly followed by an
4225 invalid designation, both get inserted literally. */
4226 iso->switched_dir_and_no_valid_charset_yet = 1;
4231 /**** designation ****/
4234 if (0x28 <= c && c <= 0x2F)
4236 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4239 if (0x40 <= c && c <= 0x42)
4241 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4242 *flags & CODING_STATE_R2L ?
4243 CHARSET_RIGHT_TO_LEFT :
4244 CHARSET_LEFT_TO_RIGHT);
4254 if (c < '0' || c > '~')
4255 return 0; /* bad final byte */
4257 if (iso->esc >= ISO_ESC_2_8 &&
4258 iso->esc <= ISO_ESC_2_15)
4260 type = ((iso->esc >= ISO_ESC_2_12) ?
4261 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4262 reg = (iso->esc - ISO_ESC_2_8) & 3;
4264 else if (iso->esc >= ISO_ESC_2_4_8 &&
4265 iso->esc <= ISO_ESC_2_4_15)
4267 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4268 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4269 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4273 /* Can this ever be reached? -slb */
4277 cs = CHARSET_BY_ATTRIBUTES (type, c,
4278 *flags & CODING_STATE_R2L ?
4279 CHARSET_RIGHT_TO_LEFT :
4280 CHARSET_LEFT_TO_RIGHT);
4286 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4290 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4291 /* can't invoke something that ain't there. */
4293 iso->esc = ISO_ESC_SINGLE_SHIFT;
4294 *flags &= CODING_STATE_ISO2022_LOCK;
4296 *flags |= CODING_STATE_SS2;
4298 *flags |= CODING_STATE_SS3;
4302 if (check_invalid_charsets &&
4303 !CHARSETP (iso->charset[reg]))
4304 /* can't invoke something that ain't there. */
4307 iso->register_right = reg;
4309 iso->register_left = reg;
4310 *flags &= CODING_STATE_ISO2022_LOCK;
4311 iso->esc = ISO_ESC_LOCKING_SHIFT;
4315 if (NILP (cs) && check_invalid_charsets)
4317 iso->invalid_designated[reg] = 1;
4318 iso->charset[reg] = Vcharset_ascii;
4319 iso->esc = ISO_ESC_DESIGNATE;
4320 *flags &= CODING_STATE_ISO2022_LOCK;
4321 iso->output_literally = 1;
4322 if (iso->switched_dir_and_no_valid_charset_yet)
4324 /* We encountered a switch-direction followed by an
4325 invalid designation. Ensure that the switch-direction
4326 gets outputted; otherwise it will probably get eaten
4327 when the text is written out again. */
4328 iso->switched_dir_and_no_valid_charset_yet = 0;
4329 iso->output_direction_sequence = 1;
4330 /* And make sure that the switch-dir going the other
4331 way gets outputted, as well. */
4332 iso->invalid_switch_dir = 1;
4336 /* This function is called with CODESYS equal to nil when
4337 doing coding-system detection. */
4338 if (!NILP (codesys))
4340 charset_conversion_spec_dynarr *dyn =
4341 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4347 for (i = 0; i < Dynarr_length (dyn); i++)
4349 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4350 if (EQ (cs, spec->from_charset))
4351 cs = spec->to_charset;
4356 iso->charset[reg] = cs;
4357 iso->esc = ISO_ESC_DESIGNATE;
4358 *flags &= CODING_STATE_ISO2022_LOCK;
4359 if (iso->invalid_designated[reg])
4361 iso->invalid_designated[reg] = 0;
4362 iso->output_literally = 1;
4364 if (iso->switched_dir_and_no_valid_charset_yet)
4365 iso->switched_dir_and_no_valid_charset_yet = 0;
4370 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4375 /* #### There are serious deficiencies in the recognition mechanism
4376 here. This needs to be much smarter if it's going to cut it.
4377 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4378 it should be detected as Latin-1.
4379 All the ISO2022 stuff in this file should be synced up with the
4380 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4381 Perhaps we should wait till R2L works in FSF Emacs? */
4383 if (!st->iso2022.initted)
4385 reset_iso2022 (Qnil, &st->iso2022.iso);
4386 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4387 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4388 CODING_CATEGORY_ISO_8_1_MASK |
4389 CODING_CATEGORY_ISO_8_2_MASK |
4390 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4391 st->iso2022.flags = 0;
4392 st->iso2022.high_byte_count = 0;
4393 st->iso2022.saw_single_shift = 0;
4394 st->iso2022.initted = 1;
4397 mask = st->iso2022.mask;
4404 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4405 st->iso2022.high_byte_count++;
4409 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4411 if (st->iso2022.high_byte_count & 1)
4412 /* odd number of high bytes; assume not iso-8-2 */
4413 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4415 st->iso2022.high_byte_count = 0;
4416 st->iso2022.saw_single_shift = 0;
4418 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4420 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4421 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4422 { /* control chars */
4425 /* Allow and ignore control characters that you might
4426 reasonably see in a text file */
4431 case 8: /* backspace */
4432 case 11: /* vertical tab */
4433 case 12: /* form feed */
4434 case 26: /* MS-DOS C-z junk */
4435 case 31: /* '^_' -- for info */
4436 goto label_continue_loop;
4443 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4446 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4447 &st->iso2022.flags, 0))
4449 switch (st->iso2022.iso.esc)
4451 case ISO_ESC_DESIGNATE:
4452 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4453 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4455 case ISO_ESC_LOCKING_SHIFT:
4456 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4457 goto ran_out_of_chars;
4458 case ISO_ESC_SINGLE_SHIFT:
4459 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4460 st->iso2022.saw_single_shift = 1;
4469 goto ran_out_of_chars;
4472 label_continue_loop:;
4481 postprocess_iso2022_mask (int mask)
4483 /* #### kind of cheesy */
4484 /* If seven-bit ISO is allowed, then assume that the encoding is
4485 entirely seven-bit and turn off the eight-bit ones. */
4486 if (mask & CODING_CATEGORY_ISO_7_MASK)
4487 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4488 CODING_CATEGORY_ISO_8_1_MASK |
4489 CODING_CATEGORY_ISO_8_2_MASK);
4493 /* If FLAGS is a null pointer or specifies right-to-left motion,
4494 output a switch-dir-to-left-to-right sequence to DST.
4495 Also update FLAGS if it is not a null pointer.
4496 If INTERNAL_P is set, we are outputting in internal format and
4497 need to handle the CSI differently. */
4500 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4501 unsigned_char_dynarr *dst,
4502 unsigned int *flags,
4505 if (!flags || (*flags & CODING_STATE_R2L))
4507 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4509 Dynarr_add (dst, ISO_CODE_ESC);
4510 Dynarr_add (dst, '[');
4512 else if (internal_p)
4513 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4515 Dynarr_add (dst, ISO_CODE_CSI);
4516 Dynarr_add (dst, '0');
4517 Dynarr_add (dst, ']');
4519 *flags &= ~CODING_STATE_R2L;
4523 /* If FLAGS is a null pointer or specifies a direction different from
4524 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4525 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4526 sequence to DST. Also update FLAGS if it is not a null pointer.
4527 If INTERNAL_P is set, we are outputting in internal format and
4528 need to handle the CSI differently. */
4531 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4532 unsigned_char_dynarr *dst, unsigned int *flags,
4535 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4536 direction == CHARSET_LEFT_TO_RIGHT)
4537 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4538 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4539 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4540 direction == CHARSET_RIGHT_TO_LEFT)
4542 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4544 Dynarr_add (dst, ISO_CODE_ESC);
4545 Dynarr_add (dst, '[');
4547 else if (internal_p)
4548 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4550 Dynarr_add (dst, ISO_CODE_CSI);
4551 Dynarr_add (dst, '2');
4552 Dynarr_add (dst, ']');
4554 *flags |= CODING_STATE_R2L;
4558 /* Convert ISO2022-format data to internal format. */
4561 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4562 unsigned_char_dynarr *dst, unsigned int n)
4564 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4565 unsigned int flags = str->flags;
4566 unsigned int ch = str->ch;
4567 eol_type_t eol_type = str->eol_type;
4568 #ifdef ENABLE_COMPOSITE_CHARS
4569 unsigned_char_dynarr *real_dst = dst;
4571 Lisp_Object coding_system;
4573 XSETCODING_SYSTEM (coding_system, str->codesys);
4575 #ifdef ENABLE_COMPOSITE_CHARS
4576 if (flags & CODING_STATE_COMPOSITE)
4577 dst = str->iso2022.composite_chars;
4578 #endif /* ENABLE_COMPOSITE_CHARS */
4582 unsigned char c = *src++;
4583 if (flags & CODING_STATE_ESCAPE)
4584 { /* Within ESC sequence */
4585 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4590 switch (str->iso2022.esc)
4592 #ifdef ENABLE_COMPOSITE_CHARS
4593 case ISO_ESC_START_COMPOSITE:
4594 if (str->iso2022.composite_chars)
4595 Dynarr_reset (str->iso2022.composite_chars);
4597 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4598 dst = str->iso2022.composite_chars;
4600 case ISO_ESC_END_COMPOSITE:
4602 Bufbyte comstr[MAX_EMCHAR_LEN];
4604 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4605 Dynarr_length (dst));
4607 len = set_charptr_emchar (comstr, emch);
4608 Dynarr_add_many (dst, comstr, len);
4611 #endif /* ENABLE_COMPOSITE_CHARS */
4613 case ISO_ESC_LITERAL:
4614 COMPOSE_FLUSH_CHARS (str, dst);
4615 DECODE_ADD_BINARY_CHAR (c, dst);
4619 /* Everything else handled already */
4624 /* Attempted error recovery. */
4625 if (str->iso2022.output_direction_sequence)
4626 ensure_correct_direction (flags & CODING_STATE_R2L ?
4627 CHARSET_RIGHT_TO_LEFT :
4628 CHARSET_LEFT_TO_RIGHT,
4629 str->codesys, dst, 0, 1);
4630 /* More error recovery. */
4631 if (!retval || str->iso2022.output_literally)
4633 /* Output the (possibly invalid) sequence */
4635 COMPOSE_FLUSH_CHARS (str, dst);
4636 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4637 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4638 flags &= CODING_STATE_ISO2022_LOCK;
4640 n++, src--;/* Repeat the loop with the same character. */
4643 /* No sense in reprocessing the final byte of the
4644 escape sequence; it could mess things up anyway.
4646 COMPOSE_FLUSH_CHARS (str, dst);
4647 DECODE_ADD_BINARY_CHAR (c, dst);
4652 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4653 { /* Control characters */
4655 /***** Error-handling *****/
4657 /* If we were in the middle of a character, dump out the
4658 partial character. */
4661 COMPOSE_FLUSH_CHARS (str, dst);
4662 DECODE_ADD_BINARY_CHAR (ch, dst);
4666 /* If we just saw a single-shift character, dump it out.
4667 This may dump out the wrong sort of single-shift character,
4668 but least it will give an indication that something went
4670 if (flags & CODING_STATE_SS2)
4672 COMPOSE_FLUSH_CHARS (str, dst);
4673 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4674 flags &= ~CODING_STATE_SS2;
4676 if (flags & CODING_STATE_SS3)
4678 COMPOSE_FLUSH_CHARS (str, dst);
4679 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4680 flags &= ~CODING_STATE_SS3;
4683 /***** Now handle the control characters. *****/
4689 COMPOSE_FLUSH_CHARS (str, dst);
4690 if (eol_type == EOL_CR)
4691 Dynarr_add (dst, '\n');
4692 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4693 Dynarr_add (dst, c);
4695 flags |= CODING_STATE_CR;
4696 goto label_continue_loop;
4698 else if (flags & CODING_STATE_CR)
4699 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4701 Dynarr_add (dst, '\r');
4702 flags &= ~CODING_STATE_CR;
4705 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4708 flags &= CODING_STATE_ISO2022_LOCK;
4710 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4712 COMPOSE_FLUSH_CHARS (str, dst);
4713 DECODE_ADD_BINARY_CHAR (c, dst);
4717 { /* Graphic characters */
4718 Lisp_Object charset;
4727 COMPOSE_FLUSH_CHARS (str, dst);
4728 if (eol_type == EOL_CR)
4729 Dynarr_add (dst, '\n');
4730 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4731 Dynarr_add (dst, c);
4733 flags |= CODING_STATE_CR;
4734 goto label_continue_loop;
4736 else if (flags & CODING_STATE_CR)
4737 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4739 Dynarr_add (dst, '\r');
4740 flags &= ~CODING_STATE_CR;
4743 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4746 /* Now determine the charset. */
4747 reg = ((flags & CODING_STATE_SS2) ? 2
4748 : (flags & CODING_STATE_SS3) ? 3
4749 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4750 : str->iso2022.register_left);
4751 charset = str->iso2022.charset[reg];
4753 /* Error checking: */
4754 if (! CHARSETP (charset)
4755 || str->iso2022.invalid_designated[reg]
4756 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4757 && XCHARSET_CHARS (charset) == 94))
4758 /* Mrmph. We are trying to invoke a register that has no
4759 or an invalid charset in it, or trying to add a character
4760 outside the range of the charset. Insert that char literally
4761 to preserve it for the output. */
4763 COMPOSE_FLUSH_CHARS (str, dst);
4764 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4765 DECODE_ADD_BINARY_CHAR (c, dst);
4770 /* Things are probably hunky-dorey. */
4772 /* Fetch reverse charset, maybe. */
4773 if (((flags & CODING_STATE_R2L) &&
4774 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4776 (!(flags & CODING_STATE_R2L) &&
4777 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4779 Lisp_Object new_charset =
4780 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4781 if (!NILP (new_charset))
4782 charset = new_charset;
4786 if (XCHARSET_DIMENSION (charset) == 1)
4790 COMPOSE_FLUSH_CHARS (str, dst);
4791 DECODE_ADD_BINARY_CHAR (ch, dst);
4794 COMPOSE_ADD_CHAR (str,
4795 MAKE_CHAR (charset, c & 0x7F, 0), dst);
4799 COMPOSE_ADD_CHAR (str,
4800 MAKE_CHAR (charset, ch & 0x7F, c & 0x7F),
4807 lb = XCHARSET_LEADING_BYTE (charset);
4808 switch (XCHARSET_REP_BYTES (charset))
4811 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4812 Dynarr_add (dst, c & 0x7F);
4815 case 2: /* one-byte official */
4816 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4817 Dynarr_add (dst, lb);
4818 Dynarr_add (dst, c | 0x80);
4821 case 3: /* one-byte private or two-byte official */
4822 if (XCHARSET_PRIVATE_P (charset))
4824 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4825 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4826 Dynarr_add (dst, lb);
4827 Dynarr_add (dst, c | 0x80);
4833 Dynarr_add (dst, lb);
4834 Dynarr_add (dst, ch | 0x80);
4835 Dynarr_add (dst, c | 0x80);
4843 default: /* two-byte private */
4846 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4847 Dynarr_add (dst, lb);
4848 Dynarr_add (dst, ch | 0x80);
4849 Dynarr_add (dst, c | 0x80);
4859 flags &= CODING_STATE_ISO2022_LOCK;
4862 label_continue_loop:;
4865 if (flags & CODING_STATE_END)
4867 COMPOSE_FLUSH_CHARS (str, dst);
4868 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4875 /***** ISO2022 encoder *****/
4877 /* Designate CHARSET into register REG. */
4880 iso2022_designate (Lisp_Object charset, unsigned char reg,
4881 struct encoding_stream *str, unsigned_char_dynarr *dst)
4883 static CONST char inter94[] = "()*+";
4884 static CONST char inter96[] = ",-./";
4885 unsigned short chars;
4886 unsigned char dimension;
4887 unsigned char final;
4888 Lisp_Object old_charset = str->iso2022.charset[reg];
4890 str->iso2022.charset[reg] = charset;
4891 if (!CHARSETP (charset))
4892 /* charset might be an initial nil or t. */
4894 chars = XCHARSET_CHARS (charset);
4895 dimension = XCHARSET_DIMENSION (charset);
4896 final = XCHARSET_FINAL (charset);
4897 if (!str->iso2022.force_charset_on_output[reg] &&
4898 CHARSETP (old_charset) &&
4899 XCHARSET_CHARS (old_charset) == chars &&
4900 XCHARSET_DIMENSION (old_charset) == dimension &&
4901 XCHARSET_FINAL (old_charset) == final)
4904 str->iso2022.force_charset_on_output[reg] = 0;
4907 charset_conversion_spec_dynarr *dyn =
4908 str->codesys->iso2022.output_conv;
4914 for (i = 0; i < Dynarr_length (dyn); i++)
4916 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4917 if (EQ (charset, spec->from_charset))
4918 charset = spec->to_charset;
4923 Dynarr_add (dst, ISO_CODE_ESC);
4928 Dynarr_add (dst, inter94[reg]);
4931 Dynarr_add (dst, '$');
4933 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4936 Dynarr_add (dst, inter94[reg]);
4941 Dynarr_add (dst, inter96[reg]);
4944 Dynarr_add (dst, '$');
4945 Dynarr_add (dst, inter96[reg]);
4949 Dynarr_add (dst, final);
4953 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4955 if (str->iso2022.register_left != 0)
4957 Dynarr_add (dst, ISO_CODE_SI);
4958 str->iso2022.register_left = 0;
4963 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4965 if (str->iso2022.register_left != 1)
4967 Dynarr_add (dst, ISO_CODE_SO);
4968 str->iso2022.register_left = 1;
4973 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4974 unsigned_char_dynarr *dst, unsigned int *flags)
4976 unsigned char charmask;
4977 Lisp_Coding_System* codesys = str->codesys;
4978 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4980 Lisp_Object charset = str->iso2022.current_charset;
4981 int half = str->iso2022.current_half;
4982 unsigned int byte1, byte2;
4986 restore_left_to_right_direction (codesys, dst, flags, 0);
4988 /* Make sure G0 contains ASCII */
4989 if ((ch > ' ' && ch < ISO_CODE_DEL)
4990 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4992 ensure_normal_shift (str, dst);
4993 iso2022_designate (Vcharset_ascii, 0, str, dst);
4996 /* If necessary, restore everything to the default state
4998 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5000 restore_left_to_right_direction (codesys, dst, flags, 0);
5002 ensure_normal_shift (str, dst);
5004 for (i = 0; i < 4; i++)
5006 Lisp_Object initial_charset =
5007 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5008 iso2022_designate (initial_charset, i, str, dst);
5013 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5014 Dynarr_add (dst, '\r');
5015 if (eol_type != EOL_CR)
5016 Dynarr_add (dst, ch);
5020 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5021 && fit_to_be_escape_quoted (ch))
5022 Dynarr_add (dst, ISO_CODE_ESC);
5023 Dynarr_add (dst, ch);
5026 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5028 charmask = (half == 0 ? 0x00 : 0x80);
5030 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5031 && fit_to_be_escape_quoted (ch))
5032 Dynarr_add (dst, ISO_CODE_ESC);
5033 /* you asked for it ... */
5034 Dynarr_add (dst, ch);
5040 /* Now determine which register to use. */
5042 for (i = 0; i < 4; i++)
5046 if ((CHARSETP (charset = str->iso2022.charset[i])
5047 && ((code_point = charset_code_point (charset, ch)) >= 0))
5051 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5052 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5054 if (XCHARSET_DIMENSION (charset) == 1)
5059 else /* if (XCHARSET_DIMENSION (charset) == 2) */
5061 byte1 = code_point >> 8;
5062 byte2 = code_point & 255;
5070 Lisp_Object original_default_coded_charset_priority_list
5071 = Vdefault_coded_charset_priority_list;
5073 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5075 BREAKUP_CHAR (ch, charset, byte1, byte2);
5076 if (XCHARSET_FINAL (charset))
5078 Vdefault_coded_charset_priority_list
5079 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5080 Vdefault_coded_charset_priority_list));
5082 BREAKUP_CHAR (ch, charset, byte1, byte2);
5083 if (!XCHARSET_FINAL (charset))
5085 charset = Vcharset_ascii;
5089 Vdefault_coded_charset_priority_list
5090 = original_default_coded_charset_priority_list;
5092 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5093 codesys, dst, flags, 0);
5097 if (XCHARSET_GRAPHIC (charset) != 0)
5099 if (!NILP (str->iso2022.charset[1]) &&
5100 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5101 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5103 else if (!NILP (str->iso2022.charset[2]))
5105 else if (!NILP (str->iso2022.charset[3]))
5114 iso2022_designate (charset, reg, str, dst);
5116 /* Now invoke that register. */
5120 ensure_normal_shift (str, dst);
5124 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5126 ensure_shift_out (str, dst);
5133 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5135 Dynarr_add (dst, ISO_CODE_ESC);
5136 Dynarr_add (dst, 'N');
5141 Dynarr_add (dst, ISO_CODE_SS2);
5146 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5148 Dynarr_add (dst, ISO_CODE_ESC);
5149 Dynarr_add (dst, 'O');
5154 Dynarr_add (dst, ISO_CODE_SS3);
5162 charmask = (half == 0 ? 0x00 : 0x80);
5164 switch (XCHARSET_DIMENSION (charset))
5167 Dynarr_add (dst, byte1 | charmask);
5170 Dynarr_add (dst, byte1 | charmask);
5171 Dynarr_add (dst, byte2 | charmask);
5177 str->iso2022.current_charset = charset;
5178 str->iso2022.current_half = half;
5182 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5183 unsigned int *flags)
5185 Lisp_Coding_System* codesys = str->codesys;
5188 restore_left_to_right_direction (codesys, dst, flags, 0);
5189 ensure_normal_shift (str, dst);
5190 for (i = 0; i < 4; i++)
5192 Lisp_Object initial_charset
5193 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5194 iso2022_designate (initial_charset, i, str, dst);
5199 /************************************************************************/
5200 /* No-conversion methods */
5201 /************************************************************************/
5203 /* This is used when reading in "binary" files -- i.e. files that may
5204 contain all 256 possible byte values and that are not to be
5205 interpreted as being in any particular decoding. */
5207 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5208 unsigned_char_dynarr *dst, unsigned int n)
5211 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5212 unsigned int flags = str->flags;
5213 unsigned int ch = str->ch;
5214 eol_type_t eol_type = str->eol_type;
5220 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5221 DECODE_ADD_BINARY_CHAR (c, dst);
5222 label_continue_loop:;
5225 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5232 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5233 unsigned_char_dynarr *dst, unsigned int n)
5236 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5237 unsigned int flags = str->flags;
5238 unsigned int ch = str->ch;
5239 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5241 unsigned char char_boundary = str->iso2022.current_char_boundary;
5248 if (char_boundary == 0)
5254 else if ( c >= 0xf8 )
5259 else if ( c >= 0xf0 )
5264 else if ( c >= 0xe0 )
5269 else if ( c >= 0xc0 )
5279 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5280 Dynarr_add (dst, '\r');
5281 if (eol_type != EOL_CR)
5282 Dynarr_add (dst, c);
5285 Dynarr_add (dst, c);
5288 else if (char_boundary == 1)
5290 ch = ( ch << 6 ) | ( c & 0x3f );
5291 Dynarr_add (dst, ch & 0xff);
5296 ch = ( ch << 6 ) | ( c & 0x3f );
5299 #else /* not UTF2000 */
5302 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5303 Dynarr_add (dst, '\r');
5304 if (eol_type != EOL_CR)
5305 Dynarr_add (dst, '\n');
5308 else if (BYTE_ASCII_P (c))
5311 Dynarr_add (dst, c);
5313 else if (BUFBYTE_LEADING_BYTE_P (c))
5316 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5317 c == LEADING_BYTE_CONTROL_1)
5320 Dynarr_add (dst, '~'); /* untranslatable character */
5324 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5325 Dynarr_add (dst, c);
5326 else if (ch == LEADING_BYTE_CONTROL_1)
5329 Dynarr_add (dst, c - 0x20);
5331 /* else it should be the second or third byte of an
5332 untranslatable character, so ignore it */
5335 #endif /* not UTF2000 */
5341 str->iso2022.current_char_boundary = char_boundary;
5346 /************************************************************************/
5347 /* Simple internal/external functions */
5348 /************************************************************************/
5350 static Extbyte_dynarr *conversion_out_dynarr;
5351 static Bufbyte_dynarr *conversion_in_dynarr;
5353 /* Determine coding system from coding format */
5355 /* #### not correct for all values of `fmt'! */
5357 external_data_format_to_coding_system (enum external_data_format fmt)
5361 case FORMAT_FILENAME:
5362 case FORMAT_TERMINAL:
5363 if (EQ (Vfile_name_coding_system, Qnil) ||
5364 EQ (Vfile_name_coding_system, Qbinary))
5367 return Fget_coding_system (Vfile_name_coding_system);
5370 return Fget_coding_system (Qctext);
5378 convert_to_external_format (CONST Bufbyte *ptr,
5381 enum external_data_format fmt)
5383 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5385 if (!conversion_out_dynarr)
5386 conversion_out_dynarr = Dynarr_new (Extbyte);
5388 Dynarr_reset (conversion_out_dynarr);
5390 if (NILP (coding_system))
5392 CONST Bufbyte *end = ptr + len;
5398 (*ptr < 0xc0) ? *ptr :
5399 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5402 (BYTE_ASCII_P (*ptr)) ? *ptr :
5403 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5404 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5407 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5411 #ifdef ERROR_CHECK_BUFPOS
5412 assert (ptr == end);
5417 Lisp_Object instream, outstream, da_outstream;
5418 Lstream *istr, *ostr;
5419 struct gcpro gcpro1, gcpro2, gcpro3;
5420 char tempbuf[1024]; /* some random amount */
5422 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5423 da_outstream = make_dynarr_output_stream
5424 ((unsigned_char_dynarr *) conversion_out_dynarr);
5426 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5427 istr = XLSTREAM (instream);
5428 ostr = XLSTREAM (outstream);
5429 GCPRO3 (instream, outstream, da_outstream);
5432 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5435 Lstream_write (ostr, tempbuf, size_in_bytes);
5437 Lstream_close (istr);
5438 Lstream_close (ostr);
5440 Lstream_delete (istr);
5441 Lstream_delete (ostr);
5442 Lstream_delete (XLSTREAM (da_outstream));
5445 *len_out = Dynarr_length (conversion_out_dynarr);
5446 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5447 return Dynarr_atp (conversion_out_dynarr, 0);
5451 convert_from_external_format (CONST Extbyte *ptr,
5454 enum external_data_format fmt)
5456 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5458 if (!conversion_in_dynarr)
5459 conversion_in_dynarr = Dynarr_new (Bufbyte);
5461 Dynarr_reset (conversion_in_dynarr);
5463 if (NILP (coding_system))
5465 CONST Extbyte *end = ptr + len;
5466 for (; ptr < end; ptr++)
5469 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5474 Lisp_Object instream, outstream, da_outstream;
5475 Lstream *istr, *ostr;
5476 struct gcpro gcpro1, gcpro2, gcpro3;
5477 char tempbuf[1024]; /* some random amount */
5479 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5480 da_outstream = make_dynarr_output_stream
5481 ((unsigned_char_dynarr *) conversion_in_dynarr);
5483 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5484 istr = XLSTREAM (instream);
5485 ostr = XLSTREAM (outstream);
5486 GCPRO3 (instream, outstream, da_outstream);
5489 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5492 Lstream_write (ostr, tempbuf, size_in_bytes);
5494 Lstream_close (istr);
5495 Lstream_close (ostr);
5497 Lstream_delete (istr);
5498 Lstream_delete (ostr);
5499 Lstream_delete (XLSTREAM (da_outstream));
5502 *len_out = Dynarr_length (conversion_in_dynarr);
5503 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5504 return Dynarr_atp (conversion_in_dynarr, 0);
5508 /************************************************************************/
5509 /* Initialization */
5510 /************************************************************************/
5513 syms_of_file_coding (void)
5515 deferror (&Qcoding_system_error, "coding-system-error",
5516 "Coding-system error", Qio_error);
5518 DEFSUBR (Fcoding_system_p);
5519 DEFSUBR (Ffind_coding_system);
5520 DEFSUBR (Fget_coding_system);
5521 DEFSUBR (Fcoding_system_list);
5522 DEFSUBR (Fcoding_system_name);
5523 DEFSUBR (Fmake_coding_system);
5524 DEFSUBR (Fcopy_coding_system);
5525 DEFSUBR (Fdefine_coding_system_alias);
5526 DEFSUBR (Fsubsidiary_coding_system);
5528 DEFSUBR (Fcoding_system_type);
5529 DEFSUBR (Fcoding_system_doc_string);
5531 DEFSUBR (Fcoding_system_charset);
5533 DEFSUBR (Fcoding_system_property);
5535 DEFSUBR (Fcoding_category_list);
5536 DEFSUBR (Fset_coding_priority_list);
5537 DEFSUBR (Fcoding_priority_list);
5538 DEFSUBR (Fset_coding_category_system);
5539 DEFSUBR (Fcoding_category_system);
5541 DEFSUBR (Fdetect_coding_region);
5542 DEFSUBR (Fdecode_coding_region);
5543 DEFSUBR (Fencode_coding_region);
5545 DEFSUBR (Fdecode_shift_jis_char);
5546 DEFSUBR (Fencode_shift_jis_char);
5547 DEFSUBR (Fdecode_big5_char);
5548 DEFSUBR (Fencode_big5_char);
5550 defsymbol (&Qcoding_systemp, "coding-system-p");
5551 defsymbol (&Qno_conversion, "no-conversion");
5552 defsymbol (&Qraw_text, "raw-text");
5554 defsymbol (&Qbig5, "big5");
5555 defsymbol (&Qshift_jis, "shift-jis");
5556 defsymbol (&Qucs4, "ucs-4");
5557 defsymbol (&Qutf8, "utf-8");
5558 defsymbol (&Qccl, "ccl");
5559 defsymbol (&Qiso2022, "iso2022");
5561 defsymbol (&Qmnemonic, "mnemonic");
5562 defsymbol (&Qeol_type, "eol-type");
5563 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5564 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5566 defsymbol (&Qcr, "cr");
5567 defsymbol (&Qlf, "lf");
5568 defsymbol (&Qcrlf, "crlf");
5569 defsymbol (&Qeol_cr, "eol-cr");
5570 defsymbol (&Qeol_lf, "eol-lf");
5571 defsymbol (&Qeol_crlf, "eol-crlf");
5573 defsymbol (&Qcharset_g0, "charset-g0");
5574 defsymbol (&Qcharset_g1, "charset-g1");
5575 defsymbol (&Qcharset_g2, "charset-g2");
5576 defsymbol (&Qcharset_g3, "charset-g3");
5577 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5578 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5579 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5580 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5581 defsymbol (&Qno_iso6429, "no-iso6429");
5582 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5583 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5585 defsymbol (&Qshort, "short");
5586 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5587 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5588 defsymbol (&Qseven, "seven");
5589 defsymbol (&Qlock_shift, "lock-shift");
5590 defsymbol (&Qescape_quoted, "escape-quoted");
5593 defsymbol (&Qdisable_composition, "disable-composition");
5595 defsymbol (&Qencode, "encode");
5596 defsymbol (&Qdecode, "decode");
5599 defsymbol (&Qctext, "ctext");
5600 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5602 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5604 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5606 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5608 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5610 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5612 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5614 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5616 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5619 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5624 lstream_type_create_file_coding (void)
5626 LSTREAM_HAS_METHOD (decoding, reader);
5627 LSTREAM_HAS_METHOD (decoding, writer);
5628 LSTREAM_HAS_METHOD (decoding, rewinder);
5629 LSTREAM_HAS_METHOD (decoding, seekable_p);
5630 LSTREAM_HAS_METHOD (decoding, flusher);
5631 LSTREAM_HAS_METHOD (decoding, closer);
5632 LSTREAM_HAS_METHOD (decoding, marker);
5634 LSTREAM_HAS_METHOD (encoding, reader);
5635 LSTREAM_HAS_METHOD (encoding, writer);
5636 LSTREAM_HAS_METHOD (encoding, rewinder);
5637 LSTREAM_HAS_METHOD (encoding, seekable_p);
5638 LSTREAM_HAS_METHOD (encoding, flusher);
5639 LSTREAM_HAS_METHOD (encoding, closer);
5640 LSTREAM_HAS_METHOD (encoding, marker);
5644 vars_of_file_coding (void)
5648 fcd = xnew (struct file_coding_dump);
5649 dumpstruct (&fcd, &fcd_description);
5651 /* Initialize to something reasonable ... */
5652 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5654 fcd->coding_category_system[i] = Qnil;
5655 fcd->coding_category_by_priority[i] = i;
5658 Fprovide (intern ("file-coding"));
5660 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5661 Coding system used for TTY keyboard input.
5662 Not used under a windowing system.
5664 Vkeyboard_coding_system = Qnil;
5666 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5667 Coding system used for TTY display output.
5668 Not used under a windowing system.
5670 Vterminal_coding_system = Qnil;
5672 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5673 Overriding coding system used when writing a file or process.
5674 You should *bind* this, not set it. If this is non-nil, it specifies
5675 the coding system that will be used when a file or process is read
5676 in, and overrides `buffer-file-coding-system-for-read',
5677 `insert-file-contents-pre-hook', etc. Use those variables instead of
5678 this one for permanent changes to the environment.
5680 Vcoding_system_for_read = Qnil;
5682 DEFVAR_LISP ("coding-system-for-write",
5683 &Vcoding_system_for_write /*
5684 Overriding coding system used when writing a file or process.
5685 You should *bind* this, not set it. If this is non-nil, it specifies
5686 the coding system that will be used when a file or process is wrote
5687 in, and overrides `buffer-file-coding-system',
5688 `write-region-pre-hook', etc. Use those variables instead of this one
5689 for permanent changes to the environment.
5691 Vcoding_system_for_write = Qnil;
5693 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5694 Coding system used to convert pathnames when accessing files.
5696 Vfile_name_coding_system = Qnil;
5698 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5699 Non-nil means the buffer contents are regarded as multi-byte form
5700 of characters, not a binary code. This affects the display, file I/O,
5701 and behaviors of various editing commands.
5703 Setting this to nil does not do anything.
5705 enable_multibyte_characters = 1;
5709 complex_vars_of_file_coding (void)
5711 staticpro (&Vcoding_system_hash_table);
5712 Vcoding_system_hash_table =
5713 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5715 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5716 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5718 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5720 struct codesys_prop csp; \
5722 csp.prop_type = (Prop_Type); \
5723 Dynarr_add (the_codesys_prop_dynarr, csp); \
5726 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5727 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5728 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5729 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5730 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5731 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5732 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5734 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5735 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5736 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5737 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5738 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5739 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5740 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5741 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5742 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5743 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5744 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5745 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5746 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5747 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5748 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5749 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5750 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5752 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5753 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5755 /* Need to create this here or we're really screwed. */
5757 (Qraw_text, Qno_conversion,
5758 build_string ("Raw text, which means it converts only line-break-codes."),
5759 list2 (Qmnemonic, build_string ("Raw")));
5762 (Qbinary, Qno_conversion,
5763 build_string ("Binary, which means it does not convert anything."),
5764 list4 (Qeol_type, Qlf,
5765 Qmnemonic, build_string ("Binary")));
5770 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5771 list2 (Qmnemonic, build_string ("UTF8")));
5774 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5776 /* Need this for bootstrapping */
5777 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5778 Fget_coding_system (Qraw_text);
5781 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5782 = Fget_coding_system (Qutf8);