1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof(struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qctext, Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qdisable_composition;
106 Lisp_Object Qencode, Qdecode;
108 Lisp_Object Vcoding_system_hash_table;
110 int enable_multibyte_characters;
113 /* Additional information used by the ISO2022 decoder and detector. */
114 struct iso2022_decoder
116 /* CHARSET holds the character sets currently assigned to the G0
117 through G3 variables. It is initialized from the array
118 INITIAL_CHARSET in CODESYS. */
119 Lisp_Object charset[4];
121 /* Which registers are currently invoked into the left (GL) and
122 right (GR) halves of the 8-bit encoding space? */
123 int register_left, register_right;
125 /* ISO_ESC holds a value indicating part of an escape sequence
126 that has already been seen. */
127 enum iso_esc_flag esc;
129 /* This records the bytes we've seen so far in an escape sequence,
130 in case the sequence is invalid (we spit out the bytes unchanged). */
131 unsigned char esc_bytes[8];
133 /* Index for next byte to store in ISO escape sequence. */
136 #ifdef ENABLE_COMPOSITE_CHARS
137 /* Stuff seen so far when composing a string. */
138 unsigned_char_dynarr *composite_chars;
141 /* If we saw an invalid designation sequence for a particular
142 register, we flag it here and switch to ASCII. The next time we
143 see a valid designation for this register, we turn off the flag
144 and do the designation normally, but pretend the sequence was
145 invalid. The effect of all this is that (most of the time) the
146 escape sequences for both the switch to the unknown charset, and
147 the switch back to the known charset, get inserted literally into
148 the buffer and saved out as such. The hope is that we can
149 preserve the escape sequences so that the resulting written out
150 file makes sense. If we don't do any of this, the designation
151 to the invalid charset will be preserved but that switch back
152 to the known charset will probably get eaten because it was
153 the same charset that was already present in the register. */
154 unsigned char invalid_designated[4];
156 /* We try to do similar things as above for direction-switching
157 sequences. If we encountered a direction switch while an
158 invalid designation was present, or an invalid designation
159 just after a direction switch (i.e. no valid designation
160 encountered yet), we insert the direction-switch escape
161 sequence literally into the output stream, and later on
162 insert the corresponding direction-restoring escape sequence
164 unsigned int switched_dir_and_no_valid_charset_yet :1;
165 unsigned int invalid_switch_dir :1;
167 /* Tells the decoder to output the escape sequence literally
168 even though it was valid. Used in the games we play to
169 avoid lossage when we encounter invalid designations. */
170 unsigned int output_literally :1;
171 /* We encountered a direction switch followed by an invalid
172 designation. We didn't output the direction switch
173 literally because we didn't know about the invalid designation;
174 but we have to do so now. */
175 unsigned int output_direction_sequence :1;
178 EXFUN (Fcopy_coding_system, 2);
180 struct detection_state;
183 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
184 unsigned_char_dynarr *dst, unsigned int n);
186 static int detect_coding_sjis (struct detection_state *st,
187 CONST unsigned char *src,
189 static void decode_coding_sjis (Lstream *decoding,
190 CONST unsigned char *src,
191 unsigned_char_dynarr *dst,
193 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
194 unsigned_char_dynarr *dst, unsigned int *flags);
195 void char_finish_shift_jis (struct encoding_stream *str,
196 unsigned_char_dynarr *dst, unsigned int *flags);
198 static int detect_coding_big5 (struct detection_state *st,
199 CONST unsigned char *src,
201 static void decode_coding_big5 (Lstream *decoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_big5 (Lstream *encoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_ucs4 (struct detection_state *st,
208 CONST unsigned char *src,
210 static void decode_coding_ucs4 (Lstream *decoding,
211 CONST unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
214 unsigned_char_dynarr *dst, unsigned int *flags);
215 void char_finish_ucs4 (struct encoding_stream *str,
216 unsigned_char_dynarr *dst, unsigned int *flags);
218 static int detect_coding_utf8 (struct detection_state *st,
219 CONST unsigned char *src,
221 static void decode_coding_utf8 (Lstream *decoding,
222 CONST unsigned char *src,
223 unsigned_char_dynarr *dst, unsigned int n);
224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf8 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int postprocess_iso2022_mask (int mask);
230 static void reset_iso2022 (Lisp_Object coding_system,
231 struct iso2022_decoder *iso);
232 static int detect_coding_iso2022 (struct detection_state *st,
233 CONST unsigned char *src,
235 static void decode_coding_iso2022 (Lstream *decoding,
236 CONST unsigned char *src,
237 unsigned_char_dynarr *dst, unsigned int n);
238 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
239 unsigned_char_dynarr *dst, unsigned int *flags);
240 void char_finish_iso2022 (struct encoding_stream *str,
241 unsigned_char_dynarr *dst, unsigned int *flags);
243 static void decode_coding_no_conversion (Lstream *decoding,
244 CONST unsigned char *src,
245 unsigned_char_dynarr *dst,
247 static void encode_coding_no_conversion (Lstream *encoding,
248 CONST unsigned char *src,
249 unsigned_char_dynarr *dst,
251 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
252 unsigned_char_dynarr *dst, unsigned int n);
253 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
254 unsigned_char_dynarr *dst, unsigned int n);
256 typedef struct codesys_prop codesys_prop;
265 Dynarr_declare (codesys_prop);
266 } codesys_prop_dynarr;
268 static const struct lrecord_description codesys_prop_description_1[] = {
269 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
273 static const struct struct_description codesys_prop_description = {
274 sizeof(codesys_prop),
275 codesys_prop_description_1
278 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
279 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
283 static const struct struct_description codesys_prop_dynarr_description = {
284 sizeof(codesys_prop_dynarr),
285 codesys_prop_dynarr_description_1
288 codesys_prop_dynarr *the_codesys_prop_dynarr;
290 enum codesys_prop_enum
293 CODESYS_PROP_ISO2022,
298 /************************************************************************/
299 /* Coding system functions */
300 /************************************************************************/
302 static Lisp_Object mark_coding_system (Lisp_Object);
303 static void print_coding_system (Lisp_Object, Lisp_Object, int);
304 static void finalize_coding_system (void *header, int for_disksave);
307 static const struct lrecord_description ccs_description_1[] = {
308 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
312 static const struct struct_description ccs_description = {
313 sizeof(charset_conversion_spec),
317 static const struct lrecord_description ccsd_description_1[] = {
318 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
322 static const struct struct_description ccsd_description = {
323 sizeof(charset_conversion_spec_dynarr),
328 static const struct lrecord_description coding_system_description[] = {
329 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
330 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
331 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
333 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
334 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
335 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
336 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
341 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
342 mark_coding_system, print_coding_system,
343 finalize_coding_system,
344 0, 0, coding_system_description,
345 struct Lisp_Coding_System);
348 mark_coding_system (Lisp_Object obj)
350 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
352 mark_object (CODING_SYSTEM_NAME (codesys));
353 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
354 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
355 mark_object (CODING_SYSTEM_EOL_LF (codesys));
356 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
357 mark_object (CODING_SYSTEM_EOL_CR (codesys));
359 switch (CODING_SYSTEM_TYPE (codesys))
363 case CODESYS_ISO2022:
364 for (i = 0; i < 4; i++)
365 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
366 if (codesys->iso2022.input_conv)
368 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
370 struct charset_conversion_spec *ccs =
371 Dynarr_atp (codesys->iso2022.input_conv, i);
372 mark_object (ccs->from_charset);
373 mark_object (ccs->to_charset);
376 if (codesys->iso2022.output_conv)
378 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
380 struct charset_conversion_spec *ccs =
381 Dynarr_atp (codesys->iso2022.output_conv, i);
382 mark_object (ccs->from_charset);
383 mark_object (ccs->to_charset);
389 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
390 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
397 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
398 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
402 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
405 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
407 error ("printing unreadable object #<coding_system 0x%x>",
410 write_c_string ("#<coding_system ", printcharfun);
411 print_internal (c->name, printcharfun, 1);
412 write_c_string (">", printcharfun);
416 finalize_coding_system (void *header, int for_disksave)
418 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
419 /* Since coding systems never go away, this function is not
420 necessary. But it would be necessary if we changed things
421 so that coding systems could go away. */
422 if (!for_disksave) /* see comment in lstream.c */
424 switch (CODING_SYSTEM_TYPE (c))
427 case CODESYS_ISO2022:
428 if (c->iso2022.input_conv)
430 Dynarr_free (c->iso2022.input_conv);
431 c->iso2022.input_conv = 0;
433 if (c->iso2022.output_conv)
435 Dynarr_free (c->iso2022.output_conv);
436 c->iso2022.output_conv = 0;
447 symbol_to_eol_type (Lisp_Object symbol)
449 CHECK_SYMBOL (symbol);
450 if (NILP (symbol)) return EOL_AUTODETECT;
451 if (EQ (symbol, Qlf)) return EOL_LF;
452 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
453 if (EQ (symbol, Qcr)) return EOL_CR;
455 signal_simple_error ("Unrecognized eol type", symbol);
456 return EOL_AUTODETECT; /* not reached */
460 eol_type_to_symbol (enum eol_type type)
465 case EOL_LF: return Qlf;
466 case EOL_CRLF: return Qcrlf;
467 case EOL_CR: return Qcr;
468 case EOL_AUTODETECT: return Qnil;
473 setup_eol_coding_systems (Lisp_Coding_System *codesys)
475 Lisp_Object codesys_obj;
476 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
477 char *codesys_name = (char *) alloca (len + 7);
479 char *codesys_mnemonic=0;
481 Lisp_Object codesys_name_sym, sub_codesys_obj;
485 XSETCODING_SYSTEM (codesys_obj, codesys);
487 memcpy (codesys_name,
488 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
490 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
492 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
493 codesys_mnemonic = (char *) alloca (mlen + 7);
494 memcpy (codesys_mnemonic,
495 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
498 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
499 strcpy (codesys_name + len, "-" op_sys); \
501 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
502 codesys_name_sym = intern (codesys_name); \
503 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
504 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
506 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
507 build_string (codesys_mnemonic); \
508 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
511 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
512 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
513 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
516 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
517 Return t if OBJECT is a coding system.
518 A coding system is an object that defines how text containing multiple
519 character sets is encoded into a stream of (typically 8-bit) bytes.
520 The coding system is used to decode the stream into a series of
521 characters (which may be from multiple charsets) when the text is read
522 from a file or process, and is used to encode the text back into the
523 same format when it is written out to a file or process.
525 For example, many ISO2022-compliant coding systems (such as Compound
526 Text, which is used for inter-client data under the X Window System)
527 use escape sequences to switch between different charsets -- Japanese
528 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
529 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
530 `make-coding-system' for more information.
532 Coding systems are normally identified using a symbol, and the
533 symbol is accepted in place of the actual coding system object whenever
534 a coding system is called for. (This is similar to how faces work.)
538 return CODING_SYSTEMP (object) ? Qt : Qnil;
541 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
542 Retrieve the coding system of the given name.
544 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
545 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
546 If there is no such coding system, nil is returned. Otherwise the
547 associated coding system object is returned.
549 (coding_system_or_name))
551 if (CODING_SYSTEMP (coding_system_or_name))
552 return coding_system_or_name;
554 if (NILP (coding_system_or_name))
555 coding_system_or_name = Qbinary;
557 CHECK_SYMBOL (coding_system_or_name);
559 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
562 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
563 Retrieve the coding system of the given name.
564 Same as `find-coding-system' except that if there is no such
565 coding system, an error is signaled instead of returning nil.
569 Lisp_Object coding_system = Ffind_coding_system (name);
571 if (NILP (coding_system))
572 signal_simple_error ("No such coding system", name);
573 return coding_system;
576 /* We store the coding systems in hash tables with the names as the key and the
577 actual coding system object as the value. Occasionally we need to use them
578 in a list format. These routines provide us with that. */
579 struct coding_system_list_closure
581 Lisp_Object *coding_system_list;
585 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
586 void *coding_system_list_closure)
588 /* This function can GC */
589 struct coding_system_list_closure *cscl =
590 (struct coding_system_list_closure *) coding_system_list_closure;
591 Lisp_Object *coding_system_list = cscl->coding_system_list;
593 *coding_system_list = Fcons (key, *coding_system_list);
597 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
598 Return a list of the names of all defined coding systems.
602 Lisp_Object coding_system_list = Qnil;
604 struct coding_system_list_closure coding_system_list_closure;
606 GCPRO1 (coding_system_list);
607 coding_system_list_closure.coding_system_list = &coding_system_list;
608 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
609 &coding_system_list_closure);
612 return coding_system_list;
615 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
616 Return the name of the given coding system.
620 coding_system = Fget_coding_system (coding_system);
621 return XCODING_SYSTEM_NAME (coding_system);
624 static Lisp_Coding_System *
625 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
627 Lisp_Coding_System *codesys =
628 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
630 zero_lcrecord (codesys);
631 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
633 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
634 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
635 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
636 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
637 CODING_SYSTEM_TYPE (codesys) = type;
638 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
640 if (type == CODESYS_ISO2022)
643 for (i = 0; i < 4; i++)
644 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
646 else if (type == CODESYS_CCL)
648 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
649 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
652 CODING_SYSTEM_NAME (codesys) = name;
658 /* Given a list of charset conversion specs as specified in a Lisp
659 program, parse it into STORE_HERE. */
662 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
663 Lisp_Object spec_list)
667 EXTERNAL_LIST_LOOP (rest, spec_list)
669 Lisp_Object car = XCAR (rest);
670 Lisp_Object from, to;
671 struct charset_conversion_spec spec;
673 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
674 signal_simple_error ("Invalid charset conversion spec", car);
675 from = Fget_charset (XCAR (car));
676 to = Fget_charset (XCAR (XCDR (car)));
677 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
678 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
679 signal_simple_error_2
680 ("Attempted conversion between different charset types",
682 spec.from_charset = from;
683 spec.to_charset = to;
685 Dynarr_add (store_here, spec);
689 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
690 specs, return the equivalent as the Lisp programmer would see it.
692 If LOAD_HERE is 0, return Qnil. */
695 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
702 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
704 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
705 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
708 return Fnreverse (result);
713 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
714 Register symbol NAME as a coding system.
716 TYPE describes the conversion method used and should be one of
719 Automatic conversion. XEmacs attempts to detect the coding system
722 No conversion. Use this for binary files and such. On output,
723 graphic characters that are not in ASCII or Latin-1 will be
724 replaced by a ?. (For a no-conversion-encoded buffer, these
725 characters will only be present if you explicitly insert them.)
727 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
729 ISO 10646 UCS-4 encoding.
731 ISO 10646 UTF-8 encoding.
733 Any ISO2022-compliant encoding. Among other things, this includes
734 JIS (the Japanese encoding commonly used for e-mail), EUC (the
735 standard Unix encoding for Japanese and other languages), and
736 Compound Text (the encoding used in X11). You can specify more
737 specific information about the conversion with the FLAGS argument.
739 Big5 (the encoding commonly used for Taiwanese).
741 The conversion is performed using a user-written pseudo-code
742 program. CCL (Code Conversion Language) is the name of this
745 Write out or read in the raw contents of the memory representing
746 the buffer's text. This is primarily useful for debugging
747 purposes, and is only enabled when XEmacs has been compiled with
748 DEBUG_XEMACS defined (via the --debug configure option).
749 WARNING: Reading in a file using 'internal conversion can result
750 in an internal inconsistency in the memory representing a
751 buffer's text, which will produce unpredictable results and may
752 cause XEmacs to crash. Under normal circumstances you should
753 never use 'internal conversion.
755 DOC-STRING is a string describing the coding system.
757 PROPS is a property list, describing the specific nature of the
758 character set. Recognized properties are:
761 String to be displayed in the modeline when this coding system is
765 End-of-line conversion to be used. It should be one of
768 Automatically detect the end-of-line type (LF, CRLF,
769 or CR). Also generate subsidiary coding systems named
770 `NAME-unix', `NAME-dos', and `NAME-mac', that are
771 identical to this coding system but have an EOL-TYPE
772 value of 'lf, 'crlf, and 'cr, respectively.
774 The end of a line is marked externally using ASCII LF.
775 Since this is also the way that XEmacs represents an
776 end-of-line internally, specifying this option results
777 in no end-of-line conversion. This is the standard
778 format for Unix text files.
780 The end of a line is marked externally using ASCII
781 CRLF. This is the standard format for MS-DOS text
784 The end of a line is marked externally using ASCII CR.
785 This is the standard format for Macintosh text files.
787 Automatically detect the end-of-line type but do not
788 generate subsidiary coding systems. (This value is
789 converted to nil when stored internally, and
790 `coding-system-property' will return nil.)
792 'post-read-conversion
793 Function called after a file has been read in, to perform the
794 decoding. Called with two arguments, BEG and END, denoting
795 a region of the current buffer to be decoded.
797 'pre-write-conversion
798 Function called before a file is written out, to perform the
799 encoding. Called with two arguments, BEG and END, denoting
800 a region of the current buffer to be encoded.
803 The following additional properties are recognized if TYPE is 'iso2022:
809 The character set initially designated to the G0 - G3 registers.
810 The value should be one of
812 -- A charset object (designate that character set)
813 -- nil (do not ever use this register)
814 -- t (no character set is initially designated to
815 the register, but may be later on; this automatically
816 sets the corresponding `force-g*-on-output' property)
822 If non-nil, send an explicit designation sequence on output before
823 using the specified register.
826 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
827 "ESC $ B" on output in place of the full designation sequences
828 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
831 If non-nil, don't designate ASCII to G0 at each end of line on output.
832 Setting this to non-nil also suppresses other state-resetting that
833 normally happens at the end of a line.
836 If non-nil, don't designate ASCII to G0 before control chars on output.
839 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
843 If non-nil, use locking-shift (SO/SI) instead of single-shift
844 or designation by escape sequence.
847 If non-nil, don't use ISO6429's direction specification.
850 If non-nil, literal control characters that are the same as
851 the beginning of a recognized ISO2022 or ISO6429 escape sequence
852 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
853 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
854 so that they can be properly distinguished from an escape sequence.
855 (Note that doing this results in a non-portable encoding.) This
856 encoding flag is used for byte-compiled files. Note that ESC
857 is a good choice for a quoting character because there are no
858 escape sequences whose second byte is a character from the Control-0
859 or Control-1 character sets; this is explicitly disallowed by the
862 'input-charset-conversion
863 A list of conversion specifications, specifying conversion of
864 characters in one charset to another when decoding is performed.
865 Each specification is a list of two elements: the source charset,
866 and the destination charset.
868 'output-charset-conversion
869 A list of conversion specifications, specifying conversion of
870 characters in one charset to another when encoding is performed.
871 The form of each specification is the same as for
872 'input-charset-conversion.
875 The following additional properties are recognized (and required)
879 CCL program used for decoding (converting to internal format).
882 CCL program used for encoding (converting to external format).
884 (name, type, doc_string, props))
886 Lisp_Coding_System *codesys;
887 Lisp_Object rest, key, value;
888 enum coding_system_type ty;
889 int need_to_setup_eol_systems = 1;
891 /* Convert type to constant */
892 if (NILP (type) || EQ (type, Qundecided))
893 { ty = CODESYS_AUTODETECT; }
895 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
896 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
897 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
898 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
899 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
900 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
902 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
904 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
907 signal_simple_error ("Invalid coding system type", type);
911 codesys = allocate_coding_system (ty, name);
913 if (NILP (doc_string))
914 doc_string = build_string ("");
916 CHECK_STRING (doc_string);
917 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
919 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
921 if (EQ (key, Qmnemonic))
924 CHECK_STRING (value);
925 CODING_SYSTEM_MNEMONIC (codesys) = value;
928 else if (EQ (key, Qeol_type))
930 need_to_setup_eol_systems = NILP (value);
933 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
936 else if (EQ (key, Qpost_read_conversion))
937 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
938 else if (EQ (key, Qpre_write_conversion))
939 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
941 else if (EQ (key, Qdisable_composition))
942 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
945 else if (ty == CODESYS_ISO2022)
947 #define FROB_INITIAL_CHARSET(charset_num) \
948 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
949 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
951 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
952 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
953 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
954 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
956 #define FROB_FORCE_CHARSET(charset_num) \
957 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
959 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
960 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
961 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
962 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
964 #define FROB_BOOLEAN_PROPERTY(prop) \
965 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
967 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
968 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
969 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
970 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
971 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
972 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
973 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
975 else if (EQ (key, Qinput_charset_conversion))
977 codesys->iso2022.input_conv =
978 Dynarr_new (charset_conversion_spec);
979 parse_charset_conversion_specs (codesys->iso2022.input_conv,
982 else if (EQ (key, Qoutput_charset_conversion))
984 codesys->iso2022.output_conv =
985 Dynarr_new (charset_conversion_spec);
986 parse_charset_conversion_specs (codesys->iso2022.output_conv,
990 signal_simple_error ("Unrecognized property", key);
992 else if (EQ (type, Qccl))
994 if (EQ (key, Qdecode))
996 CHECK_VECTOR (value);
997 CODING_SYSTEM_CCL_DECODE (codesys) = value;
999 else if (EQ (key, Qencode))
1001 CHECK_VECTOR (value);
1002 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1005 signal_simple_error ("Unrecognized property", key);
1009 signal_simple_error ("Unrecognized property", key);
1012 if (need_to_setup_eol_systems)
1013 setup_eol_coding_systems (codesys);
1016 Lisp_Object codesys_obj;
1017 XSETCODING_SYSTEM (codesys_obj, codesys);
1018 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1023 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1024 Copy OLD-CODING-SYSTEM to NEW-NAME.
1025 If NEW-NAME does not name an existing coding system, a new one will
1028 (old_coding_system, new_name))
1030 Lisp_Object new_coding_system;
1031 old_coding_system = Fget_coding_system (old_coding_system);
1032 new_coding_system = Ffind_coding_system (new_name);
1033 if (NILP (new_coding_system))
1035 XSETCODING_SYSTEM (new_coding_system,
1036 allocate_coding_system
1037 (XCODING_SYSTEM_TYPE (old_coding_system),
1039 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1043 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1044 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1045 memcpy (((char *) to ) + sizeof (to->header),
1046 ((char *) from) + sizeof (from->header),
1047 sizeof (*from) - sizeof (from->header));
1048 to->name = new_name;
1050 return new_coding_system;
1053 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1054 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1056 (alias, coding_system))
1058 CHECK_SYMBOL (alias);
1059 if (!NILP (Ffind_coding_system (alias)))
1060 signal_simple_error ("Symbol already names a coding system", alias);
1061 coding_system = Fget_coding_system (coding_system);
1062 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1064 /* Set up aliases for subsidiaries. */
1065 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1068 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1069 #define FROB(type, name) \
1071 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1072 if (!NILP (subsidiary)) \
1073 Fdefine_coding_system_alias \
1074 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1077 FROB (CRLF, "-dos");
1081 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1082 but it doesn't look intentional, so I'd rather return something
1083 meaningful or nothing at all. */
1088 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1090 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1091 Lisp_Object new_coding_system;
1093 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1094 return coding_system;
1098 case EOL_AUTODETECT: return coding_system;
1099 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1100 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1101 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1105 return NILP (new_coding_system) ? coding_system : new_coding_system;
1108 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1109 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1111 (coding_system, eol_type))
1113 coding_system = Fget_coding_system (coding_system);
1115 return subsidiary_coding_system (coding_system,
1116 symbol_to_eol_type (eol_type));
1120 /************************************************************************/
1121 /* Coding system accessors */
1122 /************************************************************************/
1124 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1125 Return the doc string for CODING-SYSTEM.
1129 coding_system = Fget_coding_system (coding_system);
1130 return XCODING_SYSTEM_DOC_STRING (coding_system);
1133 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1134 Return the type of CODING-SYSTEM.
1138 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1141 case CODESYS_AUTODETECT: return Qundecided;
1143 case CODESYS_SHIFT_JIS: return Qshift_jis;
1144 case CODESYS_ISO2022: return Qiso2022;
1145 case CODESYS_BIG5: return Qbig5;
1146 case CODESYS_UCS4: return Qucs4;
1147 case CODESYS_UTF8: return Qutf8;
1148 case CODESYS_CCL: return Qccl;
1150 case CODESYS_NO_CONVERSION: return Qno_conversion;
1152 case CODESYS_INTERNAL: return Qinternal;
1159 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1162 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1164 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1167 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1168 Return initial charset of CODING-SYSTEM designated to GNUM.
1171 (coding_system, gnum))
1173 coding_system = Fget_coding_system (coding_system);
1176 return coding_system_charset (coding_system, XINT (gnum));
1180 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1181 Return the PROP property of CODING-SYSTEM.
1183 (coding_system, prop))
1186 enum coding_system_type type;
1188 coding_system = Fget_coding_system (coding_system);
1189 CHECK_SYMBOL (prop);
1190 type = XCODING_SYSTEM_TYPE (coding_system);
1192 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1193 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1196 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1198 case CODESYS_PROP_ALL_OK:
1201 case CODESYS_PROP_ISO2022:
1202 if (type != CODESYS_ISO2022)
1204 ("Property only valid in ISO2022 coding systems",
1208 case CODESYS_PROP_CCL:
1209 if (type != CODESYS_CCL)
1211 ("Property only valid in CCL coding systems",
1221 signal_simple_error ("Unrecognized property", prop);
1223 if (EQ (prop, Qname))
1224 return XCODING_SYSTEM_NAME (coding_system);
1225 else if (EQ (prop, Qtype))
1226 return Fcoding_system_type (coding_system);
1227 else if (EQ (prop, Qdoc_string))
1228 return XCODING_SYSTEM_DOC_STRING (coding_system);
1229 else if (EQ (prop, Qmnemonic))
1230 return XCODING_SYSTEM_MNEMONIC (coding_system);
1231 else if (EQ (prop, Qeol_type))
1232 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1233 else if (EQ (prop, Qeol_lf))
1234 return XCODING_SYSTEM_EOL_LF (coding_system);
1235 else if (EQ (prop, Qeol_crlf))
1236 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1237 else if (EQ (prop, Qeol_cr))
1238 return XCODING_SYSTEM_EOL_CR (coding_system);
1239 else if (EQ (prop, Qpost_read_conversion))
1240 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1241 else if (EQ (prop, Qpre_write_conversion))
1242 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1244 else if (type == CODESYS_ISO2022)
1246 if (EQ (prop, Qcharset_g0))
1247 return coding_system_charset (coding_system, 0);
1248 else if (EQ (prop, Qcharset_g1))
1249 return coding_system_charset (coding_system, 1);
1250 else if (EQ (prop, Qcharset_g2))
1251 return coding_system_charset (coding_system, 2);
1252 else if (EQ (prop, Qcharset_g3))
1253 return coding_system_charset (coding_system, 3);
1255 #define FORCE_CHARSET(charset_num) \
1256 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1257 (coding_system, charset_num) ? Qt : Qnil)
1259 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1260 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1261 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1262 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1264 #define LISP_BOOLEAN(prop) \
1265 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1267 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1268 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1269 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1270 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1271 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1272 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1273 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1275 else if (EQ (prop, Qinput_charset_conversion))
1277 unparse_charset_conversion_specs
1278 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1279 else if (EQ (prop, Qoutput_charset_conversion))
1281 unparse_charset_conversion_specs
1282 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1286 else if (type == CODESYS_CCL)
1288 if (EQ (prop, Qdecode))
1289 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1290 else if (EQ (prop, Qencode))
1291 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1299 return Qnil; /* not reached */
1303 /************************************************************************/
1304 /* Coding category functions */
1305 /************************************************************************/
1308 decode_coding_category (Lisp_Object symbol)
1312 CHECK_SYMBOL (symbol);
1313 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1314 if (EQ (coding_category_symbol[i], symbol))
1317 signal_simple_error ("Unrecognized coding category", symbol);
1318 return 0; /* not reached */
1321 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1322 Return a list of all recognized coding categories.
1327 Lisp_Object list = Qnil;
1329 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1330 list = Fcons (coding_category_symbol[i], list);
1334 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1335 Change the priority order of the coding categories.
1336 LIST should be list of coding categories, in descending order of
1337 priority. Unspecified coding categories will be lower in priority
1338 than all specified ones, in the same relative order they were in
1343 int category_to_priority[CODING_CATEGORY_LAST + 1];
1347 /* First generate a list that maps coding categories to priorities. */
1349 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1350 category_to_priority[i] = -1;
1352 /* Highest priority comes from the specified list. */
1354 EXTERNAL_LIST_LOOP (rest, list)
1356 int cat = decode_coding_category (XCAR (rest));
1358 if (category_to_priority[cat] >= 0)
1359 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1360 category_to_priority[cat] = i++;
1363 /* Now go through the existing categories by priority to retrieve
1364 the categories not yet specified and preserve their priority
1366 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1368 int cat = fcd->coding_category_by_priority[j];
1369 if (category_to_priority[cat] < 0)
1370 category_to_priority[cat] = i++;
1373 /* Now we need to construct the inverse of the mapping we just
1376 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1377 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1379 /* Phew! That was confusing. */
1383 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1384 Return a list of coding categories in descending order of priority.
1389 Lisp_Object list = Qnil;
1391 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1392 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1397 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1398 Change the coding system associated with a coding category.
1400 (coding_category, coding_system))
1402 int cat = decode_coding_category (coding_category);
1404 coding_system = Fget_coding_system (coding_system);
1405 fcd->coding_category_system[cat] = coding_system;
1409 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1410 Return the coding system associated with a coding category.
1414 int cat = decode_coding_category (coding_category);
1415 Lisp_Object sys = fcd->coding_category_system[cat];
1418 return XCODING_SYSTEM_NAME (sys);
1423 /************************************************************************/
1424 /* Detecting the encoding of data */
1425 /************************************************************************/
1427 struct detection_state
1429 enum eol_type eol_type;
1465 struct iso2022_decoder iso;
1467 int high_byte_count;
1468 unsigned int saw_single_shift:1;
1481 acceptable_control_char_p (int c)
1485 /* Allow and ignore control characters that you might
1486 reasonably see in a text file */
1491 case 8: /* backspace */
1492 case 11: /* vertical tab */
1493 case 12: /* form feed */
1494 case 26: /* MS-DOS C-z junk */
1495 case 31: /* '^_' -- for info */
1503 mask_has_at_most_one_bit_p (int mask)
1505 /* Perhaps the only thing useful you learn from intensive Microsoft
1506 technical interviews */
1507 return (mask & (mask - 1)) == 0;
1510 static enum eol_type
1511 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1521 if (st->eol.just_saw_cr)
1523 else if (st->eol.seen_anything)
1526 else if (st->eol.just_saw_cr)
1529 st->eol.just_saw_cr = 1;
1531 st->eol.just_saw_cr = 0;
1532 st->eol.seen_anything = 1;
1535 return EOL_AUTODETECT;
1538 /* Attempt to determine the encoding and EOL type of the given text.
1539 Before calling this function for the first type, you must initialize
1540 st->eol_type as appropriate and initialize st->mask to ~0.
1542 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1545 st->mask holds the determined coding category mask, or ~0 if only
1546 ASCII has been seen so far.
1550 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1551 is present in st->mask
1552 1 == definitive answers are here for both st->eol_type and st->mask
1556 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1557 unsigned int n, int just_do_eol)
1561 if (st->eol_type == EOL_AUTODETECT)
1562 st->eol_type = detect_eol_type (st, src, n);
1565 return st->eol_type != EOL_AUTODETECT;
1567 if (!st->seen_non_ascii)
1569 for (; n; n--, src++)
1572 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1574 st->seen_non_ascii = 1;
1576 st->shift_jis.mask = ~0;
1580 st->iso2022.mask = ~0;
1590 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1591 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1592 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1593 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1594 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1595 st->big5.mask = detect_coding_big5 (st, src, n);
1596 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1597 st->utf8.mask = detect_coding_utf8 (st, src, n);
1598 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1599 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1602 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1603 | st->utf8.mask | st->ucs4.mask;
1606 int retval = mask_has_at_most_one_bit_p (st->mask);
1607 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1608 return retval && st->eol_type != EOL_AUTODETECT;
1613 coding_system_from_mask (int mask)
1617 /* If the file was entirely or basically ASCII, use the
1618 default value of `buffer-file-coding-system'. */
1619 Lisp_Object retval =
1620 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1623 retval = Ffind_coding_system (retval);
1627 (Qbad_variable, Qwarning,
1628 "Invalid `default-buffer-file-coding-system', set to nil");
1629 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1633 retval = Fget_coding_system (Qraw_text);
1641 mask = postprocess_iso2022_mask (mask);
1643 /* Look through the coding categories by priority and find
1644 the first one that is allowed. */
1645 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1647 cat = fcd->coding_category_by_priority[i];
1648 if ((mask & (1 << cat)) &&
1649 !NILP (fcd->coding_category_system[cat]))
1653 return fcd->coding_category_system[cat];
1655 return Fget_coding_system (Qraw_text);
1659 /* Given a seekable read stream and potential coding system and EOL type
1660 as specified, do any autodetection that is called for. If the
1661 coding system and/or EOL type are not `autodetect', they will be left
1662 alone; but this function will never return an autodetect coding system
1665 This function does not automatically fetch subsidiary coding systems;
1666 that should be unnecessary with the explicit eol-type argument. */
1668 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1671 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1672 enum eol_type *eol_type_in_out)
1674 struct detection_state decst;
1676 if (*eol_type_in_out == EOL_AUTODETECT)
1677 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1680 decst.eol_type = *eol_type_in_out;
1683 /* If autodetection is called for, do it now. */
1684 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1685 || *eol_type_in_out == EOL_AUTODETECT)
1688 Lisp_Object coding_system = Qnil;
1690 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1693 /* Look for initial "-*-"; mode line prefix */
1695 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1700 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1702 Extbyte *local_vars_beg = p + 3;
1703 /* Look for final "-*-"; mode line suffix */
1704 for (p = local_vars_beg,
1705 scan_end = buf + nread - LENGTH ("-*-");
1710 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1712 Extbyte *suffix = p;
1713 /* Look for "coding:" */
1714 for (p = local_vars_beg,
1715 scan_end = suffix - LENGTH ("coding:?");
1718 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1719 && (p == local_vars_beg
1720 || (*(p-1) == ' ' ||
1726 p += LENGTH ("coding:");
1727 while (*p == ' ' || *p == '\t') p++;
1729 /* Get coding system name */
1730 save = *suffix; *suffix = '\0';
1731 /* Characters valid in a MIME charset name (rfc 1521),
1732 and in a Lisp symbol name. */
1733 n = strspn ( (char *) p,
1734 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1735 "abcdefghijklmnopqrstuvwxyz"
1741 save = p[n]; p[n] = '\0';
1743 Ffind_coding_system (intern ((char *) p));
1753 if (NILP (coding_system))
1756 if (detect_coding_type (&decst, buf, nread,
1757 XCODING_SYSTEM_TYPE (*codesys_in_out)
1758 != CODESYS_AUTODETECT))
1760 nread = Lstream_read (stream, buf, sizeof (buf));
1766 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1767 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1770 if (detect_coding_type (&decst, buf, nread, 1))
1772 nread = Lstream_read (stream, buf, sizeof (buf));
1778 *eol_type_in_out = decst.eol_type;
1779 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1781 if (NILP (coding_system))
1782 *codesys_in_out = coding_system_from_mask (decst.mask);
1784 *codesys_in_out = coding_system;
1788 /* If we absolutely can't determine the EOL type, just assume LF. */
1789 if (*eol_type_in_out == EOL_AUTODETECT)
1790 *eol_type_in_out = EOL_LF;
1792 Lstream_rewind (stream);
1795 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1796 Detect coding system of the text in the region between START and END.
1797 Returned a list of possible coding systems ordered by priority.
1798 If only ASCII characters are found, it returns 'undecided or one of
1799 its subsidiary coding systems according to a detected end-of-line
1800 type. Optional arg BUFFER defaults to the current buffer.
1802 (start, end, buffer))
1804 Lisp_Object val = Qnil;
1805 struct buffer *buf = decode_buffer (buffer, 0);
1807 Lisp_Object instream, lb_instream;
1808 Lstream *istr, *lb_istr;
1809 struct detection_state decst;
1810 struct gcpro gcpro1, gcpro2;
1812 get_buffer_range_char (buf, start, end, &b, &e, 0);
1813 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1814 lb_istr = XLSTREAM (lb_instream);
1815 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1816 istr = XLSTREAM (instream);
1817 GCPRO2 (instream, lb_instream);
1819 decst.eol_type = EOL_AUTODETECT;
1823 unsigned char random_buffer[4096];
1824 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1828 if (detect_coding_type (&decst, random_buffer, nread, 0))
1832 if (decst.mask == ~0)
1833 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1841 decst.mask = postprocess_iso2022_mask (decst.mask);
1843 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1845 int sys = fcd->coding_category_by_priority[i];
1846 if (decst.mask & (1 << sys))
1848 Lisp_Object codesys = fcd->coding_category_system[sys];
1849 if (!NILP (codesys))
1850 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1851 val = Fcons (codesys, val);
1855 Lstream_close (istr);
1857 Lstream_delete (istr);
1858 Lstream_delete (lb_istr);
1863 /************************************************************************/
1864 /* Converting to internal Mule format ("decoding") */
1865 /************************************************************************/
1867 /* A decoding stream is a stream used for decoding text (i.e.
1868 converting from some external format to internal format).
1869 The decoding-stream object keeps track of the actual coding
1870 stream, the stream that is at the other end, and data that
1871 needs to be persistent across the lifetime of the stream. */
1873 /* Handle the EOL stuff related to just-read-in character C.
1874 EOL_TYPE is the EOL type of the coding stream.
1875 FLAGS is the current value of FLAGS in the coding stream, and may
1876 be modified by this macro. (The macro only looks at the
1877 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1878 bytes are to be written. You need to also define a local goto
1879 label "label_continue_loop" that is at the end of the main
1880 character-reading loop.
1882 If C is a CR character, then this macro handles it entirely and
1883 jumps to label_continue_loop. Otherwise, this macro does not add
1884 anything to DST, and continues normally. You should continue
1885 processing C normally after this macro. */
1887 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1891 if (eol_type == EOL_CR) \
1892 Dynarr_add (dst, '\n'); \
1893 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1894 Dynarr_add (dst, c); \
1896 flags |= CODING_STATE_CR; \
1897 goto label_continue_loop; \
1899 else if (flags & CODING_STATE_CR) \
1900 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1902 Dynarr_add (dst, '\r'); \
1903 flags &= ~CODING_STATE_CR; \
1907 /* C should be a binary character in the range 0 - 255; convert
1908 to internal format and add to Dynarr DST. */
1911 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1913 if (BYTE_ASCII_P (c)) \
1914 Dynarr_add (dst, c); \
1917 Dynarr_add (dst, (c >> 6) | 0xc0); \
1918 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1923 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1927 Dynarr_add (dst, c);
1929 else if ( c <= 0x7ff )
1931 Dynarr_add (dst, (c >> 6) | 0xc0);
1932 Dynarr_add (dst, (c & 0x3f) | 0x80);
1934 else if ( c <= 0xffff )
1936 Dynarr_add (dst, (c >> 12) | 0xe0);
1937 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1938 Dynarr_add (dst, (c & 0x3f) | 0x80);
1940 else if ( c <= 0x1fffff )
1942 Dynarr_add (dst, (c >> 18) | 0xf0);
1943 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1944 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1945 Dynarr_add (dst, (c & 0x3f) | 0x80);
1947 else if ( c <= 0x3ffffff )
1949 Dynarr_add (dst, (c >> 24) | 0xf8);
1950 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1951 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1952 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1953 Dynarr_add (dst, (c & 0x3f) | 0x80);
1957 Dynarr_add (dst, (c >> 30) | 0xfc);
1958 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1959 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1960 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1961 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1962 Dynarr_add (dst, (c & 0x3f) | 0x80);
1966 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1968 if (BYTE_ASCII_P (c)) \
1969 Dynarr_add (dst, c); \
1970 else if (BYTE_C1_P (c)) \
1972 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1973 Dynarr_add (dst, c + 0x20); \
1977 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1978 Dynarr_add (dst, c); \
1983 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1987 DECODE_ADD_BINARY_CHAR (ch, dst); \
1992 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1994 if (flags & CODING_STATE_END) \
1996 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1997 if (flags & CODING_STATE_CR) \
1998 Dynarr_add (dst, '\r'); \
2002 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2004 struct decoding_stream
2006 /* Coding system that governs the conversion. */
2007 Lisp_Coding_System *codesys;
2009 /* Stream that we read the encoded data from or
2010 write the decoded data to. */
2013 /* If we are reading, then we can return only a fixed amount of
2014 data, so if the conversion resulted in too much data, we store it
2015 here for retrieval the next time around. */
2016 unsigned_char_dynarr *runoff;
2018 /* FLAGS holds flags indicating the current state of the decoding.
2019 Some of these flags are dependent on the coding system. */
2022 /* CH holds a partially built-up character. Since we only deal
2023 with one- and two-byte characters at the moment, we only use
2024 this to store the first byte of a two-byte character. */
2027 /* EOL_TYPE specifies the type of end-of-line conversion that
2028 currently applies. We need to keep this separate from the
2029 EOL type stored in CODESYS because the latter might indicate
2030 automatic EOL-type detection while the former will always
2031 indicate a particular EOL type. */
2032 enum eol_type eol_type;
2034 /* Additional ISO2022 information. We define the structure above
2035 because it's also needed by the detection routines. */
2036 struct iso2022_decoder iso2022;
2038 /* Additional information (the state of the running CCL program)
2039 used by the CCL decoder. */
2040 struct ccl_program ccl;
2042 /* counter for UTF-8 or UCS-4 */
2043 unsigned char counter;
2046 unsigned combined_char_count;
2047 Emchar combined_chars[16];
2048 Lisp_Object combining_table;
2050 struct detection_state decst;
2054 extern Lisp_Object Vcharacter_composition_table;
2057 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2061 for (i = 0; i < str->combined_char_count; i++)
2062 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2063 str->combined_char_count = 0;
2064 str->combining_table = Qnil;
2068 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2069 Emchar character, unsigned_char_dynarr* dst)
2071 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2072 DECODE_ADD_UCS_CHAR (character, dst);
2073 else if (!CHAR_CODE_TABLE_P (str->combining_table))
2076 = get_char_code_table (character, Vcharacter_composition_table);
2079 DECODE_ADD_UCS_CHAR (character, dst);
2082 str->combined_chars[0] = character;
2083 str->combined_char_count = 1;
2084 str->combining_table = ret;
2090 = get_char_code_table (character, str->combining_table);
2094 Emchar char2 = XCHARVAL (ret);
2095 ret = get_char_code_table (char2, Vcharacter_composition_table);
2098 DECODE_ADD_UCS_CHAR (char2, dst);
2099 str->combined_char_count = 0;
2100 str->combining_table = Qnil;
2104 str->combined_chars[0] = char2;
2105 str->combined_char_count = 1;
2106 str->combining_table = ret;
2109 else if (CHAR_CODE_TABLE_P (ret))
2111 str->combined_chars[str->combined_char_count++] = character;
2112 str->combining_table = ret;
2116 COMPOSE_FLUSH_CHARS (str, dst);
2117 DECODE_ADD_UCS_CHAR (character, dst);
2121 #else /* not UTF2000 */
2122 #define COMPOSE_FLUSH_CHARS(str, dst)
2123 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2124 #endif /* UTF2000 */
2126 static ssize_t decoding_reader (Lstream *stream,
2127 unsigned char *data, size_t size);
2128 static ssize_t decoding_writer (Lstream *stream,
2129 CONST unsigned char *data, size_t size);
2130 static int decoding_rewinder (Lstream *stream);
2131 static int decoding_seekable_p (Lstream *stream);
2132 static int decoding_flusher (Lstream *stream);
2133 static int decoding_closer (Lstream *stream);
2135 static Lisp_Object decoding_marker (Lisp_Object stream);
2137 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2138 sizeof (struct decoding_stream));
2141 decoding_marker (Lisp_Object stream)
2143 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2144 Lisp_Object str_obj;
2146 /* We do not need to mark the coding systems or charsets stored
2147 within the stream because they are stored in a global list
2148 and automatically marked. */
2150 XSETLSTREAM (str_obj, str);
2151 mark_object (str_obj);
2152 if (str->imp->marker)
2153 return (str->imp->marker) (str_obj);
2158 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2159 so we read data from the other end, decode it, and store it into DATA. */
2162 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2164 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2165 unsigned char *orig_data = data;
2167 int error_occurred = 0;
2169 /* We need to interface to mule_decode(), which expects to take some
2170 amount of data and store the result into a Dynarr. We have
2171 mule_decode() store into str->runoff, and take data from there
2174 /* We loop until we have enough data, reading chunks from the other
2175 end and decoding it. */
2178 /* Take data from the runoff if we can. Make sure to take at
2179 most SIZE bytes, and delete the data from the runoff. */
2180 if (Dynarr_length (str->runoff) > 0)
2182 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2183 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2184 Dynarr_delete_many (str->runoff, 0, chunk);
2190 break; /* No more room for data */
2192 if (str->flags & CODING_STATE_END)
2193 /* This means that on the previous iteration, we hit the EOF on
2194 the other end. We loop once more so that mule_decode() can
2195 output any final stuff it may be holding, or any "go back
2196 to a sane state" escape sequences. (This latter makes sense
2197 during encoding.) */
2200 /* Exhausted the runoff, so get some more. DATA has at least
2201 SIZE bytes left of storage in it, so it's OK to read directly
2202 into it. (We'll be overwriting above, after we've decoded it
2203 into the runoff.) */
2204 read_size = Lstream_read (str->other_end, data, size);
2211 /* There might be some more end data produced in the translation.
2212 See the comment above. */
2213 str->flags |= CODING_STATE_END;
2214 mule_decode (stream, data, str->runoff, read_size);
2217 if (data - orig_data == 0)
2218 return error_occurred ? -1 : 0;
2220 return data - orig_data;
2224 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2226 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2229 /* Decode all our data into the runoff, and then attempt to write
2230 it all out to the other end. Remove whatever chunk we succeeded
2232 mule_decode (stream, data, str->runoff, size);
2233 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2234 Dynarr_length (str->runoff));
2236 Dynarr_delete_many (str->runoff, 0, retval);
2237 /* Do NOT return retval. The return value indicates how much
2238 of the incoming data was written, not how many bytes were
2244 reset_decoding_stream (struct decoding_stream *str)
2247 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2249 Lisp_Object coding_system;
2250 XSETCODING_SYSTEM (coding_system, str->codesys);
2251 reset_iso2022 (coding_system, &str->iso2022);
2253 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2255 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2260 str->combined_char_count = 0;
2261 str->combining_table = Qnil;
2263 str->flags = str->ch = 0;
2267 decoding_rewinder (Lstream *stream)
2269 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2270 reset_decoding_stream (str);
2271 Dynarr_reset (str->runoff);
2272 return Lstream_rewind (str->other_end);
2276 decoding_seekable_p (Lstream *stream)
2278 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2279 return Lstream_seekable_p (str->other_end);
2283 decoding_flusher (Lstream *stream)
2285 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2286 return Lstream_flush (str->other_end);
2290 decoding_closer (Lstream *stream)
2292 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2293 if (stream->flags & LSTREAM_FL_WRITE)
2295 str->flags |= CODING_STATE_END;
2296 decoding_writer (stream, 0, 0);
2298 Dynarr_free (str->runoff);
2300 #ifdef ENABLE_COMPOSITE_CHARS
2301 if (str->iso2022.composite_chars)
2302 Dynarr_free (str->iso2022.composite_chars);
2305 return Lstream_close (str->other_end);
2309 decoding_stream_coding_system (Lstream *stream)
2311 Lisp_Object coding_system;
2312 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2314 XSETCODING_SYSTEM (coding_system, str->codesys);
2315 return subsidiary_coding_system (coding_system, str->eol_type);
2319 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2321 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2322 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2324 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2325 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2326 reset_decoding_stream (str);
2329 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2330 stream for writing, no automatic code detection will be performed.
2331 The reason for this is that automatic code detection requires a
2332 seekable input. Things will also fail if you open a decoding
2333 stream for reading using a non-fully-specified coding system and
2334 a non-seekable input stream. */
2337 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2340 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2341 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2345 str->other_end = stream;
2346 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2347 str->eol_type = EOL_AUTODETECT;
2348 if (!strcmp (mode, "r")
2349 && Lstream_seekable_p (stream))
2350 /* We can determine the coding system now. */
2351 determine_real_coding_system (stream, &codesys, &str->eol_type);
2352 set_decoding_stream_coding_system (lstr, codesys);
2353 str->decst.eol_type = str->eol_type;
2354 str->decst.mask = ~0;
2355 XSETLSTREAM (obj, lstr);
2360 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2362 return make_decoding_stream_1 (stream, codesys, "r");
2366 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2368 return make_decoding_stream_1 (stream, codesys, "w");
2371 /* Note: the decode_coding_* functions all take the same
2372 arguments as mule_decode(), which is to say some SRC data of
2373 size N, which is to be stored into dynamic array DST.
2374 DECODING is the stream within which the decoding is
2375 taking place, but no data is actually read from or
2376 written to that stream; that is handled in decoding_reader()
2377 or decoding_writer(). This allows the same functions to
2378 be used for both reading and writing. */
2381 mule_decode (Lstream *decoding, CONST unsigned char *src,
2382 unsigned_char_dynarr *dst, unsigned int n)
2384 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2386 /* If necessary, do encoding-detection now. We do this when
2387 we're a writing stream or a non-seekable reading stream,
2388 meaning that we can't just process the whole input,
2389 rewind, and start over. */
2391 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2392 str->eol_type == EOL_AUTODETECT)
2394 Lisp_Object codesys;
2396 XSETCODING_SYSTEM (codesys, str->codesys);
2397 detect_coding_type (&str->decst, src, n,
2398 CODING_SYSTEM_TYPE (str->codesys) !=
2399 CODESYS_AUTODETECT);
2400 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2401 str->decst.mask != ~0)
2402 /* #### This is cheesy. What we really ought to do is
2403 buffer up a certain amount of data so as to get a
2404 less random result. */
2405 codesys = coding_system_from_mask (str->decst.mask);
2406 str->eol_type = str->decst.eol_type;
2407 if (XCODING_SYSTEM (codesys) != str->codesys)
2409 /* Preserve the CODING_STATE_END flag in case it was set.
2410 If we erase it, bad things might happen. */
2411 int was_end = str->flags & CODING_STATE_END;
2412 set_decoding_stream_coding_system (decoding, codesys);
2414 str->flags |= CODING_STATE_END;
2418 switch (CODING_SYSTEM_TYPE (str->codesys))
2421 case CODESYS_INTERNAL:
2422 Dynarr_add_many (dst, src, n);
2425 case CODESYS_AUTODETECT:
2426 /* If we got this far and still haven't decided on the coding
2427 system, then do no conversion. */
2428 case CODESYS_NO_CONVERSION:
2429 decode_coding_no_conversion (decoding, src, dst, n);
2432 case CODESYS_SHIFT_JIS:
2433 decode_coding_sjis (decoding, src, dst, n);
2436 decode_coding_big5 (decoding, src, dst, n);
2439 decode_coding_ucs4 (decoding, src, dst, n);
2442 decode_coding_utf8 (decoding, src, dst, n);
2445 str->ccl.last_block = str->flags & CODING_STATE_END;
2446 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2448 case CODESYS_ISO2022:
2449 decode_coding_iso2022 (decoding, src, dst, n);
2457 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2458 Decode the text between START and END which is encoded in CODING-SYSTEM.
2459 This is useful if you've read in encoded text from a file without decoding
2460 it (e.g. you read in a JIS-formatted file but used the `binary' or
2461 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2462 Return length of decoded text.
2463 BUFFER defaults to the current buffer if unspecified.
2465 (start, end, coding_system, buffer))
2468 struct buffer *buf = decode_buffer (buffer, 0);
2469 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2470 Lstream *istr, *ostr;
2471 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2473 get_buffer_range_char (buf, start, end, &b, &e, 0);
2475 barf_if_buffer_read_only (buf, b, e);
2477 coding_system = Fget_coding_system (coding_system);
2478 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2479 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2480 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2482 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2483 Fget_coding_system (Qbinary));
2484 istr = XLSTREAM (instream);
2485 ostr = XLSTREAM (outstream);
2486 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2488 /* The chain of streams looks like this:
2490 [BUFFER] <----- send through
2491 ------> [ENCODE AS BINARY]
2492 ------> [DECODE AS SPECIFIED]
2498 char tempbuf[1024]; /* some random amount */
2499 Bufpos newpos, even_newer_pos;
2500 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2501 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2505 newpos = lisp_buffer_stream_startpos (istr);
2506 Lstream_write (ostr, tempbuf, size_in_bytes);
2507 even_newer_pos = lisp_buffer_stream_startpos (istr);
2508 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2511 Lstream_close (istr);
2512 Lstream_close (ostr);
2514 Lstream_delete (istr);
2515 Lstream_delete (ostr);
2516 Lstream_delete (XLSTREAM (de_outstream));
2517 Lstream_delete (XLSTREAM (lb_outstream));
2522 /************************************************************************/
2523 /* Converting to an external encoding ("encoding") */
2524 /************************************************************************/
2526 /* An encoding stream is an output stream. When you create the
2527 stream, you specify the coding system that governs the encoding
2528 and another stream that the resulting encoded data is to be
2529 sent to, and then start sending data to it. */
2531 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2533 struct encoding_stream
2535 /* Coding system that governs the conversion. */
2536 Lisp_Coding_System *codesys;
2538 /* Stream that we read the encoded data from or
2539 write the decoded data to. */
2542 /* If we are reading, then we can return only a fixed amount of
2543 data, so if the conversion resulted in too much data, we store it
2544 here for retrieval the next time around. */
2545 unsigned_char_dynarr *runoff;
2547 /* FLAGS holds flags indicating the current state of the encoding.
2548 Some of these flags are dependent on the coding system. */
2551 /* CH holds a partially built-up character. Since we only deal
2552 with one- and two-byte characters at the moment, we only use
2553 this to store the first byte of a two-byte character. */
2556 /* Additional information used by the ISO2022 encoder. */
2559 /* CHARSET holds the character sets currently assigned to the G0
2560 through G3 registers. It is initialized from the array
2561 INITIAL_CHARSET in CODESYS. */
2562 Lisp_Object charset[4];
2564 /* Which registers are currently invoked into the left (GL) and
2565 right (GR) halves of the 8-bit encoding space? */
2566 int register_left, register_right;
2568 /* Whether we need to explicitly designate the charset in the
2569 G? register before using it. It is initialized from the
2570 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2571 unsigned char force_charset_on_output[4];
2573 /* Other state variables that need to be preserved across
2575 Lisp_Object current_charset;
2577 int current_char_boundary;
2580 void (*encode_char) (struct encoding_stream *str, Emchar c,
2581 unsigned_char_dynarr *dst, unsigned int *flags);
2582 void (*finish) (struct encoding_stream *str,
2583 unsigned_char_dynarr *dst, unsigned int *flags);
2585 /* Additional information (the state of the running CCL program)
2586 used by the CCL encoder. */
2587 struct ccl_program ccl;
2591 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2592 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2594 static int encoding_rewinder (Lstream *stream);
2595 static int encoding_seekable_p (Lstream *stream);
2596 static int encoding_flusher (Lstream *stream);
2597 static int encoding_closer (Lstream *stream);
2599 static Lisp_Object encoding_marker (Lisp_Object stream);
2601 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2602 sizeof (struct encoding_stream));
2605 encoding_marker (Lisp_Object stream)
2607 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2608 Lisp_Object str_obj;
2610 /* We do not need to mark the coding systems or charsets stored
2611 within the stream because they are stored in a global list
2612 and automatically marked. */
2614 XSETLSTREAM (str_obj, str);
2615 mark_object (str_obj);
2616 if (str->imp->marker)
2617 return (str->imp->marker) (str_obj);
2622 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2623 so we read data from the other end, encode it, and store it into DATA. */
2626 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2628 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2629 unsigned char *orig_data = data;
2631 int error_occurred = 0;
2633 /* We need to interface to mule_encode(), which expects to take some
2634 amount of data and store the result into a Dynarr. We have
2635 mule_encode() store into str->runoff, and take data from there
2638 /* We loop until we have enough data, reading chunks from the other
2639 end and encoding it. */
2642 /* Take data from the runoff if we can. Make sure to take at
2643 most SIZE bytes, and delete the data from the runoff. */
2644 if (Dynarr_length (str->runoff) > 0)
2646 int chunk = min ((int) size, Dynarr_length (str->runoff));
2647 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2648 Dynarr_delete_many (str->runoff, 0, chunk);
2654 break; /* No more room for data */
2656 if (str->flags & CODING_STATE_END)
2657 /* This means that on the previous iteration, we hit the EOF on
2658 the other end. We loop once more so that mule_encode() can
2659 output any final stuff it may be holding, or any "go back
2660 to a sane state" escape sequences. (This latter makes sense
2661 during encoding.) */
2664 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2665 left of storage in it, so it's OK to read directly into it.
2666 (We'll be overwriting above, after we've encoded it into the
2668 read_size = Lstream_read (str->other_end, data, size);
2675 /* There might be some more end data produced in the translation.
2676 See the comment above. */
2677 str->flags |= CODING_STATE_END;
2678 mule_encode (stream, data, str->runoff, read_size);
2681 if (data == orig_data)
2682 return error_occurred ? -1 : 0;
2684 return data - orig_data;
2688 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2690 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2693 /* Encode all our data into the runoff, and then attempt to write
2694 it all out to the other end. Remove whatever chunk we succeeded
2696 mule_encode (stream, data, str->runoff, size);
2697 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2698 Dynarr_length (str->runoff));
2700 Dynarr_delete_many (str->runoff, 0, retval);
2701 /* Do NOT return retval. The return value indicates how much
2702 of the incoming data was written, not how many bytes were
2708 reset_encoding_stream (struct encoding_stream *str)
2711 switch (CODING_SYSTEM_TYPE (str->codesys))
2713 case CODESYS_ISO2022:
2717 str->encode_char = &char_encode_iso2022;
2718 str->finish = &char_finish_iso2022;
2719 for (i = 0; i < 4; i++)
2721 str->iso2022.charset[i] =
2722 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2723 str->iso2022.force_charset_on_output[i] =
2724 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2726 str->iso2022.register_left = 0;
2727 str->iso2022.register_right = 1;
2728 str->iso2022.current_charset = Qnil;
2729 str->iso2022.current_half = 0;
2733 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2736 str->encode_char = &char_encode_utf8;
2737 str->finish = &char_finish_utf8;
2740 str->encode_char = &char_encode_ucs4;
2741 str->finish = &char_finish_ucs4;
2743 case CODESYS_SHIFT_JIS:
2744 str->encode_char = &char_encode_shift_jis;
2745 str->finish = &char_finish_shift_jis;
2751 str->iso2022.current_char_boundary = 0;
2752 str->flags = str->ch = 0;
2756 encoding_rewinder (Lstream *stream)
2758 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2759 reset_encoding_stream (str);
2760 Dynarr_reset (str->runoff);
2761 return Lstream_rewind (str->other_end);
2765 encoding_seekable_p (Lstream *stream)
2767 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2768 return Lstream_seekable_p (str->other_end);
2772 encoding_flusher (Lstream *stream)
2774 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2775 return Lstream_flush (str->other_end);
2779 encoding_closer (Lstream *stream)
2781 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2782 if (stream->flags & LSTREAM_FL_WRITE)
2784 str->flags |= CODING_STATE_END;
2785 encoding_writer (stream, 0, 0);
2787 Dynarr_free (str->runoff);
2788 return Lstream_close (str->other_end);
2792 encoding_stream_coding_system (Lstream *stream)
2794 Lisp_Object coding_system;
2795 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2797 XSETCODING_SYSTEM (coding_system, str->codesys);
2798 return coding_system;
2802 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2804 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2805 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2807 reset_encoding_stream (str);
2811 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2814 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2815 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2819 str->runoff = Dynarr_new (unsigned_char);
2820 str->other_end = stream;
2821 set_encoding_stream_coding_system (lstr, codesys);
2822 XSETLSTREAM (obj, lstr);
2827 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2829 return make_encoding_stream_1 (stream, codesys, "r");
2833 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2835 return make_encoding_stream_1 (stream, codesys, "w");
2838 /* Convert N bytes of internally-formatted data stored in SRC to an
2839 external format, according to the encoding stream ENCODING.
2840 Store the encoded data into DST. */
2843 mule_encode (Lstream *encoding, CONST unsigned char *src,
2844 unsigned_char_dynarr *dst, unsigned int n)
2846 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2848 switch (CODING_SYSTEM_TYPE (str->codesys))
2851 case CODESYS_INTERNAL:
2852 Dynarr_add_many (dst, src, n);
2855 case CODESYS_AUTODETECT:
2856 /* If we got this far and still haven't decided on the coding
2857 system, then do no conversion. */
2858 case CODESYS_NO_CONVERSION:
2859 encode_coding_no_conversion (encoding, src, dst, n);
2863 encode_coding_big5 (encoding, src, dst, n);
2866 str->ccl.last_block = str->flags & CODING_STATE_END;
2867 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2871 text_encode_generic (encoding, src, dst, n);
2875 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2876 Encode the text between START and END using CODING-SYSTEM.
2877 This will, for example, convert Japanese characters into stuff such as
2878 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2879 text. BUFFER defaults to the current buffer if unspecified.
2881 (start, end, coding_system, buffer))
2884 struct buffer *buf = decode_buffer (buffer, 0);
2885 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2886 Lstream *istr, *ostr;
2887 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2889 get_buffer_range_char (buf, start, end, &b, &e, 0);
2891 barf_if_buffer_read_only (buf, b, e);
2893 coding_system = Fget_coding_system (coding_system);
2894 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2895 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2896 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2897 Fget_coding_system (Qbinary));
2898 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2900 istr = XLSTREAM (instream);
2901 ostr = XLSTREAM (outstream);
2902 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2903 /* The chain of streams looks like this:
2905 [BUFFER] <----- send through
2906 ------> [ENCODE AS SPECIFIED]
2907 ------> [DECODE AS BINARY]
2912 char tempbuf[1024]; /* some random amount */
2913 Bufpos newpos, even_newer_pos;
2914 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2915 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2919 newpos = lisp_buffer_stream_startpos (istr);
2920 Lstream_write (ostr, tempbuf, size_in_bytes);
2921 even_newer_pos = lisp_buffer_stream_startpos (istr);
2922 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2928 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2929 Lstream_close (istr);
2930 Lstream_close (ostr);
2932 Lstream_delete (istr);
2933 Lstream_delete (ostr);
2934 Lstream_delete (XLSTREAM (de_outstream));
2935 Lstream_delete (XLSTREAM (lb_outstream));
2936 return make_int (retlen);
2943 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2944 unsigned_char_dynarr *dst, unsigned int n)
2947 unsigned char char_boundary;
2948 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2949 unsigned int flags = str->flags;
2950 Emchar ch = str->ch;
2952 char_boundary = str->iso2022.current_char_boundary;
2958 if (char_boundary == 0)
2986 (*str->encode_char) (str, c, dst, &flags);
2988 else if (char_boundary == 1)
2990 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2996 ch = (ch << 6) | (c & 0x3f);
3001 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3003 (*str->finish) (str, dst, &flags);
3008 str->iso2022.current_char_boundary = char_boundary;
3012 /************************************************************************/
3013 /* Shift-JIS methods */
3014 /************************************************************************/
3016 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3017 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3018 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3019 encoded by "position-code + 0x80". A character of JISX0208
3020 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3021 position-codes are divided and shifted so that it fit in the range
3024 --- CODE RANGE of Shift-JIS ---
3025 (character set) (range)
3027 JISX0201-Kana 0xA0 .. 0xDF
3028 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3029 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3030 -------------------------------
3034 /* Is this the first byte of a Shift-JIS two-byte char? */
3036 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3037 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3039 /* Is this the second byte of a Shift-JIS two-byte char? */
3041 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3042 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3044 #define BYTE_SJIS_KATAKANA_P(c) \
3045 ((c) >= 0xA1 && (c) <= 0xDF)
3048 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
3056 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3058 if (st->shift_jis.in_second_byte)
3060 st->shift_jis.in_second_byte = 0;
3064 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3065 st->shift_jis.in_second_byte = 1;
3067 return CODING_CATEGORY_SHIFT_JIS_MASK;
3070 /* Convert Shift-JIS data to internal format. */
3073 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
3074 unsigned_char_dynarr *dst, unsigned int n)
3077 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3078 unsigned int flags = str->flags;
3079 unsigned int ch = str->ch;
3080 eol_type_t eol_type = str->eol_type;
3088 /* Previous character was first byte of Shift-JIS Kanji char. */
3089 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3091 unsigned char e1, e2;
3093 DECODE_SJIS (ch, c, e1, e2);
3095 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3099 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3100 Dynarr_add (dst, e1);
3101 Dynarr_add (dst, e2);
3106 DECODE_ADD_BINARY_CHAR (ch, dst);
3107 DECODE_ADD_BINARY_CHAR (c, dst);
3113 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3114 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3116 else if (BYTE_SJIS_KATAKANA_P (c))
3119 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3122 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3123 Dynarr_add (dst, c);
3128 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3132 DECODE_ADD_BINARY_CHAR (c, dst);
3134 label_continue_loop:;
3137 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3143 /* Convert internal character representation to Shift_JIS. */
3146 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3147 unsigned_char_dynarr *dst, unsigned int *flags)
3149 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3153 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3154 Dynarr_add (dst, '\r');
3155 if (eol_type != EOL_CR)
3156 Dynarr_add (dst, ch);
3160 Lisp_Object charset;
3161 unsigned int c1, c2, s1, s2;
3163 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3165 if (code_point >= 0)
3167 charset = Vcharset_latin_jisx0201;
3173 BREAKUP_CHAR (ch, charset, c1, c2);
3175 if (EQ(charset, Vcharset_katakana_jisx0201))
3177 Dynarr_add (dst, c1 | 0x80);
3181 Dynarr_add (dst, c1);
3183 else if (EQ(charset, Vcharset_japanese_jisx0208))
3185 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3186 Dynarr_add (dst, s1);
3187 Dynarr_add (dst, s2);
3190 Dynarr_add (dst, '?');
3195 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3196 unsigned int *flags)
3200 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3201 Decode a JISX0208 character of Shift-JIS coding-system.
3202 CODE is the character code in Shift-JIS as a cons of type bytes.
3203 Return the corresponding character.
3207 unsigned char c1, c2, s1, s2;
3210 CHECK_INT (XCAR (code));
3211 CHECK_INT (XCDR (code));
3212 s1 = XINT (XCAR (code));
3213 s2 = XINT (XCDR (code));
3214 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3215 BYTE_SJIS_TWO_BYTE_2_P (s2))
3217 DECODE_SJIS (s1, s2, c1, c2);
3218 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3219 c1 & 0x7F, c2 & 0x7F));
3225 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3226 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3227 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3231 Lisp_Object charset;
3234 CHECK_CHAR_COERCE_INT (ch);
3235 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3236 if (EQ (charset, Vcharset_japanese_jisx0208))
3238 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3239 return Fcons (make_int (s1), make_int (s2));
3246 /************************************************************************/
3248 /************************************************************************/
3250 /* BIG5 is a coding system encoding two character sets: ASCII and
3251 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3252 character set and is encoded in two-byte.
3254 --- CODE RANGE of BIG5 ---
3255 (character set) (range)
3257 Big5 (1st byte) 0xA1 .. 0xFE
3258 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3259 --------------------------
3261 Since the number of characters in Big5 is larger than maximum
3262 characters in Emacs' charset (96x96), it can't be handled as one
3263 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3264 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3265 contains frequently used characters and the latter contains less
3266 frequently used characters. */
3268 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3269 ((c) >= 0xA1 && (c) <= 0xFE)
3271 /* Is this the second byte of a Shift-JIS two-byte char? */
3273 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3274 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3276 /* Number of Big5 characters which have the same code in 1st byte. */
3278 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3280 /* Code conversion macros. These are macros because they are used in
3281 inner loops during code conversion.
3283 Note that temporary variables in macros introduce the classic
3284 dynamic-scoping problems with variable names. We use capital-
3285 lettered variables in the assumption that XEmacs does not use
3286 capital letters in variables except in a very formalized way
3289 /* Convert Big5 code (b1, b2) into its internal string representation
3292 /* There is a much simpler way to split the Big5 charset into two.
3293 For the moment I'm going to leave the algorithm as-is because it
3294 claims to separate out the most-used characters into a single
3295 charset, which perhaps will lead to optimizations in various
3298 The way the algorithm works is something like this:
3300 Big5 can be viewed as a 94x157 charset, where the row is
3301 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3302 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3303 the split between low and high column numbers is apparently
3304 meaningless; ascending rows produce less and less frequent chars.
3305 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3306 the first charset, and the upper half (0xC9 .. 0xFE) to the
3307 second. To do the conversion, we convert the character into
3308 a single number where 0 .. 156 is the first row, 157 .. 313
3309 is the second, etc. That way, the characters are ordered by
3310 decreasing frequency. Then we just chop the space in two
3311 and coerce the result into a 94x94 space.
3314 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3316 int B1 = b1, B2 = b2; \
3318 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3322 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3326 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3327 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3329 c1 = I / (0xFF - 0xA1) + 0xA1; \
3330 c2 = I % (0xFF - 0xA1) + 0xA1; \
3333 /* Convert the internal string representation of a Big5 character
3334 (lb, c1, c2) into Big5 code (b1, b2). */
3336 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3338 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3340 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3342 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3344 b1 = I / BIG5_SAME_ROW + 0xA1; \
3345 b2 = I % BIG5_SAME_ROW; \
3346 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3350 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3358 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3359 (c >= 0x80 && c <= 0xA0))
3361 if (st->big5.in_second_byte)
3363 st->big5.in_second_byte = 0;
3364 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3368 st->big5.in_second_byte = 1;
3370 return CODING_CATEGORY_BIG5_MASK;
3373 /* Convert Big5 data to internal format. */
3376 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3377 unsigned_char_dynarr *dst, unsigned int n)
3380 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3381 unsigned int flags = str->flags;
3382 unsigned int ch = str->ch;
3383 eol_type_t eol_type = str->eol_type;
3390 /* Previous character was first byte of Big5 char. */
3391 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3393 unsigned char b1, b2, b3;
3394 DECODE_BIG5 (ch, c, b1, b2, b3);
3395 Dynarr_add (dst, b1);
3396 Dynarr_add (dst, b2);
3397 Dynarr_add (dst, b3);
3401 DECODE_ADD_BINARY_CHAR (ch, dst);
3402 DECODE_ADD_BINARY_CHAR (c, dst);
3408 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3409 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3412 DECODE_ADD_BINARY_CHAR (c, dst);
3414 label_continue_loop:;
3417 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3423 /* Convert internally-formatted data to Big5. */
3426 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3427 unsigned_char_dynarr *dst, unsigned int n)
3431 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3432 unsigned int flags = str->flags;
3433 unsigned int ch = str->ch;
3434 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3441 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3442 Dynarr_add (dst, '\r');
3443 if (eol_type != EOL_CR)
3444 Dynarr_add (dst, '\n');
3446 else if (BYTE_ASCII_P (c))
3449 Dynarr_add (dst, c);
3451 else if (BUFBYTE_LEADING_BYTE_P (c))
3453 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3454 c == LEADING_BYTE_CHINESE_BIG5_2)
3456 /* A recognized leading byte. */
3458 continue; /* not done with this character. */
3460 /* otherwise just ignore this character. */
3462 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3463 ch == LEADING_BYTE_CHINESE_BIG5_2)
3465 /* Previous char was a recognized leading byte. */
3467 continue; /* not done with this character. */
3471 /* Encountering second byte of a Big5 character. */
3472 unsigned char b1, b2;
3474 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3475 Dynarr_add (dst, b1);
3476 Dynarr_add (dst, b2);
3488 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3489 Decode a Big5 character CODE of BIG5 coding-system.
3490 CODE is the character code in BIG5, a cons of two integers.
3491 Return the corresponding character.
3495 unsigned char c1, c2, b1, b2;
3498 CHECK_INT (XCAR (code));
3499 CHECK_INT (XCDR (code));
3500 b1 = XINT (XCAR (code));
3501 b2 = XINT (XCDR (code));
3502 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3503 BYTE_BIG5_TWO_BYTE_2_P (b2))
3505 Charset_ID leading_byte;
3506 Lisp_Object charset;
3507 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3508 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3509 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3515 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3516 Encode the Big5 character CH to BIG5 coding-system.
3517 Return the corresponding character code in Big5.
3521 Lisp_Object charset;
3524 CHECK_CHAR_COERCE_INT (ch);
3525 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3526 if (EQ (charset, Vcharset_chinese_big5_1) ||
3527 EQ (charset, Vcharset_chinese_big5_2))
3529 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3531 return Fcons (make_int (b1), make_int (b2));
3538 /************************************************************************/
3540 /************************************************************************/
3543 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3549 switch (st->ucs4.in_byte)
3558 st->ucs4.in_byte = 0;
3564 return CODING_CATEGORY_UCS4_MASK;
3568 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3569 unsigned_char_dynarr *dst, unsigned int n)
3571 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3572 unsigned int flags = str->flags;
3573 unsigned int ch = str->ch;
3574 unsigned char counter = str->counter;
3578 unsigned char c = *src++;
3586 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3591 ch = ( ch << 8 ) | c;
3595 if (counter & CODING_STATE_END)
3596 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3600 str->counter = counter;
3604 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3605 unsigned_char_dynarr *dst, unsigned int *flags)
3607 Dynarr_add (dst, ch >> 24);
3608 Dynarr_add (dst, ch >> 16);
3609 Dynarr_add (dst, ch >> 8);
3610 Dynarr_add (dst, ch );
3614 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3615 unsigned int *flags)
3620 /************************************************************************/
3622 /************************************************************************/
3625 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3630 unsigned char c = *src++;
3631 switch (st->utf8.in_byte)
3634 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3637 st->utf8.in_byte = 5;
3639 st->utf8.in_byte = 4;
3641 st->utf8.in_byte = 3;
3643 st->utf8.in_byte = 2;
3645 st->utf8.in_byte = 1;
3650 if ((c & 0xc0) != 0x80)
3656 return CODING_CATEGORY_UTF8_MASK;
3660 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3661 unsigned_char_dynarr *dst, unsigned int n)
3663 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3664 unsigned int flags = str->flags;
3665 unsigned int ch = str->ch;
3666 eol_type_t eol_type = str->eol_type;
3667 unsigned char counter = str->counter;
3671 unsigned char c = *src++;
3680 else if ( c >= 0xf8 )
3685 else if ( c >= 0xf0 )
3690 else if ( c >= 0xe0 )
3695 else if ( c >= 0xc0 )
3702 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3703 DECODE_ADD_UCS_CHAR (c, dst);
3707 ch = ( ch << 6 ) | ( c & 0x3f );
3708 DECODE_ADD_UCS_CHAR (ch, dst);
3713 ch = ( ch << 6 ) | ( c & 0x3f );
3716 label_continue_loop:;
3719 if (flags & CODING_STATE_END)
3720 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3724 str->counter = counter;
3728 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3729 unsigned_char_dynarr *dst, unsigned int *flags)
3731 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3735 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3736 Dynarr_add (dst, '\r');
3737 if (eol_type != EOL_CR)
3738 Dynarr_add (dst, ch);
3740 else if (ch <= 0x7f)
3742 Dynarr_add (dst, ch);
3744 else if (ch <= 0x7ff)
3746 Dynarr_add (dst, (ch >> 6) | 0xc0);
3747 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3749 else if (ch <= 0xffff)
3751 Dynarr_add (dst, (ch >> 12) | 0xe0);
3752 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3753 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3755 else if (ch <= 0x1fffff)
3757 Dynarr_add (dst, (ch >> 18) | 0xf0);
3758 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3759 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3760 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3762 else if (ch <= 0x3ffffff)
3764 Dynarr_add (dst, (ch >> 24) | 0xf8);
3765 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3766 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3767 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3768 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3772 Dynarr_add (dst, (ch >> 30) | 0xfc);
3773 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3774 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3775 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3776 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3777 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3782 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3783 unsigned int *flags)
3788 /************************************************************************/
3789 /* ISO2022 methods */
3790 /************************************************************************/
3792 /* The following note describes the coding system ISO2022 briefly.
3793 Since the intention of this note is to help understand the
3794 functions in this file, some parts are NOT ACCURATE or OVERLY
3795 SIMPLIFIED. For thorough understanding, please refer to the
3796 original document of ISO2022.
3798 ISO2022 provides many mechanisms to encode several character sets
3799 in 7-bit and 8-bit environments. For 7-bit environments, all text
3800 is encoded using bytes less than 128. This may make the encoded
3801 text a little bit longer, but the text passes more easily through
3802 several gateways, some of which strip off MSB (Most Signigant Bit).
3804 There are two kinds of character sets: control character set and
3805 graphic character set. The former contains control characters such
3806 as `newline' and `escape' to provide control functions (control
3807 functions are also provided by escape sequences). The latter
3808 contains graphic characters such as 'A' and '-'. Emacs recognizes
3809 two control character sets and many graphic character sets.
3811 Graphic character sets are classified into one of the following
3812 four classes, according to the number of bytes (DIMENSION) and
3813 number of characters in one dimension (CHARS) of the set:
3814 - DIMENSION1_CHARS94
3815 - DIMENSION1_CHARS96
3816 - DIMENSION2_CHARS94
3817 - DIMENSION2_CHARS96
3819 In addition, each character set is assigned an identification tag,
3820 unique for each set, called "final character" (denoted as <F>
3821 hereafter). The <F> of each character set is decided by ECMA(*)
3822 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3823 (0x30..0x3F are for private use only).
3825 Note (*): ECMA = European Computer Manufacturers Association
3827 Here are examples of graphic character set [NAME(<F>)]:
3828 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3829 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3830 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3831 o DIMENSION2_CHARS96 -- none for the moment
3833 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3834 C0 [0x00..0x1F] -- control character plane 0
3835 GL [0x20..0x7F] -- graphic character plane 0
3836 C1 [0x80..0x9F] -- control character plane 1
3837 GR [0xA0..0xFF] -- graphic character plane 1
3839 A control character set is directly designated and invoked to C0 or
3840 C1 by an escape sequence. The most common case is that:
3841 - ISO646's control character set is designated/invoked to C0, and
3842 - ISO6429's control character set is designated/invoked to C1,
3843 and usually these designations/invocations are omitted in encoded
3844 text. In a 7-bit environment, only C0 can be used, and a control
3845 character for C1 is encoded by an appropriate escape sequence to
3846 fit into the environment. All control characters for C1 are
3847 defined to have corresponding escape sequences.
3849 A graphic character set is at first designated to one of four
3850 graphic registers (G0 through G3), then these graphic registers are
3851 invoked to GL or GR. These designations and invocations can be
3852 done independently. The most common case is that G0 is invoked to
3853 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3854 these invocations and designations are omitted in encoded text.
3855 In a 7-bit environment, only GL can be used.
3857 When a graphic character set of CHARS94 is invoked to GL, codes
3858 0x20 and 0x7F of the GL area work as control characters SPACE and
3859 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3862 There are two ways of invocation: locking-shift and single-shift.
3863 With locking-shift, the invocation lasts until the next different
3864 invocation, whereas with single-shift, the invocation affects the
3865 following character only and doesn't affect the locking-shift
3866 state. Invocations are done by the following control characters or
3869 ----------------------------------------------------------------------
3870 abbrev function cntrl escape seq description
3871 ----------------------------------------------------------------------
3872 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3873 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3874 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3875 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3876 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3877 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3878 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3879 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3880 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3881 ----------------------------------------------------------------------
3882 (*) These are not used by any known coding system.
3884 Control characters for these functions are defined by macros
3885 ISO_CODE_XXX in `coding.h'.
3887 Designations are done by the following escape sequences:
3888 ----------------------------------------------------------------------
3889 escape sequence description
3890 ----------------------------------------------------------------------
3891 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3892 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3893 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3894 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3895 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3896 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3897 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3898 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3899 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3900 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3901 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3902 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3903 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3904 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3905 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3906 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3907 ----------------------------------------------------------------------
3909 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3910 of dimension 1, chars 94, and final character <F>, etc...
3912 Note (*): Although these designations are not allowed in ISO2022,
3913 Emacs accepts them on decoding, and produces them on encoding
3914 CHARS96 character sets in a coding system which is characterized as
3915 7-bit environment, non-locking-shift, and non-single-shift.
3917 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3918 '(' can be omitted. We refer to this as "short-form" hereafter.
3920 Now you may notice that there are a lot of ways for encoding the
3921 same multilingual text in ISO2022. Actually, there exist many
3922 coding systems such as Compound Text (used in X11's inter client
3923 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3924 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3925 localized platforms), and all of these are variants of ISO2022.
3927 In addition to the above, Emacs handles two more kinds of escape
3928 sequences: ISO6429's direction specification and Emacs' private
3929 sequence for specifying character composition.
3931 ISO6429's direction specification takes the following form:
3932 o CSI ']' -- end of the current direction
3933 o CSI '0' ']' -- end of the current direction
3934 o CSI '1' ']' -- start of left-to-right text
3935 o CSI '2' ']' -- start of right-to-left text
3936 The control character CSI (0x9B: control sequence introducer) is
3937 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3939 Character composition specification takes the following form:
3940 o ESC '0' -- start character composition
3941 o ESC '1' -- end character composition
3942 Since these are not standard escape sequences of any ISO standard,
3943 their use with these meanings is restricted to Emacs only. */
3946 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3950 for (i = 0; i < 4; i++)
3952 if (!NILP (coding_system))
3954 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3956 iso->charset[i] = Qt;
3957 iso->invalid_designated[i] = 0;
3959 iso->esc = ISO_ESC_NOTHING;
3960 iso->esc_bytes_index = 0;
3961 iso->register_left = 0;
3962 iso->register_right = 1;
3963 iso->switched_dir_and_no_valid_charset_yet = 0;
3964 iso->invalid_switch_dir = 0;
3965 iso->output_direction_sequence = 0;
3966 iso->output_literally = 0;
3967 #ifdef ENABLE_COMPOSITE_CHARS
3968 if (iso->composite_chars)
3969 Dynarr_reset (iso->composite_chars);
3974 fit_to_be_escape_quoted (unsigned char c)
3991 /* Parse one byte of an ISO2022 escape sequence.
3992 If the result is an invalid escape sequence, return 0 and
3993 do not change anything in STR. Otherwise, if the result is
3994 an incomplete escape sequence, update ISO2022.ESC and
3995 ISO2022.ESC_BYTES and return -1. Otherwise, update
3996 all the state variables (but not ISO2022.ESC_BYTES) and
3999 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4000 or invocation of an invalid character set and treat that as
4001 an unrecognized escape sequence. */
4004 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4005 unsigned char c, unsigned int *flags,
4006 int check_invalid_charsets)
4008 /* (1) If we're at the end of a designation sequence, CS is the
4009 charset being designated and REG is the register to designate
4012 (2) If we're at the end of a locking-shift sequence, REG is
4013 the register to invoke and HALF (0 == left, 1 == right) is
4014 the half to invoke it into.
4016 (3) If we're at the end of a single-shift sequence, REG is
4017 the register to invoke. */
4018 Lisp_Object cs = Qnil;
4021 /* NOTE: This code does goto's all over the fucking place.
4022 The reason for this is that we're basically implementing
4023 a state machine here, and hierarchical languages like C
4024 don't really provide a clean way of doing this. */
4026 if (! (*flags & CODING_STATE_ESCAPE))
4027 /* At beginning of escape sequence; we need to reset our
4028 escape-state variables. */
4029 iso->esc = ISO_ESC_NOTHING;
4031 iso->output_literally = 0;
4032 iso->output_direction_sequence = 0;
4036 case ISO_ESC_NOTHING:
4037 iso->esc_bytes_index = 0;
4040 case ISO_CODE_ESC: /* Start escape sequence */
4041 *flags |= CODING_STATE_ESCAPE;
4045 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4046 *flags |= CODING_STATE_ESCAPE;
4047 iso->esc = ISO_ESC_5_11;
4050 case ISO_CODE_SO: /* locking shift 1 */
4053 case ISO_CODE_SI: /* locking shift 0 */
4057 case ISO_CODE_SS2: /* single shift */
4060 case ISO_CODE_SS3: /* single shift */
4064 default: /* Other control characters */
4071 /**** single shift ****/
4073 case 'N': /* single shift 2 */
4076 case 'O': /* single shift 3 */
4080 /**** locking shift ****/
4082 case '~': /* locking shift 1 right */
4085 case 'n': /* locking shift 2 */
4088 case '}': /* locking shift 2 right */
4091 case 'o': /* locking shift 3 */
4094 case '|': /* locking shift 3 right */
4098 #ifdef ENABLE_COMPOSITE_CHARS
4099 /**** composite ****/
4102 iso->esc = ISO_ESC_START_COMPOSITE;
4103 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4104 CODING_STATE_COMPOSITE;
4108 iso->esc = ISO_ESC_END_COMPOSITE;
4109 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4110 ~CODING_STATE_COMPOSITE;
4112 #endif /* ENABLE_COMPOSITE_CHARS */
4114 /**** directionality ****/
4117 iso->esc = ISO_ESC_5_11;
4120 /**** designation ****/
4122 case '$': /* multibyte charset prefix */
4123 iso->esc = ISO_ESC_2_4;
4127 if (0x28 <= c && c <= 0x2F)
4129 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4133 /* This function is called with CODESYS equal to nil when
4134 doing coding-system detection. */
4136 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4137 && fit_to_be_escape_quoted (c))
4139 iso->esc = ISO_ESC_LITERAL;
4140 *flags &= CODING_STATE_ISO2022_LOCK;
4150 /**** directionality ****/
4152 case ISO_ESC_5_11: /* ISO6429 direction control */
4155 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4156 goto directionality;
4158 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4159 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4160 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4164 case ISO_ESC_5_11_0:
4167 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4168 goto directionality;
4172 case ISO_ESC_5_11_1:
4175 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4176 goto directionality;
4180 case ISO_ESC_5_11_2:
4183 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4184 goto directionality;
4189 iso->esc = ISO_ESC_DIRECTIONALITY;
4190 /* Various junk here to attempt to preserve the direction sequences
4191 literally in the text if they would otherwise be swallowed due
4192 to invalid designations that don't show up as actual charset
4193 changes in the text. */
4194 if (iso->invalid_switch_dir)
4196 /* We already inserted a direction switch literally into the
4197 text. We assume (#### this may not be right) that the
4198 next direction switch is the one going the other way,
4199 and we need to output that literally as well. */
4200 iso->output_literally = 1;
4201 iso->invalid_switch_dir = 0;
4207 /* If we are in the thrall of an invalid designation,
4208 then stick the directionality sequence literally into the
4209 output stream so it ends up in the original text again. */
4210 for (jj = 0; jj < 4; jj++)
4211 if (iso->invalid_designated[jj])
4215 iso->output_literally = 1;
4216 iso->invalid_switch_dir = 1;
4219 /* Indicate that we haven't yet seen a valid designation,
4220 so that if a switch-dir is directly followed by an
4221 invalid designation, both get inserted literally. */
4222 iso->switched_dir_and_no_valid_charset_yet = 1;
4227 /**** designation ****/
4230 if (0x28 <= c && c <= 0x2F)
4232 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4235 if (0x40 <= c && c <= 0x42)
4237 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4238 *flags & CODING_STATE_R2L ?
4239 CHARSET_RIGHT_TO_LEFT :
4240 CHARSET_LEFT_TO_RIGHT);
4250 if (c < '0' || c > '~')
4251 return 0; /* bad final byte */
4253 if (iso->esc >= ISO_ESC_2_8 &&
4254 iso->esc <= ISO_ESC_2_15)
4256 type = ((iso->esc >= ISO_ESC_2_12) ?
4257 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4258 reg = (iso->esc - ISO_ESC_2_8) & 3;
4260 else if (iso->esc >= ISO_ESC_2_4_8 &&
4261 iso->esc <= ISO_ESC_2_4_15)
4263 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4264 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4265 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4269 /* Can this ever be reached? -slb */
4273 cs = CHARSET_BY_ATTRIBUTES (type, c,
4274 *flags & CODING_STATE_R2L ?
4275 CHARSET_RIGHT_TO_LEFT :
4276 CHARSET_LEFT_TO_RIGHT);
4282 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4286 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4287 /* can't invoke something that ain't there. */
4289 iso->esc = ISO_ESC_SINGLE_SHIFT;
4290 *flags &= CODING_STATE_ISO2022_LOCK;
4292 *flags |= CODING_STATE_SS2;
4294 *flags |= CODING_STATE_SS3;
4298 if (check_invalid_charsets &&
4299 !CHARSETP (iso->charset[reg]))
4300 /* can't invoke something that ain't there. */
4303 iso->register_right = reg;
4305 iso->register_left = reg;
4306 *flags &= CODING_STATE_ISO2022_LOCK;
4307 iso->esc = ISO_ESC_LOCKING_SHIFT;
4311 if (NILP (cs) && check_invalid_charsets)
4313 iso->invalid_designated[reg] = 1;
4314 iso->charset[reg] = Vcharset_ascii;
4315 iso->esc = ISO_ESC_DESIGNATE;
4316 *flags &= CODING_STATE_ISO2022_LOCK;
4317 iso->output_literally = 1;
4318 if (iso->switched_dir_and_no_valid_charset_yet)
4320 /* We encountered a switch-direction followed by an
4321 invalid designation. Ensure that the switch-direction
4322 gets outputted; otherwise it will probably get eaten
4323 when the text is written out again. */
4324 iso->switched_dir_and_no_valid_charset_yet = 0;
4325 iso->output_direction_sequence = 1;
4326 /* And make sure that the switch-dir going the other
4327 way gets outputted, as well. */
4328 iso->invalid_switch_dir = 1;
4332 /* This function is called with CODESYS equal to nil when
4333 doing coding-system detection. */
4334 if (!NILP (codesys))
4336 charset_conversion_spec_dynarr *dyn =
4337 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4343 for (i = 0; i < Dynarr_length (dyn); i++)
4345 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4346 if (EQ (cs, spec->from_charset))
4347 cs = spec->to_charset;
4352 iso->charset[reg] = cs;
4353 iso->esc = ISO_ESC_DESIGNATE;
4354 *flags &= CODING_STATE_ISO2022_LOCK;
4355 if (iso->invalid_designated[reg])
4357 iso->invalid_designated[reg] = 0;
4358 iso->output_literally = 1;
4360 if (iso->switched_dir_and_no_valid_charset_yet)
4361 iso->switched_dir_and_no_valid_charset_yet = 0;
4366 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4371 /* #### There are serious deficiencies in the recognition mechanism
4372 here. This needs to be much smarter if it's going to cut it.
4373 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4374 it should be detected as Latin-1.
4375 All the ISO2022 stuff in this file should be synced up with the
4376 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4377 Perhaps we should wait till R2L works in FSF Emacs? */
4379 if (!st->iso2022.initted)
4381 reset_iso2022 (Qnil, &st->iso2022.iso);
4382 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4383 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4384 CODING_CATEGORY_ISO_8_1_MASK |
4385 CODING_CATEGORY_ISO_8_2_MASK |
4386 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4387 st->iso2022.flags = 0;
4388 st->iso2022.high_byte_count = 0;
4389 st->iso2022.saw_single_shift = 0;
4390 st->iso2022.initted = 1;
4393 mask = st->iso2022.mask;
4400 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4401 st->iso2022.high_byte_count++;
4405 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4407 if (st->iso2022.high_byte_count & 1)
4408 /* odd number of high bytes; assume not iso-8-2 */
4409 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4411 st->iso2022.high_byte_count = 0;
4412 st->iso2022.saw_single_shift = 0;
4414 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4416 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4417 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4418 { /* control chars */
4421 /* Allow and ignore control characters that you might
4422 reasonably see in a text file */
4427 case 8: /* backspace */
4428 case 11: /* vertical tab */
4429 case 12: /* form feed */
4430 case 26: /* MS-DOS C-z junk */
4431 case 31: /* '^_' -- for info */
4432 goto label_continue_loop;
4439 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4442 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4443 &st->iso2022.flags, 0))
4445 switch (st->iso2022.iso.esc)
4447 case ISO_ESC_DESIGNATE:
4448 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4449 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4451 case ISO_ESC_LOCKING_SHIFT:
4452 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4453 goto ran_out_of_chars;
4454 case ISO_ESC_SINGLE_SHIFT:
4455 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4456 st->iso2022.saw_single_shift = 1;
4465 goto ran_out_of_chars;
4468 label_continue_loop:;
4477 postprocess_iso2022_mask (int mask)
4479 /* #### kind of cheesy */
4480 /* If seven-bit ISO is allowed, then assume that the encoding is
4481 entirely seven-bit and turn off the eight-bit ones. */
4482 if (mask & CODING_CATEGORY_ISO_7_MASK)
4483 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4484 CODING_CATEGORY_ISO_8_1_MASK |
4485 CODING_CATEGORY_ISO_8_2_MASK);
4489 /* If FLAGS is a null pointer or specifies right-to-left motion,
4490 output a switch-dir-to-left-to-right sequence to DST.
4491 Also update FLAGS if it is not a null pointer.
4492 If INTERNAL_P is set, we are outputting in internal format and
4493 need to handle the CSI differently. */
4496 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4497 unsigned_char_dynarr *dst,
4498 unsigned int *flags,
4501 if (!flags || (*flags & CODING_STATE_R2L))
4503 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4505 Dynarr_add (dst, ISO_CODE_ESC);
4506 Dynarr_add (dst, '[');
4508 else if (internal_p)
4509 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4511 Dynarr_add (dst, ISO_CODE_CSI);
4512 Dynarr_add (dst, '0');
4513 Dynarr_add (dst, ']');
4515 *flags &= ~CODING_STATE_R2L;
4519 /* If FLAGS is a null pointer or specifies a direction different from
4520 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4521 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4522 sequence to DST. Also update FLAGS if it is not a null pointer.
4523 If INTERNAL_P is set, we are outputting in internal format and
4524 need to handle the CSI differently. */
4527 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4528 unsigned_char_dynarr *dst, unsigned int *flags,
4531 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4532 direction == CHARSET_LEFT_TO_RIGHT)
4533 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4534 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4535 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4536 direction == CHARSET_RIGHT_TO_LEFT)
4538 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4540 Dynarr_add (dst, ISO_CODE_ESC);
4541 Dynarr_add (dst, '[');
4543 else if (internal_p)
4544 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4546 Dynarr_add (dst, ISO_CODE_CSI);
4547 Dynarr_add (dst, '2');
4548 Dynarr_add (dst, ']');
4550 *flags |= CODING_STATE_R2L;
4554 /* Convert ISO2022-format data to internal format. */
4557 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4558 unsigned_char_dynarr *dst, unsigned int n)
4560 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4561 unsigned int flags = str->flags;
4562 unsigned int ch = str->ch;
4563 eol_type_t eol_type = str->eol_type;
4564 #ifdef ENABLE_COMPOSITE_CHARS
4565 unsigned_char_dynarr *real_dst = dst;
4567 Lisp_Object coding_system;
4569 XSETCODING_SYSTEM (coding_system, str->codesys);
4571 #ifdef ENABLE_COMPOSITE_CHARS
4572 if (flags & CODING_STATE_COMPOSITE)
4573 dst = str->iso2022.composite_chars;
4574 #endif /* ENABLE_COMPOSITE_CHARS */
4578 unsigned char c = *src++;
4579 if (flags & CODING_STATE_ESCAPE)
4580 { /* Within ESC sequence */
4581 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4586 switch (str->iso2022.esc)
4588 #ifdef ENABLE_COMPOSITE_CHARS
4589 case ISO_ESC_START_COMPOSITE:
4590 if (str->iso2022.composite_chars)
4591 Dynarr_reset (str->iso2022.composite_chars);
4593 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4594 dst = str->iso2022.composite_chars;
4596 case ISO_ESC_END_COMPOSITE:
4598 Bufbyte comstr[MAX_EMCHAR_LEN];
4600 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4601 Dynarr_length (dst));
4603 len = set_charptr_emchar (comstr, emch);
4604 Dynarr_add_many (dst, comstr, len);
4607 #endif /* ENABLE_COMPOSITE_CHARS */
4609 case ISO_ESC_LITERAL:
4610 COMPOSE_FLUSH_CHARS (str, dst);
4611 DECODE_ADD_BINARY_CHAR (c, dst);
4615 /* Everything else handled already */
4620 /* Attempted error recovery. */
4621 if (str->iso2022.output_direction_sequence)
4622 ensure_correct_direction (flags & CODING_STATE_R2L ?
4623 CHARSET_RIGHT_TO_LEFT :
4624 CHARSET_LEFT_TO_RIGHT,
4625 str->codesys, dst, 0, 1);
4626 /* More error recovery. */
4627 if (!retval || str->iso2022.output_literally)
4629 /* Output the (possibly invalid) sequence */
4631 COMPOSE_FLUSH_CHARS (str, dst);
4632 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4633 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4634 flags &= CODING_STATE_ISO2022_LOCK;
4636 n++, src--;/* Repeat the loop with the same character. */
4639 /* No sense in reprocessing the final byte of the
4640 escape sequence; it could mess things up anyway.
4642 COMPOSE_FLUSH_CHARS (str, dst);
4643 DECODE_ADD_BINARY_CHAR (c, dst);
4648 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4649 { /* Control characters */
4651 /***** Error-handling *****/
4653 /* If we were in the middle of a character, dump out the
4654 partial character. */
4657 COMPOSE_FLUSH_CHARS (str, dst);
4658 DECODE_ADD_BINARY_CHAR (ch, dst);
4662 /* If we just saw a single-shift character, dump it out.
4663 This may dump out the wrong sort of single-shift character,
4664 but least it will give an indication that something went
4666 if (flags & CODING_STATE_SS2)
4668 COMPOSE_FLUSH_CHARS (str, dst);
4669 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4670 flags &= ~CODING_STATE_SS2;
4672 if (flags & CODING_STATE_SS3)
4674 COMPOSE_FLUSH_CHARS (str, dst);
4675 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4676 flags &= ~CODING_STATE_SS3;
4679 /***** Now handle the control characters. *****/
4685 COMPOSE_FLUSH_CHARS (str, dst);
4686 if (eol_type == EOL_CR)
4687 Dynarr_add (dst, '\n');
4688 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4689 Dynarr_add (dst, c);
4691 flags |= CODING_STATE_CR;
4692 goto label_continue_loop;
4694 else if (flags & CODING_STATE_CR)
4695 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4697 Dynarr_add (dst, '\r');
4698 flags &= ~CODING_STATE_CR;
4701 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4704 flags &= CODING_STATE_ISO2022_LOCK;
4706 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4708 COMPOSE_FLUSH_CHARS (str, dst);
4709 DECODE_ADD_BINARY_CHAR (c, dst);
4713 { /* Graphic characters */
4714 Lisp_Object charset;
4723 COMPOSE_FLUSH_CHARS (str, dst);
4724 if (eol_type == EOL_CR)
4725 Dynarr_add (dst, '\n');
4726 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4727 Dynarr_add (dst, c);
4729 flags |= CODING_STATE_CR;
4730 goto label_continue_loop;
4732 else if (flags & CODING_STATE_CR)
4733 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4735 Dynarr_add (dst, '\r');
4736 flags &= ~CODING_STATE_CR;
4739 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4742 /* Now determine the charset. */
4743 reg = ((flags & CODING_STATE_SS2) ? 2
4744 : (flags & CODING_STATE_SS3) ? 3
4745 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4746 : str->iso2022.register_left);
4747 charset = str->iso2022.charset[reg];
4749 /* Error checking: */
4750 if (! CHARSETP (charset)
4751 || str->iso2022.invalid_designated[reg]
4752 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4753 && XCHARSET_CHARS (charset) == 94))
4754 /* Mrmph. We are trying to invoke a register that has no
4755 or an invalid charset in it, or trying to add a character
4756 outside the range of the charset. Insert that char literally
4757 to preserve it for the output. */
4759 COMPOSE_FLUSH_CHARS (str, dst);
4760 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4761 DECODE_ADD_BINARY_CHAR (c, dst);
4766 /* Things are probably hunky-dorey. */
4768 /* Fetch reverse charset, maybe. */
4769 if (((flags & CODING_STATE_R2L) &&
4770 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4772 (!(flags & CODING_STATE_R2L) &&
4773 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4775 Lisp_Object new_charset =
4776 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4777 if (!NILP (new_charset))
4778 charset = new_charset;
4782 if (XCHARSET_DIMENSION (charset) == 1)
4786 COMPOSE_FLUSH_CHARS (str, dst);
4787 DECODE_ADD_BINARY_CHAR (ch, dst);
4790 COMPOSE_ADD_CHAR (str,
4791 MAKE_CHAR (charset, c & 0x7F, 0), dst);
4795 COMPOSE_ADD_CHAR (str,
4796 MAKE_CHAR (charset, ch & 0x7F, c & 0x7F),
4803 lb = XCHARSET_LEADING_BYTE (charset);
4804 switch (XCHARSET_REP_BYTES (charset))
4807 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4808 Dynarr_add (dst, c & 0x7F);
4811 case 2: /* one-byte official */
4812 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4813 Dynarr_add (dst, lb);
4814 Dynarr_add (dst, c | 0x80);
4817 case 3: /* one-byte private or two-byte official */
4818 if (XCHARSET_PRIVATE_P (charset))
4820 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4821 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4822 Dynarr_add (dst, lb);
4823 Dynarr_add (dst, c | 0x80);
4829 Dynarr_add (dst, lb);
4830 Dynarr_add (dst, ch | 0x80);
4831 Dynarr_add (dst, c | 0x80);
4839 default: /* two-byte private */
4842 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4843 Dynarr_add (dst, lb);
4844 Dynarr_add (dst, ch | 0x80);
4845 Dynarr_add (dst, c | 0x80);
4855 flags &= CODING_STATE_ISO2022_LOCK;
4858 label_continue_loop:;
4861 if (flags & CODING_STATE_END)
4863 COMPOSE_FLUSH_CHARS (str, dst);
4864 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4871 /***** ISO2022 encoder *****/
4873 /* Designate CHARSET into register REG. */
4876 iso2022_designate (Lisp_Object charset, unsigned char reg,
4877 struct encoding_stream *str, unsigned_char_dynarr *dst)
4879 static CONST char inter94[] = "()*+";
4880 static CONST char inter96[] = ",-./";
4881 unsigned short chars;
4882 unsigned char dimension;
4883 unsigned char final;
4884 Lisp_Object old_charset = str->iso2022.charset[reg];
4886 str->iso2022.charset[reg] = charset;
4887 if (!CHARSETP (charset))
4888 /* charset might be an initial nil or t. */
4890 chars = XCHARSET_CHARS (charset);
4891 dimension = XCHARSET_DIMENSION (charset);
4892 final = XCHARSET_FINAL (charset);
4893 if (!str->iso2022.force_charset_on_output[reg] &&
4894 CHARSETP (old_charset) &&
4895 XCHARSET_CHARS (old_charset) == chars &&
4896 XCHARSET_DIMENSION (old_charset) == dimension &&
4897 XCHARSET_FINAL (old_charset) == final)
4900 str->iso2022.force_charset_on_output[reg] = 0;
4903 charset_conversion_spec_dynarr *dyn =
4904 str->codesys->iso2022.output_conv;
4910 for (i = 0; i < Dynarr_length (dyn); i++)
4912 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4913 if (EQ (charset, spec->from_charset))
4914 charset = spec->to_charset;
4919 Dynarr_add (dst, ISO_CODE_ESC);
4924 Dynarr_add (dst, inter94[reg]);
4927 Dynarr_add (dst, '$');
4929 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4932 Dynarr_add (dst, inter94[reg]);
4937 Dynarr_add (dst, inter96[reg]);
4940 Dynarr_add (dst, '$');
4941 Dynarr_add (dst, inter96[reg]);
4945 Dynarr_add (dst, final);
4949 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4951 if (str->iso2022.register_left != 0)
4953 Dynarr_add (dst, ISO_CODE_SI);
4954 str->iso2022.register_left = 0;
4959 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4961 if (str->iso2022.register_left != 1)
4963 Dynarr_add (dst, ISO_CODE_SO);
4964 str->iso2022.register_left = 1;
4969 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4970 unsigned_char_dynarr *dst, unsigned int *flags)
4972 unsigned char charmask;
4973 Lisp_Coding_System* codesys = str->codesys;
4974 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4976 Lisp_Object charset = str->iso2022.current_charset;
4977 int half = str->iso2022.current_half;
4978 unsigned int byte1, byte2;
4982 restore_left_to_right_direction (codesys, dst, flags, 0);
4984 /* Make sure G0 contains ASCII */
4985 if ((ch > ' ' && ch < ISO_CODE_DEL)
4986 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4988 ensure_normal_shift (str, dst);
4989 iso2022_designate (Vcharset_ascii, 0, str, dst);
4992 /* If necessary, restore everything to the default state
4994 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4996 restore_left_to_right_direction (codesys, dst, flags, 0);
4998 ensure_normal_shift (str, dst);
5000 for (i = 0; i < 4; i++)
5002 Lisp_Object initial_charset =
5003 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5004 iso2022_designate (initial_charset, i, str, dst);
5009 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5010 Dynarr_add (dst, '\r');
5011 if (eol_type != EOL_CR)
5012 Dynarr_add (dst, ch);
5016 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5017 && fit_to_be_escape_quoted (ch))
5018 Dynarr_add (dst, ISO_CODE_ESC);
5019 Dynarr_add (dst, ch);
5022 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5024 charmask = (half == 0 ? 0x00 : 0x80);
5026 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5027 && fit_to_be_escape_quoted (ch))
5028 Dynarr_add (dst, ISO_CODE_ESC);
5029 /* you asked for it ... */
5030 Dynarr_add (dst, ch);
5036 /* Now determine which register to use. */
5038 for (i = 0; i < 4; i++)
5042 if ((CHARSETP (charset = str->iso2022.charset[i])
5043 && ((code_point = charset_code_point (charset, ch)) >= 0))
5047 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5048 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5050 if (XCHARSET_DIMENSION (charset) == 1)
5055 else /* if (XCHARSET_DIMENSION (charset) == 2) */
5057 byte1 = code_point >> 8;
5058 byte2 = code_point & 255;
5066 Lisp_Object original_default_coded_charset_priority_list
5067 = Vdefault_coded_charset_priority_list;
5069 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5071 BREAKUP_CHAR (ch, charset, byte1, byte2);
5072 if (XCHARSET_FINAL (charset))
5074 Vdefault_coded_charset_priority_list
5075 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5076 Vdefault_coded_charset_priority_list));
5078 BREAKUP_CHAR (ch, charset, byte1, byte2);
5079 if (!XCHARSET_FINAL (charset))
5081 charset = Vcharset_ascii;
5085 Vdefault_coded_charset_priority_list
5086 = original_default_coded_charset_priority_list;
5088 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5089 codesys, dst, flags, 0);
5093 if (XCHARSET_GRAPHIC (charset) != 0)
5095 if (!NILP (str->iso2022.charset[1]) &&
5096 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5097 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5099 else if (!NILP (str->iso2022.charset[2]))
5101 else if (!NILP (str->iso2022.charset[3]))
5110 iso2022_designate (charset, reg, str, dst);
5112 /* Now invoke that register. */
5116 ensure_normal_shift (str, dst);
5120 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5122 ensure_shift_out (str, dst);
5129 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5131 Dynarr_add (dst, ISO_CODE_ESC);
5132 Dynarr_add (dst, 'N');
5137 Dynarr_add (dst, ISO_CODE_SS2);
5142 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5144 Dynarr_add (dst, ISO_CODE_ESC);
5145 Dynarr_add (dst, 'O');
5150 Dynarr_add (dst, ISO_CODE_SS3);
5158 charmask = (half == 0 ? 0x00 : 0x80);
5160 switch (XCHARSET_DIMENSION (charset))
5163 Dynarr_add (dst, byte1 | charmask);
5166 Dynarr_add (dst, byte1 | charmask);
5167 Dynarr_add (dst, byte2 | charmask);
5173 str->iso2022.current_charset = charset;
5174 str->iso2022.current_half = half;
5178 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5179 unsigned int *flags)
5181 Lisp_Coding_System* codesys = str->codesys;
5184 restore_left_to_right_direction (codesys, dst, flags, 0);
5185 ensure_normal_shift (str, dst);
5186 for (i = 0; i < 4; i++)
5188 Lisp_Object initial_charset
5189 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5190 iso2022_designate (initial_charset, i, str, dst);
5195 /************************************************************************/
5196 /* No-conversion methods */
5197 /************************************************************************/
5199 /* This is used when reading in "binary" files -- i.e. files that may
5200 contain all 256 possible byte values and that are not to be
5201 interpreted as being in any particular decoding. */
5203 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5204 unsigned_char_dynarr *dst, unsigned int n)
5207 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5208 unsigned int flags = str->flags;
5209 unsigned int ch = str->ch;
5210 eol_type_t eol_type = str->eol_type;
5216 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5217 DECODE_ADD_BINARY_CHAR (c, dst);
5218 label_continue_loop:;
5221 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5228 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5229 unsigned_char_dynarr *dst, unsigned int n)
5232 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5233 unsigned int flags = str->flags;
5234 unsigned int ch = str->ch;
5235 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5237 unsigned char char_boundary = str->iso2022.current_char_boundary;
5244 if (char_boundary == 0)
5250 else if ( c >= 0xf8 )
5255 else if ( c >= 0xf0 )
5260 else if ( c >= 0xe0 )
5265 else if ( c >= 0xc0 )
5275 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5276 Dynarr_add (dst, '\r');
5277 if (eol_type != EOL_CR)
5278 Dynarr_add (dst, c);
5281 Dynarr_add (dst, c);
5284 else if (char_boundary == 1)
5286 ch = ( ch << 6 ) | ( c & 0x3f );
5287 Dynarr_add (dst, ch & 0xff);
5292 ch = ( ch << 6 ) | ( c & 0x3f );
5295 #else /* not UTF2000 */
5298 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5299 Dynarr_add (dst, '\r');
5300 if (eol_type != EOL_CR)
5301 Dynarr_add (dst, '\n');
5304 else if (BYTE_ASCII_P (c))
5307 Dynarr_add (dst, c);
5309 else if (BUFBYTE_LEADING_BYTE_P (c))
5312 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5313 c == LEADING_BYTE_CONTROL_1)
5316 Dynarr_add (dst, '~'); /* untranslatable character */
5320 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5321 Dynarr_add (dst, c);
5322 else if (ch == LEADING_BYTE_CONTROL_1)
5325 Dynarr_add (dst, c - 0x20);
5327 /* else it should be the second or third byte of an
5328 untranslatable character, so ignore it */
5331 #endif /* not UTF2000 */
5337 str->iso2022.current_char_boundary = char_boundary;
5342 /************************************************************************/
5343 /* Simple internal/external functions */
5344 /************************************************************************/
5346 static Extbyte_dynarr *conversion_out_dynarr;
5347 static Bufbyte_dynarr *conversion_in_dynarr;
5349 /* Determine coding system from coding format */
5351 /* #### not correct for all values of `fmt'! */
5353 external_data_format_to_coding_system (enum external_data_format fmt)
5357 case FORMAT_FILENAME:
5358 case FORMAT_TERMINAL:
5359 if (EQ (Vfile_name_coding_system, Qnil) ||
5360 EQ (Vfile_name_coding_system, Qbinary))
5363 return Fget_coding_system (Vfile_name_coding_system);
5366 return Fget_coding_system (Qctext);
5374 convert_to_external_format (CONST Bufbyte *ptr,
5377 enum external_data_format fmt)
5379 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5381 if (!conversion_out_dynarr)
5382 conversion_out_dynarr = Dynarr_new (Extbyte);
5384 Dynarr_reset (conversion_out_dynarr);
5386 if (NILP (coding_system))
5388 CONST Bufbyte *end = ptr + len;
5394 (*ptr < 0xc0) ? *ptr :
5395 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5398 (BYTE_ASCII_P (*ptr)) ? *ptr :
5399 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5400 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5403 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5407 #ifdef ERROR_CHECK_BUFPOS
5408 assert (ptr == end);
5413 Lisp_Object instream, outstream, da_outstream;
5414 Lstream *istr, *ostr;
5415 struct gcpro gcpro1, gcpro2, gcpro3;
5416 char tempbuf[1024]; /* some random amount */
5418 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5419 da_outstream = make_dynarr_output_stream
5420 ((unsigned_char_dynarr *) conversion_out_dynarr);
5422 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5423 istr = XLSTREAM (instream);
5424 ostr = XLSTREAM (outstream);
5425 GCPRO3 (instream, outstream, da_outstream);
5428 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5431 Lstream_write (ostr, tempbuf, size_in_bytes);
5433 Lstream_close (istr);
5434 Lstream_close (ostr);
5436 Lstream_delete (istr);
5437 Lstream_delete (ostr);
5438 Lstream_delete (XLSTREAM (da_outstream));
5441 *len_out = Dynarr_length (conversion_out_dynarr);
5442 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5443 return Dynarr_atp (conversion_out_dynarr, 0);
5447 convert_from_external_format (CONST Extbyte *ptr,
5450 enum external_data_format fmt)
5452 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5454 if (!conversion_in_dynarr)
5455 conversion_in_dynarr = Dynarr_new (Bufbyte);
5457 Dynarr_reset (conversion_in_dynarr);
5459 if (NILP (coding_system))
5461 CONST Extbyte *end = ptr + len;
5462 for (; ptr < end; ptr++)
5465 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5470 Lisp_Object instream, outstream, da_outstream;
5471 Lstream *istr, *ostr;
5472 struct gcpro gcpro1, gcpro2, gcpro3;
5473 char tempbuf[1024]; /* some random amount */
5475 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5476 da_outstream = make_dynarr_output_stream
5477 ((unsigned_char_dynarr *) conversion_in_dynarr);
5479 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5480 istr = XLSTREAM (instream);
5481 ostr = XLSTREAM (outstream);
5482 GCPRO3 (instream, outstream, da_outstream);
5485 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5488 Lstream_write (ostr, tempbuf, size_in_bytes);
5490 Lstream_close (istr);
5491 Lstream_close (ostr);
5493 Lstream_delete (istr);
5494 Lstream_delete (ostr);
5495 Lstream_delete (XLSTREAM (da_outstream));
5498 *len_out = Dynarr_length (conversion_in_dynarr);
5499 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5500 return Dynarr_atp (conversion_in_dynarr, 0);
5504 /************************************************************************/
5505 /* Initialization */
5506 /************************************************************************/
5509 syms_of_file_coding (void)
5511 deferror (&Qcoding_system_error, "coding-system-error",
5512 "Coding-system error", Qio_error);
5514 DEFSUBR (Fcoding_system_p);
5515 DEFSUBR (Ffind_coding_system);
5516 DEFSUBR (Fget_coding_system);
5517 DEFSUBR (Fcoding_system_list);
5518 DEFSUBR (Fcoding_system_name);
5519 DEFSUBR (Fmake_coding_system);
5520 DEFSUBR (Fcopy_coding_system);
5521 DEFSUBR (Fdefine_coding_system_alias);
5522 DEFSUBR (Fsubsidiary_coding_system);
5524 DEFSUBR (Fcoding_system_type);
5525 DEFSUBR (Fcoding_system_doc_string);
5527 DEFSUBR (Fcoding_system_charset);
5529 DEFSUBR (Fcoding_system_property);
5531 DEFSUBR (Fcoding_category_list);
5532 DEFSUBR (Fset_coding_priority_list);
5533 DEFSUBR (Fcoding_priority_list);
5534 DEFSUBR (Fset_coding_category_system);
5535 DEFSUBR (Fcoding_category_system);
5537 DEFSUBR (Fdetect_coding_region);
5538 DEFSUBR (Fdecode_coding_region);
5539 DEFSUBR (Fencode_coding_region);
5541 DEFSUBR (Fdecode_shift_jis_char);
5542 DEFSUBR (Fencode_shift_jis_char);
5543 DEFSUBR (Fdecode_big5_char);
5544 DEFSUBR (Fencode_big5_char);
5546 defsymbol (&Qcoding_systemp, "coding-system-p");
5547 defsymbol (&Qno_conversion, "no-conversion");
5548 defsymbol (&Qraw_text, "raw-text");
5550 defsymbol (&Qbig5, "big5");
5551 defsymbol (&Qshift_jis, "shift-jis");
5552 defsymbol (&Qucs4, "ucs-4");
5553 defsymbol (&Qutf8, "utf-8");
5554 defsymbol (&Qccl, "ccl");
5555 defsymbol (&Qiso2022, "iso2022");
5557 defsymbol (&Qmnemonic, "mnemonic");
5558 defsymbol (&Qeol_type, "eol-type");
5559 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5560 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5562 defsymbol (&Qcr, "cr");
5563 defsymbol (&Qlf, "lf");
5564 defsymbol (&Qcrlf, "crlf");
5565 defsymbol (&Qeol_cr, "eol-cr");
5566 defsymbol (&Qeol_lf, "eol-lf");
5567 defsymbol (&Qeol_crlf, "eol-crlf");
5569 defsymbol (&Qcharset_g0, "charset-g0");
5570 defsymbol (&Qcharset_g1, "charset-g1");
5571 defsymbol (&Qcharset_g2, "charset-g2");
5572 defsymbol (&Qcharset_g3, "charset-g3");
5573 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5574 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5575 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5576 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5577 defsymbol (&Qno_iso6429, "no-iso6429");
5578 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5579 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5581 defsymbol (&Qshort, "short");
5582 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5583 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5584 defsymbol (&Qseven, "seven");
5585 defsymbol (&Qlock_shift, "lock-shift");
5586 defsymbol (&Qescape_quoted, "escape-quoted");
5589 defsymbol (&Qdisable_composition, "disable-composition");
5591 defsymbol (&Qencode, "encode");
5592 defsymbol (&Qdecode, "decode");
5595 defsymbol (&Qctext, "ctext");
5596 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5598 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5600 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5602 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5604 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5606 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5608 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5610 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5612 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5615 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5620 lstream_type_create_file_coding (void)
5622 LSTREAM_HAS_METHOD (decoding, reader);
5623 LSTREAM_HAS_METHOD (decoding, writer);
5624 LSTREAM_HAS_METHOD (decoding, rewinder);
5625 LSTREAM_HAS_METHOD (decoding, seekable_p);
5626 LSTREAM_HAS_METHOD (decoding, flusher);
5627 LSTREAM_HAS_METHOD (decoding, closer);
5628 LSTREAM_HAS_METHOD (decoding, marker);
5630 LSTREAM_HAS_METHOD (encoding, reader);
5631 LSTREAM_HAS_METHOD (encoding, writer);
5632 LSTREAM_HAS_METHOD (encoding, rewinder);
5633 LSTREAM_HAS_METHOD (encoding, seekable_p);
5634 LSTREAM_HAS_METHOD (encoding, flusher);
5635 LSTREAM_HAS_METHOD (encoding, closer);
5636 LSTREAM_HAS_METHOD (encoding, marker);
5640 vars_of_file_coding (void)
5644 fcd = xnew (struct file_coding_dump);
5645 dumpstruct (&fcd, &fcd_description);
5647 /* Initialize to something reasonable ... */
5648 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5650 fcd->coding_category_system[i] = Qnil;
5651 fcd->coding_category_by_priority[i] = i;
5654 Fprovide (intern ("file-coding"));
5656 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5657 Coding system used for TTY keyboard input.
5658 Not used under a windowing system.
5660 Vkeyboard_coding_system = Qnil;
5662 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5663 Coding system used for TTY display output.
5664 Not used under a windowing system.
5666 Vterminal_coding_system = Qnil;
5668 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5669 Overriding coding system used when writing a file or process.
5670 You should *bind* this, not set it. If this is non-nil, it specifies
5671 the coding system that will be used when a file or process is read
5672 in, and overrides `buffer-file-coding-system-for-read',
5673 `insert-file-contents-pre-hook', etc. Use those variables instead of
5674 this one for permanent changes to the environment.
5676 Vcoding_system_for_read = Qnil;
5678 DEFVAR_LISP ("coding-system-for-write",
5679 &Vcoding_system_for_write /*
5680 Overriding coding system used when writing a file or process.
5681 You should *bind* this, not set it. If this is non-nil, it specifies
5682 the coding system that will be used when a file or process is wrote
5683 in, and overrides `buffer-file-coding-system',
5684 `write-region-pre-hook', etc. Use those variables instead of this one
5685 for permanent changes to the environment.
5687 Vcoding_system_for_write = Qnil;
5689 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5690 Coding system used to convert pathnames when accessing files.
5692 Vfile_name_coding_system = Qnil;
5694 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5695 Non-nil means the buffer contents are regarded as multi-byte form
5696 of characters, not a binary code. This affects the display, file I/O,
5697 and behaviors of various editing commands.
5699 Setting this to nil does not do anything.
5701 enable_multibyte_characters = 1;
5705 complex_vars_of_file_coding (void)
5707 staticpro (&Vcoding_system_hash_table);
5708 Vcoding_system_hash_table =
5709 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5711 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5712 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5714 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5716 struct codesys_prop csp; \
5718 csp.prop_type = (Prop_Type); \
5719 Dynarr_add (the_codesys_prop_dynarr, csp); \
5722 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5723 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5724 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5725 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5726 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5727 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5728 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5730 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5731 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5732 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5733 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5734 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5735 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5736 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5737 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5738 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5739 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5740 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5741 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5742 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5743 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5744 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5745 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5746 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5748 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5749 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5751 /* Need to create this here or we're really screwed. */
5753 (Qraw_text, Qno_conversion,
5754 build_string ("Raw text, which means it converts only line-break-codes."),
5755 list2 (Qmnemonic, build_string ("Raw")));
5758 (Qbinary, Qno_conversion,
5759 build_string ("Binary, which means it does not convert anything."),
5760 list4 (Qeol_type, Qlf,
5761 Qmnemonic, build_string ("Binary")));
5766 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5767 list2 (Qmnemonic, build_string ("UTF8")));
5770 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5772 /* Need this for bootstrapping */
5773 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5774 Fget_coding_system (Qraw_text);
5777 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5778 = Fget_coding_system (Qutf8);