1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
52 struct file_coding_dump {
53 /* Coding system currently associated with each coding category. */
54 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
56 /* Table of all coding categories in decreasing order of priority.
57 This describes a permutation of the possible coding categories. */
58 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
60 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
65 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
69 static const struct struct_description fcd_description = {
70 sizeof(struct file_coding_dump),
74 Lisp_Object mule_to_ucs_table;
76 Lisp_Object Qcoding_systemp;
78 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
79 /* Qinternal in general.c */
81 Lisp_Object Qmnemonic, Qeol_type;
82 Lisp_Object Qcr, Qcrlf, Qlf;
83 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
84 Lisp_Object Qpost_read_conversion;
85 Lisp_Object Qpre_write_conversion;
88 Lisp_Object Qucs4, Qutf8;
89 Lisp_Object Qbig5, Qshift_jis;
90 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
91 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
92 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
93 Lisp_Object Qno_iso6429;
94 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
95 Lisp_Object Qctext, Qescape_quoted;
96 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
98 Lisp_Object Qencode, Qdecode;
100 Lisp_Object Vcoding_system_hash_table;
102 int enable_multibyte_characters;
105 /* Additional information used by the ISO2022 decoder and detector. */
106 struct iso2022_decoder
108 /* CHARSET holds the character sets currently assigned to the G0
109 through G3 variables. It is initialized from the array
110 INITIAL_CHARSET in CODESYS. */
111 Lisp_Object charset[4];
113 /* Which registers are currently invoked into the left (GL) and
114 right (GR) halves of the 8-bit encoding space? */
115 int register_left, register_right;
117 /* ISO_ESC holds a value indicating part of an escape sequence
118 that has already been seen. */
119 enum iso_esc_flag esc;
121 /* This records the bytes we've seen so far in an escape sequence,
122 in case the sequence is invalid (we spit out the bytes unchanged). */
123 unsigned char esc_bytes[8];
125 /* Index for next byte to store in ISO escape sequence. */
128 #ifdef ENABLE_COMPOSITE_CHARS
129 /* Stuff seen so far when composing a string. */
130 unsigned_char_dynarr *composite_chars;
133 /* If we saw an invalid designation sequence for a particular
134 register, we flag it here and switch to ASCII. The next time we
135 see a valid designation for this register, we turn off the flag
136 and do the designation normally, but pretend the sequence was
137 invalid. The effect of all this is that (most of the time) the
138 escape sequences for both the switch to the unknown charset, and
139 the switch back to the known charset, get inserted literally into
140 the buffer and saved out as such. The hope is that we can
141 preserve the escape sequences so that the resulting written out
142 file makes sense. If we don't do any of this, the designation
143 to the invalid charset will be preserved but that switch back
144 to the known charset will probably get eaten because it was
145 the same charset that was already present in the register. */
146 unsigned char invalid_designated[4];
148 /* We try to do similar things as above for direction-switching
149 sequences. If we encountered a direction switch while an
150 invalid designation was present, or an invalid designation
151 just after a direction switch (i.e. no valid designation
152 encountered yet), we insert the direction-switch escape
153 sequence literally into the output stream, and later on
154 insert the corresponding direction-restoring escape sequence
156 unsigned int switched_dir_and_no_valid_charset_yet :1;
157 unsigned int invalid_switch_dir :1;
159 /* Tells the decoder to output the escape sequence literally
160 even though it was valid. Used in the games we play to
161 avoid lossage when we encounter invalid designations. */
162 unsigned int output_literally :1;
163 /* We encountered a direction switch followed by an invalid
164 designation. We didn't output the direction switch
165 literally because we didn't know about the invalid designation;
166 but we have to do so now. */
167 unsigned int output_direction_sequence :1;
170 EXFUN (Fcopy_coding_system, 2);
172 struct detection_state;
173 static int detect_coding_sjis (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_sjis (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst,
180 static void encode_coding_sjis (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst,
184 static int detect_coding_big5 (struct detection_state *st,
185 CONST unsigned char *src,
187 static void decode_coding_big5 (Lstream *decoding,
188 CONST unsigned char *src,
189 unsigned_char_dynarr *dst, unsigned int n);
190 static void encode_coding_big5 (Lstream *encoding,
191 CONST unsigned char *src,
192 unsigned_char_dynarr *dst, unsigned int n);
193 static int detect_coding_ucs4 (struct detection_state *st,
194 CONST unsigned char *src,
196 static void decode_coding_ucs4 (Lstream *decoding,
197 CONST unsigned char *src,
198 unsigned_char_dynarr *dst, unsigned int n);
199 static void encode_coding_ucs4 (Lstream *encoding,
200 CONST unsigned char *src,
201 unsigned_char_dynarr *dst, unsigned int n);
202 static int detect_coding_utf8 (struct detection_state *st,
203 CONST unsigned char *src,
205 static void decode_coding_utf8 (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst, unsigned int n);
208 static void encode_coding_utf8 (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst, unsigned int n);
211 static int postprocess_iso2022_mask (int mask);
212 static void reset_iso2022 (Lisp_Object coding_system,
213 struct iso2022_decoder *iso);
214 static int detect_coding_iso2022 (struct detection_state *st,
215 CONST unsigned char *src,
217 static void decode_coding_iso2022 (Lstream *decoding,
218 CONST unsigned char *src,
219 unsigned_char_dynarr *dst, unsigned int n);
220 static void encode_coding_iso2022 (Lstream *encoding,
221 CONST unsigned char *src,
222 unsigned_char_dynarr *dst, unsigned int n);
224 static void decode_coding_no_conversion (Lstream *decoding,
225 CONST unsigned char *src,
226 unsigned_char_dynarr *dst,
228 static void encode_coding_no_conversion (Lstream *encoding,
229 CONST unsigned char *src,
230 unsigned_char_dynarr *dst,
232 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
233 unsigned_char_dynarr *dst, unsigned int n);
234 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
235 unsigned_char_dynarr *dst, unsigned int n);
237 typedef struct codesys_prop codesys_prop;
246 Dynarr_declare (codesys_prop);
247 } codesys_prop_dynarr;
249 static const struct lrecord_description codesys_prop_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
254 static const struct struct_description codesys_prop_description = {
255 sizeof(codesys_prop),
256 codesys_prop_description_1
259 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
260 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
264 static const struct struct_description codesys_prop_dynarr_description = {
265 sizeof(codesys_prop_dynarr),
266 codesys_prop_dynarr_description_1
269 codesys_prop_dynarr *the_codesys_prop_dynarr;
271 enum codesys_prop_enum
274 CODESYS_PROP_ISO2022,
279 /************************************************************************/
280 /* Coding system functions */
281 /************************************************************************/
283 static Lisp_Object mark_coding_system (Lisp_Object);
284 static void print_coding_system (Lisp_Object, Lisp_Object, int);
285 static void finalize_coding_system (void *header, int for_disksave);
288 static const struct lrecord_description ccs_description_1[] = {
289 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
293 static const struct struct_description ccs_description = {
294 sizeof(charset_conversion_spec),
298 static const struct lrecord_description ccsd_description_1[] = {
299 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
303 static const struct struct_description ccsd_description = {
304 sizeof(charset_conversion_spec_dynarr),
309 static const struct lrecord_description coding_system_description[] = {
310 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
311 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
312 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
314 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
315 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
316 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
317 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
322 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
323 mark_coding_system, print_coding_system,
324 finalize_coding_system,
325 0, 0, coding_system_description,
326 struct Lisp_Coding_System);
329 mark_coding_system (Lisp_Object obj)
331 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
333 mark_object (CODING_SYSTEM_NAME (codesys));
334 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
335 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
336 mark_object (CODING_SYSTEM_EOL_LF (codesys));
337 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
338 mark_object (CODING_SYSTEM_EOL_CR (codesys));
340 switch (CODING_SYSTEM_TYPE (codesys))
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
347 if (codesys->iso2022.input_conv)
349 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
351 struct charset_conversion_spec *ccs =
352 Dynarr_atp (codesys->iso2022.input_conv, i);
353 mark_object (ccs->from_charset);
354 mark_object (ccs->to_charset);
357 if (codesys->iso2022.output_conv)
359 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
361 struct charset_conversion_spec *ccs =
362 Dynarr_atp (codesys->iso2022.output_conv, i);
363 mark_object (ccs->from_charset);
364 mark_object (ccs->to_charset);
370 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
371 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
378 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
379 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
383 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
386 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
388 error ("printing unreadable object #<coding_system 0x%x>",
391 write_c_string ("#<coding_system ", printcharfun);
392 print_internal (c->name, printcharfun, 1);
393 write_c_string (">", printcharfun);
397 finalize_coding_system (void *header, int for_disksave)
399 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
400 /* Since coding systems never go away, this function is not
401 necessary. But it would be necessary if we changed things
402 so that coding systems could go away. */
403 if (!for_disksave) /* see comment in lstream.c */
405 switch (CODING_SYSTEM_TYPE (c))
408 case CODESYS_ISO2022:
409 if (c->iso2022.input_conv)
411 Dynarr_free (c->iso2022.input_conv);
412 c->iso2022.input_conv = 0;
414 if (c->iso2022.output_conv)
416 Dynarr_free (c->iso2022.output_conv);
417 c->iso2022.output_conv = 0;
428 symbol_to_eol_type (Lisp_Object symbol)
430 CHECK_SYMBOL (symbol);
431 if (NILP (symbol)) return EOL_AUTODETECT;
432 if (EQ (symbol, Qlf)) return EOL_LF;
433 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
434 if (EQ (symbol, Qcr)) return EOL_CR;
436 signal_simple_error ("Unrecognized eol type", symbol);
437 return EOL_AUTODETECT; /* not reached */
441 eol_type_to_symbol (enum eol_type type)
446 case EOL_LF: return Qlf;
447 case EOL_CRLF: return Qcrlf;
448 case EOL_CR: return Qcr;
449 case EOL_AUTODETECT: return Qnil;
454 setup_eol_coding_systems (Lisp_Coding_System *codesys)
456 Lisp_Object codesys_obj;
457 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
458 char *codesys_name = (char *) alloca (len + 7);
460 char *codesys_mnemonic=0;
462 Lisp_Object codesys_name_sym, sub_codesys_obj;
466 XSETCODING_SYSTEM (codesys_obj, codesys);
468 memcpy (codesys_name,
469 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
471 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
473 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
474 codesys_mnemonic = (char *) alloca (mlen + 7);
475 memcpy (codesys_mnemonic,
476 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
479 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
480 strcpy (codesys_name + len, "-" op_sys); \
482 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
483 codesys_name_sym = intern (codesys_name); \
484 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
485 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
487 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
488 build_string (codesys_mnemonic); \
489 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
492 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
493 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
494 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
497 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
498 Return t if OBJECT is a coding system.
499 A coding system is an object that defines how text containing multiple
500 character sets is encoded into a stream of (typically 8-bit) bytes.
501 The coding system is used to decode the stream into a series of
502 characters (which may be from multiple charsets) when the text is read
503 from a file or process, and is used to encode the text back into the
504 same format when it is written out to a file or process.
506 For example, many ISO2022-compliant coding systems (such as Compound
507 Text, which is used for inter-client data under the X Window System)
508 use escape sequences to switch between different charsets -- Japanese
509 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
510 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
511 `make-coding-system' for more information.
513 Coding systems are normally identified using a symbol, and the
514 symbol is accepted in place of the actual coding system object whenever
515 a coding system is called for. (This is similar to how faces work.)
519 return CODING_SYSTEMP (object) ? Qt : Qnil;
522 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
523 Retrieve the coding system of the given name.
525 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
526 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
527 If there is no such coding system, nil is returned. Otherwise the
528 associated coding system object is returned.
530 (coding_system_or_name))
532 if (CODING_SYSTEMP (coding_system_or_name))
533 return coding_system_or_name;
535 if (NILP (coding_system_or_name))
536 coding_system_or_name = Qbinary;
538 CHECK_SYMBOL (coding_system_or_name);
540 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
543 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
544 Retrieve the coding system of the given name.
545 Same as `find-coding-system' except that if there is no such
546 coding system, an error is signaled instead of returning nil.
550 Lisp_Object coding_system = Ffind_coding_system (name);
552 if (NILP (coding_system))
553 signal_simple_error ("No such coding system", name);
554 return coding_system;
557 /* We store the coding systems in hash tables with the names as the key and the
558 actual coding system object as the value. Occasionally we need to use them
559 in a list format. These routines provide us with that. */
560 struct coding_system_list_closure
562 Lisp_Object *coding_system_list;
566 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
567 void *coding_system_list_closure)
569 /* This function can GC */
570 struct coding_system_list_closure *cscl =
571 (struct coding_system_list_closure *) coding_system_list_closure;
572 Lisp_Object *coding_system_list = cscl->coding_system_list;
574 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
575 *coding_system_list);
579 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
580 Return a list of the names of all defined coding systems.
584 Lisp_Object coding_system_list = Qnil;
586 struct coding_system_list_closure coding_system_list_closure;
588 GCPRO1 (coding_system_list);
589 coding_system_list_closure.coding_system_list = &coding_system_list;
590 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
591 &coding_system_list_closure);
594 return coding_system_list;
597 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
598 Return the name of the given coding system.
602 coding_system = Fget_coding_system (coding_system);
603 return XCODING_SYSTEM_NAME (coding_system);
606 static Lisp_Coding_System *
607 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
609 Lisp_Coding_System *codesys =
610 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
612 zero_lcrecord (codesys);
613 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
614 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
615 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
616 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
617 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
618 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
619 CODING_SYSTEM_TYPE (codesys) = type;
620 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
622 if (type == CODESYS_ISO2022)
625 for (i = 0; i < 4; i++)
626 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
628 else if (type == CODESYS_CCL)
630 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
631 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
634 CODING_SYSTEM_NAME (codesys) = name;
640 /* Given a list of charset conversion specs as specified in a Lisp
641 program, parse it into STORE_HERE. */
644 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
645 Lisp_Object spec_list)
649 EXTERNAL_LIST_LOOP (rest, spec_list)
651 Lisp_Object car = XCAR (rest);
652 Lisp_Object from, to;
653 struct charset_conversion_spec spec;
655 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
656 signal_simple_error ("Invalid charset conversion spec", car);
657 from = Fget_charset (XCAR (car));
658 to = Fget_charset (XCAR (XCDR (car)));
659 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
660 signal_simple_error_2
661 ("Attempted conversion between different charset types",
663 spec.from_charset = from;
664 spec.to_charset = to;
666 Dynarr_add (store_here, spec);
670 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
671 specs, return the equivalent as the Lisp programmer would see it.
673 If LOAD_HERE is 0, return Qnil. */
676 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
683 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
685 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
686 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
689 return Fnreverse (result);
694 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
695 Register symbol NAME as a coding system.
697 TYPE describes the conversion method used and should be one of
700 Automatic conversion. XEmacs attempts to detect the coding system
703 No conversion. Use this for binary files and such. On output,
704 graphic characters that are not in ASCII or Latin-1 will be
705 replaced by a ?. (For a no-conversion-encoded buffer, these
706 characters will only be present if you explicitly insert them.)
708 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
710 ISO 10646 UCS-4 encoding.
712 ISO 10646 UTF-8 encoding.
714 Any ISO2022-compliant encoding. Among other things, this includes
715 JIS (the Japanese encoding commonly used for e-mail), EUC (the
716 standard Unix encoding for Japanese and other languages), and
717 Compound Text (the encoding used in X11). You can specify more
718 specific information about the conversion with the FLAGS argument.
720 Big5 (the encoding commonly used for Taiwanese).
722 The conversion is performed using a user-written pseudo-code
723 program. CCL (Code Conversion Language) is the name of this
726 Write out or read in the raw contents of the memory representing
727 the buffer's text. This is primarily useful for debugging
728 purposes, and is only enabled when XEmacs has been compiled with
729 DEBUG_XEMACS defined (via the --debug configure option).
730 WARNING: Reading in a file using 'internal conversion can result
731 in an internal inconsistency in the memory representing a
732 buffer's text, which will produce unpredictable results and may
733 cause XEmacs to crash. Under normal circumstances you should
734 never use 'internal conversion.
736 DOC-STRING is a string describing the coding system.
738 PROPS is a property list, describing the specific nature of the
739 character set. Recognized properties are:
742 String to be displayed in the modeline when this coding system is
746 End-of-line conversion to be used. It should be one of
749 Automatically detect the end-of-line type (LF, CRLF,
750 or CR). Also generate subsidiary coding systems named
751 `NAME-unix', `NAME-dos', and `NAME-mac', that are
752 identical to this coding system but have an EOL-TYPE
753 value of 'lf, 'crlf, and 'cr, respectively.
755 The end of a line is marked externally using ASCII LF.
756 Since this is also the way that XEmacs represents an
757 end-of-line internally, specifying this option results
758 in no end-of-line conversion. This is the standard
759 format for Unix text files.
761 The end of a line is marked externally using ASCII
762 CRLF. This is the standard format for MS-DOS text
765 The end of a line is marked externally using ASCII CR.
766 This is the standard format for Macintosh text files.
768 Automatically detect the end-of-line type but do not
769 generate subsidiary coding systems. (This value is
770 converted to nil when stored internally, and
771 `coding-system-property' will return nil.)
773 'post-read-conversion
774 Function called after a file has been read in, to perform the
775 decoding. Called with two arguments, BEG and END, denoting
776 a region of the current buffer to be decoded.
778 'pre-write-conversion
779 Function called before a file is written out, to perform the
780 encoding. Called with two arguments, BEG and END, denoting
781 a region of the current buffer to be encoded.
784 The following additional properties are recognized if TYPE is 'iso2022:
790 The character set initially designated to the G0 - G3 registers.
791 The value should be one of
793 -- A charset object (designate that character set)
794 -- nil (do not ever use this register)
795 -- t (no character set is initially designated to
796 the register, but may be later on; this automatically
797 sets the corresponding `force-g*-on-output' property)
803 If non-nil, send an explicit designation sequence on output before
804 using the specified register.
807 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
808 "ESC $ B" on output in place of the full designation sequences
809 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
812 If non-nil, don't designate ASCII to G0 at each end of line on output.
813 Setting this to non-nil also suppresses other state-resetting that
814 normally happens at the end of a line.
817 If non-nil, don't designate ASCII to G0 before control chars on output.
820 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
824 If non-nil, use locking-shift (SO/SI) instead of single-shift
825 or designation by escape sequence.
828 If non-nil, don't use ISO6429's direction specification.
831 If non-nil, literal control characters that are the same as
832 the beginning of a recognized ISO2022 or ISO6429 escape sequence
833 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
834 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
835 so that they can be properly distinguished from an escape sequence.
836 (Note that doing this results in a non-portable encoding.) This
837 encoding flag is used for byte-compiled files. Note that ESC
838 is a good choice for a quoting character because there are no
839 escape sequences whose second byte is a character from the Control-0
840 or Control-1 character sets; this is explicitly disallowed by the
843 'input-charset-conversion
844 A list of conversion specifications, specifying conversion of
845 characters in one charset to another when decoding is performed.
846 Each specification is a list of two elements: the source charset,
847 and the destination charset.
849 'output-charset-conversion
850 A list of conversion specifications, specifying conversion of
851 characters in one charset to another when encoding is performed.
852 The form of each specification is the same as for
853 'input-charset-conversion.
856 The following additional properties are recognized (and required)
860 CCL program used for decoding (converting to internal format).
863 CCL program used for encoding (converting to external format).
865 (name, type, doc_string, props))
867 Lisp_Coding_System *codesys;
868 Lisp_Object rest, key, value;
869 enum coding_system_type ty;
870 int need_to_setup_eol_systems = 1;
872 /* Convert type to constant */
873 if (NILP (type) || EQ (type, Qundecided))
874 { ty = CODESYS_AUTODETECT; }
876 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
877 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
878 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
879 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
880 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
881 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
883 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
885 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
888 signal_simple_error ("Invalid coding system type", type);
892 codesys = allocate_coding_system (ty, name);
894 if (NILP (doc_string))
895 doc_string = build_string ("");
897 CHECK_STRING (doc_string);
898 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
900 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
902 if (EQ (key, Qmnemonic))
905 CHECK_STRING (value);
906 CODING_SYSTEM_MNEMONIC (codesys) = value;
909 else if (EQ (key, Qeol_type))
911 need_to_setup_eol_systems = NILP (value);
914 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
917 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
918 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
920 else if (ty == CODESYS_ISO2022)
922 #define FROB_INITIAL_CHARSET(charset_num) \
923 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
924 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
926 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
927 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
928 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
929 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
931 #define FROB_FORCE_CHARSET(charset_num) \
932 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
934 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
935 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
936 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
937 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
939 #define FROB_BOOLEAN_PROPERTY(prop) \
940 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
942 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
943 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
944 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
945 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
946 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
947 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
948 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
950 else if (EQ (key, Qinput_charset_conversion))
952 codesys->iso2022.input_conv =
953 Dynarr_new (charset_conversion_spec);
954 parse_charset_conversion_specs (codesys->iso2022.input_conv,
957 else if (EQ (key, Qoutput_charset_conversion))
959 codesys->iso2022.output_conv =
960 Dynarr_new (charset_conversion_spec);
961 parse_charset_conversion_specs (codesys->iso2022.output_conv,
965 signal_simple_error ("Unrecognized property", key);
967 else if (EQ (type, Qccl))
969 if (EQ (key, Qdecode))
971 CHECK_VECTOR (value);
972 CODING_SYSTEM_CCL_DECODE (codesys) = value;
974 else if (EQ (key, Qencode))
976 CHECK_VECTOR (value);
977 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
980 signal_simple_error ("Unrecognized property", key);
984 signal_simple_error ("Unrecognized property", key);
987 if (need_to_setup_eol_systems)
988 setup_eol_coding_systems (codesys);
991 Lisp_Object codesys_obj;
992 XSETCODING_SYSTEM (codesys_obj, codesys);
993 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
998 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
999 Copy OLD-CODING-SYSTEM to NEW-NAME.
1000 If NEW-NAME does not name an existing coding system, a new one will
1003 (old_coding_system, new_name))
1005 Lisp_Object new_coding_system;
1006 old_coding_system = Fget_coding_system (old_coding_system);
1007 new_coding_system = Ffind_coding_system (new_name);
1008 if (NILP (new_coding_system))
1010 XSETCODING_SYSTEM (new_coding_system,
1011 allocate_coding_system
1012 (XCODING_SYSTEM_TYPE (old_coding_system),
1014 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1018 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1019 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1020 memcpy (((char *) to ) + sizeof (to->header),
1021 ((char *) from) + sizeof (from->header),
1022 sizeof (*from) - sizeof (from->header));
1023 to->name = new_name;
1025 return new_coding_system;
1028 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1029 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1031 (alias, coding_system))
1033 CHECK_SYMBOL (alias);
1034 if (!NILP (Ffind_coding_system (alias)))
1035 signal_simple_error ("Symbol already names a coding system", alias);
1036 coding_system = Fget_coding_system (coding_system);
1037 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1039 /* Set up aliases for subsidiaries. */
1040 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1043 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1044 #define FROB(type, name) \
1046 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1047 if (!NILP (subsidiary)) \
1048 Fdefine_coding_system_alias \
1049 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1052 FROB (CRLF, "-dos");
1056 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1057 but it doesn't look intentional, so I'd rather return something
1058 meaningful or nothing at all. */
1063 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1065 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1066 Lisp_Object new_coding_system;
1068 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1069 return coding_system;
1073 case EOL_AUTODETECT: return coding_system;
1074 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1075 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1076 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1080 return NILP (new_coding_system) ? coding_system : new_coding_system;
1083 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1084 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1086 (coding_system, eol_type))
1088 coding_system = Fget_coding_system (coding_system);
1090 return subsidiary_coding_system (coding_system,
1091 symbol_to_eol_type (eol_type));
1095 /************************************************************************/
1096 /* Coding system accessors */
1097 /************************************************************************/
1099 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1100 Return the doc string for CODING-SYSTEM.
1104 coding_system = Fget_coding_system (coding_system);
1105 return XCODING_SYSTEM_DOC_STRING (coding_system);
1108 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1109 Return the type of CODING-SYSTEM.
1113 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1116 case CODESYS_AUTODETECT: return Qundecided;
1118 case CODESYS_SHIFT_JIS: return Qshift_jis;
1119 case CODESYS_ISO2022: return Qiso2022;
1120 case CODESYS_BIG5: return Qbig5;
1121 case CODESYS_UCS4: return Qucs4;
1122 case CODESYS_UTF8: return Qutf8;
1123 case CODESYS_CCL: return Qccl;
1125 case CODESYS_NO_CONVERSION: return Qno_conversion;
1127 case CODESYS_INTERNAL: return Qinternal;
1134 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1137 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1139 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1142 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1143 Return initial charset of CODING-SYSTEM designated to GNUM.
1146 (coding_system, gnum))
1148 coding_system = Fget_coding_system (coding_system);
1151 return coding_system_charset (coding_system, XINT (gnum));
1155 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1156 Return the PROP property of CODING-SYSTEM.
1158 (coding_system, prop))
1161 enum coding_system_type type;
1163 coding_system = Fget_coding_system (coding_system);
1164 CHECK_SYMBOL (prop);
1165 type = XCODING_SYSTEM_TYPE (coding_system);
1167 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1168 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1171 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1173 case CODESYS_PROP_ALL_OK:
1176 case CODESYS_PROP_ISO2022:
1177 if (type != CODESYS_ISO2022)
1179 ("Property only valid in ISO2022 coding systems",
1183 case CODESYS_PROP_CCL:
1184 if (type != CODESYS_CCL)
1186 ("Property only valid in CCL coding systems",
1196 signal_simple_error ("Unrecognized property", prop);
1198 if (EQ (prop, Qname))
1199 return XCODING_SYSTEM_NAME (coding_system);
1200 else if (EQ (prop, Qtype))
1201 return Fcoding_system_type (coding_system);
1202 else if (EQ (prop, Qdoc_string))
1203 return XCODING_SYSTEM_DOC_STRING (coding_system);
1204 else if (EQ (prop, Qmnemonic))
1205 return XCODING_SYSTEM_MNEMONIC (coding_system);
1206 else if (EQ (prop, Qeol_type))
1207 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1208 else if (EQ (prop, Qeol_lf))
1209 return XCODING_SYSTEM_EOL_LF (coding_system);
1210 else if (EQ (prop, Qeol_crlf))
1211 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1212 else if (EQ (prop, Qeol_cr))
1213 return XCODING_SYSTEM_EOL_CR (coding_system);
1214 else if (EQ (prop, Qpost_read_conversion))
1215 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1216 else if (EQ (prop, Qpre_write_conversion))
1217 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1219 else if (type == CODESYS_ISO2022)
1221 if (EQ (prop, Qcharset_g0))
1222 return coding_system_charset (coding_system, 0);
1223 else if (EQ (prop, Qcharset_g1))
1224 return coding_system_charset (coding_system, 1);
1225 else if (EQ (prop, Qcharset_g2))
1226 return coding_system_charset (coding_system, 2);
1227 else if (EQ (prop, Qcharset_g3))
1228 return coding_system_charset (coding_system, 3);
1230 #define FORCE_CHARSET(charset_num) \
1231 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1232 (coding_system, charset_num) ? Qt : Qnil)
1234 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1235 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1236 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1237 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1239 #define LISP_BOOLEAN(prop) \
1240 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1242 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1243 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1244 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1245 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1246 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1247 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1248 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1250 else if (EQ (prop, Qinput_charset_conversion))
1252 unparse_charset_conversion_specs
1253 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1254 else if (EQ (prop, Qoutput_charset_conversion))
1256 unparse_charset_conversion_specs
1257 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1261 else if (type == CODESYS_CCL)
1263 if (EQ (prop, Qdecode))
1264 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1265 else if (EQ (prop, Qencode))
1266 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1274 return Qnil; /* not reached */
1278 /************************************************************************/
1279 /* Coding category functions */
1280 /************************************************************************/
1283 decode_coding_category (Lisp_Object symbol)
1287 CHECK_SYMBOL (symbol);
1288 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1289 if (EQ (coding_category_symbol[i], symbol))
1292 signal_simple_error ("Unrecognized coding category", symbol);
1293 return 0; /* not reached */
1296 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1297 Return a list of all recognized coding categories.
1302 Lisp_Object list = Qnil;
1304 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1305 list = Fcons (coding_category_symbol[i], list);
1309 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1310 Change the priority order of the coding categories.
1311 LIST should be list of coding categories, in descending order of
1312 priority. Unspecified coding categories will be lower in priority
1313 than all specified ones, in the same relative order they were in
1318 int category_to_priority[CODING_CATEGORY_LAST + 1];
1322 /* First generate a list that maps coding categories to priorities. */
1324 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1325 category_to_priority[i] = -1;
1327 /* Highest priority comes from the specified list. */
1329 EXTERNAL_LIST_LOOP (rest, list)
1331 int cat = decode_coding_category (XCAR (rest));
1333 if (category_to_priority[cat] >= 0)
1334 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1335 category_to_priority[cat] = i++;
1338 /* Now go through the existing categories by priority to retrieve
1339 the categories not yet specified and preserve their priority
1341 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1343 int cat = fcd->coding_category_by_priority[j];
1344 if (category_to_priority[cat] < 0)
1345 category_to_priority[cat] = i++;
1348 /* Now we need to construct the inverse of the mapping we just
1351 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1352 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1354 /* Phew! That was confusing. */
1358 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1359 Return a list of coding categories in descending order of priority.
1364 Lisp_Object list = Qnil;
1366 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1367 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1372 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1373 Change the coding system associated with a coding category.
1375 (coding_category, coding_system))
1377 int cat = decode_coding_category (coding_category);
1379 coding_system = Fget_coding_system (coding_system);
1380 fcd->coding_category_system[cat] = coding_system;
1384 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1385 Return the coding system associated with a coding category.
1389 int cat = decode_coding_category (coding_category);
1390 Lisp_Object sys = fcd->coding_category_system[cat];
1393 return XCODING_SYSTEM_NAME (sys);
1398 /************************************************************************/
1399 /* Detecting the encoding of data */
1400 /************************************************************************/
1402 struct detection_state
1404 enum eol_type eol_type;
1440 struct iso2022_decoder iso;
1442 int high_byte_count;
1443 unsigned int saw_single_shift:1;
1456 acceptable_control_char_p (int c)
1460 /* Allow and ignore control characters that you might
1461 reasonably see in a text file */
1466 case 8: /* backspace */
1467 case 11: /* vertical tab */
1468 case 12: /* form feed */
1469 case 26: /* MS-DOS C-z junk */
1470 case 31: /* '^_' -- for info */
1478 mask_has_at_most_one_bit_p (int mask)
1480 /* Perhaps the only thing useful you learn from intensive Microsoft
1481 technical interviews */
1482 return (mask & (mask - 1)) == 0;
1485 static enum eol_type
1486 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1495 st->eol.just_saw_cr = 1;
1500 if (st->eol.just_saw_cr)
1502 else if (st->eol.seen_anything)
1505 else if (st->eol.just_saw_cr)
1507 st->eol.just_saw_cr = 0;
1509 st->eol.seen_anything = 1;
1512 return EOL_AUTODETECT;
1515 /* Attempt to determine the encoding and EOL type of the given text.
1516 Before calling this function for the first type, you must initialize
1517 st->eol_type as appropriate and initialize st->mask to ~0.
1519 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1522 st->mask holds the determined coding category mask, or ~0 if only
1523 ASCII has been seen so far.
1527 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1528 is present in st->mask
1529 1 == definitive answers are here for both st->eol_type and st->mask
1533 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1534 unsigned int n, int just_do_eol)
1538 if (st->eol_type == EOL_AUTODETECT)
1539 st->eol_type = detect_eol_type (st, src, n);
1542 return st->eol_type != EOL_AUTODETECT;
1544 if (!st->seen_non_ascii)
1546 for (; n; n--, src++)
1549 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1551 st->seen_non_ascii = 1;
1553 st->shift_jis.mask = ~0;
1557 st->iso2022.mask = ~0;
1567 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1568 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1569 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1570 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1571 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1572 st->big5.mask = detect_coding_big5 (st, src, n);
1573 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1574 st->utf8.mask = detect_coding_utf8 (st, src, n);
1575 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1576 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1579 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1580 | st->utf8.mask | st->ucs4.mask;
1583 int retval = mask_has_at_most_one_bit_p (st->mask);
1584 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1585 return retval && st->eol_type != EOL_AUTODETECT;
1590 coding_system_from_mask (int mask)
1594 /* If the file was entirely or basically ASCII, use the
1595 default value of `buffer-file-coding-system'. */
1596 Lisp_Object retval =
1597 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1600 retval = Ffind_coding_system (retval);
1604 (Qbad_variable, Qwarning,
1605 "Invalid `default-buffer-file-coding-system', set to nil");
1606 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1610 retval = Fget_coding_system (Qraw_text);
1618 mask = postprocess_iso2022_mask (mask);
1620 /* Look through the coding categories by priority and find
1621 the first one that is allowed. */
1622 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1624 cat = fcd->coding_category_by_priority[i];
1625 if ((mask & (1 << cat)) &&
1626 !NILP (fcd->coding_category_system[cat]))
1630 return fcd->coding_category_system[cat];
1632 return Fget_coding_system (Qraw_text);
1636 /* Given a seekable read stream and potential coding system and EOL type
1637 as specified, do any autodetection that is called for. If the
1638 coding system and/or EOL type are not autodetect, they will be left
1639 alone; but this function will never return an autodetect coding system
1642 This function does not automatically fetch subsidiary coding systems;
1643 that should be unnecessary with the explicit eol-type argument. */
1646 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1647 enum eol_type *eol_type_in_out)
1649 struct detection_state decst;
1651 if (*eol_type_in_out == EOL_AUTODETECT)
1652 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1655 decst.eol_type = *eol_type_in_out;
1658 /* If autodetection is called for, do it now. */
1659 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1660 *eol_type_in_out == EOL_AUTODETECT)
1662 unsigned char random_buffer[4096];
1664 Lisp_Object coding_system = Qnil;
1666 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1669 unsigned char *cp = random_buffer;
1671 while (cp < random_buffer + nread)
1673 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1674 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1675 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1676 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1677 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1678 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1679 (*cp++ == ':') && (cp < random_buffer + nread))
1681 unsigned char coding_system_name[4096 - 6];
1682 unsigned char *np = coding_system_name;
1684 while ( (cp < random_buffer + nread)
1685 && ((*cp == ' ') || (*cp == '\t')) )
1689 while ( (cp < random_buffer + nread) &&
1690 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1696 = Ffind_coding_system (intern ((char *) coding_system_name));
1700 if (EQ(coding_system, Qnil))
1702 if (detect_coding_type (&decst, random_buffer, nread,
1703 XCODING_SYSTEM_TYPE (*codesys_in_out)
1704 != CODESYS_AUTODETECT))
1706 nread = Lstream_read (stream,
1707 random_buffer, sizeof (random_buffer));
1712 *eol_type_in_out = decst.eol_type;
1713 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1715 if (EQ(coding_system, Qnil))
1716 *codesys_in_out = coding_system_from_mask (decst.mask);
1718 *codesys_in_out = coding_system;
1721 /* If we absolutely can't determine the EOL type, just assume LF. */
1722 if (*eol_type_in_out == EOL_AUTODETECT)
1723 *eol_type_in_out = EOL_LF;
1725 Lstream_rewind (stream);
1728 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1729 Detect coding system of the text in the region between START and END.
1730 Returned a list of possible coding systems ordered by priority.
1731 If only ASCII characters are found, it returns 'undecided or one of
1732 its subsidiary coding systems according to a detected end-of-line
1733 type. Optional arg BUFFER defaults to the current buffer.
1735 (start, end, buffer))
1737 Lisp_Object val = Qnil;
1738 struct buffer *buf = decode_buffer (buffer, 0);
1740 Lisp_Object instream, lb_instream;
1741 Lstream *istr, *lb_istr;
1742 struct detection_state decst;
1743 struct gcpro gcpro1, gcpro2;
1745 get_buffer_range_char (buf, start, end, &b, &e, 0);
1746 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1747 lb_istr = XLSTREAM (lb_instream);
1748 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1749 istr = XLSTREAM (instream);
1750 GCPRO2 (instream, lb_instream);
1752 decst.eol_type = EOL_AUTODETECT;
1756 unsigned char random_buffer[4096];
1757 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1761 if (detect_coding_type (&decst, random_buffer, nread, 0))
1765 if (decst.mask == ~0)
1766 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1774 decst.mask = postprocess_iso2022_mask (decst.mask);
1776 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1778 int sys = fcd->coding_category_by_priority[i];
1779 if (decst.mask & (1 << sys))
1781 Lisp_Object codesys = fcd->coding_category_system[sys];
1782 if (!NILP (codesys))
1783 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1784 val = Fcons (codesys, val);
1788 Lstream_close (istr);
1790 Lstream_delete (istr);
1791 Lstream_delete (lb_istr);
1796 /************************************************************************/
1797 /* Converting to internal Mule format ("decoding") */
1798 /************************************************************************/
1800 /* A decoding stream is a stream used for decoding text (i.e.
1801 converting from some external format to internal format).
1802 The decoding-stream object keeps track of the actual coding
1803 stream, the stream that is at the other end, and data that
1804 needs to be persistent across the lifetime of the stream. */
1806 /* Handle the EOL stuff related to just-read-in character C.
1807 EOL_TYPE is the EOL type of the coding stream.
1808 FLAGS is the current value of FLAGS in the coding stream, and may
1809 be modified by this macro. (The macro only looks at the
1810 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1811 bytes are to be written. You need to also define a local goto
1812 label "label_continue_loop" that is at the end of the main
1813 character-reading loop.
1815 If C is a CR character, then this macro handles it entirely and
1816 jumps to label_continue_loop. Otherwise, this macro does not add
1817 anything to DST, and continues normally. You should continue
1818 processing C normally after this macro. */
1820 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1824 if (eol_type == EOL_CR) \
1825 Dynarr_add (dst, '\n'); \
1826 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1827 Dynarr_add (dst, c); \
1829 flags |= CODING_STATE_CR; \
1830 goto label_continue_loop; \
1832 else if (flags & CODING_STATE_CR) \
1833 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1835 Dynarr_add (dst, '\r'); \
1836 flags &= ~CODING_STATE_CR; \
1840 /* C should be a binary character in the range 0 - 255; convert
1841 to internal format and add to Dynarr DST. */
1843 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1845 if (BYTE_ASCII_P (c)) \
1846 Dynarr_add (dst, c); \
1847 else if (BYTE_C1_P (c)) \
1849 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1850 Dynarr_add (dst, c + 0x20); \
1854 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1855 Dynarr_add (dst, c); \
1859 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1863 DECODE_ADD_BINARY_CHAR (ch, dst); \
1868 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1870 if (flags & CODING_STATE_END) \
1872 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1873 if (flags & CODING_STATE_CR) \
1874 Dynarr_add (dst, '\r'); \
1878 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1880 struct decoding_stream
1882 /* Coding system that governs the conversion. */
1883 Lisp_Coding_System *codesys;
1885 /* Stream that we read the encoded data from or
1886 write the decoded data to. */
1889 /* If we are reading, then we can return only a fixed amount of
1890 data, so if the conversion resulted in too much data, we store it
1891 here for retrieval the next time around. */
1892 unsigned_char_dynarr *runoff;
1894 /* FLAGS holds flags indicating the current state of the decoding.
1895 Some of these flags are dependent on the coding system. */
1898 /* CH holds a partially built-up character. Since we only deal
1899 with one- and two-byte characters at the moment, we only use
1900 this to store the first byte of a two-byte character. */
1903 /* EOL_TYPE specifies the type of end-of-line conversion that
1904 currently applies. We need to keep this separate from the
1905 EOL type stored in CODESYS because the latter might indicate
1906 automatic EOL-type detection while the former will always
1907 indicate a particular EOL type. */
1908 enum eol_type eol_type;
1910 /* Additional ISO2022 information. We define the structure above
1911 because it's also needed by the detection routines. */
1912 struct iso2022_decoder iso2022;
1914 /* Additional information (the state of the running CCL program)
1915 used by the CCL decoder. */
1916 struct ccl_program ccl;
1918 /* counter for UTF-8 or UCS-4 */
1919 unsigned char counter;
1921 struct detection_state decst;
1924 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1925 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1926 static int decoding_rewinder (Lstream *stream);
1927 static int decoding_seekable_p (Lstream *stream);
1928 static int decoding_flusher (Lstream *stream);
1929 static int decoding_closer (Lstream *stream);
1931 static Lisp_Object decoding_marker (Lisp_Object stream);
1933 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1934 sizeof (struct decoding_stream));
1937 decoding_marker (Lisp_Object stream)
1939 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1940 Lisp_Object str_obj;
1942 /* We do not need to mark the coding systems or charsets stored
1943 within the stream because they are stored in a global list
1944 and automatically marked. */
1946 XSETLSTREAM (str_obj, str);
1947 mark_object (str_obj);
1948 if (str->imp->marker)
1949 return (str->imp->marker) (str_obj);
1954 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1955 so we read data from the other end, decode it, and store it into DATA. */
1958 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1960 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1961 unsigned char *orig_data = data;
1963 int error_occurred = 0;
1965 /* We need to interface to mule_decode(), which expects to take some
1966 amount of data and store the result into a Dynarr. We have
1967 mule_decode() store into str->runoff, and take data from there
1970 /* We loop until we have enough data, reading chunks from the other
1971 end and decoding it. */
1974 /* Take data from the runoff if we can. Make sure to take at
1975 most SIZE bytes, and delete the data from the runoff. */
1976 if (Dynarr_length (str->runoff) > 0)
1978 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1979 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1980 Dynarr_delete_many (str->runoff, 0, chunk);
1986 break; /* No more room for data */
1988 if (str->flags & CODING_STATE_END)
1989 /* This means that on the previous iteration, we hit the EOF on
1990 the other end. We loop once more so that mule_decode() can
1991 output any final stuff it may be holding, or any "go back
1992 to a sane state" escape sequences. (This latter makes sense
1993 during encoding.) */
1996 /* Exhausted the runoff, so get some more. DATA has at least
1997 SIZE bytes left of storage in it, so it's OK to read directly
1998 into it. (We'll be overwriting above, after we've decoded it
1999 into the runoff.) */
2000 read_size = Lstream_read (str->other_end, data, size);
2007 /* There might be some more end data produced in the translation.
2008 See the comment above. */
2009 str->flags |= CODING_STATE_END;
2010 mule_decode (stream, data, str->runoff, read_size);
2013 if (data - orig_data == 0)
2014 return error_occurred ? -1 : 0;
2016 return data - orig_data;
2020 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2022 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2025 /* Decode all our data into the runoff, and then attempt to write
2026 it all out to the other end. Remove whatever chunk we succeeded
2028 mule_decode (stream, data, str->runoff, size);
2029 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2030 Dynarr_length (str->runoff));
2032 Dynarr_delete_many (str->runoff, 0, retval);
2033 /* Do NOT return retval. The return value indicates how much
2034 of the incoming data was written, not how many bytes were
2040 reset_decoding_stream (struct decoding_stream *str)
2043 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2045 Lisp_Object coding_system;
2046 XSETCODING_SYSTEM (coding_system, str->codesys);
2047 reset_iso2022 (coding_system, &str->iso2022);
2049 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2051 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2055 str->flags = str->ch = 0;
2059 decoding_rewinder (Lstream *stream)
2061 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2062 reset_decoding_stream (str);
2063 Dynarr_reset (str->runoff);
2064 return Lstream_rewind (str->other_end);
2068 decoding_seekable_p (Lstream *stream)
2070 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2071 return Lstream_seekable_p (str->other_end);
2075 decoding_flusher (Lstream *stream)
2077 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2078 return Lstream_flush (str->other_end);
2082 decoding_closer (Lstream *stream)
2084 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2085 if (stream->flags & LSTREAM_FL_WRITE)
2087 str->flags |= CODING_STATE_END;
2088 decoding_writer (stream, 0, 0);
2090 Dynarr_free (str->runoff);
2092 #ifdef ENABLE_COMPOSITE_CHARS
2093 if (str->iso2022.composite_chars)
2094 Dynarr_free (str->iso2022.composite_chars);
2097 return Lstream_close (str->other_end);
2101 decoding_stream_coding_system (Lstream *stream)
2103 Lisp_Object coding_system;
2104 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2106 XSETCODING_SYSTEM (coding_system, str->codesys);
2107 return subsidiary_coding_system (coding_system, str->eol_type);
2111 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2113 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2114 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2116 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2117 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2118 reset_decoding_stream (str);
2121 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2122 stream for writing, no automatic code detection will be performed.
2123 The reason for this is that automatic code detection requires a
2124 seekable input. Things will also fail if you open a decoding
2125 stream for reading using a non-fully-specified coding system and
2126 a non-seekable input stream. */
2129 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2132 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2133 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2137 str->other_end = stream;
2138 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2139 str->eol_type = EOL_AUTODETECT;
2140 if (!strcmp (mode, "r")
2141 && Lstream_seekable_p (stream))
2142 /* We can determine the coding system now. */
2143 determine_real_coding_system (stream, &codesys, &str->eol_type);
2144 set_decoding_stream_coding_system (lstr, codesys);
2145 str->decst.eol_type = str->eol_type;
2146 str->decst.mask = ~0;
2147 XSETLSTREAM (obj, lstr);
2152 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2154 return make_decoding_stream_1 (stream, codesys, "r");
2158 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2160 return make_decoding_stream_1 (stream, codesys, "w");
2163 /* Note: the decode_coding_* functions all take the same
2164 arguments as mule_decode(), which is to say some SRC data of
2165 size N, which is to be stored into dynamic array DST.
2166 DECODING is the stream within which the decoding is
2167 taking place, but no data is actually read from or
2168 written to that stream; that is handled in decoding_reader()
2169 or decoding_writer(). This allows the same functions to
2170 be used for both reading and writing. */
2173 mule_decode (Lstream *decoding, CONST unsigned char *src,
2174 unsigned_char_dynarr *dst, unsigned int n)
2176 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2178 /* If necessary, do encoding-detection now. We do this when
2179 we're a writing stream or a non-seekable reading stream,
2180 meaning that we can't just process the whole input,
2181 rewind, and start over. */
2183 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2184 str->eol_type == EOL_AUTODETECT)
2186 Lisp_Object codesys;
2188 XSETCODING_SYSTEM (codesys, str->codesys);
2189 detect_coding_type (&str->decst, src, n,
2190 CODING_SYSTEM_TYPE (str->codesys) !=
2191 CODESYS_AUTODETECT);
2192 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2193 str->decst.mask != ~0)
2194 /* #### This is cheesy. What we really ought to do is
2195 buffer up a certain amount of data so as to get a
2196 less random result. */
2197 codesys = coding_system_from_mask (str->decst.mask);
2198 str->eol_type = str->decst.eol_type;
2199 if (XCODING_SYSTEM (codesys) != str->codesys)
2201 /* Preserve the CODING_STATE_END flag in case it was set.
2202 If we erase it, bad things might happen. */
2203 int was_end = str->flags & CODING_STATE_END;
2204 set_decoding_stream_coding_system (decoding, codesys);
2206 str->flags |= CODING_STATE_END;
2210 switch (CODING_SYSTEM_TYPE (str->codesys))
2213 case CODESYS_INTERNAL:
2214 Dynarr_add_many (dst, src, n);
2217 case CODESYS_AUTODETECT:
2218 /* If we got this far and still haven't decided on the coding
2219 system, then do no conversion. */
2220 case CODESYS_NO_CONVERSION:
2221 decode_coding_no_conversion (decoding, src, dst, n);
2224 case CODESYS_SHIFT_JIS:
2225 decode_coding_sjis (decoding, src, dst, n);
2228 decode_coding_big5 (decoding, src, dst, n);
2231 decode_coding_ucs4 (decoding, src, dst, n);
2234 decode_coding_utf8 (decoding, src, dst, n);
2237 str->ccl.last_block = str->flags & CODING_STATE_END;
2238 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2240 case CODESYS_ISO2022:
2241 decode_coding_iso2022 (decoding, src, dst, n);
2249 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2250 Decode the text between START and END which is encoded in CODING-SYSTEM.
2251 This is useful if you've read in encoded text from a file without decoding
2252 it (e.g. you read in a JIS-formatted file but used the `binary' or
2253 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2254 Return length of decoded text.
2255 BUFFER defaults to the current buffer if unspecified.
2257 (start, end, coding_system, buffer))
2260 struct buffer *buf = decode_buffer (buffer, 0);
2261 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2262 Lstream *istr, *ostr;
2263 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2265 get_buffer_range_char (buf, start, end, &b, &e, 0);
2267 barf_if_buffer_read_only (buf, b, e);
2269 coding_system = Fget_coding_system (coding_system);
2270 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2271 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2272 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2274 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2275 Fget_coding_system (Qbinary));
2276 istr = XLSTREAM (instream);
2277 ostr = XLSTREAM (outstream);
2278 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2280 /* The chain of streams looks like this:
2282 [BUFFER] <----- send through
2283 ------> [ENCODE AS BINARY]
2284 ------> [DECODE AS SPECIFIED]
2290 char tempbuf[1024]; /* some random amount */
2291 Bufpos newpos, even_newer_pos;
2292 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2293 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2297 newpos = lisp_buffer_stream_startpos (istr);
2298 Lstream_write (ostr, tempbuf, size_in_bytes);
2299 even_newer_pos = lisp_buffer_stream_startpos (istr);
2300 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2303 Lstream_close (istr);
2304 Lstream_close (ostr);
2306 Lstream_delete (istr);
2307 Lstream_delete (ostr);
2308 Lstream_delete (XLSTREAM (de_outstream));
2309 Lstream_delete (XLSTREAM (lb_outstream));
2314 /************************************************************************/
2315 /* Converting to an external encoding ("encoding") */
2316 /************************************************************************/
2318 /* An encoding stream is an output stream. When you create the
2319 stream, you specify the coding system that governs the encoding
2320 and another stream that the resulting encoded data is to be
2321 sent to, and then start sending data to it. */
2323 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2325 struct encoding_stream
2327 /* Coding system that governs the conversion. */
2328 Lisp_Coding_System *codesys;
2330 /* Stream that we read the encoded data from or
2331 write the decoded data to. */
2334 /* If we are reading, then we can return only a fixed amount of
2335 data, so if the conversion resulted in too much data, we store it
2336 here for retrieval the next time around. */
2337 unsigned_char_dynarr *runoff;
2339 /* FLAGS holds flags indicating the current state of the encoding.
2340 Some of these flags are dependent on the coding system. */
2343 /* CH holds a partially built-up character. Since we only deal
2344 with one- and two-byte characters at the moment, we only use
2345 this to store the first byte of a two-byte character. */
2348 /* Additional information used by the ISO2022 encoder. */
2351 /* CHARSET holds the character sets currently assigned to the G0
2352 through G3 registers. It is initialized from the array
2353 INITIAL_CHARSET in CODESYS. */
2354 Lisp_Object charset[4];
2356 /* Which registers are currently invoked into the left (GL) and
2357 right (GR) halves of the 8-bit encoding space? */
2358 int register_left, register_right;
2360 /* Whether we need to explicitly designate the charset in the
2361 G? register before using it. It is initialized from the
2362 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2363 unsigned char force_charset_on_output[4];
2365 /* Other state variables that need to be preserved across
2367 Lisp_Object current_charset;
2369 int current_char_boundary;
2372 /* Additional information (the state of the running CCL program)
2373 used by the CCL encoder. */
2374 struct ccl_program ccl;
2378 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2379 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2381 static int encoding_rewinder (Lstream *stream);
2382 static int encoding_seekable_p (Lstream *stream);
2383 static int encoding_flusher (Lstream *stream);
2384 static int encoding_closer (Lstream *stream);
2386 static Lisp_Object encoding_marker (Lisp_Object stream);
2388 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2389 sizeof (struct encoding_stream));
2392 encoding_marker (Lisp_Object stream)
2394 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2395 Lisp_Object str_obj;
2397 /* We do not need to mark the coding systems or charsets stored
2398 within the stream because they are stored in a global list
2399 and automatically marked. */
2401 XSETLSTREAM (str_obj, str);
2402 mark_object (str_obj);
2403 if (str->imp->marker)
2404 return (str->imp->marker) (str_obj);
2409 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2410 so we read data from the other end, encode it, and store it into DATA. */
2413 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2415 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2416 unsigned char *orig_data = data;
2418 int error_occurred = 0;
2420 /* We need to interface to mule_encode(), which expects to take some
2421 amount of data and store the result into a Dynarr. We have
2422 mule_encode() store into str->runoff, and take data from there
2425 /* We loop until we have enough data, reading chunks from the other
2426 end and encoding it. */
2429 /* Take data from the runoff if we can. Make sure to take at
2430 most SIZE bytes, and delete the data from the runoff. */
2431 if (Dynarr_length (str->runoff) > 0)
2433 int chunk = min ((int) size, Dynarr_length (str->runoff));
2434 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2435 Dynarr_delete_many (str->runoff, 0, chunk);
2441 break; /* No more room for data */
2443 if (str->flags & CODING_STATE_END)
2444 /* This means that on the previous iteration, we hit the EOF on
2445 the other end. We loop once more so that mule_encode() can
2446 output any final stuff it may be holding, or any "go back
2447 to a sane state" escape sequences. (This latter makes sense
2448 during encoding.) */
2451 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2452 left of storage in it, so it's OK to read directly into it.
2453 (We'll be overwriting above, after we've encoded it into the
2455 read_size = Lstream_read (str->other_end, data, size);
2462 /* There might be some more end data produced in the translation.
2463 See the comment above. */
2464 str->flags |= CODING_STATE_END;
2465 mule_encode (stream, data, str->runoff, read_size);
2468 if (data == orig_data)
2469 return error_occurred ? -1 : 0;
2471 return data - orig_data;
2475 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2477 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2480 /* Encode all our data into the runoff, and then attempt to write
2481 it all out to the other end. Remove whatever chunk we succeeded
2483 mule_encode (stream, data, str->runoff, size);
2484 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2485 Dynarr_length (str->runoff));
2487 Dynarr_delete_many (str->runoff, 0, retval);
2488 /* Do NOT return retval. The return value indicates how much
2489 of the incoming data was written, not how many bytes were
2495 reset_encoding_stream (struct encoding_stream *str)
2498 switch (CODING_SYSTEM_TYPE (str->codesys))
2500 case CODESYS_ISO2022:
2504 for (i = 0; i < 4; i++)
2506 str->iso2022.charset[i] =
2507 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2508 str->iso2022.force_charset_on_output[i] =
2509 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2511 str->iso2022.register_left = 0;
2512 str->iso2022.register_right = 1;
2513 str->iso2022.current_charset = Qnil;
2514 str->iso2022.current_half = 0;
2515 str->iso2022.current_char_boundary = 1;
2519 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2526 str->flags = str->ch = 0;
2530 encoding_rewinder (Lstream *stream)
2532 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2533 reset_encoding_stream (str);
2534 Dynarr_reset (str->runoff);
2535 return Lstream_rewind (str->other_end);
2539 encoding_seekable_p (Lstream *stream)
2541 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2542 return Lstream_seekable_p (str->other_end);
2546 encoding_flusher (Lstream *stream)
2548 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2549 return Lstream_flush (str->other_end);
2553 encoding_closer (Lstream *stream)
2555 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2556 if (stream->flags & LSTREAM_FL_WRITE)
2558 str->flags |= CODING_STATE_END;
2559 encoding_writer (stream, 0, 0);
2561 Dynarr_free (str->runoff);
2562 return Lstream_close (str->other_end);
2566 encoding_stream_coding_system (Lstream *stream)
2568 Lisp_Object coding_system;
2569 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2571 XSETCODING_SYSTEM (coding_system, str->codesys);
2572 return coding_system;
2576 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2578 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2579 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2581 reset_encoding_stream (str);
2585 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2588 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2589 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2593 str->runoff = Dynarr_new (unsigned_char);
2594 str->other_end = stream;
2595 set_encoding_stream_coding_system (lstr, codesys);
2596 XSETLSTREAM (obj, lstr);
2601 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2603 return make_encoding_stream_1 (stream, codesys, "r");
2607 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2609 return make_encoding_stream_1 (stream, codesys, "w");
2612 /* Convert N bytes of internally-formatted data stored in SRC to an
2613 external format, according to the encoding stream ENCODING.
2614 Store the encoded data into DST. */
2617 mule_encode (Lstream *encoding, CONST unsigned char *src,
2618 unsigned_char_dynarr *dst, unsigned int n)
2620 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2622 switch (CODING_SYSTEM_TYPE (str->codesys))
2625 case CODESYS_INTERNAL:
2626 Dynarr_add_many (dst, src, n);
2629 case CODESYS_AUTODETECT:
2630 /* If we got this far and still haven't decided on the coding
2631 system, then do no conversion. */
2632 case CODESYS_NO_CONVERSION:
2633 encode_coding_no_conversion (encoding, src, dst, n);
2636 case CODESYS_SHIFT_JIS:
2637 encode_coding_sjis (encoding, src, dst, n);
2640 encode_coding_big5 (encoding, src, dst, n);
2643 encode_coding_ucs4 (encoding, src, dst, n);
2646 encode_coding_utf8 (encoding, src, dst, n);
2649 str->ccl.last_block = str->flags & CODING_STATE_END;
2650 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2652 case CODESYS_ISO2022:
2653 encode_coding_iso2022 (encoding, src, dst, n);
2661 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2662 Encode the text between START and END using CODING-SYSTEM.
2663 This will, for example, convert Japanese characters into stuff such as
2664 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2665 text. BUFFER defaults to the current buffer if unspecified.
2667 (start, end, coding_system, buffer))
2670 struct buffer *buf = decode_buffer (buffer, 0);
2671 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2672 Lstream *istr, *ostr;
2673 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2675 get_buffer_range_char (buf, start, end, &b, &e, 0);
2677 barf_if_buffer_read_only (buf, b, e);
2679 coding_system = Fget_coding_system (coding_system);
2680 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2681 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2682 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2683 Fget_coding_system (Qbinary));
2684 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2686 istr = XLSTREAM (instream);
2687 ostr = XLSTREAM (outstream);
2688 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2689 /* The chain of streams looks like this:
2691 [BUFFER] <----- send through
2692 ------> [ENCODE AS SPECIFIED]
2693 ------> [DECODE AS BINARY]
2698 char tempbuf[1024]; /* some random amount */
2699 Bufpos newpos, even_newer_pos;
2700 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2701 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2705 newpos = lisp_buffer_stream_startpos (istr);
2706 Lstream_write (ostr, tempbuf, size_in_bytes);
2707 even_newer_pos = lisp_buffer_stream_startpos (istr);
2708 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2714 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2715 Lstream_close (istr);
2716 Lstream_close (ostr);
2718 Lstream_delete (istr);
2719 Lstream_delete (ostr);
2720 Lstream_delete (XLSTREAM (de_outstream));
2721 Lstream_delete (XLSTREAM (lb_outstream));
2722 return make_int (retlen);
2728 /************************************************************************/
2729 /* Shift-JIS methods */
2730 /************************************************************************/
2732 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2733 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2734 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2735 encoded by "position-code + 0x80". A character of JISX0208
2736 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2737 position-codes are divided and shifted so that it fit in the range
2740 --- CODE RANGE of Shift-JIS ---
2741 (character set) (range)
2743 JISX0201-Kana 0xA0 .. 0xDF
2744 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2745 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2746 -------------------------------
2750 /* Is this the first byte of a Shift-JIS two-byte char? */
2752 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2753 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2755 /* Is this the second byte of a Shift-JIS two-byte char? */
2757 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2758 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2760 #define BYTE_SJIS_KATAKANA_P(c) \
2761 ((c) >= 0xA1 && (c) <= 0xDF)
2764 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2772 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2774 if (st->shift_jis.in_second_byte)
2776 st->shift_jis.in_second_byte = 0;
2780 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2781 st->shift_jis.in_second_byte = 1;
2783 return CODING_CATEGORY_SHIFT_JIS_MASK;
2786 /* Convert Shift-JIS data to internal format. */
2789 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2790 unsigned_char_dynarr *dst, unsigned int n)
2793 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2794 unsigned int flags = str->flags;
2795 unsigned int ch = str->ch;
2796 eol_type_t eol_type = str->eol_type;
2804 /* Previous character was first byte of Shift-JIS Kanji char. */
2805 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2807 unsigned char e1, e2;
2809 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2810 DECODE_SJIS (ch, c, e1, e2);
2811 Dynarr_add (dst, e1);
2812 Dynarr_add (dst, e2);
2816 DECODE_ADD_BINARY_CHAR (ch, dst);
2817 DECODE_ADD_BINARY_CHAR (c, dst);
2823 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2824 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2826 else if (BYTE_SJIS_KATAKANA_P (c))
2828 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2829 Dynarr_add (dst, c);
2832 DECODE_ADD_BINARY_CHAR (c, dst);
2834 label_continue_loop:;
2837 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2843 /* Convert internally-formatted data to Shift-JIS. */
2846 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2847 unsigned_char_dynarr *dst, unsigned int n)
2850 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2851 unsigned int flags = str->flags;
2852 unsigned int ch = str->ch;
2853 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2860 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2861 Dynarr_add (dst, '\r');
2862 if (eol_type != EOL_CR)
2863 Dynarr_add (dst, '\n');
2866 else if (BYTE_ASCII_P (c))
2868 Dynarr_add (dst, c);
2871 else if (BUFBYTE_LEADING_BYTE_P (c))
2872 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2873 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2874 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2877 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2879 Dynarr_add (dst, c);
2882 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2883 ch == LEADING_BYTE_JAPANESE_JISX0208)
2887 unsigned char j1, j2;
2888 ENCODE_SJIS (ch, c, j1, j2);
2889 Dynarr_add (dst, j1);
2890 Dynarr_add (dst, j2);
2900 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2901 Decode a JISX0208 character of Shift-JIS coding-system.
2902 CODE is the character code in Shift-JIS as a cons of type bytes.
2903 Return the corresponding character.
2907 unsigned char c1, c2, s1, s2;
2910 CHECK_INT (XCAR (code));
2911 CHECK_INT (XCDR (code));
2912 s1 = XINT (XCAR (code));
2913 s2 = XINT (XCDR (code));
2914 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2915 BYTE_SJIS_TWO_BYTE_2_P (s2))
2917 DECODE_SJIS (s1, s2, c1, c2);
2918 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2919 c1 & 0x7F, c2 & 0x7F));
2925 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2926 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2927 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2931 Lisp_Object charset;
2934 CHECK_CHAR_COERCE_INT (ch);
2935 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2936 if (EQ (charset, Vcharset_japanese_jisx0208))
2938 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2939 return Fcons (make_int (s1), make_int (s2));
2946 /************************************************************************/
2948 /************************************************************************/
2950 /* BIG5 is a coding system encoding two character sets: ASCII and
2951 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2952 character set and is encoded in two-byte.
2954 --- CODE RANGE of BIG5 ---
2955 (character set) (range)
2957 Big5 (1st byte) 0xA1 .. 0xFE
2958 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2959 --------------------------
2961 Since the number of characters in Big5 is larger than maximum
2962 characters in Emacs' charset (96x96), it can't be handled as one
2963 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2964 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2965 contains frequently used characters and the latter contains less
2966 frequently used characters. */
2968 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2969 ((c) >= 0xA1 && (c) <= 0xFE)
2971 /* Is this the second byte of a Shift-JIS two-byte char? */
2973 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2974 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2976 /* Number of Big5 characters which have the same code in 1st byte. */
2978 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2980 /* Code conversion macros. These are macros because they are used in
2981 inner loops during code conversion.
2983 Note that temporary variables in macros introduce the classic
2984 dynamic-scoping problems with variable names. We use capital-
2985 lettered variables in the assumption that XEmacs does not use
2986 capital letters in variables except in a very formalized way
2989 /* Convert Big5 code (b1, b2) into its internal string representation
2992 /* There is a much simpler way to split the Big5 charset into two.
2993 For the moment I'm going to leave the algorithm as-is because it
2994 claims to separate out the most-used characters into a single
2995 charset, which perhaps will lead to optimizations in various
2998 The way the algorithm works is something like this:
3000 Big5 can be viewed as a 94x157 charset, where the row is
3001 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3002 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3003 the split between low and high column numbers is apparently
3004 meaningless; ascending rows produce less and less frequent chars.
3005 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3006 the first charset, and the upper half (0xC9 .. 0xFE) to the
3007 second. To do the conversion, we convert the character into
3008 a single number where 0 .. 156 is the first row, 157 .. 313
3009 is the second, etc. That way, the characters are ordered by
3010 decreasing frequency. Then we just chop the space in two
3011 and coerce the result into a 94x94 space.
3014 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3016 int B1 = b1, B2 = b2; \
3018 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3022 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3026 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3027 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3029 c1 = I / (0xFF - 0xA1) + 0xA1; \
3030 c2 = I % (0xFF - 0xA1) + 0xA1; \
3033 /* Convert the internal string representation of a Big5 character
3034 (lb, c1, c2) into Big5 code (b1, b2). */
3036 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3038 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3040 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3042 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3044 b1 = I / BIG5_SAME_ROW + 0xA1; \
3045 b2 = I % BIG5_SAME_ROW; \
3046 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3050 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3058 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3059 (c >= 0x80 && c <= 0xA0))
3061 if (st->big5.in_second_byte)
3063 st->big5.in_second_byte = 0;
3064 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3068 st->big5.in_second_byte = 1;
3070 return CODING_CATEGORY_BIG5_MASK;
3073 /* Convert Big5 data to internal format. */
3076 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3077 unsigned_char_dynarr *dst, unsigned int n)
3080 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3081 unsigned int flags = str->flags;
3082 unsigned int ch = str->ch;
3083 eol_type_t eol_type = str->eol_type;
3090 /* Previous character was first byte of Big5 char. */
3091 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3093 unsigned char b1, b2, b3;
3094 DECODE_BIG5 (ch, c, b1, b2, b3);
3095 Dynarr_add (dst, b1);
3096 Dynarr_add (dst, b2);
3097 Dynarr_add (dst, b3);
3101 DECODE_ADD_BINARY_CHAR (ch, dst);
3102 DECODE_ADD_BINARY_CHAR (c, dst);
3108 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3109 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3112 DECODE_ADD_BINARY_CHAR (c, dst);
3114 label_continue_loop:;
3117 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3123 /* Convert internally-formatted data to Big5. */
3126 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3127 unsigned_char_dynarr *dst, unsigned int n)
3130 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3131 unsigned int flags = str->flags;
3132 unsigned int ch = str->ch;
3133 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3140 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3141 Dynarr_add (dst, '\r');
3142 if (eol_type != EOL_CR)
3143 Dynarr_add (dst, '\n');
3145 else if (BYTE_ASCII_P (c))
3148 Dynarr_add (dst, c);
3150 else if (BUFBYTE_LEADING_BYTE_P (c))
3152 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3153 c == LEADING_BYTE_CHINESE_BIG5_2)
3155 /* A recognized leading byte. */
3157 continue; /* not done with this character. */
3159 /* otherwise just ignore this character. */
3161 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3162 ch == LEADING_BYTE_CHINESE_BIG5_2)
3164 /* Previous char was a recognized leading byte. */
3166 continue; /* not done with this character. */
3170 /* Encountering second byte of a Big5 character. */
3171 unsigned char b1, b2;
3173 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3174 Dynarr_add (dst, b1);
3175 Dynarr_add (dst, b2);
3186 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3187 Decode a Big5 character CODE of BIG5 coding-system.
3188 CODE is the character code in BIG5, a cons of two integers.
3189 Return the corresponding character.
3193 unsigned char c1, c2, b1, b2;
3196 CHECK_INT (XCAR (code));
3197 CHECK_INT (XCDR (code));
3198 b1 = XINT (XCAR (code));
3199 b2 = XINT (XCDR (code));
3200 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3201 BYTE_BIG5_TWO_BYTE_2_P (b2))
3204 Lisp_Object charset;
3205 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3206 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3207 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3213 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3214 Encode the Big5 character CH to BIG5 coding-system.
3215 Return the corresponding character code in Big5.
3219 Lisp_Object charset;
3222 CHECK_CHAR_COERCE_INT (ch);
3223 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3224 if (EQ (charset, Vcharset_chinese_big5_1) ||
3225 EQ (charset, Vcharset_chinese_big5_2))
3227 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3229 return Fcons (make_int (b1), make_int (b2));
3236 /************************************************************************/
3239 /* UCS-4 character codes are implemented as nonnegative integers. */
3241 /************************************************************************/
3244 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3245 Map UCS-4 code CODE to Mule character CHARACTER.
3247 Return T on success, NIL on failure.
3253 CHECK_CHAR (character);
3257 if (c < sizeof (fcd->ucs_to_mule_table))
3259 fcd->ucs_to_mule_table[c] = character;
3267 ucs_to_char (unsigned long code)
3269 if (code < sizeof (fcd->ucs_to_mule_table))
3271 return fcd->ucs_to_mule_table[code];
3273 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3278 c = code % (94 * 94);
3280 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3281 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3282 CHARSET_LEFT_TO_RIGHT),
3283 c / 94 + 33, c % 94 + 33));
3289 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3290 Return Mule character corresponding to UCS code CODE (a positive integer).
3294 CHECK_NATNUM (code);
3295 return ucs_to_char (XINT (code));
3298 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3299 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3303 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3304 Fset_char_ucs is more restrictive on index arg, but should
3305 check code arg in a char_table method. */
3306 CHECK_CHAR (character);
3307 CHECK_NATNUM (code);
3308 return Fput_char_table (character, code, mule_to_ucs_table);
3311 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3312 Return the UCS code (a positive integer) corresponding to CHARACTER.
3316 return Fget_char_table (character, mule_to_ucs_table);
3319 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3320 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3321 is not found, instead.
3322 #### do something more appropriate (use blob?)
3323 Danger, Will Robinson! Data loss. Should we signal user? */
3325 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3327 Lisp_Object chr = ucs_to_char (ch);
3331 Bufbyte work[MAX_EMCHAR_LEN];
3336 simple_set_charptr_emchar (work, ch) :
3337 non_ascii_set_charptr_emchar (work, ch);
3338 Dynarr_add_many (dst, work, len);
3342 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3343 Dynarr_add (dst, 34 + 128);
3344 Dynarr_add (dst, 46 + 128);
3348 static unsigned long
3349 mule_char_to_ucs4 (Lisp_Object charset,
3350 unsigned char h, unsigned char l)
3353 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3360 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3361 (XCHARSET_CHARS (charset) == 94) )
3363 unsigned char final = XCHARSET_FINAL (charset);
3365 if ( ('@' <= final) && (final < 0x7f) )
3367 return 0xe00000 + (final - '@') * 94 * 94
3368 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3382 encode_ucs4 (Lisp_Object charset,
3383 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3385 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3386 Dynarr_add (dst, code >> 24);
3387 Dynarr_add (dst, (code >> 16) & 255);
3388 Dynarr_add (dst, (code >> 8) & 255);
3389 Dynarr_add (dst, code & 255);
3393 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3399 switch (st->ucs4.in_byte)
3408 st->ucs4.in_byte = 0;
3414 return CODING_CATEGORY_UCS4_MASK;
3418 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3419 unsigned_char_dynarr *dst, unsigned int n)
3421 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3422 unsigned int flags = str->flags;
3423 unsigned int ch = str->ch;
3424 unsigned char counter = str->counter;
3428 unsigned char c = *src++;
3436 decode_ucs4 ( ( ch << 8 ) | c, dst);
3441 ch = ( ch << 8 ) | c;
3445 if (counter & CODING_STATE_END)
3446 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3450 str->counter = counter;
3454 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3455 unsigned_char_dynarr *dst, unsigned int n)
3457 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3458 unsigned int flags = str->flags;
3459 unsigned int ch = str->ch;
3460 unsigned char char_boundary = str->iso2022.current_char_boundary;
3461 Lisp_Object charset = str->iso2022.current_charset;
3463 #ifdef ENABLE_COMPOSITE_CHARS
3464 /* flags for handling composite chars. We do a little switcharoo
3465 on the source while we're outputting the composite char. */
3466 unsigned int saved_n = 0;
3467 CONST unsigned char *saved_src = NULL;
3468 int in_composite = 0;
3475 unsigned char c = *src++;
3477 if (BYTE_ASCII_P (c))
3478 { /* Processing ASCII character */
3480 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3483 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3484 { /* Processing Leading Byte */
3486 charset = CHARSET_BY_LEADING_BYTE (c);
3487 if (LEADING_BYTE_PREFIX_P(c))
3492 { /* Processing Non-ASCII character */
3494 if (EQ (charset, Vcharset_control_1))
3496 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3500 switch (XCHARSET_REP_BYTES (charset))
3503 encode_ucs4 (charset, c, 0, dst);
3506 if (XCHARSET_PRIVATE_P (charset))
3508 encode_ucs4 (charset, c, 0, dst);
3513 #ifdef ENABLE_COMPOSITE_CHARS
3514 if (EQ (charset, Vcharset_composite))
3518 /* #### Bother! We don't know how to
3520 Dynarr_add (dst, 0);
3521 Dynarr_add (dst, 0);
3522 Dynarr_add (dst, 0);
3523 Dynarr_add (dst, '~');
3527 Emchar emch = MAKE_CHAR (Vcharset_composite,
3528 ch & 0x7F, c & 0x7F);
3529 Lisp_Object lstr = composite_char_string (emch);
3533 src = XSTRING_DATA (lstr);
3534 n = XSTRING_LENGTH (lstr);
3538 #endif /* ENABLE_COMPOSITE_CHARS */
3540 encode_ucs4(charset, ch, c, dst);
3553 encode_ucs4 (charset, ch, c, dst);
3569 #ifdef ENABLE_COMPOSITE_CHARS
3575 goto back_to_square_n; /* Wheeeeeeeee ..... */
3577 #endif /* ENABLE_COMPOSITE_CHARS */
3581 str->iso2022.current_char_boundary = char_boundary;
3582 str->iso2022.current_charset = charset;
3584 /* Verbum caro factum est! */
3588 /************************************************************************/
3590 /************************************************************************/
3593 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3598 unsigned char c = *src++;
3599 switch (st->utf8.in_byte)
3602 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3605 st->utf8.in_byte = 5;
3607 st->utf8.in_byte = 4;
3609 st->utf8.in_byte = 3;
3611 st->utf8.in_byte = 2;
3613 st->utf8.in_byte = 1;
3618 if ((c & 0xc0) != 0x80)
3624 return CODING_CATEGORY_UTF8_MASK;
3628 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3629 unsigned_char_dynarr *dst, unsigned int n)
3631 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3632 unsigned int flags = str->flags;
3633 unsigned int ch = str->ch;
3634 eol_type_t eol_type = str->eol_type;
3635 unsigned char counter = str->counter;
3639 unsigned char c = *src++;
3648 else if ( c >= 0xf8 )
3653 else if ( c >= 0xf0 )
3658 else if ( c >= 0xe0 )
3663 else if ( c >= 0xc0 )
3670 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3671 decode_ucs4 (c, dst);
3675 ch = ( ch << 6 ) | ( c & 0x3f );
3676 decode_ucs4 (ch, dst);
3681 ch = ( ch << 6 ) | ( c & 0x3f );
3684 label_continue_loop:;
3687 if (flags & CODING_STATE_END)
3688 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3692 str->counter = counter;
3696 encode_utf8 (Lisp_Object charset,
3697 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3699 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3702 Dynarr_add (dst, code);
3704 else if ( code <= 0x7ff )
3706 Dynarr_add (dst, (code >> 6) | 0xc0);
3707 Dynarr_add (dst, (code & 0x3f) | 0x80);
3709 else if ( code <= 0xffff )
3711 Dynarr_add (dst, (code >> 12) | 0xe0);
3712 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3713 Dynarr_add (dst, (code & 0x3f) | 0x80);
3715 else if ( code <= 0x1fffff )
3717 Dynarr_add (dst, (code >> 18) | 0xf0);
3718 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3719 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3720 Dynarr_add (dst, (code & 0x3f) | 0x80);
3722 else if ( code <= 0x3ffffff )
3724 Dynarr_add (dst, (code >> 24) | 0xf8);
3725 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3726 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3727 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3728 Dynarr_add (dst, (code & 0x3f) | 0x80);
3732 Dynarr_add (dst, (code >> 30) | 0xfc);
3733 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3734 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3735 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3736 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3737 Dynarr_add (dst, (code & 0x3f) | 0x80);
3742 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3743 unsigned_char_dynarr *dst, unsigned int n)
3745 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3746 unsigned int flags = str->flags;
3747 unsigned int ch = str->ch;
3748 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3749 unsigned char char_boundary = str->iso2022.current_char_boundary;
3750 Lisp_Object charset = str->iso2022.current_charset;
3752 #ifdef ENABLE_COMPOSITE_CHARS
3753 /* flags for handling composite chars. We do a little switcharoo
3754 on the source while we're outputting the composite char. */
3755 unsigned int saved_n = 0;
3756 CONST unsigned char *saved_src = NULL;
3757 int in_composite = 0;
3760 #endif /* ENABLE_COMPOSITE_CHARS */
3764 unsigned char c = *src++;
3766 if (BYTE_ASCII_P (c))
3767 { /* Processing ASCII character */
3771 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3772 Dynarr_add (dst, '\r');
3773 if (eol_type != EOL_CR)
3774 Dynarr_add (dst, c);
3777 encode_utf8 (Vcharset_ascii, c, 0, dst);
3780 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3781 { /* Processing Leading Byte */
3783 charset = CHARSET_BY_LEADING_BYTE (c);
3784 if (LEADING_BYTE_PREFIX_P(c))
3789 { /* Processing Non-ASCII character */
3791 if (EQ (charset, Vcharset_control_1))
3793 encode_utf8 (Vcharset_control_1, c, 0, dst);
3797 switch (XCHARSET_REP_BYTES (charset))
3800 encode_utf8 (charset, c, 0, dst);
3803 if (XCHARSET_PRIVATE_P (charset))
3805 encode_utf8 (charset, c, 0, dst);
3810 #ifdef ENABLE_COMPOSITE_CHARS
3811 if (EQ (charset, Vcharset_composite))
3815 /* #### Bother! We don't know how to
3817 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3821 Emchar emch = MAKE_CHAR (Vcharset_composite,
3822 ch & 0x7F, c & 0x7F);
3823 Lisp_Object lstr = composite_char_string (emch);
3827 src = XSTRING_DATA (lstr);
3828 n = XSTRING_LENGTH (lstr);
3832 #endif /* ENABLE_COMPOSITE_CHARS */
3834 encode_utf8 (charset, ch, c, dst);
3847 encode_utf8 (charset, ch, c, dst);
3863 #ifdef ENABLE_COMPOSITE_CHARS
3869 goto back_to_square_n; /* Wheeeeeeeee ..... */
3875 str->iso2022.current_char_boundary = char_boundary;
3876 str->iso2022.current_charset = charset;
3878 /* Verbum caro factum est! */
3882 /************************************************************************/
3883 /* ISO2022 methods */
3884 /************************************************************************/
3886 /* The following note describes the coding system ISO2022 briefly.
3887 Since the intention of this note is to help understand the
3888 functions in this file, some parts are NOT ACCURATE or OVERLY
3889 SIMPLIFIED. For thorough understanding, please refer to the
3890 original document of ISO2022.
3892 ISO2022 provides many mechanisms to encode several character sets
3893 in 7-bit and 8-bit environments. For 7-bit environments, all text
3894 is encoded using bytes less than 128. This may make the encoded
3895 text a little bit longer, but the text passes more easily through
3896 several gateways, some of which strip off MSB (Most Signigant Bit).
3898 There are two kinds of character sets: control character set and
3899 graphic character set. The former contains control characters such
3900 as `newline' and `escape' to provide control functions (control
3901 functions are also provided by escape sequences). The latter
3902 contains graphic characters such as 'A' and '-'. Emacs recognizes
3903 two control character sets and many graphic character sets.
3905 Graphic character sets are classified into one of the following
3906 four classes, according to the number of bytes (DIMENSION) and
3907 number of characters in one dimension (CHARS) of the set:
3908 - DIMENSION1_CHARS94
3909 - DIMENSION1_CHARS96
3910 - DIMENSION2_CHARS94
3911 - DIMENSION2_CHARS96
3913 In addition, each character set is assigned an identification tag,
3914 unique for each set, called "final character" (denoted as <F>
3915 hereafter). The <F> of each character set is decided by ECMA(*)
3916 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3917 (0x30..0x3F are for private use only).
3919 Note (*): ECMA = European Computer Manufacturers Association
3921 Here are examples of graphic character set [NAME(<F>)]:
3922 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3923 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3924 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3925 o DIMENSION2_CHARS96 -- none for the moment
3927 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3928 C0 [0x00..0x1F] -- control character plane 0
3929 GL [0x20..0x7F] -- graphic character plane 0
3930 C1 [0x80..0x9F] -- control character plane 1
3931 GR [0xA0..0xFF] -- graphic character plane 1
3933 A control character set is directly designated and invoked to C0 or
3934 C1 by an escape sequence. The most common case is that:
3935 - ISO646's control character set is designated/invoked to C0, and
3936 - ISO6429's control character set is designated/invoked to C1,
3937 and usually these designations/invocations are omitted in encoded
3938 text. In a 7-bit environment, only C0 can be used, and a control
3939 character for C1 is encoded by an appropriate escape sequence to
3940 fit into the environment. All control characters for C1 are
3941 defined to have corresponding escape sequences.
3943 A graphic character set is at first designated to one of four
3944 graphic registers (G0 through G3), then these graphic registers are
3945 invoked to GL or GR. These designations and invocations can be
3946 done independently. The most common case is that G0 is invoked to
3947 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3948 these invocations and designations are omitted in encoded text.
3949 In a 7-bit environment, only GL can be used.
3951 When a graphic character set of CHARS94 is invoked to GL, codes
3952 0x20 and 0x7F of the GL area work as control characters SPACE and
3953 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3956 There are two ways of invocation: locking-shift and single-shift.
3957 With locking-shift, the invocation lasts until the next different
3958 invocation, whereas with single-shift, the invocation affects the
3959 following character only and doesn't affect the locking-shift
3960 state. Invocations are done by the following control characters or
3963 ----------------------------------------------------------------------
3964 abbrev function cntrl escape seq description
3965 ----------------------------------------------------------------------
3966 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3967 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3968 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3969 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3970 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3971 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3972 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3973 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3974 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3975 ----------------------------------------------------------------------
3976 (*) These are not used by any known coding system.
3978 Control characters for these functions are defined by macros
3979 ISO_CODE_XXX in `coding.h'.
3981 Designations are done by the following escape sequences:
3982 ----------------------------------------------------------------------
3983 escape sequence description
3984 ----------------------------------------------------------------------
3985 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3986 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3987 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3988 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3989 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3990 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3991 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3992 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3993 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3994 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3995 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3996 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3997 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3998 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3999 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4000 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4001 ----------------------------------------------------------------------
4003 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4004 of dimension 1, chars 94, and final character <F>, etc...
4006 Note (*): Although these designations are not allowed in ISO2022,
4007 Emacs accepts them on decoding, and produces them on encoding
4008 CHARS96 character sets in a coding system which is characterized as
4009 7-bit environment, non-locking-shift, and non-single-shift.
4011 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4012 '(' can be omitted. We refer to this as "short-form" hereafter.
4014 Now you may notice that there are a lot of ways for encoding the
4015 same multilingual text in ISO2022. Actually, there exist many
4016 coding systems such as Compound Text (used in X11's inter client
4017 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4018 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4019 localized platforms), and all of these are variants of ISO2022.
4021 In addition to the above, Emacs handles two more kinds of escape
4022 sequences: ISO6429's direction specification and Emacs' private
4023 sequence for specifying character composition.
4025 ISO6429's direction specification takes the following form:
4026 o CSI ']' -- end of the current direction
4027 o CSI '0' ']' -- end of the current direction
4028 o CSI '1' ']' -- start of left-to-right text
4029 o CSI '2' ']' -- start of right-to-left text
4030 The control character CSI (0x9B: control sequence introducer) is
4031 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4033 Character composition specification takes the following form:
4034 o ESC '0' -- start character composition
4035 o ESC '1' -- end character composition
4036 Since these are not standard escape sequences of any ISO standard,
4037 their use with these meanings is restricted to Emacs only. */
4040 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4044 for (i = 0; i < 4; i++)
4046 if (!NILP (coding_system))
4048 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4050 iso->charset[i] = Qt;
4051 iso->invalid_designated[i] = 0;
4053 iso->esc = ISO_ESC_NOTHING;
4054 iso->esc_bytes_index = 0;
4055 iso->register_left = 0;
4056 iso->register_right = 1;
4057 iso->switched_dir_and_no_valid_charset_yet = 0;
4058 iso->invalid_switch_dir = 0;
4059 iso->output_direction_sequence = 0;
4060 iso->output_literally = 0;
4061 #ifdef ENABLE_COMPOSITE_CHARS
4062 if (iso->composite_chars)
4063 Dynarr_reset (iso->composite_chars);
4068 fit_to_be_escape_quoted (unsigned char c)
4085 /* Parse one byte of an ISO2022 escape sequence.
4086 If the result is an invalid escape sequence, return 0 and
4087 do not change anything in STR. Otherwise, if the result is
4088 an incomplete escape sequence, update ISO2022.ESC and
4089 ISO2022.ESC_BYTES and return -1. Otherwise, update
4090 all the state variables (but not ISO2022.ESC_BYTES) and
4093 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4094 or invocation of an invalid character set and treat that as
4095 an unrecognized escape sequence. */
4098 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4099 unsigned char c, unsigned int *flags,
4100 int check_invalid_charsets)
4102 /* (1) If we're at the end of a designation sequence, CS is the
4103 charset being designated and REG is the register to designate
4106 (2) If we're at the end of a locking-shift sequence, REG is
4107 the register to invoke and HALF (0 == left, 1 == right) is
4108 the half to invoke it into.
4110 (3) If we're at the end of a single-shift sequence, REG is
4111 the register to invoke. */
4112 Lisp_Object cs = Qnil;
4115 /* NOTE: This code does goto's all over the fucking place.
4116 The reason for this is that we're basically implementing
4117 a state machine here, and hierarchical languages like C
4118 don't really provide a clean way of doing this. */
4120 if (! (*flags & CODING_STATE_ESCAPE))
4121 /* At beginning of escape sequence; we need to reset our
4122 escape-state variables. */
4123 iso->esc = ISO_ESC_NOTHING;
4125 iso->output_literally = 0;
4126 iso->output_direction_sequence = 0;
4130 case ISO_ESC_NOTHING:
4131 iso->esc_bytes_index = 0;
4134 case ISO_CODE_ESC: /* Start escape sequence */
4135 *flags |= CODING_STATE_ESCAPE;
4139 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4140 *flags |= CODING_STATE_ESCAPE;
4141 iso->esc = ISO_ESC_5_11;
4144 case ISO_CODE_SO: /* locking shift 1 */
4147 case ISO_CODE_SI: /* locking shift 0 */
4151 case ISO_CODE_SS2: /* single shift */
4154 case ISO_CODE_SS3: /* single shift */
4158 default: /* Other control characters */
4165 /**** single shift ****/
4167 case 'N': /* single shift 2 */
4170 case 'O': /* single shift 3 */
4174 /**** locking shift ****/
4176 case '~': /* locking shift 1 right */
4179 case 'n': /* locking shift 2 */
4182 case '}': /* locking shift 2 right */
4185 case 'o': /* locking shift 3 */
4188 case '|': /* locking shift 3 right */
4192 #ifdef ENABLE_COMPOSITE_CHARS
4193 /**** composite ****/
4196 iso->esc = ISO_ESC_START_COMPOSITE;
4197 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4198 CODING_STATE_COMPOSITE;
4202 iso->esc = ISO_ESC_END_COMPOSITE;
4203 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4204 ~CODING_STATE_COMPOSITE;
4206 #endif /* ENABLE_COMPOSITE_CHARS */
4208 /**** directionality ****/
4211 iso->esc = ISO_ESC_5_11;
4214 /**** designation ****/
4216 case '$': /* multibyte charset prefix */
4217 iso->esc = ISO_ESC_2_4;
4221 if (0x28 <= c && c <= 0x2F)
4223 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4227 /* This function is called with CODESYS equal to nil when
4228 doing coding-system detection. */
4230 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4231 && fit_to_be_escape_quoted (c))
4233 iso->esc = ISO_ESC_LITERAL;
4234 *flags &= CODING_STATE_ISO2022_LOCK;
4244 /**** directionality ****/
4246 case ISO_ESC_5_11: /* ISO6429 direction control */
4249 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4250 goto directionality;
4252 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4253 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4254 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4258 case ISO_ESC_5_11_0:
4261 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4262 goto directionality;
4266 case ISO_ESC_5_11_1:
4269 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4270 goto directionality;
4274 case ISO_ESC_5_11_2:
4277 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4278 goto directionality;
4283 iso->esc = ISO_ESC_DIRECTIONALITY;
4284 /* Various junk here to attempt to preserve the direction sequences
4285 literally in the text if they would otherwise be swallowed due
4286 to invalid designations that don't show up as actual charset
4287 changes in the text. */
4288 if (iso->invalid_switch_dir)
4290 /* We already inserted a direction switch literally into the
4291 text. We assume (#### this may not be right) that the
4292 next direction switch is the one going the other way,
4293 and we need to output that literally as well. */
4294 iso->output_literally = 1;
4295 iso->invalid_switch_dir = 0;
4301 /* If we are in the thrall of an invalid designation,
4302 then stick the directionality sequence literally into the
4303 output stream so it ends up in the original text again. */
4304 for (jj = 0; jj < 4; jj++)
4305 if (iso->invalid_designated[jj])
4309 iso->output_literally = 1;
4310 iso->invalid_switch_dir = 1;
4313 /* Indicate that we haven't yet seen a valid designation,
4314 so that if a switch-dir is directly followed by an
4315 invalid designation, both get inserted literally. */
4316 iso->switched_dir_and_no_valid_charset_yet = 1;
4321 /**** designation ****/
4324 if (0x28 <= c && c <= 0x2F)
4326 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4329 if (0x40 <= c && c <= 0x42)
4331 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4332 *flags & CODING_STATE_R2L ?
4333 CHARSET_RIGHT_TO_LEFT :
4334 CHARSET_LEFT_TO_RIGHT);
4344 if (c < '0' || c > '~')
4345 return 0; /* bad final byte */
4347 if (iso->esc >= ISO_ESC_2_8 &&
4348 iso->esc <= ISO_ESC_2_15)
4350 type = ((iso->esc >= ISO_ESC_2_12) ?
4351 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4352 reg = (iso->esc - ISO_ESC_2_8) & 3;
4354 else if (iso->esc >= ISO_ESC_2_4_8 &&
4355 iso->esc <= ISO_ESC_2_4_15)
4357 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4358 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4359 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4363 /* Can this ever be reached? -slb */
4367 cs = CHARSET_BY_ATTRIBUTES (type, c,
4368 *flags & CODING_STATE_R2L ?
4369 CHARSET_RIGHT_TO_LEFT :
4370 CHARSET_LEFT_TO_RIGHT);
4376 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4380 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4381 /* can't invoke something that ain't there. */
4383 iso->esc = ISO_ESC_SINGLE_SHIFT;
4384 *flags &= CODING_STATE_ISO2022_LOCK;
4386 *flags |= CODING_STATE_SS2;
4388 *flags |= CODING_STATE_SS3;
4392 if (check_invalid_charsets &&
4393 !CHARSETP (iso->charset[reg]))
4394 /* can't invoke something that ain't there. */
4397 iso->register_right = reg;
4399 iso->register_left = reg;
4400 *flags &= CODING_STATE_ISO2022_LOCK;
4401 iso->esc = ISO_ESC_LOCKING_SHIFT;
4405 if (NILP (cs) && check_invalid_charsets)
4407 iso->invalid_designated[reg] = 1;
4408 iso->charset[reg] = Vcharset_ascii;
4409 iso->esc = ISO_ESC_DESIGNATE;
4410 *flags &= CODING_STATE_ISO2022_LOCK;
4411 iso->output_literally = 1;
4412 if (iso->switched_dir_and_no_valid_charset_yet)
4414 /* We encountered a switch-direction followed by an
4415 invalid designation. Ensure that the switch-direction
4416 gets outputted; otherwise it will probably get eaten
4417 when the text is written out again. */
4418 iso->switched_dir_and_no_valid_charset_yet = 0;
4419 iso->output_direction_sequence = 1;
4420 /* And make sure that the switch-dir going the other
4421 way gets outputted, as well. */
4422 iso->invalid_switch_dir = 1;
4426 /* This function is called with CODESYS equal to nil when
4427 doing coding-system detection. */
4428 if (!NILP (codesys))
4430 charset_conversion_spec_dynarr *dyn =
4431 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4437 for (i = 0; i < Dynarr_length (dyn); i++)
4439 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4440 if (EQ (cs, spec->from_charset))
4441 cs = spec->to_charset;
4446 iso->charset[reg] = cs;
4447 iso->esc = ISO_ESC_DESIGNATE;
4448 *flags &= CODING_STATE_ISO2022_LOCK;
4449 if (iso->invalid_designated[reg])
4451 iso->invalid_designated[reg] = 0;
4452 iso->output_literally = 1;
4454 if (iso->switched_dir_and_no_valid_charset_yet)
4455 iso->switched_dir_and_no_valid_charset_yet = 0;
4460 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4465 /* #### There are serious deficiencies in the recognition mechanism
4466 here. This needs to be much smarter if it's going to cut it.
4467 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4468 it should be detected as Latin-1.
4469 All the ISO2022 stuff in this file should be synced up with the
4470 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4471 Perhaps we should wait till R2L works in FSF Emacs? */
4473 if (!st->iso2022.initted)
4475 reset_iso2022 (Qnil, &st->iso2022.iso);
4476 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4477 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4478 CODING_CATEGORY_ISO_8_1_MASK |
4479 CODING_CATEGORY_ISO_8_2_MASK |
4480 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4481 st->iso2022.flags = 0;
4482 st->iso2022.high_byte_count = 0;
4483 st->iso2022.saw_single_shift = 0;
4484 st->iso2022.initted = 1;
4487 mask = st->iso2022.mask;
4494 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4495 st->iso2022.high_byte_count++;
4499 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4501 if (st->iso2022.high_byte_count & 1)
4502 /* odd number of high bytes; assume not iso-8-2 */
4503 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4505 st->iso2022.high_byte_count = 0;
4506 st->iso2022.saw_single_shift = 0;
4508 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4510 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4511 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4512 { /* control chars */
4515 /* Allow and ignore control characters that you might
4516 reasonably see in a text file */
4521 case 8: /* backspace */
4522 case 11: /* vertical tab */
4523 case 12: /* form feed */
4524 case 26: /* MS-DOS C-z junk */
4525 case 31: /* '^_' -- for info */
4526 goto label_continue_loop;
4533 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4536 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4537 &st->iso2022.flags, 0))
4539 switch (st->iso2022.iso.esc)
4541 case ISO_ESC_DESIGNATE:
4542 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4543 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4545 case ISO_ESC_LOCKING_SHIFT:
4546 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4547 goto ran_out_of_chars;
4548 case ISO_ESC_SINGLE_SHIFT:
4549 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4550 st->iso2022.saw_single_shift = 1;
4559 goto ran_out_of_chars;
4562 label_continue_loop:;
4571 postprocess_iso2022_mask (int mask)
4573 /* #### kind of cheesy */
4574 /* If seven-bit ISO is allowed, then assume that the encoding is
4575 entirely seven-bit and turn off the eight-bit ones. */
4576 if (mask & CODING_CATEGORY_ISO_7_MASK)
4577 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4578 CODING_CATEGORY_ISO_8_1_MASK |
4579 CODING_CATEGORY_ISO_8_2_MASK);
4583 /* If FLAGS is a null pointer or specifies right-to-left motion,
4584 output a switch-dir-to-left-to-right sequence to DST.
4585 Also update FLAGS if it is not a null pointer.
4586 If INTERNAL_P is set, we are outputting in internal format and
4587 need to handle the CSI differently. */
4590 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4591 unsigned_char_dynarr *dst,
4592 unsigned int *flags,
4595 if (!flags || (*flags & CODING_STATE_R2L))
4597 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4599 Dynarr_add (dst, ISO_CODE_ESC);
4600 Dynarr_add (dst, '[');
4602 else if (internal_p)
4603 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4605 Dynarr_add (dst, ISO_CODE_CSI);
4606 Dynarr_add (dst, '0');
4607 Dynarr_add (dst, ']');
4609 *flags &= ~CODING_STATE_R2L;
4613 /* If FLAGS is a null pointer or specifies a direction different from
4614 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4615 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4616 sequence to DST. Also update FLAGS if it is not a null pointer.
4617 If INTERNAL_P is set, we are outputting in internal format and
4618 need to handle the CSI differently. */
4621 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4622 unsigned_char_dynarr *dst, unsigned int *flags,
4625 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4626 direction == CHARSET_LEFT_TO_RIGHT)
4627 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4628 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4629 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4630 direction == CHARSET_RIGHT_TO_LEFT)
4632 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4634 Dynarr_add (dst, ISO_CODE_ESC);
4635 Dynarr_add (dst, '[');
4637 else if (internal_p)
4638 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4640 Dynarr_add (dst, ISO_CODE_CSI);
4641 Dynarr_add (dst, '2');
4642 Dynarr_add (dst, ']');
4644 *flags |= CODING_STATE_R2L;
4648 /* Convert ISO2022-format data to internal format. */
4651 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4652 unsigned_char_dynarr *dst, unsigned int n)
4654 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4655 unsigned int flags = str->flags;
4656 unsigned int ch = str->ch;
4657 eol_type_t eol_type = str->eol_type;
4658 #ifdef ENABLE_COMPOSITE_CHARS
4659 unsigned_char_dynarr *real_dst = dst;
4661 Lisp_Object coding_system;
4663 XSETCODING_SYSTEM (coding_system, str->codesys);
4665 #ifdef ENABLE_COMPOSITE_CHARS
4666 if (flags & CODING_STATE_COMPOSITE)
4667 dst = str->iso2022.composite_chars;
4668 #endif /* ENABLE_COMPOSITE_CHARS */
4672 unsigned char c = *src++;
4673 if (flags & CODING_STATE_ESCAPE)
4674 { /* Within ESC sequence */
4675 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4680 switch (str->iso2022.esc)
4682 #ifdef ENABLE_COMPOSITE_CHARS
4683 case ISO_ESC_START_COMPOSITE:
4684 if (str->iso2022.composite_chars)
4685 Dynarr_reset (str->iso2022.composite_chars);
4687 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4688 dst = str->iso2022.composite_chars;
4690 case ISO_ESC_END_COMPOSITE:
4692 Bufbyte comstr[MAX_EMCHAR_LEN];
4694 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4695 Dynarr_length (dst));
4697 len = set_charptr_emchar (comstr, emch);
4698 Dynarr_add_many (dst, comstr, len);
4701 #endif /* ENABLE_COMPOSITE_CHARS */
4703 case ISO_ESC_LITERAL:
4704 DECODE_ADD_BINARY_CHAR (c, dst);
4708 /* Everything else handled already */
4713 /* Attempted error recovery. */
4714 if (str->iso2022.output_direction_sequence)
4715 ensure_correct_direction (flags & CODING_STATE_R2L ?
4716 CHARSET_RIGHT_TO_LEFT :
4717 CHARSET_LEFT_TO_RIGHT,
4718 str->codesys, dst, 0, 1);
4719 /* More error recovery. */
4720 if (!retval || str->iso2022.output_literally)
4722 /* Output the (possibly invalid) sequence */
4724 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4725 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4726 flags &= CODING_STATE_ISO2022_LOCK;
4728 n++, src--;/* Repeat the loop with the same character. */
4731 /* No sense in reprocessing the final byte of the
4732 escape sequence; it could mess things up anyway.
4734 DECODE_ADD_BINARY_CHAR (c, dst);
4739 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4740 { /* Control characters */
4742 /***** Error-handling *****/
4744 /* If we were in the middle of a character, dump out the
4745 partial character. */
4746 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4748 /* If we just saw a single-shift character, dump it out.
4749 This may dump out the wrong sort of single-shift character,
4750 but least it will give an indication that something went
4752 if (flags & CODING_STATE_SS2)
4754 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4755 flags &= ~CODING_STATE_SS2;
4757 if (flags & CODING_STATE_SS3)
4759 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4760 flags &= ~CODING_STATE_SS3;
4763 /***** Now handle the control characters. *****/
4766 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4768 flags &= CODING_STATE_ISO2022_LOCK;
4770 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4771 DECODE_ADD_BINARY_CHAR (c, dst);
4774 { /* Graphic characters */
4775 Lisp_Object charset;
4779 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4781 /* Now determine the charset. */
4782 reg = ((flags & CODING_STATE_SS2) ? 2
4783 : (flags & CODING_STATE_SS3) ? 3
4784 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4785 : str->iso2022.register_left);
4786 charset = str->iso2022.charset[reg];
4788 /* Error checking: */
4789 if (! CHARSETP (charset)
4790 || str->iso2022.invalid_designated[reg]
4791 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4792 && XCHARSET_CHARS (charset) == 94))
4793 /* Mrmph. We are trying to invoke a register that has no
4794 or an invalid charset in it, or trying to add a character
4795 outside the range of the charset. Insert that char literally
4796 to preserve it for the output. */
4798 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4799 DECODE_ADD_BINARY_CHAR (c, dst);
4804 /* Things are probably hunky-dorey. */
4806 /* Fetch reverse charset, maybe. */
4807 if (((flags & CODING_STATE_R2L) &&
4808 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4810 (!(flags & CODING_STATE_R2L) &&
4811 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4813 Lisp_Object new_charset =
4814 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4815 if (!NILP (new_charset))
4816 charset = new_charset;
4819 lb = XCHARSET_LEADING_BYTE (charset);
4820 switch (XCHARSET_REP_BYTES (charset))
4823 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4824 Dynarr_add (dst, c & 0x7F);
4827 case 2: /* one-byte official */
4828 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4829 Dynarr_add (dst, lb);
4830 Dynarr_add (dst, c | 0x80);
4833 case 3: /* one-byte private or two-byte official */
4834 if (XCHARSET_PRIVATE_P (charset))
4836 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4837 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4838 Dynarr_add (dst, lb);
4839 Dynarr_add (dst, c | 0x80);
4845 Dynarr_add (dst, lb);
4846 Dynarr_add (dst, ch | 0x80);
4847 Dynarr_add (dst, c | 0x80);
4855 default: /* two-byte private */
4858 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4859 Dynarr_add (dst, lb);
4860 Dynarr_add (dst, ch | 0x80);
4861 Dynarr_add (dst, c | 0x80);
4870 flags &= CODING_STATE_ISO2022_LOCK;
4873 label_continue_loop:;
4876 if (flags & CODING_STATE_END)
4877 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4884 /***** ISO2022 encoder *****/
4886 /* Designate CHARSET into register REG. */
4889 iso2022_designate (Lisp_Object charset, unsigned char reg,
4890 struct encoding_stream *str, unsigned_char_dynarr *dst)
4892 static CONST char inter94[] = "()*+";
4893 static CONST char inter96[] = ",-./";
4895 unsigned char final;
4896 Lisp_Object old_charset = str->iso2022.charset[reg];
4898 str->iso2022.charset[reg] = charset;
4899 if (!CHARSETP (charset))
4900 /* charset might be an initial nil or t. */
4902 type = XCHARSET_TYPE (charset);
4903 final = XCHARSET_FINAL (charset);
4904 if (!str->iso2022.force_charset_on_output[reg] &&
4905 CHARSETP (old_charset) &&
4906 XCHARSET_TYPE (old_charset) == type &&
4907 XCHARSET_FINAL (old_charset) == final)
4910 str->iso2022.force_charset_on_output[reg] = 0;
4913 charset_conversion_spec_dynarr *dyn =
4914 str->codesys->iso2022.output_conv;
4920 for (i = 0; i < Dynarr_length (dyn); i++)
4922 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4923 if (EQ (charset, spec->from_charset))
4924 charset = spec->to_charset;
4929 Dynarr_add (dst, ISO_CODE_ESC);
4932 case CHARSET_TYPE_94:
4933 Dynarr_add (dst, inter94[reg]);
4935 case CHARSET_TYPE_96:
4936 Dynarr_add (dst, inter96[reg]);
4938 case CHARSET_TYPE_94X94:
4939 Dynarr_add (dst, '$');
4941 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4944 Dynarr_add (dst, inter94[reg]);
4946 case CHARSET_TYPE_96X96:
4947 Dynarr_add (dst, '$');
4948 Dynarr_add (dst, inter96[reg]);
4951 Dynarr_add (dst, final);
4955 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4957 if (str->iso2022.register_left != 0)
4959 Dynarr_add (dst, ISO_CODE_SI);
4960 str->iso2022.register_left = 0;
4965 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4967 if (str->iso2022.register_left != 1)
4969 Dynarr_add (dst, ISO_CODE_SO);
4970 str->iso2022.register_left = 1;
4974 /* Convert internally-formatted data to ISO2022 format. */
4977 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4978 unsigned_char_dynarr *dst, unsigned int n)
4980 unsigned char charmask, c;
4981 unsigned char char_boundary;
4982 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4983 unsigned int flags = str->flags;
4984 unsigned int ch = str->ch;
4985 Lisp_Coding_System *codesys = str->codesys;
4986 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4988 Lisp_Object charset;
4991 #ifdef ENABLE_COMPOSITE_CHARS
4992 /* flags for handling composite chars. We do a little switcharoo
4993 on the source while we're outputting the composite char. */
4994 unsigned int saved_n = 0;
4995 CONST unsigned char *saved_src = NULL;
4996 int in_composite = 0;
4997 #endif /* ENABLE_COMPOSITE_CHARS */
4999 char_boundary = str->iso2022.current_char_boundary;
5000 charset = str->iso2022.current_charset;
5001 half = str->iso2022.current_half;
5003 #ifdef ENABLE_COMPOSITE_CHARS
5010 if (BYTE_ASCII_P (c))
5011 { /* Processing ASCII character */
5014 restore_left_to_right_direction (codesys, dst, &flags, 0);
5016 /* Make sure G0 contains ASCII */
5017 if ((c > ' ' && c < ISO_CODE_DEL) ||
5018 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5020 ensure_normal_shift (str, dst);
5021 iso2022_designate (Vcharset_ascii, 0, str, dst);
5024 /* If necessary, restore everything to the default state
5027 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5029 restore_left_to_right_direction (codesys, dst, &flags, 0);
5031 ensure_normal_shift (str, dst);
5033 for (i = 0; i < 4; i++)
5035 Lisp_Object initial_charset =
5036 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5037 iso2022_designate (initial_charset, i, str, dst);
5042 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5043 Dynarr_add (dst, '\r');
5044 if (eol_type != EOL_CR)
5045 Dynarr_add (dst, c);
5049 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5050 && fit_to_be_escape_quoted (c))
5051 Dynarr_add (dst, ISO_CODE_ESC);
5052 Dynarr_add (dst, c);
5057 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5058 { /* Processing Leading Byte */
5060 charset = CHARSET_BY_LEADING_BYTE (c);
5061 if (LEADING_BYTE_PREFIX_P(c))
5063 else if (!EQ (charset, Vcharset_control_1)
5064 #ifdef ENABLE_COMPOSITE_CHARS
5065 && !EQ (charset, Vcharset_composite)
5071 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5072 codesys, dst, &flags, 0);
5074 /* Now determine which register to use. */
5076 for (i = 0; i < 4; i++)
5078 if (EQ (charset, str->iso2022.charset[i]) ||
5080 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5089 if (XCHARSET_GRAPHIC (charset) != 0)
5091 if (!NILP (str->iso2022.charset[1]) &&
5092 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5093 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5095 else if (!NILP (str->iso2022.charset[2]))
5097 else if (!NILP (str->iso2022.charset[3]))
5106 iso2022_designate (charset, reg, str, dst);
5108 /* Now invoke that register. */
5112 ensure_normal_shift (str, dst);
5117 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5119 ensure_shift_out (str, dst);
5127 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5129 Dynarr_add (dst, ISO_CODE_ESC);
5130 Dynarr_add (dst, 'N');
5135 Dynarr_add (dst, ISO_CODE_SS2);
5141 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5143 Dynarr_add (dst, ISO_CODE_ESC);
5144 Dynarr_add (dst, 'O');
5149 Dynarr_add (dst, ISO_CODE_SS3);
5161 { /* Processing Non-ASCII character */
5162 charmask = (half == 0 ? 0x7F : 0xFF);
5164 if (EQ (charset, Vcharset_control_1))
5166 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5167 && fit_to_be_escape_quoted (c))
5168 Dynarr_add (dst, ISO_CODE_ESC);
5169 /* you asked for it ... */
5170 Dynarr_add (dst, c - 0x20);
5174 switch (XCHARSET_REP_BYTES (charset))
5177 Dynarr_add (dst, c & charmask);
5180 if (XCHARSET_PRIVATE_P (charset))
5182 Dynarr_add (dst, c & charmask);
5187 #ifdef ENABLE_COMPOSITE_CHARS
5188 if (EQ (charset, Vcharset_composite))
5192 /* #### Bother! We don't know how to
5194 Dynarr_add (dst, '~');
5198 Emchar emch = MAKE_CHAR (Vcharset_composite,
5199 ch & 0x7F, c & 0x7F);
5200 Lisp_Object lstr = composite_char_string (emch);
5204 src = XSTRING_DATA (lstr);
5205 n = XSTRING_LENGTH (lstr);
5206 Dynarr_add (dst, ISO_CODE_ESC);
5207 Dynarr_add (dst, '0'); /* start composing */
5211 #endif /* ENABLE_COMPOSITE_CHARS */
5213 Dynarr_add (dst, ch & charmask);
5214 Dynarr_add (dst, c & charmask);
5227 Dynarr_add (dst, ch & charmask);
5228 Dynarr_add (dst, c & charmask);
5244 #ifdef ENABLE_COMPOSITE_CHARS
5250 Dynarr_add (dst, ISO_CODE_ESC);
5251 Dynarr_add (dst, '1'); /* end composing */
5252 goto back_to_square_n; /* Wheeeeeeeee ..... */
5254 #endif /* ENABLE_COMPOSITE_CHARS */
5256 if (char_boundary && flags & CODING_STATE_END)
5258 restore_left_to_right_direction (codesys, dst, &flags, 0);
5259 ensure_normal_shift (str, dst);
5260 for (i = 0; i < 4; i++)
5262 Lisp_Object initial_charset =
5263 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5264 iso2022_designate (initial_charset, i, str, dst);
5270 str->iso2022.current_char_boundary = char_boundary;
5271 str->iso2022.current_charset = charset;
5272 str->iso2022.current_half = half;
5274 /* Verbum caro factum est! */
5278 /************************************************************************/
5279 /* No-conversion methods */
5280 /************************************************************************/
5282 /* This is used when reading in "binary" files -- i.e. files that may
5283 contain all 256 possible byte values and that are not to be
5284 interpreted as being in any particular decoding. */
5286 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5287 unsigned_char_dynarr *dst, unsigned int n)
5290 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5291 unsigned int flags = str->flags;
5292 unsigned int ch = str->ch;
5293 eol_type_t eol_type = str->eol_type;
5299 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5300 DECODE_ADD_BINARY_CHAR (c, dst);
5301 label_continue_loop:;
5304 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5311 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5312 unsigned_char_dynarr *dst, unsigned int n)
5315 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5316 unsigned int flags = str->flags;
5317 unsigned int ch = str->ch;
5318 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5325 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5326 Dynarr_add (dst, '\r');
5327 if (eol_type != EOL_CR)
5328 Dynarr_add (dst, '\n');
5331 else if (BYTE_ASCII_P (c))
5334 Dynarr_add (dst, c);
5336 else if (BUFBYTE_LEADING_BYTE_P (c))
5339 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5340 c == LEADING_BYTE_CONTROL_1)
5343 Dynarr_add (dst, '~'); /* untranslatable character */
5347 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5348 Dynarr_add (dst, c);
5349 else if (ch == LEADING_BYTE_CONTROL_1)
5352 Dynarr_add (dst, c - 0x20);
5354 /* else it should be the second or third byte of an
5355 untranslatable character, so ignore it */
5365 /************************************************************************/
5366 /* Simple internal/external functions */
5367 /************************************************************************/
5369 static Extbyte_dynarr *conversion_out_dynarr;
5370 static Bufbyte_dynarr *conversion_in_dynarr;
5372 /* Determine coding system from coding format */
5374 /* #### not correct for all values of `fmt'! */
5376 external_data_format_to_coding_system (enum external_data_format fmt)
5380 case FORMAT_FILENAME:
5381 case FORMAT_TERMINAL:
5382 if (EQ (Vfile_name_coding_system, Qnil) ||
5383 EQ (Vfile_name_coding_system, Qbinary))
5386 return Fget_coding_system (Vfile_name_coding_system);
5389 return Fget_coding_system (Qctext);
5397 convert_to_external_format (CONST Bufbyte *ptr,
5400 enum external_data_format fmt)
5402 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5404 if (!conversion_out_dynarr)
5405 conversion_out_dynarr = Dynarr_new (Extbyte);
5407 Dynarr_reset (conversion_out_dynarr);
5409 if (NILP (coding_system))
5411 CONST Bufbyte *end = ptr + len;
5416 (BYTE_ASCII_P (*ptr)) ? *ptr :
5417 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5418 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5421 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5425 #ifdef ERROR_CHECK_BUFPOS
5426 assert (ptr == end);
5431 Lisp_Object instream, outstream, da_outstream;
5432 Lstream *istr, *ostr;
5433 struct gcpro gcpro1, gcpro2, gcpro3;
5434 char tempbuf[1024]; /* some random amount */
5436 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5437 da_outstream = make_dynarr_output_stream
5438 ((unsigned_char_dynarr *) conversion_out_dynarr);
5440 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5441 istr = XLSTREAM (instream);
5442 ostr = XLSTREAM (outstream);
5443 GCPRO3 (instream, outstream, da_outstream);
5446 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5449 Lstream_write (ostr, tempbuf, size_in_bytes);
5451 Lstream_close (istr);
5452 Lstream_close (ostr);
5454 Lstream_delete (istr);
5455 Lstream_delete (ostr);
5456 Lstream_delete (XLSTREAM (da_outstream));
5459 *len_out = Dynarr_length (conversion_out_dynarr);
5460 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5461 return Dynarr_atp (conversion_out_dynarr, 0);
5465 convert_from_external_format (CONST Extbyte *ptr,
5468 enum external_data_format fmt)
5470 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5472 if (!conversion_in_dynarr)
5473 conversion_in_dynarr = Dynarr_new (Bufbyte);
5475 Dynarr_reset (conversion_in_dynarr);
5477 if (NILP (coding_system))
5479 CONST Extbyte *end = ptr + len;
5480 for (; ptr < end; ptr++)
5483 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5488 Lisp_Object instream, outstream, da_outstream;
5489 Lstream *istr, *ostr;
5490 struct gcpro gcpro1, gcpro2, gcpro3;
5491 char tempbuf[1024]; /* some random amount */
5493 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5494 da_outstream = make_dynarr_output_stream
5495 ((unsigned_char_dynarr *) conversion_in_dynarr);
5497 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5498 istr = XLSTREAM (instream);
5499 ostr = XLSTREAM (outstream);
5500 GCPRO3 (instream, outstream, da_outstream);
5503 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5506 Lstream_write (ostr, tempbuf, size_in_bytes);
5508 Lstream_close (istr);
5509 Lstream_close (ostr);
5511 Lstream_delete (istr);
5512 Lstream_delete (ostr);
5513 Lstream_delete (XLSTREAM (da_outstream));
5516 *len_out = Dynarr_length (conversion_in_dynarr);
5517 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5518 return Dynarr_atp (conversion_in_dynarr, 0);
5522 /************************************************************************/
5523 /* Initialization */
5524 /************************************************************************/
5527 syms_of_file_coding (void)
5529 deferror (&Qcoding_system_error, "coding-system-error",
5530 "Coding-system error", Qio_error);
5532 DEFSUBR (Fcoding_system_p);
5533 DEFSUBR (Ffind_coding_system);
5534 DEFSUBR (Fget_coding_system);
5535 DEFSUBR (Fcoding_system_list);
5536 DEFSUBR (Fcoding_system_name);
5537 DEFSUBR (Fmake_coding_system);
5538 DEFSUBR (Fcopy_coding_system);
5539 DEFSUBR (Fdefine_coding_system_alias);
5540 DEFSUBR (Fsubsidiary_coding_system);
5542 DEFSUBR (Fcoding_system_type);
5543 DEFSUBR (Fcoding_system_doc_string);
5545 DEFSUBR (Fcoding_system_charset);
5547 DEFSUBR (Fcoding_system_property);
5549 DEFSUBR (Fcoding_category_list);
5550 DEFSUBR (Fset_coding_priority_list);
5551 DEFSUBR (Fcoding_priority_list);
5552 DEFSUBR (Fset_coding_category_system);
5553 DEFSUBR (Fcoding_category_system);
5555 DEFSUBR (Fdetect_coding_region);
5556 DEFSUBR (Fdecode_coding_region);
5557 DEFSUBR (Fencode_coding_region);
5559 DEFSUBR (Fdecode_shift_jis_char);
5560 DEFSUBR (Fencode_shift_jis_char);
5561 DEFSUBR (Fdecode_big5_char);
5562 DEFSUBR (Fencode_big5_char);
5563 DEFSUBR (Fset_ucs_char);
5564 DEFSUBR (Fucs_char);
5565 DEFSUBR (Fset_char_ucs);
5566 DEFSUBR (Fchar_ucs);
5568 defsymbol (&Qcoding_systemp, "coding-system-p");
5569 defsymbol (&Qno_conversion, "no-conversion");
5570 defsymbol (&Qraw_text, "raw-text");
5572 defsymbol (&Qbig5, "big5");
5573 defsymbol (&Qshift_jis, "shift-jis");
5574 defsymbol (&Qucs4, "ucs-4");
5575 defsymbol (&Qutf8, "utf-8");
5576 defsymbol (&Qccl, "ccl");
5577 defsymbol (&Qiso2022, "iso2022");
5579 defsymbol (&Qmnemonic, "mnemonic");
5580 defsymbol (&Qeol_type, "eol-type");
5581 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5582 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5584 defsymbol (&Qcr, "cr");
5585 defsymbol (&Qlf, "lf");
5586 defsymbol (&Qcrlf, "crlf");
5587 defsymbol (&Qeol_cr, "eol-cr");
5588 defsymbol (&Qeol_lf, "eol-lf");
5589 defsymbol (&Qeol_crlf, "eol-crlf");
5591 defsymbol (&Qcharset_g0, "charset-g0");
5592 defsymbol (&Qcharset_g1, "charset-g1");
5593 defsymbol (&Qcharset_g2, "charset-g2");
5594 defsymbol (&Qcharset_g3, "charset-g3");
5595 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5596 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5597 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5598 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5599 defsymbol (&Qno_iso6429, "no-iso6429");
5600 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5601 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5603 defsymbol (&Qshort, "short");
5604 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5605 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5606 defsymbol (&Qseven, "seven");
5607 defsymbol (&Qlock_shift, "lock-shift");
5608 defsymbol (&Qescape_quoted, "escape-quoted");
5610 defsymbol (&Qencode, "encode");
5611 defsymbol (&Qdecode, "decode");
5614 defsymbol (&Qctext, "ctext");
5615 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5617 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5619 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5621 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5623 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5625 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5627 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5629 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5631 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5634 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5639 lstream_type_create_file_coding (void)
5641 LSTREAM_HAS_METHOD (decoding, reader);
5642 LSTREAM_HAS_METHOD (decoding, writer);
5643 LSTREAM_HAS_METHOD (decoding, rewinder);
5644 LSTREAM_HAS_METHOD (decoding, seekable_p);
5645 LSTREAM_HAS_METHOD (decoding, flusher);
5646 LSTREAM_HAS_METHOD (decoding, closer);
5647 LSTREAM_HAS_METHOD (decoding, marker);
5649 LSTREAM_HAS_METHOD (encoding, reader);
5650 LSTREAM_HAS_METHOD (encoding, writer);
5651 LSTREAM_HAS_METHOD (encoding, rewinder);
5652 LSTREAM_HAS_METHOD (encoding, seekable_p);
5653 LSTREAM_HAS_METHOD (encoding, flusher);
5654 LSTREAM_HAS_METHOD (encoding, closer);
5655 LSTREAM_HAS_METHOD (encoding, marker);
5659 vars_of_file_coding (void)
5663 fcd = xnew (struct file_coding_dump);
5664 dumpstruct (&fcd, &fcd_description);
5666 /* Initialize to something reasonable ... */
5667 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5669 fcd->coding_category_system[i] = Qnil;
5670 fcd->coding_category_by_priority[i] = i;
5673 Fprovide (intern ("file-coding"));
5675 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5676 Coding system used for TTY keyboard input.
5677 Not used under a windowing system.
5679 Vkeyboard_coding_system = Qnil;
5681 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5682 Coding system used for TTY display output.
5683 Not used under a windowing system.
5685 Vterminal_coding_system = Qnil;
5687 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5688 Overriding coding system used when writing a file or process.
5689 You should *bind* this, not set it. If this is non-nil, it specifies
5690 the coding system that will be used when a file or process is read
5691 in, and overrides `buffer-file-coding-system-for-read',
5692 `insert-file-contents-pre-hook', etc. Use those variables instead of
5693 this one for permanent changes to the environment.
5695 Vcoding_system_for_read = Qnil;
5697 DEFVAR_LISP ("coding-system-for-write",
5698 &Vcoding_system_for_write /*
5699 Overriding coding system used when writing a file or process.
5700 You should *bind* this, not set it. If this is non-nil, it specifies
5701 the coding system that will be used when a file or process is wrote
5702 in, and overrides `buffer-file-coding-system',
5703 `write-region-pre-hook', etc. Use those variables instead of this one
5704 for permanent changes to the environment.
5706 Vcoding_system_for_write = Qnil;
5708 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5709 Coding system used to convert pathnames when accessing files.
5711 Vfile_name_coding_system = Qnil;
5713 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5714 Non-nil means the buffer contents are regarded as multi-byte form
5715 of characters, not a binary code. This affects the display, file I/O,
5716 and behaviors of various editing commands.
5718 Setting this to nil does not do anything.
5720 enable_multibyte_characters = 1;
5724 complex_vars_of_file_coding (void)
5726 staticpro (&Vcoding_system_hash_table);
5727 Vcoding_system_hash_table =
5728 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5730 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5731 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5733 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5735 struct codesys_prop csp; \
5737 csp.prop_type = (Prop_Type); \
5738 Dynarr_add (the_codesys_prop_dynarr, csp); \
5741 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5742 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5743 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5744 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5745 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5746 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5747 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5749 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5750 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5751 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5752 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5753 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5754 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5755 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5756 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5757 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5758 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5759 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5760 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5761 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5762 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5763 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5764 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5765 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5767 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5768 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5770 /* Need to create this here or we're really screwed. */
5772 (Qraw_text, Qno_conversion,
5773 build_string ("Raw text, which means it converts only line-break-codes."),
5774 list2 (Qmnemonic, build_string ("Raw")));
5777 (Qbinary, Qno_conversion,
5778 build_string ("Binary, which means it does not convert anything."),
5779 list4 (Qeol_type, Qlf,
5780 Qmnemonic, build_string ("Binary")));
5782 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5784 /* Need this for bootstrapping */
5785 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5786 Fget_coding_system (Qraw_text);
5792 for (i = 0; i < 65536; i++)
5793 fcd->ucs_to_mule_table[i] = Qnil;
5795 staticpro (&mule_to_ucs_table);
5796 mule_to_ucs_table = Fmake_char_table(Qgeneric);