1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
824 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
826 if (EQ (key, Qmnemonic))
829 CHECK_STRING (value);
830 CODING_SYSTEM_MNEMONIC (codesys) = value;
833 else if (EQ (key, Qeol_type))
835 need_to_setup_eol_systems = NILP (value);
838 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
841 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
842 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
844 else if (ty == CODESYS_ISO2022)
846 #define FROB_INITIAL_CHARSET(charset_num) \
847 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
848 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
850 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
851 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
852 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
853 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
855 #define FROB_FORCE_CHARSET(charset_num) \
856 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
858 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
859 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
860 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
861 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
863 #define FROB_BOOLEAN_PROPERTY(prop) \
864 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
866 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
867 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
868 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
869 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
870 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
871 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
872 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
874 else if (EQ (key, Qinput_charset_conversion))
876 codesys->iso2022.input_conv =
877 Dynarr_new (charset_conversion_spec);
878 parse_charset_conversion_specs (codesys->iso2022.input_conv,
881 else if (EQ (key, Qoutput_charset_conversion))
883 codesys->iso2022.output_conv =
884 Dynarr_new (charset_conversion_spec);
885 parse_charset_conversion_specs (codesys->iso2022.output_conv,
889 signal_simple_error ("Unrecognized property", key);
891 else if (EQ (type, Qccl))
893 if (EQ (key, Qdecode))
895 CHECK_VECTOR (value);
896 CODING_SYSTEM_CCL_DECODE (codesys) = value;
898 else if (EQ (key, Qencode))
900 CHECK_VECTOR (value);
901 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
904 signal_simple_error ("Unrecognized property", key);
908 signal_simple_error ("Unrecognized property", key);
911 if (need_to_setup_eol_systems)
912 setup_eol_coding_systems (codesys);
915 Lisp_Object codesys_obj;
916 XSETCODING_SYSTEM (codesys_obj, codesys);
917 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
923 Copy OLD-CODING-SYSTEM to NEW-NAME.
924 If NEW-NAME does not name an existing coding system, a new one will
927 (old_coding_system, new_name))
929 Lisp_Object new_coding_system;
930 old_coding_system = Fget_coding_system (old_coding_system);
931 new_coding_system = Ffind_coding_system (new_name);
932 if (NILP (new_coding_system))
934 XSETCODING_SYSTEM (new_coding_system,
935 allocate_coding_system
936 (XCODING_SYSTEM_TYPE (old_coding_system),
938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
944 memcpy (((char *) to ) + sizeof (to->header),
945 ((char *) from) + sizeof (from->header),
946 sizeof (*from) - sizeof (from->header));
949 return new_coding_system;
952 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
953 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
955 (alias, coding_system))
957 CHECK_SYMBOL (alias);
958 if (!NILP (Ffind_coding_system (alias)))
959 signal_simple_error ("Symbol already names a coding system", alias);
960 coding_system = Fget_coding_system (coding_system);
961 Fputhash (alias, coding_system, Vcoding_system_hash_table);
963 /* Set up aliases for subsidiaries. */
964 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
967 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
968 #define FROB(type, name) \
970 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
971 if (!NILP (subsidiary)) \
972 Fdefine_coding_system_alias \
973 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
980 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
981 but it doesn't look intentional, so I'd rather return something
982 meaningful or nothing at all. */
987 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
989 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
990 Lisp_Object new_coding_system;
992 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
993 return coding_system;
997 case EOL_AUTODETECT: return coding_system;
998 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
999 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1000 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1004 return NILP (new_coding_system) ? coding_system : new_coding_system;
1007 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1008 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1010 (coding_system, eol_type))
1012 coding_system = Fget_coding_system (coding_system);
1014 return subsidiary_coding_system (coding_system,
1015 symbol_to_eol_type (eol_type));
1019 /************************************************************************/
1020 /* Coding system accessors */
1021 /************************************************************************/
1023 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1024 Return the doc string for CODING-SYSTEM.
1028 coding_system = Fget_coding_system (coding_system);
1029 return XCODING_SYSTEM_DOC_STRING (coding_system);
1032 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1033 Return the type of CODING-SYSTEM.
1037 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1040 case CODESYS_AUTODETECT: return Qundecided;
1042 case CODESYS_SHIFT_JIS: return Qshift_jis;
1043 case CODESYS_ISO2022: return Qiso2022;
1044 case CODESYS_BIG5: return Qbig5;
1045 case CODESYS_UCS4: return Qucs4;
1046 case CODESYS_UTF8: return Qutf8;
1047 case CODESYS_CCL: return Qccl;
1049 case CODESYS_NO_CONVERSION: return Qno_conversion;
1051 case CODESYS_INTERNAL: return Qinternal;
1058 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1061 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1063 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1066 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1067 Return initial charset of CODING-SYSTEM designated to GNUM.
1070 (coding_system, gnum))
1072 coding_system = Fget_coding_system (coding_system);
1075 return coding_system_charset (coding_system, XINT (gnum));
1079 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1080 Return the PROP property of CODING-SYSTEM.
1082 (coding_system, prop))
1085 enum coding_system_type type;
1087 coding_system = Fget_coding_system (coding_system);
1088 CHECK_SYMBOL (prop);
1089 type = XCODING_SYSTEM_TYPE (coding_system);
1091 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1092 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1095 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1097 case CODESYS_PROP_ALL_OK:
1100 case CODESYS_PROP_ISO2022:
1101 if (type != CODESYS_ISO2022)
1103 ("Property only valid in ISO2022 coding systems",
1107 case CODESYS_PROP_CCL:
1108 if (type != CODESYS_CCL)
1110 ("Property only valid in CCL coding systems",
1120 signal_simple_error ("Unrecognized property", prop);
1122 if (EQ (prop, Qname))
1123 return XCODING_SYSTEM_NAME (coding_system);
1124 else if (EQ (prop, Qtype))
1125 return Fcoding_system_type (coding_system);
1126 else if (EQ (prop, Qdoc_string))
1127 return XCODING_SYSTEM_DOC_STRING (coding_system);
1128 else if (EQ (prop, Qmnemonic))
1129 return XCODING_SYSTEM_MNEMONIC (coding_system);
1130 else if (EQ (prop, Qeol_type))
1131 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1132 else if (EQ (prop, Qeol_lf))
1133 return XCODING_SYSTEM_EOL_LF (coding_system);
1134 else if (EQ (prop, Qeol_crlf))
1135 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1136 else if (EQ (prop, Qeol_cr))
1137 return XCODING_SYSTEM_EOL_CR (coding_system);
1138 else if (EQ (prop, Qpost_read_conversion))
1139 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1140 else if (EQ (prop, Qpre_write_conversion))
1141 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1143 else if (type == CODESYS_ISO2022)
1145 if (EQ (prop, Qcharset_g0))
1146 return coding_system_charset (coding_system, 0);
1147 else if (EQ (prop, Qcharset_g1))
1148 return coding_system_charset (coding_system, 1);
1149 else if (EQ (prop, Qcharset_g2))
1150 return coding_system_charset (coding_system, 2);
1151 else if (EQ (prop, Qcharset_g3))
1152 return coding_system_charset (coding_system, 3);
1154 #define FORCE_CHARSET(charset_num) \
1155 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1156 (coding_system, charset_num) ? Qt : Qnil)
1158 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1159 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1160 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1161 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1163 #define LISP_BOOLEAN(prop) \
1164 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1166 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1167 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1168 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1169 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1170 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1171 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1172 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1174 else if (EQ (prop, Qinput_charset_conversion))
1176 unparse_charset_conversion_specs
1177 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1178 else if (EQ (prop, Qoutput_charset_conversion))
1180 unparse_charset_conversion_specs
1181 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1185 else if (type == CODESYS_CCL)
1187 if (EQ (prop, Qdecode))
1188 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1189 else if (EQ (prop, Qencode))
1190 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1198 return Qnil; /* not reached */
1202 /************************************************************************/
1203 /* Coding category functions */
1204 /************************************************************************/
1207 decode_coding_category (Lisp_Object symbol)
1211 CHECK_SYMBOL (symbol);
1212 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1213 if (EQ (coding_category_symbol[i], symbol))
1216 signal_simple_error ("Unrecognized coding category", symbol);
1217 return 0; /* not reached */
1220 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1221 Return a list of all recognized coding categories.
1226 Lisp_Object list = Qnil;
1228 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1229 list = Fcons (coding_category_symbol[i], list);
1233 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1234 Change the priority order of the coding categories.
1235 LIST should be list of coding categories, in descending order of
1236 priority. Unspecified coding categories will be lower in priority
1237 than all specified ones, in the same relative order they were in
1242 int category_to_priority[CODING_CATEGORY_LAST + 1];
1246 /* First generate a list that maps coding categories to priorities. */
1248 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1249 category_to_priority[i] = -1;
1251 /* Highest priority comes from the specified list. */
1253 EXTERNAL_LIST_LOOP (rest, list)
1255 int cat = decode_coding_category (XCAR (rest));
1257 if (category_to_priority[cat] >= 0)
1258 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1259 category_to_priority[cat] = i++;
1262 /* Now go through the existing categories by priority to retrieve
1263 the categories not yet specified and preserve their priority
1265 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1267 int cat = coding_category_by_priority[j];
1268 if (category_to_priority[cat] < 0)
1269 category_to_priority[cat] = i++;
1272 /* Now we need to construct the inverse of the mapping we just
1275 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1276 coding_category_by_priority[category_to_priority[i]] = i;
1278 /* Phew! That was confusing. */
1282 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1283 Return a list of coding categories in descending order of priority.
1288 Lisp_Object list = Qnil;
1290 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1291 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1296 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1297 Change the coding system associated with a coding category.
1299 (coding_category, coding_system))
1301 int cat = decode_coding_category (coding_category);
1303 coding_system = Fget_coding_system (coding_system);
1304 coding_category_system[cat] = coding_system;
1308 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1309 Return the coding system associated with a coding category.
1313 int cat = decode_coding_category (coding_category);
1314 Lisp_Object sys = coding_category_system[cat];
1317 return XCODING_SYSTEM_NAME (sys);
1322 /************************************************************************/
1323 /* Detecting the encoding of data */
1324 /************************************************************************/
1326 struct detection_state
1328 enum eol_type eol_type;
1364 struct iso2022_decoder iso;
1366 int high_byte_count;
1367 unsigned int saw_single_shift:1;
1380 acceptable_control_char_p (int c)
1384 /* Allow and ignore control characters that you might
1385 reasonably see in a text file */
1390 case 8: /* backspace */
1391 case 11: /* vertical tab */
1392 case 12: /* form feed */
1393 case 26: /* MS-DOS C-z junk */
1394 case 31: /* '^_' -- for info */
1402 mask_has_at_most_one_bit_p (int mask)
1404 /* Perhaps the only thing useful you learn from intensive Microsoft
1405 technical interviews */
1406 return (mask & (mask - 1)) == 0;
1409 static enum eol_type
1410 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1419 st->eol.just_saw_cr = 1;
1424 if (st->eol.just_saw_cr)
1426 else if (st->eol.seen_anything)
1429 else if (st->eol.just_saw_cr)
1431 st->eol.just_saw_cr = 0;
1433 st->eol.seen_anything = 1;
1436 return EOL_AUTODETECT;
1439 /* Attempt to determine the encoding and EOL type of the given text.
1440 Before calling this function for the first type, you must initialize
1441 st->eol_type as appropriate and initialize st->mask to ~0.
1443 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1446 st->mask holds the determined coding category mask, or ~0 if only
1447 ASCII has been seen so far.
1451 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1452 is present in st->mask
1453 1 == definitive answers are here for both st->eol_type and st->mask
1457 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1458 unsigned int n, int just_do_eol)
1462 if (st->eol_type == EOL_AUTODETECT)
1463 st->eol_type = detect_eol_type (st, src, n);
1466 return st->eol_type != EOL_AUTODETECT;
1468 if (!st->seen_non_ascii)
1470 for (; n; n--, src++)
1473 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1475 st->seen_non_ascii = 1;
1477 st->shift_jis.mask = ~0;
1481 st->iso2022.mask = ~0;
1491 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1492 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1493 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1494 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1495 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1496 st->big5.mask = detect_coding_big5 (st, src, n);
1497 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1498 st->utf8.mask = detect_coding_utf8 (st, src, n);
1499 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1500 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1503 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1504 | st->utf8.mask | st->ucs4.mask;
1507 int retval = mask_has_at_most_one_bit_p (st->mask);
1508 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1509 return retval && st->eol_type != EOL_AUTODETECT;
1514 coding_system_from_mask (int mask)
1518 /* If the file was entirely or basically ASCII, use the
1519 default value of `buffer-file-coding-system'. */
1520 Lisp_Object retval =
1521 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1524 retval = Ffind_coding_system (retval);
1528 (Qbad_variable, Qwarning,
1529 "Invalid `default-buffer-file-coding-system', set to nil");
1530 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1534 retval = Fget_coding_system (Qraw_text);
1542 mask = postprocess_iso2022_mask (mask);
1544 /* Look through the coding categories by priority and find
1545 the first one that is allowed. */
1546 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1548 cat = coding_category_by_priority[i];
1549 if ((mask & (1 << cat)) &&
1550 !NILP (coding_category_system[cat]))
1554 return coding_category_system[cat];
1556 return Fget_coding_system (Qraw_text);
1560 /* Given a seekable read stream and potential coding system and EOL type
1561 as specified, do any autodetection that is called for. If the
1562 coding system and/or EOL type are not autodetect, they will be left
1563 alone; but this function will never return an autodetect coding system
1566 This function does not automatically fetch subsidiary coding systems;
1567 that should be unnecessary with the explicit eol-type argument. */
1570 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1571 enum eol_type *eol_type_in_out)
1573 struct detection_state decst;
1575 if (*eol_type_in_out == EOL_AUTODETECT)
1576 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1579 decst.eol_type = *eol_type_in_out;
1582 /* If autodetection is called for, do it now. */
1583 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1584 *eol_type_in_out == EOL_AUTODETECT)
1589 unsigned char random_buffer[4096];
1592 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1595 if (detect_coding_type (&decst, random_buffer, nread,
1596 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1597 CODESYS_AUTODETECT))
1601 *eol_type_in_out = decst.eol_type;
1602 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1603 *codesys_in_out = coding_system_from_mask (decst.mask);
1606 /* If we absolutely can't determine the EOL type, just assume LF. */
1607 if (*eol_type_in_out == EOL_AUTODETECT)
1608 *eol_type_in_out = EOL_LF;
1610 Lstream_rewind (stream);
1613 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1614 Detect coding system of the text in the region between START and END.
1615 Returned a list of possible coding systems ordered by priority.
1616 If only ASCII characters are found, it returns 'undecided or one of
1617 its subsidiary coding systems according to a detected end-of-line
1618 type. Optional arg BUFFER defaults to the current buffer.
1620 (start, end, buffer))
1622 Lisp_Object val = Qnil;
1623 struct buffer *buf = decode_buffer (buffer, 0);
1625 Lisp_Object instream, lb_instream;
1626 Lstream *istr, *lb_istr;
1627 struct detection_state decst;
1628 struct gcpro gcpro1, gcpro2;
1630 get_buffer_range_char (buf, start, end, &b, &e, 0);
1631 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1632 lb_istr = XLSTREAM (lb_instream);
1633 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1634 istr = XLSTREAM (instream);
1635 GCPRO2 (instream, lb_instream);
1637 decst.eol_type = EOL_AUTODETECT;
1641 unsigned char random_buffer[4096];
1642 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1646 if (detect_coding_type (&decst, random_buffer, nread, 0))
1650 if (decst.mask == ~0)
1651 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1659 decst.mask = postprocess_iso2022_mask (decst.mask);
1661 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1663 int sys = coding_category_by_priority[i];
1664 if (decst.mask & (1 << sys))
1666 Lisp_Object codesys = coding_category_system[sys];
1667 if (!NILP (codesys))
1668 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1669 val = Fcons (codesys, val);
1673 Lstream_close (istr);
1675 Lstream_delete (istr);
1676 Lstream_delete (lb_istr);
1681 /************************************************************************/
1682 /* Converting to internal Mule format ("decoding") */
1683 /************************************************************************/
1685 /* A decoding stream is a stream used for decoding text (i.e.
1686 converting from some external format to internal format).
1687 The decoding-stream object keeps track of the actual coding
1688 stream, the stream that is at the other end, and data that
1689 needs to be persistent across the lifetime of the stream. */
1691 /* Handle the EOL stuff related to just-read-in character C.
1692 EOL_TYPE is the EOL type of the coding stream.
1693 FLAGS is the current value of FLAGS in the coding stream, and may
1694 be modified by this macro. (The macro only looks at the
1695 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1696 bytes are to be written. You need to also define a local goto
1697 label "label_continue_loop" that is at the end of the main
1698 character-reading loop.
1700 If C is a CR character, then this macro handles it entirely and
1701 jumps to label_continue_loop. Otherwise, this macro does not add
1702 anything to DST, and continues normally. You should continue
1703 processing C normally after this macro. */
1705 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1709 if (eol_type == EOL_CR) \
1710 Dynarr_add (dst, '\n'); \
1711 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1712 Dynarr_add (dst, c); \
1714 flags |= CODING_STATE_CR; \
1715 goto label_continue_loop; \
1717 else if (flags & CODING_STATE_CR) \
1718 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1720 Dynarr_add (dst, '\r'); \
1721 flags &= ~CODING_STATE_CR; \
1725 /* C should be a binary character in the range 0 - 255; convert
1726 to internal format and add to Dynarr DST. */
1729 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1731 if (BYTE_ASCII_P (c)) \
1732 Dynarr_add (dst, c); \
1735 Dynarr_add (dst, (c >> 6) | 0xc0); \
1736 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1741 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1745 Dynarr_add (dst, c);
1747 else if ( c <= 0x7ff )
1749 Dynarr_add (dst, (c >> 6) | 0xc0);
1750 Dynarr_add (dst, (c & 0x3f) | 0x80);
1752 else if ( c <= 0xffff )
1754 Dynarr_add (dst, (c >> 12) | 0xe0);
1755 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1756 Dynarr_add (dst, (c & 0x3f) | 0x80);
1758 else if ( c <= 0x1fffff )
1760 Dynarr_add (dst, (c >> 18) | 0xf0);
1761 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1762 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1763 Dynarr_add (dst, (c & 0x3f) | 0x80);
1765 else if ( c <= 0x3ffffff )
1767 Dynarr_add (dst, (c >> 24) | 0xf8);
1768 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1769 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1770 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1771 Dynarr_add (dst, (c & 0x3f) | 0x80);
1775 Dynarr_add (dst, (c >> 30) | 0xfc);
1776 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1777 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1778 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1779 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1780 Dynarr_add (dst, (c & 0x3f) | 0x80);
1784 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1786 if (BYTE_ASCII_P (c)) \
1787 Dynarr_add (dst, c); \
1788 else if (BYTE_C1_P (c)) \
1790 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1791 Dynarr_add (dst, c + 0x20); \
1795 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1796 Dynarr_add (dst, c); \
1801 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1805 DECODE_ADD_BINARY_CHAR (ch, dst); \
1810 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1812 if (flags & CODING_STATE_END) \
1814 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1815 if (flags & CODING_STATE_CR) \
1816 Dynarr_add (dst, '\r'); \
1820 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1822 struct decoding_stream
1824 /* Coding system that governs the conversion. */
1825 Lisp_Coding_System *codesys;
1827 /* Stream that we read the encoded data from or
1828 write the decoded data to. */
1831 /* If we are reading, then we can return only a fixed amount of
1832 data, so if the conversion resulted in too much data, we store it
1833 here for retrieval the next time around. */
1834 unsigned_char_dynarr *runoff;
1836 /* FLAGS holds flags indicating the current state of the decoding.
1837 Some of these flags are dependent on the coding system. */
1840 /* CH holds a partially built-up character. Since we only deal
1841 with one- and two-byte characters at the moment, we only use
1842 this to store the first byte of a two-byte character. */
1845 /* EOL_TYPE specifies the type of end-of-line conversion that
1846 currently applies. We need to keep this separate from the
1847 EOL type stored in CODESYS because the latter might indicate
1848 automatic EOL-type detection while the former will always
1849 indicate a particular EOL type. */
1850 enum eol_type eol_type;
1852 /* Additional ISO2022 information. We define the structure above
1853 because it's also needed by the detection routines. */
1854 struct iso2022_decoder iso2022;
1856 /* Additional information (the state of the running CCL program)
1857 used by the CCL decoder. */
1858 struct ccl_program ccl;
1860 struct detection_state decst;
1863 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1864 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1865 static int decoding_rewinder (Lstream *stream);
1866 static int decoding_seekable_p (Lstream *stream);
1867 static int decoding_flusher (Lstream *stream);
1868 static int decoding_closer (Lstream *stream);
1870 static Lisp_Object decoding_marker (Lisp_Object stream,
1871 void (*markobj) (Lisp_Object));
1873 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1874 sizeof (struct decoding_stream));
1877 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1879 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1880 Lisp_Object str_obj;
1882 /* We do not need to mark the coding systems or charsets stored
1883 within the stream because they are stored in a global list
1884 and automatically marked. */
1886 XSETLSTREAM (str_obj, str);
1888 if (str->imp->marker)
1889 return (str->imp->marker) (str_obj, markobj);
1894 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1895 so we read data from the other end, decode it, and store it into DATA. */
1898 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1900 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1901 unsigned char *orig_data = data;
1903 int error_occurred = 0;
1905 /* We need to interface to mule_decode(), which expects to take some
1906 amount of data and store the result into a Dynarr. We have
1907 mule_decode() store into str->runoff, and take data from there
1910 /* We loop until we have enough data, reading chunks from the other
1911 end and decoding it. */
1914 /* Take data from the runoff if we can. Make sure to take at
1915 most SIZE bytes, and delete the data from the runoff. */
1916 if (Dynarr_length (str->runoff) > 0)
1918 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1919 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1920 Dynarr_delete_many (str->runoff, 0, chunk);
1926 break; /* No more room for data */
1928 if (str->flags & CODING_STATE_END)
1929 /* This means that on the previous iteration, we hit the EOF on
1930 the other end. We loop once more so that mule_decode() can
1931 output any final stuff it may be holding, or any "go back
1932 to a sane state" escape sequences. (This latter makes sense
1933 during encoding.) */
1936 /* Exhausted the runoff, so get some more. DATA has at least
1937 SIZE bytes left of storage in it, so it's OK to read directly
1938 into it. (We'll be overwriting above, after we've decoded it
1939 into the runoff.) */
1940 read_size = Lstream_read (str->other_end, data, size);
1947 /* There might be some more end data produced in the translation.
1948 See the comment above. */
1949 str->flags |= CODING_STATE_END;
1950 mule_decode (stream, data, str->runoff, read_size);
1953 if (data - orig_data == 0)
1954 return error_occurred ? -1 : 0;
1956 return data - orig_data;
1960 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1962 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1965 /* Decode all our data into the runoff, and then attempt to write
1966 it all out to the other end. Remove whatever chunk we succeeded
1968 mule_decode (stream, data, str->runoff, size);
1969 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1970 Dynarr_length (str->runoff));
1972 Dynarr_delete_many (str->runoff, 0, retval);
1973 /* Do NOT return retval. The return value indicates how much
1974 of the incoming data was written, not how many bytes were
1980 reset_decoding_stream (struct decoding_stream *str)
1983 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1985 Lisp_Object coding_system;
1986 XSETCODING_SYSTEM (coding_system, str->codesys);
1987 reset_iso2022 (coding_system, &str->iso2022);
1989 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1991 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1994 str->flags = str->ch = 0;
1998 decoding_rewinder (Lstream *stream)
2000 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2001 reset_decoding_stream (str);
2002 Dynarr_reset (str->runoff);
2003 return Lstream_rewind (str->other_end);
2007 decoding_seekable_p (Lstream *stream)
2009 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2010 return Lstream_seekable_p (str->other_end);
2014 decoding_flusher (Lstream *stream)
2016 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2017 return Lstream_flush (str->other_end);
2021 decoding_closer (Lstream *stream)
2023 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2024 if (stream->flags & LSTREAM_FL_WRITE)
2026 str->flags |= CODING_STATE_END;
2027 decoding_writer (stream, 0, 0);
2029 Dynarr_free (str->runoff);
2031 #ifdef ENABLE_COMPOSITE_CHARS
2032 if (str->iso2022.composite_chars)
2033 Dynarr_free (str->iso2022.composite_chars);
2036 return Lstream_close (str->other_end);
2040 decoding_stream_coding_system (Lstream *stream)
2042 Lisp_Object coding_system;
2043 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2045 XSETCODING_SYSTEM (coding_system, str->codesys);
2046 return subsidiary_coding_system (coding_system, str->eol_type);
2050 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2052 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2053 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2055 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2056 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2057 reset_decoding_stream (str);
2060 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2061 stream for writing, no automatic code detection will be performed.
2062 The reason for this is that automatic code detection requires a
2063 seekable input. Things will also fail if you open a decoding
2064 stream for reading using a non-fully-specified coding system and
2065 a non-seekable input stream. */
2068 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2071 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2072 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2076 str->other_end = stream;
2077 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2078 str->eol_type = EOL_AUTODETECT;
2079 if (!strcmp (mode, "r")
2080 && Lstream_seekable_p (stream))
2081 /* We can determine the coding system now. */
2082 determine_real_coding_system (stream, &codesys, &str->eol_type);
2083 set_decoding_stream_coding_system (lstr, codesys);
2084 str->decst.eol_type = str->eol_type;
2085 str->decst.mask = ~0;
2086 XSETLSTREAM (obj, lstr);
2091 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2093 return make_decoding_stream_1 (stream, codesys, "r");
2097 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2099 return make_decoding_stream_1 (stream, codesys, "w");
2102 /* Note: the decode_coding_* functions all take the same
2103 arguments as mule_decode(), which is to say some SRC data of
2104 size N, which is to be stored into dynamic array DST.
2105 DECODING is the stream within which the decoding is
2106 taking place, but no data is actually read from or
2107 written to that stream; that is handled in decoding_reader()
2108 or decoding_writer(). This allows the same functions to
2109 be used for both reading and writing. */
2112 mule_decode (Lstream *decoding, CONST unsigned char *src,
2113 unsigned_char_dynarr *dst, unsigned int n)
2115 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2117 /* If necessary, do encoding-detection now. We do this when
2118 we're a writing stream or a non-seekable reading stream,
2119 meaning that we can't just process the whole input,
2120 rewind, and start over. */
2122 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2123 str->eol_type == EOL_AUTODETECT)
2125 Lisp_Object codesys;
2127 XSETCODING_SYSTEM (codesys, str->codesys);
2128 detect_coding_type (&str->decst, src, n,
2129 CODING_SYSTEM_TYPE (str->codesys) !=
2130 CODESYS_AUTODETECT);
2131 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2132 str->decst.mask != ~0)
2133 /* #### This is cheesy. What we really ought to do is
2134 buffer up a certain amount of data so as to get a
2135 less random result. */
2136 codesys = coding_system_from_mask (str->decst.mask);
2137 str->eol_type = str->decst.eol_type;
2138 if (XCODING_SYSTEM (codesys) != str->codesys)
2140 /* Preserve the CODING_STATE_END flag in case it was set.
2141 If we erase it, bad things might happen. */
2142 int was_end = str->flags & CODING_STATE_END;
2143 set_decoding_stream_coding_system (decoding, codesys);
2145 str->flags |= CODING_STATE_END;
2149 switch (CODING_SYSTEM_TYPE (str->codesys))
2152 case CODESYS_INTERNAL:
2153 Dynarr_add_many (dst, src, n);
2156 case CODESYS_AUTODETECT:
2157 /* If we got this far and still haven't decided on the coding
2158 system, then do no conversion. */
2159 case CODESYS_NO_CONVERSION:
2160 decode_coding_no_conversion (decoding, src, dst, n);
2163 case CODESYS_SHIFT_JIS:
2164 decode_coding_sjis (decoding, src, dst, n);
2167 decode_coding_big5 (decoding, src, dst, n);
2170 decode_coding_ucs4 (decoding, src, dst, n);
2173 decode_coding_utf8 (decoding, src, dst, n);
2176 ccl_driver (&str->ccl, src, dst, n, 0);
2178 case CODESYS_ISO2022:
2179 decode_coding_iso2022 (decoding, src, dst, n);
2187 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2188 Decode the text between START and END which is encoded in CODING-SYSTEM.
2189 This is useful if you've read in encoded text from a file without decoding
2190 it (e.g. you read in a JIS-formatted file but used the `binary' or
2191 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2192 Return length of decoded text.
2193 BUFFER defaults to the current buffer if unspecified.
2195 (start, end, coding_system, buffer))
2198 struct buffer *buf = decode_buffer (buffer, 0);
2199 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2200 Lstream *istr, *ostr;
2201 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2203 get_buffer_range_char (buf, start, end, &b, &e, 0);
2205 barf_if_buffer_read_only (buf, b, e);
2207 coding_system = Fget_coding_system (coding_system);
2208 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2209 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2210 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2212 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2213 Fget_coding_system (Qbinary));
2214 istr = XLSTREAM (instream);
2215 ostr = XLSTREAM (outstream);
2216 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2218 /* The chain of streams looks like this:
2220 [BUFFER] <----- send through
2221 ------> [ENCODE AS BINARY]
2222 ------> [DECODE AS SPECIFIED]
2228 char tempbuf[1024]; /* some random amount */
2229 Bufpos newpos, even_newer_pos;
2230 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2231 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2235 newpos = lisp_buffer_stream_startpos (istr);
2236 Lstream_write (ostr, tempbuf, size_in_bytes);
2237 even_newer_pos = lisp_buffer_stream_startpos (istr);
2238 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2241 Lstream_close (istr);
2242 Lstream_close (ostr);
2244 Lstream_delete (istr);
2245 Lstream_delete (ostr);
2246 Lstream_delete (XLSTREAM (de_outstream));
2247 Lstream_delete (XLSTREAM (lb_outstream));
2252 /************************************************************************/
2253 /* Converting to an external encoding ("encoding") */
2254 /************************************************************************/
2256 /* An encoding stream is an output stream. When you create the
2257 stream, you specify the coding system that governs the encoding
2258 and another stream that the resulting encoded data is to be
2259 sent to, and then start sending data to it. */
2261 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2263 struct encoding_stream
2265 /* Coding system that governs the conversion. */
2266 Lisp_Coding_System *codesys;
2268 /* Stream that we read the encoded data from or
2269 write the decoded data to. */
2272 /* If we are reading, then we can return only a fixed amount of
2273 data, so if the conversion resulted in too much data, we store it
2274 here for retrieval the next time around. */
2275 unsigned_char_dynarr *runoff;
2277 /* FLAGS holds flags indicating the current state of the encoding.
2278 Some of these flags are dependent on the coding system. */
2281 /* CH holds a partially built-up character. Since we only deal
2282 with one- and two-byte characters at the moment, we only use
2283 this to store the first byte of a two-byte character. */
2286 /* Additional information used by the ISO2022 encoder. */
2289 /* CHARSET holds the character sets currently assigned to the G0
2290 through G3 registers. It is initialized from the array
2291 INITIAL_CHARSET in CODESYS. */
2292 Lisp_Object charset[4];
2294 /* Which registers are currently invoked into the left (GL) and
2295 right (GR) halves of the 8-bit encoding space? */
2296 int register_left, register_right;
2298 /* Whether we need to explicitly designate the charset in the
2299 G? register before using it. It is initialized from the
2300 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2301 unsigned char force_charset_on_output[4];
2303 /* Other state variables that need to be preserved across
2305 Lisp_Object current_charset;
2307 int current_char_boundary;
2310 /* Additional information (the state of the running CCL program)
2311 used by the CCL encoder. */
2312 struct ccl_program ccl;
2316 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2317 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2319 static int encoding_rewinder (Lstream *stream);
2320 static int encoding_seekable_p (Lstream *stream);
2321 static int encoding_flusher (Lstream *stream);
2322 static int encoding_closer (Lstream *stream);
2324 static Lisp_Object encoding_marker (Lisp_Object stream,
2325 void (*markobj) (Lisp_Object));
2327 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2328 sizeof (struct encoding_stream));
2331 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2333 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2334 Lisp_Object str_obj;
2336 /* We do not need to mark the coding systems or charsets stored
2337 within the stream because they are stored in a global list
2338 and automatically marked. */
2340 XSETLSTREAM (str_obj, str);
2342 if (str->imp->marker)
2343 return (str->imp->marker) (str_obj, markobj);
2348 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2349 so we read data from the other end, encode it, and store it into DATA. */
2352 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2354 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2355 unsigned char *orig_data = data;
2357 int error_occurred = 0;
2359 /* We need to interface to mule_encode(), which expects to take some
2360 amount of data and store the result into a Dynarr. We have
2361 mule_encode() store into str->runoff, and take data from there
2364 /* We loop until we have enough data, reading chunks from the other
2365 end and encoding it. */
2368 /* Take data from the runoff if we can. Make sure to take at
2369 most SIZE bytes, and delete the data from the runoff. */
2370 if (Dynarr_length (str->runoff) > 0)
2372 int chunk = min ((int) size, Dynarr_length (str->runoff));
2373 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2374 Dynarr_delete_many (str->runoff, 0, chunk);
2380 break; /* No more room for data */
2382 if (str->flags & CODING_STATE_END)
2383 /* This means that on the previous iteration, we hit the EOF on
2384 the other end. We loop once more so that mule_encode() can
2385 output any final stuff it may be holding, or any "go back
2386 to a sane state" escape sequences. (This latter makes sense
2387 during encoding.) */
2390 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2391 left of storage in it, so it's OK to read directly into it.
2392 (We'll be overwriting above, after we've encoded it into the
2394 read_size = Lstream_read (str->other_end, data, size);
2401 /* There might be some more end data produced in the translation.
2402 See the comment above. */
2403 str->flags |= CODING_STATE_END;
2404 mule_encode (stream, data, str->runoff, read_size);
2407 if (data == orig_data)
2408 return error_occurred ? -1 : 0;
2410 return data - orig_data;
2414 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2416 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2419 /* Encode all our data into the runoff, and then attempt to write
2420 it all out to the other end. Remove whatever chunk we succeeded
2422 mule_encode (stream, data, str->runoff, size);
2423 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2424 Dynarr_length (str->runoff));
2426 Dynarr_delete_many (str->runoff, 0, retval);
2427 /* Do NOT return retval. The return value indicates how much
2428 of the incoming data was written, not how many bytes were
2434 reset_encoding_stream (struct encoding_stream *str)
2437 switch (CODING_SYSTEM_TYPE (str->codesys))
2439 case CODESYS_ISO2022:
2443 for (i = 0; i < 4; i++)
2445 str->iso2022.charset[i] =
2446 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2447 str->iso2022.force_charset_on_output[i] =
2448 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2450 str->iso2022.register_left = 0;
2451 str->iso2022.register_right = 1;
2452 str->iso2022.current_charset = Qnil;
2453 str->iso2022.current_half = 0;
2454 str->iso2022.current_char_boundary = 1;
2458 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2465 str->flags = str->ch = 0;
2469 encoding_rewinder (Lstream *stream)
2471 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2472 reset_encoding_stream (str);
2473 Dynarr_reset (str->runoff);
2474 return Lstream_rewind (str->other_end);
2478 encoding_seekable_p (Lstream *stream)
2480 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2481 return Lstream_seekable_p (str->other_end);
2485 encoding_flusher (Lstream *stream)
2487 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2488 return Lstream_flush (str->other_end);
2492 encoding_closer (Lstream *stream)
2494 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2495 if (stream->flags & LSTREAM_FL_WRITE)
2497 str->flags |= CODING_STATE_END;
2498 encoding_writer (stream, 0, 0);
2500 Dynarr_free (str->runoff);
2501 return Lstream_close (str->other_end);
2505 encoding_stream_coding_system (Lstream *stream)
2507 Lisp_Object coding_system;
2508 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2510 XSETCODING_SYSTEM (coding_system, str->codesys);
2511 return coding_system;
2515 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2517 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2518 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2520 reset_encoding_stream (str);
2524 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2527 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2528 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2532 str->runoff = Dynarr_new (unsigned_char);
2533 str->other_end = stream;
2534 set_encoding_stream_coding_system (lstr, codesys);
2535 XSETLSTREAM (obj, lstr);
2540 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2542 return make_encoding_stream_1 (stream, codesys, "r");
2546 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2548 return make_encoding_stream_1 (stream, codesys, "w");
2551 /* Convert N bytes of internally-formatted data stored in SRC to an
2552 external format, according to the encoding stream ENCODING.
2553 Store the encoded data into DST. */
2556 mule_encode (Lstream *encoding, CONST unsigned char *src,
2557 unsigned_char_dynarr *dst, unsigned int n)
2559 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2561 switch (CODING_SYSTEM_TYPE (str->codesys))
2564 case CODESYS_INTERNAL:
2565 Dynarr_add_many (dst, src, n);
2568 case CODESYS_AUTODETECT:
2569 /* If we got this far and still haven't decided on the coding
2570 system, then do no conversion. */
2571 case CODESYS_NO_CONVERSION:
2572 encode_coding_no_conversion (encoding, src, dst, n);
2575 case CODESYS_SHIFT_JIS:
2576 encode_coding_sjis (encoding, src, dst, n);
2579 encode_coding_big5 (encoding, src, dst, n);
2582 encode_coding_ucs4 (encoding, src, dst, n);
2585 encode_coding_utf8 (encoding, src, dst, n);
2588 ccl_driver (&str->ccl, src, dst, n, 0);
2590 case CODESYS_ISO2022:
2591 encode_coding_iso2022 (encoding, src, dst, n);
2599 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2600 Encode the text between START and END using CODING-SYSTEM.
2601 This will, for example, convert Japanese characters into stuff such as
2602 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2603 text. BUFFER defaults to the current buffer if unspecified.
2605 (start, end, coding_system, buffer))
2608 struct buffer *buf = decode_buffer (buffer, 0);
2609 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2610 Lstream *istr, *ostr;
2611 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2613 get_buffer_range_char (buf, start, end, &b, &e, 0);
2615 barf_if_buffer_read_only (buf, b, e);
2617 coding_system = Fget_coding_system (coding_system);
2618 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2619 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2620 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2621 Fget_coding_system (Qbinary));
2622 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2624 istr = XLSTREAM (instream);
2625 ostr = XLSTREAM (outstream);
2626 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2627 /* The chain of streams looks like this:
2629 [BUFFER] <----- send through
2630 ------> [ENCODE AS SPECIFIED]
2631 ------> [DECODE AS BINARY]
2636 char tempbuf[1024]; /* some random amount */
2637 Bufpos newpos, even_newer_pos;
2638 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2639 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2643 newpos = lisp_buffer_stream_startpos (istr);
2644 Lstream_write (ostr, tempbuf, size_in_bytes);
2645 even_newer_pos = lisp_buffer_stream_startpos (istr);
2646 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2652 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2653 Lstream_close (istr);
2654 Lstream_close (ostr);
2656 Lstream_delete (istr);
2657 Lstream_delete (ostr);
2658 Lstream_delete (XLSTREAM (de_outstream));
2659 Lstream_delete (XLSTREAM (lb_outstream));
2660 return make_int (retlen);
2666 /************************************************************************/
2667 /* Shift-JIS methods */
2668 /************************************************************************/
2670 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2671 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2672 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2673 encoded by "position-code + 0x80". A character of JISX0208
2674 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2675 position-codes are divided and shifted so that it fit in the range
2678 --- CODE RANGE of Shift-JIS ---
2679 (character set) (range)
2681 JISX0201-Kana 0xA0 .. 0xDF
2682 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2683 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2684 -------------------------------
2688 /* Is this the first byte of a Shift-JIS two-byte char? */
2690 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2691 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2693 /* Is this the second byte of a Shift-JIS two-byte char? */
2695 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2696 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2698 #define BYTE_SJIS_KATAKANA_P(c) \
2699 ((c) >= 0xA1 && (c) <= 0xDF)
2702 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2710 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2712 if (st->shift_jis.in_second_byte)
2714 st->shift_jis.in_second_byte = 0;
2718 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2719 st->shift_jis.in_second_byte = 1;
2721 return CODING_CATEGORY_SHIFT_JIS_MASK;
2724 /* Convert Shift-JIS data to internal format. */
2727 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2728 unsigned_char_dynarr *dst, unsigned int n)
2731 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2732 unsigned int flags = str->flags;
2733 unsigned int ch = str->ch;
2734 eol_type_t eol_type = str->eol_type;
2742 /* Previous character was first byte of Shift-JIS Kanji char. */
2743 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2745 unsigned char e1, e2;
2747 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2748 DECODE_SJIS (ch, c, e1, e2);
2749 Dynarr_add (dst, e1);
2750 Dynarr_add (dst, e2);
2754 DECODE_ADD_BINARY_CHAR (ch, dst);
2755 DECODE_ADD_BINARY_CHAR (c, dst);
2761 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2762 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2764 else if (BYTE_SJIS_KATAKANA_P (c))
2766 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2767 Dynarr_add (dst, c);
2770 DECODE_ADD_BINARY_CHAR (c, dst);
2772 label_continue_loop:;
2775 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2781 /* Convert internally-formatted data to Shift-JIS. */
2784 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2785 unsigned_char_dynarr *dst, unsigned int n)
2788 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2789 unsigned int flags = str->flags;
2790 unsigned int ch = str->ch;
2791 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2798 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2799 Dynarr_add (dst, '\r');
2800 if (eol_type != EOL_CR)
2801 Dynarr_add (dst, '\n');
2804 else if (BYTE_ASCII_P (c))
2806 Dynarr_add (dst, c);
2809 else if (BUFBYTE_LEADING_BYTE_P (c))
2810 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2811 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2812 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2815 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2817 Dynarr_add (dst, c);
2820 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2821 ch == LEADING_BYTE_JAPANESE_JISX0208)
2825 unsigned char j1, j2;
2826 ENCODE_SJIS (ch, c, j1, j2);
2827 Dynarr_add (dst, j1);
2828 Dynarr_add (dst, j2);
2838 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2839 Decode a JISX0208 character of Shift-JIS coding-system.
2840 CODE is the character code in Shift-JIS as a cons of type bytes.
2841 Return the corresponding character.
2845 unsigned char c1, c2, s1, s2;
2848 CHECK_INT (XCAR (code));
2849 CHECK_INT (XCDR (code));
2850 s1 = XINT (XCAR (code));
2851 s2 = XINT (XCDR (code));
2852 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2853 BYTE_SJIS_TWO_BYTE_2_P (s2))
2855 DECODE_SJIS (s1, s2, c1, c2);
2856 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2857 c1 & 0x7F, c2 & 0x7F));
2863 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2864 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2865 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2869 Lisp_Object charset;
2872 CHECK_CHAR_COERCE_INT (ch);
2873 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2874 if (EQ (charset, Vcharset_japanese_jisx0208))
2876 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2877 return Fcons (make_int (s1), make_int (s2));
2884 /************************************************************************/
2886 /************************************************************************/
2888 /* BIG5 is a coding system encoding two character sets: ASCII and
2889 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2890 character set and is encoded in two-byte.
2892 --- CODE RANGE of BIG5 ---
2893 (character set) (range)
2895 Big5 (1st byte) 0xA1 .. 0xFE
2896 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2897 --------------------------
2899 Since the number of characters in Big5 is larger than maximum
2900 characters in Emacs' charset (96x96), it can't be handled as one
2901 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2902 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2903 contains frequently used characters and the latter contains less
2904 frequently used characters. */
2906 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2907 ((c) >= 0xA1 && (c) <= 0xFE)
2909 /* Is this the second byte of a Shift-JIS two-byte char? */
2911 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2912 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2914 /* Number of Big5 characters which have the same code in 1st byte. */
2916 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2918 /* Code conversion macros. These are macros because they are used in
2919 inner loops during code conversion.
2921 Note that temporary variables in macros introduce the classic
2922 dynamic-scoping problems with variable names. We use capital-
2923 lettered variables in the assumption that XEmacs does not use
2924 capital letters in variables except in a very formalized way
2927 /* Convert Big5 code (b1, b2) into its internal string representation
2930 /* There is a much simpler way to split the Big5 charset into two.
2931 For the moment I'm going to leave the algorithm as-is because it
2932 claims to separate out the most-used characters into a single
2933 charset, which perhaps will lead to optimizations in various
2936 The way the algorithm works is something like this:
2938 Big5 can be viewed as a 94x157 charset, where the row is
2939 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2940 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2941 the split between low and high column numbers is apparently
2942 meaningless; ascending rows produce less and less frequent chars.
2943 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2944 the first charset, and the upper half (0xC9 .. 0xFE) to the
2945 second. To do the conversion, we convert the character into
2946 a single number where 0 .. 156 is the first row, 157 .. 313
2947 is the second, etc. That way, the characters are ordered by
2948 decreasing frequency. Then we just chop the space in two
2949 and coerce the result into a 94x94 space.
2952 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2954 int B1 = b1, B2 = b2; \
2956 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2960 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2964 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2965 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2967 c1 = I / (0xFF - 0xA1) + 0xA1; \
2968 c2 = I % (0xFF - 0xA1) + 0xA1; \
2971 /* Convert the internal string representation of a Big5 character
2972 (lb, c1, c2) into Big5 code (b1, b2). */
2974 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2976 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2978 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2980 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2982 b1 = I / BIG5_SAME_ROW + 0xA1; \
2983 b2 = I % BIG5_SAME_ROW; \
2984 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2988 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2996 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2997 (c >= 0x80 && c <= 0xA0))
2999 if (st->big5.in_second_byte)
3001 st->big5.in_second_byte = 0;
3002 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3006 st->big5.in_second_byte = 1;
3008 return CODING_CATEGORY_BIG5_MASK;
3011 /* Convert Big5 data to internal format. */
3014 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3015 unsigned_char_dynarr *dst, unsigned int n)
3018 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3019 unsigned int flags = str->flags;
3020 unsigned int ch = str->ch;
3021 eol_type_t eol_type = str->eol_type;
3028 /* Previous character was first byte of Big5 char. */
3029 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3031 unsigned char b1, b2, b3;
3032 DECODE_BIG5 (ch, c, b1, b2, b3);
3033 Dynarr_add (dst, b1);
3034 Dynarr_add (dst, b2);
3035 Dynarr_add (dst, b3);
3039 DECODE_ADD_BINARY_CHAR (ch, dst);
3040 DECODE_ADD_BINARY_CHAR (c, dst);
3046 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3047 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3050 DECODE_ADD_BINARY_CHAR (c, dst);
3052 label_continue_loop:;
3055 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3061 /* Convert internally-formatted data to Big5. */
3064 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3065 unsigned_char_dynarr *dst, unsigned int n)
3068 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3069 unsigned int flags = str->flags;
3070 unsigned int ch = str->ch;
3071 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3078 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3079 Dynarr_add (dst, '\r');
3080 if (eol_type != EOL_CR)
3081 Dynarr_add (dst, '\n');
3083 else if (BYTE_ASCII_P (c))
3086 Dynarr_add (dst, c);
3088 else if (BUFBYTE_LEADING_BYTE_P (c))
3090 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3091 c == LEADING_BYTE_CHINESE_BIG5_2)
3093 /* A recognized leading byte. */
3095 continue; /* not done with this character. */
3097 /* otherwise just ignore this character. */
3099 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3100 ch == LEADING_BYTE_CHINESE_BIG5_2)
3102 /* Previous char was a recognized leading byte. */
3104 continue; /* not done with this character. */
3108 /* Encountering second byte of a Big5 character. */
3109 unsigned char b1, b2;
3111 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3112 Dynarr_add (dst, b1);
3113 Dynarr_add (dst, b2);
3124 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3125 Decode a Big5 character CODE of BIG5 coding-system.
3126 CODE is the character code in BIG5, a cons of two integers.
3127 Return the corresponding character.
3131 unsigned char c1, c2, b1, b2;
3134 CHECK_INT (XCAR (code));
3135 CHECK_INT (XCDR (code));
3136 b1 = XINT (XCAR (code));
3137 b2 = XINT (XCDR (code));
3138 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3139 BYTE_BIG5_TWO_BYTE_2_P (b2))
3142 Lisp_Object charset;
3143 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3144 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3145 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3151 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3152 Encode the Big5 character CH to BIG5 coding-system.
3153 Return the corresponding character code in Big5.
3157 Lisp_Object charset;
3160 CHECK_CHAR_COERCE_INT (ch);
3161 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3162 if (EQ (charset, Vcharset_chinese_big5_1) ||
3163 EQ (charset, Vcharset_chinese_big5_2))
3165 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3167 return Fcons (make_int (b1), make_int (b2));
3174 /************************************************************************/
3177 /* UCS-4 character codes are implemented as nonnegative integers. */
3179 /************************************************************************/
3181 Lisp_Object ucs_to_mule_table[65536];
3182 Lisp_Object mule_to_ucs_table;
3184 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3185 Map UCS-4 code CODE to Mule character CHARACTER.
3187 Return T on success, NIL on failure.
3193 CHECK_CHAR (character);
3197 if (c < sizeof (ucs_to_mule_table))
3199 ucs_to_mule_table[c] = character;
3207 ucs_to_char (unsigned long code)
3209 if (code < sizeof (ucs_to_mule_table))
3211 return ucs_to_mule_table[code];
3213 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3218 c = code % (94 * 94);
3220 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3221 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3222 CHARSET_LEFT_TO_RIGHT),
3223 c / 94 + 33, c % 94 + 33));
3229 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3230 Return Mule character corresponding to UCS code CODE (a positive integer).
3234 CHECK_NATNUM (code);
3235 return ucs_to_char (XINT (code));
3238 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3239 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3243 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3244 Fset_char_ucs is more restrictive on index arg, but should
3245 check code arg in a char_table method. */
3246 CHECK_CHAR (character);
3247 CHECK_NATNUM (code);
3248 return Fput_char_table (character, code, mule_to_ucs_table);
3251 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3252 Return the UCS code (a positive integer) corresponding to CHARACTER.
3256 return Fget_char_table (character, mule_to_ucs_table);
3260 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3262 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3263 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3264 is not found, instead.
3265 #### do something more appropriate (use blob?)
3266 Danger, Will Robinson! Data loss. Should we signal user? */
3268 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3270 Lisp_Object chr = ucs_to_char (ch);
3274 Bufbyte work[MAX_EMCHAR_LEN];
3279 simple_set_charptr_emchar (work, ch) :
3280 non_ascii_set_charptr_emchar (work, ch);
3281 Dynarr_add_many (dst, work, len);
3285 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3286 Dynarr_add (dst, 34 + 128);
3287 Dynarr_add (dst, 46 + 128);
3292 static unsigned long
3293 mule_char_to_ucs4 (Lisp_Object charset,
3294 unsigned char h, unsigned char l)
3297 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3304 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3305 (XCHARSET_CHARS (charset) == 94) )
3307 unsigned char final = XCHARSET_FINAL (charset);
3309 if ( ('@' <= final) && (final < 0x7f) )
3311 return 0xe00000 + (final - '@') * 94 * 94
3312 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3326 encode_ucs4 (Lisp_Object charset,
3327 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3329 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3330 Dynarr_add (dst, code >> 24);
3331 Dynarr_add (dst, (code >> 16) & 255);
3332 Dynarr_add (dst, (code >> 8) & 255);
3333 Dynarr_add (dst, code & 255);
3337 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3343 switch (st->ucs4.in_byte)
3352 st->ucs4.in_byte = 0;
3358 return CODING_CATEGORY_UCS4_MASK;
3362 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3363 unsigned_char_dynarr *dst, unsigned int n)
3365 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3366 unsigned int flags = str->flags;
3367 unsigned int ch = str->ch;
3371 unsigned char c = *src++;
3379 decode_ucs4 ( ( ch << 8 ) | c, dst);
3384 ch = ( ch << 8 ) | c;
3388 if (flags & CODING_STATE_END)
3389 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3396 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3397 unsigned_char_dynarr *dst, unsigned int n)
3399 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3400 unsigned int flags = str->flags;
3401 unsigned int ch = str->ch;
3402 unsigned char char_boundary = str->iso2022.current_char_boundary;
3403 Lisp_Object charset = str->iso2022.current_charset;
3405 #ifdef ENABLE_COMPOSITE_CHARS
3406 /* flags for handling composite chars. We do a little switcharoo
3407 on the source while we're outputting the composite char. */
3408 unsigned int saved_n = 0;
3409 CONST unsigned char *saved_src = NULL;
3410 int in_composite = 0;
3417 unsigned char c = *src++;
3419 if (BYTE_ASCII_P (c))
3420 { /* Processing ASCII character */
3422 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3425 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3426 { /* Processing Leading Byte */
3428 charset = CHARSET_BY_LEADING_BYTE (c);
3429 if (LEADING_BYTE_PREFIX_P(c))
3434 { /* Processing Non-ASCII character */
3436 if (EQ (charset, Vcharset_control_1))
3438 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3442 switch (XCHARSET_REP_BYTES (charset))
3445 encode_ucs4 (charset, c, 0, dst);
3448 if (XCHARSET_PRIVATE_P (charset))
3450 encode_ucs4 (charset, c, 0, dst);
3455 #ifdef ENABLE_COMPOSITE_CHARS
3456 if (EQ (charset, Vcharset_composite))
3460 /* #### Bother! We don't know how to
3462 Dynarr_add (dst, 0);
3463 Dynarr_add (dst, 0);
3464 Dynarr_add (dst, 0);
3465 Dynarr_add (dst, '~');
3469 Emchar emch = MAKE_CHAR (Vcharset_composite,
3470 ch & 0x7F, c & 0x7F);
3471 Lisp_Object lstr = composite_char_string (emch);
3475 src = XSTRING_DATA (lstr);
3476 n = XSTRING_LENGTH (lstr);
3480 #endif /* ENABLE_COMPOSITE_CHARS */
3482 encode_ucs4(charset, ch, c, dst);
3495 encode_ucs4 (charset, ch, c, dst);
3511 #ifdef ENABLE_COMPOSITE_CHARS
3517 goto back_to_square_n; /* Wheeeeeeeee ..... */
3519 #endif /* ENABLE_COMPOSITE_CHARS */
3523 str->iso2022.current_char_boundary = char_boundary;
3524 str->iso2022.current_charset = charset;
3526 /* Verbum caro factum est! */
3530 /************************************************************************/
3532 /************************************************************************/
3535 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3540 unsigned char c = *src++;
3541 switch (st->utf8.in_byte)
3544 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3547 st->utf8.in_byte = 5;
3549 st->utf8.in_byte = 4;
3551 st->utf8.in_byte = 3;
3553 st->utf8.in_byte = 2;
3555 st->utf8.in_byte = 1;
3560 if ((c & 0xc0) != 0x80)
3566 return CODING_CATEGORY_UTF8_MASK;
3570 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3571 unsigned_char_dynarr *dst, unsigned int n)
3573 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3574 unsigned int flags = str->flags;
3575 unsigned int ch = str->ch;
3576 eol_type_t eol_type = str->eol_type;
3580 unsigned char c = *src++;
3589 else if ( c >= 0xf8 )
3594 else if ( c >= 0xf0 )
3599 else if ( c >= 0xe0 )
3604 else if ( c >= 0xc0 )
3611 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3612 decode_ucs4 (c, dst);
3616 ch = ( ch << 6 ) | ( c & 0x3f );
3617 decode_ucs4 (ch, dst);
3622 ch = ( ch << 6 ) | ( c & 0x3f );
3625 label_continue_loop:;
3628 if (flags & CODING_STATE_END)
3629 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3636 encode_utf8 (Lisp_Object charset,
3637 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3639 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3642 Dynarr_add (dst, code);
3644 else if ( code <= 0x7ff )
3646 Dynarr_add (dst, (code >> 6) | 0xc0);
3647 Dynarr_add (dst, (code & 0x3f) | 0x80);
3649 else if ( code <= 0xffff )
3651 Dynarr_add (dst, (code >> 12) | 0xe0);
3652 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3653 Dynarr_add (dst, (code & 0x3f) | 0x80);
3655 else if ( code <= 0x1fffff )
3657 Dynarr_add (dst, (code >> 18) | 0xf0);
3658 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3659 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3660 Dynarr_add (dst, (code & 0x3f) | 0x80);
3662 else if ( code <= 0x3ffffff )
3664 Dynarr_add (dst, (code >> 24) | 0xf8);
3665 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3666 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3667 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3668 Dynarr_add (dst, (code & 0x3f) | 0x80);
3672 Dynarr_add (dst, (code >> 30) | 0xfc);
3673 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3674 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3675 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3676 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3677 Dynarr_add (dst, (code & 0x3f) | 0x80);
3682 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3683 unsigned_char_dynarr *dst, unsigned int n)
3685 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3686 unsigned int flags = str->flags;
3687 unsigned int ch = str->ch;
3688 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3689 unsigned char char_boundary = str->iso2022.current_char_boundary;
3690 Lisp_Object charset = str->iso2022.current_charset;
3692 #ifdef ENABLE_COMPOSITE_CHARS
3693 /* flags for handling composite chars. We do a little switcharoo
3694 on the source while we're outputting the composite char. */
3695 unsigned int saved_n = 0;
3696 CONST unsigned char *saved_src = NULL;
3697 int in_composite = 0;
3700 #endif /* ENABLE_COMPOSITE_CHARS */
3704 unsigned char c = *src++;
3706 if (BYTE_ASCII_P (c))
3707 { /* Processing ASCII character */
3711 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3712 Dynarr_add (dst, '\r');
3713 if (eol_type != EOL_CR)
3714 Dynarr_add (dst, c);
3717 encode_utf8 (Vcharset_ascii, c, 0, dst);
3720 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3721 { /* Processing Leading Byte */
3723 charset = CHARSET_BY_LEADING_BYTE (c);
3724 if (LEADING_BYTE_PREFIX_P(c))
3729 { /* Processing Non-ASCII character */
3731 if (EQ (charset, Vcharset_control_1))
3733 encode_utf8 (Vcharset_control_1, c, 0, dst);
3737 switch (XCHARSET_REP_BYTES (charset))
3740 encode_utf8 (charset, c, 0, dst);
3743 if (XCHARSET_PRIVATE_P (charset))
3745 encode_utf8 (charset, c, 0, dst);
3750 #ifdef ENABLE_COMPOSITE_CHARS
3751 if (EQ (charset, Vcharset_composite))
3755 /* #### Bother! We don't know how to
3757 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3761 Emchar emch = MAKE_CHAR (Vcharset_composite,
3762 ch & 0x7F, c & 0x7F);
3763 Lisp_Object lstr = composite_char_string (emch);
3767 src = XSTRING_DATA (lstr);
3768 n = XSTRING_LENGTH (lstr);
3772 #endif /* ENABLE_COMPOSITE_CHARS */
3774 encode_utf8 (charset, ch, c, dst);
3787 encode_utf8 (charset, ch, c, dst);
3803 #ifdef ENABLE_COMPOSITE_CHARS
3809 goto back_to_square_n; /* Wheeeeeeeee ..... */
3815 str->iso2022.current_char_boundary = char_boundary;
3816 str->iso2022.current_charset = charset;
3818 /* Verbum caro factum est! */
3822 /************************************************************************/
3823 /* ISO2022 methods */
3824 /************************************************************************/
3826 /* The following note describes the coding system ISO2022 briefly.
3827 Since the intention of this note is to help understand the
3828 functions in this file, some parts are NOT ACCURATE or OVERLY
3829 SIMPLIFIED. For thorough understanding, please refer to the
3830 original document of ISO2022.
3832 ISO2022 provides many mechanisms to encode several character sets
3833 in 7-bit and 8-bit environments. For 7-bit environments, all text
3834 is encoded using bytes less than 128. This may make the encoded
3835 text a little bit longer, but the text passes more easily through
3836 several gateways, some of which strip off MSB (Most Signigant Bit).
3838 There are two kinds of character sets: control character set and
3839 graphic character set. The former contains control characters such
3840 as `newline' and `escape' to provide control functions (control
3841 functions are also provided by escape sequences). The latter
3842 contains graphic characters such as 'A' and '-'. Emacs recognizes
3843 two control character sets and many graphic character sets.
3845 Graphic character sets are classified into one of the following
3846 four classes, according to the number of bytes (DIMENSION) and
3847 number of characters in one dimension (CHARS) of the set:
3848 - DIMENSION1_CHARS94
3849 - DIMENSION1_CHARS96
3850 - DIMENSION2_CHARS94
3851 - DIMENSION2_CHARS96
3853 In addition, each character set is assigned an identification tag,
3854 unique for each set, called "final character" (denoted as <F>
3855 hereafter). The <F> of each character set is decided by ECMA(*)
3856 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3857 (0x30..0x3F are for private use only).
3859 Note (*): ECMA = European Computer Manufacturers Association
3861 Here are examples of graphic character set [NAME(<F>)]:
3862 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3863 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3864 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3865 o DIMENSION2_CHARS96 -- none for the moment
3867 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3868 C0 [0x00..0x1F] -- control character plane 0
3869 GL [0x20..0x7F] -- graphic character plane 0
3870 C1 [0x80..0x9F] -- control character plane 1
3871 GR [0xA0..0xFF] -- graphic character plane 1
3873 A control character set is directly designated and invoked to C0 or
3874 C1 by an escape sequence. The most common case is that:
3875 - ISO646's control character set is designated/invoked to C0, and
3876 - ISO6429's control character set is designated/invoked to C1,
3877 and usually these designations/invocations are omitted in encoded
3878 text. In a 7-bit environment, only C0 can be used, and a control
3879 character for C1 is encoded by an appropriate escape sequence to
3880 fit into the environment. All control characters for C1 are
3881 defined to have corresponding escape sequences.
3883 A graphic character set is at first designated to one of four
3884 graphic registers (G0 through G3), then these graphic registers are
3885 invoked to GL or GR. These designations and invocations can be
3886 done independently. The most common case is that G0 is invoked to
3887 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3888 these invocations and designations are omitted in encoded text.
3889 In a 7-bit environment, only GL can be used.
3891 When a graphic character set of CHARS94 is invoked to GL, codes
3892 0x20 and 0x7F of the GL area work as control characters SPACE and
3893 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3896 There are two ways of invocation: locking-shift and single-shift.
3897 With locking-shift, the invocation lasts until the next different
3898 invocation, whereas with single-shift, the invocation affects the
3899 following character only and doesn't affect the locking-shift
3900 state. Invocations are done by the following control characters or
3903 ----------------------------------------------------------------------
3904 abbrev function cntrl escape seq description
3905 ----------------------------------------------------------------------
3906 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3907 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3908 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3909 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3910 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3911 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3912 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3913 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3914 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3915 ----------------------------------------------------------------------
3916 (*) These are not used by any known coding system.
3918 Control characters for these functions are defined by macros
3919 ISO_CODE_XXX in `coding.h'.
3921 Designations are done by the following escape sequences:
3922 ----------------------------------------------------------------------
3923 escape sequence description
3924 ----------------------------------------------------------------------
3925 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3926 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3927 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3928 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3929 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3930 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3931 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3932 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3933 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3934 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3935 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3936 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3937 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3938 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3939 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3940 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3941 ----------------------------------------------------------------------
3943 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3944 of dimension 1, chars 94, and final character <F>, etc...
3946 Note (*): Although these designations are not allowed in ISO2022,
3947 Emacs accepts them on decoding, and produces them on encoding
3948 CHARS96 character sets in a coding system which is characterized as
3949 7-bit environment, non-locking-shift, and non-single-shift.
3951 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3952 '(' can be omitted. We refer to this as "short-form" hereafter.
3954 Now you may notice that there are a lot of ways for encoding the
3955 same multilingual text in ISO2022. Actually, there exist many
3956 coding systems such as Compound Text (used in X11's inter client
3957 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3958 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3959 localized platforms), and all of these are variants of ISO2022.
3961 In addition to the above, Emacs handles two more kinds of escape
3962 sequences: ISO6429's direction specification and Emacs' private
3963 sequence for specifying character composition.
3965 ISO6429's direction specification takes the following form:
3966 o CSI ']' -- end of the current direction
3967 o CSI '0' ']' -- end of the current direction
3968 o CSI '1' ']' -- start of left-to-right text
3969 o CSI '2' ']' -- start of right-to-left text
3970 The control character CSI (0x9B: control sequence introducer) is
3971 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3973 Character composition specification takes the following form:
3974 o ESC '0' -- start character composition
3975 o ESC '1' -- end character composition
3976 Since these are not standard escape sequences of any ISO standard,
3977 their use with these meanings is restricted to Emacs only. */
3980 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3984 for (i = 0; i < 4; i++)
3986 if (!NILP (coding_system))
3988 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3990 iso->charset[i] = Qt;
3991 iso->invalid_designated[i] = 0;
3993 iso->esc = ISO_ESC_NOTHING;
3994 iso->esc_bytes_index = 0;
3995 iso->register_left = 0;
3996 iso->register_right = 1;
3997 iso->switched_dir_and_no_valid_charset_yet = 0;
3998 iso->invalid_switch_dir = 0;
3999 iso->output_direction_sequence = 0;
4000 iso->output_literally = 0;
4001 #ifdef ENABLE_COMPOSITE_CHARS
4002 if (iso->composite_chars)
4003 Dynarr_reset (iso->composite_chars);
4008 fit_to_be_escape_quoted (unsigned char c)
4025 /* Parse one byte of an ISO2022 escape sequence.
4026 If the result is an invalid escape sequence, return 0 and
4027 do not change anything in STR. Otherwise, if the result is
4028 an incomplete escape sequence, update ISO2022.ESC and
4029 ISO2022.ESC_BYTES and return -1. Otherwise, update
4030 all the state variables (but not ISO2022.ESC_BYTES) and
4033 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4034 or invocation of an invalid character set and treat that as
4035 an unrecognized escape sequence. */
4038 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4039 unsigned char c, unsigned int *flags,
4040 int check_invalid_charsets)
4042 /* (1) If we're at the end of a designation sequence, CS is the
4043 charset being designated and REG is the register to designate
4046 (2) If we're at the end of a locking-shift sequence, REG is
4047 the register to invoke and HALF (0 == left, 1 == right) is
4048 the half to invoke it into.
4050 (3) If we're at the end of a single-shift sequence, REG is
4051 the register to invoke. */
4052 Lisp_Object cs = Qnil;
4055 /* NOTE: This code does goto's all over the fucking place.
4056 The reason for this is that we're basically implementing
4057 a state machine here, and hierarchical languages like C
4058 don't really provide a clean way of doing this. */
4060 if (! (*flags & CODING_STATE_ESCAPE))
4061 /* At beginning of escape sequence; we need to reset our
4062 escape-state variables. */
4063 iso->esc = ISO_ESC_NOTHING;
4065 iso->output_literally = 0;
4066 iso->output_direction_sequence = 0;
4070 case ISO_ESC_NOTHING:
4071 iso->esc_bytes_index = 0;
4074 case ISO_CODE_ESC: /* Start escape sequence */
4075 *flags |= CODING_STATE_ESCAPE;
4079 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4080 *flags |= CODING_STATE_ESCAPE;
4081 iso->esc = ISO_ESC_5_11;
4084 case ISO_CODE_SO: /* locking shift 1 */
4087 case ISO_CODE_SI: /* locking shift 0 */
4091 case ISO_CODE_SS2: /* single shift */
4094 case ISO_CODE_SS3: /* single shift */
4098 default: /* Other control characters */
4105 /**** single shift ****/
4107 case 'N': /* single shift 2 */
4110 case 'O': /* single shift 3 */
4114 /**** locking shift ****/
4116 case '~': /* locking shift 1 right */
4119 case 'n': /* locking shift 2 */
4122 case '}': /* locking shift 2 right */
4125 case 'o': /* locking shift 3 */
4128 case '|': /* locking shift 3 right */
4132 #ifdef ENABLE_COMPOSITE_CHARS
4133 /**** composite ****/
4136 iso->esc = ISO_ESC_START_COMPOSITE;
4137 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4138 CODING_STATE_COMPOSITE;
4142 iso->esc = ISO_ESC_END_COMPOSITE;
4143 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4144 ~CODING_STATE_COMPOSITE;
4146 #endif /* ENABLE_COMPOSITE_CHARS */
4148 /**** directionality ****/
4151 iso->esc = ISO_ESC_5_11;
4154 /**** designation ****/
4156 case '$': /* multibyte charset prefix */
4157 iso->esc = ISO_ESC_2_4;
4161 if (0x28 <= c && c <= 0x2F)
4163 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4167 /* This function is called with CODESYS equal to nil when
4168 doing coding-system detection. */
4170 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4171 && fit_to_be_escape_quoted (c))
4173 iso->esc = ISO_ESC_LITERAL;
4174 *flags &= CODING_STATE_ISO2022_LOCK;
4184 /**** directionality ****/
4186 case ISO_ESC_5_11: /* ISO6429 direction control */
4189 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4190 goto directionality;
4192 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4193 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4194 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4198 case ISO_ESC_5_11_0:
4201 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4202 goto directionality;
4206 case ISO_ESC_5_11_1:
4209 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4210 goto directionality;
4214 case ISO_ESC_5_11_2:
4217 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4218 goto directionality;
4223 iso->esc = ISO_ESC_DIRECTIONALITY;
4224 /* Various junk here to attempt to preserve the direction sequences
4225 literally in the text if they would otherwise be swallowed due
4226 to invalid designations that don't show up as actual charset
4227 changes in the text. */
4228 if (iso->invalid_switch_dir)
4230 /* We already inserted a direction switch literally into the
4231 text. We assume (#### this may not be right) that the
4232 next direction switch is the one going the other way,
4233 and we need to output that literally as well. */
4234 iso->output_literally = 1;
4235 iso->invalid_switch_dir = 0;
4241 /* If we are in the thrall of an invalid designation,
4242 then stick the directionality sequence literally into the
4243 output stream so it ends up in the original text again. */
4244 for (jj = 0; jj < 4; jj++)
4245 if (iso->invalid_designated[jj])
4249 iso->output_literally = 1;
4250 iso->invalid_switch_dir = 1;
4253 /* Indicate that we haven't yet seen a valid designation,
4254 so that if a switch-dir is directly followed by an
4255 invalid designation, both get inserted literally. */
4256 iso->switched_dir_and_no_valid_charset_yet = 1;
4261 /**** designation ****/
4264 if (0x28 <= c && c <= 0x2F)
4266 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4269 if (0x40 <= c && c <= 0x42)
4271 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4272 *flags & CODING_STATE_R2L ?
4273 CHARSET_RIGHT_TO_LEFT :
4274 CHARSET_LEFT_TO_RIGHT);
4284 if (c < '0' || c > '~')
4285 return 0; /* bad final byte */
4287 if (iso->esc >= ISO_ESC_2_8 &&
4288 iso->esc <= ISO_ESC_2_15)
4290 type = ((iso->esc >= ISO_ESC_2_12) ?
4291 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4292 reg = (iso->esc - ISO_ESC_2_8) & 3;
4294 else if (iso->esc >= ISO_ESC_2_4_8 &&
4295 iso->esc <= ISO_ESC_2_4_15)
4297 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4298 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4299 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4303 /* Can this ever be reached? -slb */
4307 cs = CHARSET_BY_ATTRIBUTES (type, c,
4308 *flags & CODING_STATE_R2L ?
4309 CHARSET_RIGHT_TO_LEFT :
4310 CHARSET_LEFT_TO_RIGHT);
4316 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4320 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4321 /* can't invoke something that ain't there. */
4323 iso->esc = ISO_ESC_SINGLE_SHIFT;
4324 *flags &= CODING_STATE_ISO2022_LOCK;
4326 *flags |= CODING_STATE_SS2;
4328 *flags |= CODING_STATE_SS3;
4332 if (check_invalid_charsets &&
4333 !CHARSETP (iso->charset[reg]))
4334 /* can't invoke something that ain't there. */
4337 iso->register_right = reg;
4339 iso->register_left = reg;
4340 *flags &= CODING_STATE_ISO2022_LOCK;
4341 iso->esc = ISO_ESC_LOCKING_SHIFT;
4345 if (NILP (cs) && check_invalid_charsets)
4347 iso->invalid_designated[reg] = 1;
4348 iso->charset[reg] = Vcharset_ascii;
4349 iso->esc = ISO_ESC_DESIGNATE;
4350 *flags &= CODING_STATE_ISO2022_LOCK;
4351 iso->output_literally = 1;
4352 if (iso->switched_dir_and_no_valid_charset_yet)
4354 /* We encountered a switch-direction followed by an
4355 invalid designation. Ensure that the switch-direction
4356 gets outputted; otherwise it will probably get eaten
4357 when the text is written out again. */
4358 iso->switched_dir_and_no_valid_charset_yet = 0;
4359 iso->output_direction_sequence = 1;
4360 /* And make sure that the switch-dir going the other
4361 way gets outputted, as well. */
4362 iso->invalid_switch_dir = 1;
4366 /* This function is called with CODESYS equal to nil when
4367 doing coding-system detection. */
4368 if (!NILP (codesys))
4370 charset_conversion_spec_dynarr *dyn =
4371 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4377 for (i = 0; i < Dynarr_length (dyn); i++)
4379 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4380 if (EQ (cs, spec->from_charset))
4381 cs = spec->to_charset;
4386 iso->charset[reg] = cs;
4387 iso->esc = ISO_ESC_DESIGNATE;
4388 *flags &= CODING_STATE_ISO2022_LOCK;
4389 if (iso->invalid_designated[reg])
4391 iso->invalid_designated[reg] = 0;
4392 iso->output_literally = 1;
4394 if (iso->switched_dir_and_no_valid_charset_yet)
4395 iso->switched_dir_and_no_valid_charset_yet = 0;
4400 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4405 /* #### There are serious deficiencies in the recognition mechanism
4406 here. This needs to be much smarter if it's going to cut it.
4407 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4408 it should be detected as Latin-1.
4409 All the ISO2022 stuff in this file should be synced up with the
4410 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4411 Perhaps we should wait till R2L works in FSF Emacs? */
4413 if (!st->iso2022.initted)
4415 reset_iso2022 (Qnil, &st->iso2022.iso);
4416 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4417 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4418 CODING_CATEGORY_ISO_8_1_MASK |
4419 CODING_CATEGORY_ISO_8_2_MASK |
4420 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4421 st->iso2022.flags = 0;
4422 st->iso2022.high_byte_count = 0;
4423 st->iso2022.saw_single_shift = 0;
4424 st->iso2022.initted = 1;
4427 mask = st->iso2022.mask;
4434 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4435 st->iso2022.high_byte_count++;
4439 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4441 if (st->iso2022.high_byte_count & 1)
4442 /* odd number of high bytes; assume not iso-8-2 */
4443 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4445 st->iso2022.high_byte_count = 0;
4446 st->iso2022.saw_single_shift = 0;
4448 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4450 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4451 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4452 { /* control chars */
4455 /* Allow and ignore control characters that you might
4456 reasonably see in a text file */
4461 case 8: /* backspace */
4462 case 11: /* vertical tab */
4463 case 12: /* form feed */
4464 case 26: /* MS-DOS C-z junk */
4465 case 31: /* '^_' -- for info */
4466 goto label_continue_loop;
4473 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4476 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4477 &st->iso2022.flags, 0))
4479 switch (st->iso2022.iso.esc)
4481 case ISO_ESC_DESIGNATE:
4482 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4483 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4485 case ISO_ESC_LOCKING_SHIFT:
4486 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4487 goto ran_out_of_chars;
4488 case ISO_ESC_SINGLE_SHIFT:
4489 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4490 st->iso2022.saw_single_shift = 1;
4499 goto ran_out_of_chars;
4502 label_continue_loop:;
4511 postprocess_iso2022_mask (int mask)
4513 /* #### kind of cheesy */
4514 /* If seven-bit ISO is allowed, then assume that the encoding is
4515 entirely seven-bit and turn off the eight-bit ones. */
4516 if (mask & CODING_CATEGORY_ISO_7_MASK)
4517 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4518 CODING_CATEGORY_ISO_8_1_MASK |
4519 CODING_CATEGORY_ISO_8_2_MASK);
4523 /* If FLAGS is a null pointer or specifies right-to-left motion,
4524 output a switch-dir-to-left-to-right sequence to DST.
4525 Also update FLAGS if it is not a null pointer.
4526 If INTERNAL_P is set, we are outputting in internal format and
4527 need to handle the CSI differently. */
4530 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4531 unsigned_char_dynarr *dst,
4532 unsigned int *flags,
4535 if (!flags || (*flags & CODING_STATE_R2L))
4537 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4539 Dynarr_add (dst, ISO_CODE_ESC);
4540 Dynarr_add (dst, '[');
4542 else if (internal_p)
4543 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4545 Dynarr_add (dst, ISO_CODE_CSI);
4546 Dynarr_add (dst, '0');
4547 Dynarr_add (dst, ']');
4549 *flags &= ~CODING_STATE_R2L;
4553 /* If FLAGS is a null pointer or specifies a direction different from
4554 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4555 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4556 sequence to DST. Also update FLAGS if it is not a null pointer.
4557 If INTERNAL_P is set, we are outputting in internal format and
4558 need to handle the CSI differently. */
4561 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4562 unsigned_char_dynarr *dst, unsigned int *flags,
4565 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4566 direction == CHARSET_LEFT_TO_RIGHT)
4567 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4568 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4569 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4570 direction == CHARSET_RIGHT_TO_LEFT)
4572 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4574 Dynarr_add (dst, ISO_CODE_ESC);
4575 Dynarr_add (dst, '[');
4577 else if (internal_p)
4578 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4580 Dynarr_add (dst, ISO_CODE_CSI);
4581 Dynarr_add (dst, '2');
4582 Dynarr_add (dst, ']');
4584 *flags |= CODING_STATE_R2L;
4588 /* Convert ISO2022-format data to internal format. */
4591 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4592 unsigned_char_dynarr *dst, unsigned int n)
4594 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4595 unsigned int flags = str->flags;
4596 unsigned int ch = str->ch;
4597 eol_type_t eol_type = str->eol_type;
4598 #ifdef ENABLE_COMPOSITE_CHARS
4599 unsigned_char_dynarr *real_dst = dst;
4601 Lisp_Object coding_system;
4603 XSETCODING_SYSTEM (coding_system, str->codesys);
4605 #ifdef ENABLE_COMPOSITE_CHARS
4606 if (flags & CODING_STATE_COMPOSITE)
4607 dst = str->iso2022.composite_chars;
4608 #endif /* ENABLE_COMPOSITE_CHARS */
4612 unsigned char c = *src++;
4613 if (flags & CODING_STATE_ESCAPE)
4614 { /* Within ESC sequence */
4615 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4620 switch (str->iso2022.esc)
4622 #ifdef ENABLE_COMPOSITE_CHARS
4623 case ISO_ESC_START_COMPOSITE:
4624 if (str->iso2022.composite_chars)
4625 Dynarr_reset (str->iso2022.composite_chars);
4627 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4628 dst = str->iso2022.composite_chars;
4630 case ISO_ESC_END_COMPOSITE:
4632 Bufbyte comstr[MAX_EMCHAR_LEN];
4634 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4635 Dynarr_length (dst));
4637 len = set_charptr_emchar (comstr, emch);
4638 Dynarr_add_many (dst, comstr, len);
4641 #endif /* ENABLE_COMPOSITE_CHARS */
4643 case ISO_ESC_LITERAL:
4644 DECODE_ADD_BINARY_CHAR (c, dst);
4648 /* Everything else handled already */
4653 /* Attempted error recovery. */
4654 if (str->iso2022.output_direction_sequence)
4655 ensure_correct_direction (flags & CODING_STATE_R2L ?
4656 CHARSET_RIGHT_TO_LEFT :
4657 CHARSET_LEFT_TO_RIGHT,
4658 str->codesys, dst, 0, 1);
4659 /* More error recovery. */
4660 if (!retval || str->iso2022.output_literally)
4662 /* Output the (possibly invalid) sequence */
4664 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4665 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4666 flags &= CODING_STATE_ISO2022_LOCK;
4668 n++, src--;/* Repeat the loop with the same character. */
4671 /* No sense in reprocessing the final byte of the
4672 escape sequence; it could mess things up anyway.
4674 DECODE_ADD_BINARY_CHAR (c, dst);
4679 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4680 { /* Control characters */
4682 /***** Error-handling *****/
4684 /* If we were in the middle of a character, dump out the
4685 partial character. */
4686 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4688 /* If we just saw a single-shift character, dump it out.
4689 This may dump out the wrong sort of single-shift character,
4690 but least it will give an indication that something went
4692 if (flags & CODING_STATE_SS2)
4694 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4695 flags &= ~CODING_STATE_SS2;
4697 if (flags & CODING_STATE_SS3)
4699 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4700 flags &= ~CODING_STATE_SS3;
4703 /***** Now handle the control characters. *****/
4706 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4708 flags &= CODING_STATE_ISO2022_LOCK;
4710 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4711 DECODE_ADD_BINARY_CHAR (c, dst);
4714 { /* Graphic characters */
4715 Lisp_Object charset;
4721 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4723 /* Now determine the charset. */
4724 reg = ((flags & CODING_STATE_SS2) ? 2
4725 : (flags & CODING_STATE_SS3) ? 3
4726 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4727 : str->iso2022.register_left);
4728 charset = str->iso2022.charset[reg];
4730 /* Error checking: */
4731 if (! CHARSETP (charset)
4732 || str->iso2022.invalid_designated[reg]
4733 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4734 && XCHARSET_CHARS (charset) == 94))
4735 /* Mrmph. We are trying to invoke a register that has no
4736 or an invalid charset in it, or trying to add a character
4737 outside the range of the charset. Insert that char literally
4738 to preserve it for the output. */
4740 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4741 DECODE_ADD_BINARY_CHAR (c, dst);
4746 /* Things are probably hunky-dorey. */
4748 /* Fetch reverse charset, maybe. */
4749 if (((flags & CODING_STATE_R2L) &&
4750 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4752 (!(flags & CODING_STATE_R2L) &&
4753 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4755 Lisp_Object new_charset =
4756 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4757 if (!NILP (new_charset))
4758 charset = new_charset;
4762 lb = XCHARSET_LEADING_BYTE (charset);
4764 switch (XCHARSET_REP_BYTES (charset))
4767 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4768 Dynarr_add (dst, c & 0x7F);
4771 case 2: /* one-byte official */
4772 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4774 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c, 0), dst);
4776 Dynarr_add (dst, lb);
4777 Dynarr_add (dst, c | 0x80);
4781 case 3: /* one-byte private or two-byte official */
4782 if (XCHARSET_PRIVATE_P (charset))
4784 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4786 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c, 0), dst);
4788 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4789 Dynarr_add (dst, lb);
4790 Dynarr_add (dst, c | 0x80);
4798 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, ch, c), dst);
4800 Dynarr_add (dst, lb);
4801 Dynarr_add (dst, ch | 0x80);
4802 Dynarr_add (dst, c | 0x80);
4811 default: /* two-byte private */
4815 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, ch, c), dst);
4817 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4818 Dynarr_add (dst, lb);
4819 Dynarr_add (dst, ch | 0x80);
4820 Dynarr_add (dst, c | 0x80);
4830 flags &= CODING_STATE_ISO2022_LOCK;
4833 label_continue_loop:;
4836 if (flags & CODING_STATE_END)
4837 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4844 /***** ISO2022 encoder *****/
4846 /* Designate CHARSET into register REG. */
4849 iso2022_designate (Lisp_Object charset, unsigned char reg,
4850 struct encoding_stream *str, unsigned_char_dynarr *dst)
4852 static CONST char inter94[] = "()*+";
4853 static CONST char inter96[] = ",-./";
4855 unsigned char final;
4856 Lisp_Object old_charset = str->iso2022.charset[reg];
4858 str->iso2022.charset[reg] = charset;
4859 if (!CHARSETP (charset))
4860 /* charset might be an initial nil or t. */
4862 type = XCHARSET_TYPE (charset);
4863 final = XCHARSET_FINAL (charset);
4864 if (!str->iso2022.force_charset_on_output[reg] &&
4865 CHARSETP (old_charset) &&
4866 XCHARSET_TYPE (old_charset) == type &&
4867 XCHARSET_FINAL (old_charset) == final)
4870 str->iso2022.force_charset_on_output[reg] = 0;
4873 charset_conversion_spec_dynarr *dyn =
4874 str->codesys->iso2022.output_conv;
4880 for (i = 0; i < Dynarr_length (dyn); i++)
4882 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4883 if (EQ (charset, spec->from_charset))
4884 charset = spec->to_charset;
4889 Dynarr_add (dst, ISO_CODE_ESC);
4892 case CHARSET_TYPE_94:
4893 Dynarr_add (dst, inter94[reg]);
4895 case CHARSET_TYPE_96:
4896 Dynarr_add (dst, inter96[reg]);
4898 case CHARSET_TYPE_94X94:
4899 Dynarr_add (dst, '$');
4901 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4904 Dynarr_add (dst, inter94[reg]);
4906 case CHARSET_TYPE_96X96:
4907 Dynarr_add (dst, '$');
4908 Dynarr_add (dst, inter96[reg]);
4911 Dynarr_add (dst, final);
4915 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4917 if (str->iso2022.register_left != 0)
4919 Dynarr_add (dst, ISO_CODE_SI);
4920 str->iso2022.register_left = 0;
4925 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4927 if (str->iso2022.register_left != 1)
4929 Dynarr_add (dst, ISO_CODE_SO);
4930 str->iso2022.register_left = 1;
4934 /* Convert internally-formatted data to ISO2022 format. */
4937 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4938 unsigned_char_dynarr *dst, unsigned int n)
4940 unsigned char charmask, c;
4941 unsigned char char_boundary;
4942 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4943 unsigned int flags = str->flags;
4944 unsigned int ch = str->ch;
4945 Lisp_Coding_System *codesys = str->codesys;
4946 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4948 Lisp_Object charset;
4951 #ifdef ENABLE_COMPOSITE_CHARS
4952 /* flags for handling composite chars. We do a little switcharoo
4953 on the source while we're outputting the composite char. */
4954 unsigned int saved_n = 0;
4955 CONST unsigned char *saved_src = NULL;
4956 int in_composite = 0;
4957 #endif /* ENABLE_COMPOSITE_CHARS */
4959 char_boundary = str->iso2022.current_char_boundary;
4960 charset = str->iso2022.current_charset;
4961 half = str->iso2022.current_half;
4963 #ifdef ENABLE_COMPOSITE_CHARS
4970 if (BYTE_ASCII_P (c))
4971 { /* Processing ASCII character */
4974 restore_left_to_right_direction (codesys, dst, &flags, 0);
4976 /* Make sure G0 contains ASCII */
4977 if ((c > ' ' && c < ISO_CODE_DEL) ||
4978 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4980 ensure_normal_shift (str, dst);
4981 iso2022_designate (Vcharset_ascii, 0, str, dst);
4984 /* If necessary, restore everything to the default state
4987 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4989 restore_left_to_right_direction (codesys, dst, &flags, 0);
4991 ensure_normal_shift (str, dst);
4993 for (i = 0; i < 4; i++)
4995 Lisp_Object initial_charset =
4996 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4997 iso2022_designate (initial_charset, i, str, dst);
5002 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5003 Dynarr_add (dst, '\r');
5004 if (eol_type != EOL_CR)
5005 Dynarr_add (dst, c);
5009 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5010 && fit_to_be_escape_quoted (c))
5011 Dynarr_add (dst, ISO_CODE_ESC);
5012 Dynarr_add (dst, c);
5018 else if (BUFBYTE_FIRST_BYTE_P (c))
5022 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5023 { /* Processing Leading Byte */
5025 charset = CHARSET_BY_LEADING_BYTE (c);
5026 if (LEADING_BYTE_PREFIX_P(c))
5028 else if (!EQ (charset, Vcharset_control_1)
5029 #ifdef ENABLE_COMPOSITE_CHARS
5030 && !EQ (charset, Vcharset_composite)
5036 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5037 codesys, dst, &flags, 0);
5039 /* Now determine which register to use. */
5041 for (i = 0; i < 4; i++)
5043 if (EQ (charset, str->iso2022.charset[i]) ||
5045 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5054 if (XCHARSET_GRAPHIC (charset) != 0)
5056 if (!NILP (str->iso2022.charset[1]) &&
5057 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5058 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5060 else if (!NILP (str->iso2022.charset[2]))
5062 else if (!NILP (str->iso2022.charset[3]))
5071 iso2022_designate (charset, reg, str, dst);
5073 /* Now invoke that register. */
5077 ensure_normal_shift (str, dst);
5082 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5084 ensure_shift_out (str, dst);
5092 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5094 Dynarr_add (dst, ISO_CODE_ESC);
5095 Dynarr_add (dst, 'N');
5100 Dynarr_add (dst, ISO_CODE_SS2);
5106 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5108 Dynarr_add (dst, ISO_CODE_ESC);
5109 Dynarr_add (dst, 'O');
5114 Dynarr_add (dst, ISO_CODE_SS3);
5125 #endif /* not UTF2000 */
5127 { /* Processing Non-ASCII character */
5128 charmask = (half == 0 ? 0x7F : 0xFF);
5130 if (EQ (charset, Vcharset_control_1))
5132 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5133 && fit_to_be_escape_quoted (c))
5134 Dynarr_add (dst, ISO_CODE_ESC);
5135 /* you asked for it ... */
5136 Dynarr_add (dst, c - 0x20);
5140 switch (XCHARSET_REP_BYTES (charset))
5143 Dynarr_add (dst, c & charmask);
5146 if (XCHARSET_PRIVATE_P (charset))
5148 Dynarr_add (dst, c & charmask);
5153 #ifdef ENABLE_COMPOSITE_CHARS
5154 if (EQ (charset, Vcharset_composite))
5158 /* #### Bother! We don't know how to
5160 Dynarr_add (dst, '~');
5164 Emchar emch = MAKE_CHAR (Vcharset_composite,
5165 ch & 0x7F, c & 0x7F);
5166 Lisp_Object lstr = composite_char_string (emch);
5170 src = XSTRING_DATA (lstr);
5171 n = XSTRING_LENGTH (lstr);
5172 Dynarr_add (dst, ISO_CODE_ESC);
5173 Dynarr_add (dst, '0'); /* start composing */
5177 #endif /* ENABLE_COMPOSITE_CHARS */
5179 Dynarr_add (dst, ch & charmask);
5180 Dynarr_add (dst, c & charmask);
5193 Dynarr_add (dst, ch & charmask);
5194 Dynarr_add (dst, c & charmask);
5210 #ifdef ENABLE_COMPOSITE_CHARS
5216 Dynarr_add (dst, ISO_CODE_ESC);
5217 Dynarr_add (dst, '1'); /* end composing */
5218 goto back_to_square_n; /* Wheeeeeeeee ..... */
5220 #endif /* ENABLE_COMPOSITE_CHARS */
5222 if (char_boundary && flags & CODING_STATE_END)
5224 restore_left_to_right_direction (codesys, dst, &flags, 0);
5225 ensure_normal_shift (str, dst);
5226 for (i = 0; i < 4; i++)
5228 Lisp_Object initial_charset =
5229 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5230 iso2022_designate (initial_charset, i, str, dst);
5236 str->iso2022.current_char_boundary = char_boundary;
5237 str->iso2022.current_charset = charset;
5238 str->iso2022.current_half = half;
5240 /* Verbum caro factum est! */
5244 /************************************************************************/
5245 /* No-conversion methods */
5246 /************************************************************************/
5248 /* This is used when reading in "binary" files -- i.e. files that may
5249 contain all 256 possible byte values and that are not to be
5250 interpreted as being in any particular decoding. */
5252 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5253 unsigned_char_dynarr *dst, unsigned int n)
5256 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5257 unsigned int flags = str->flags;
5258 unsigned int ch = str->ch;
5259 eol_type_t eol_type = str->eol_type;
5265 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5266 DECODE_ADD_BINARY_CHAR (c, dst);
5267 label_continue_loop:;
5270 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5277 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5278 unsigned_char_dynarr *dst, unsigned int n)
5281 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5282 unsigned int flags = str->flags;
5283 unsigned int ch = str->ch;
5284 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5291 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5292 Dynarr_add (dst, '\r');
5293 if (eol_type != EOL_CR)
5294 Dynarr_add (dst, '\n');
5297 else if (BYTE_ASCII_P (c))
5300 Dynarr_add (dst, c);
5302 else if (BUFBYTE_LEADING_BYTE_P (c))
5305 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5306 c == LEADING_BYTE_CONTROL_1)
5309 Dynarr_add (dst, '~'); /* untranslatable character */
5313 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5314 Dynarr_add (dst, c);
5315 else if (ch == LEADING_BYTE_CONTROL_1)
5318 Dynarr_add (dst, c - 0x20);
5320 /* else it should be the second or third byte of an
5321 untranslatable character, so ignore it */
5331 /************************************************************************/
5332 /* Simple internal/external functions */
5333 /************************************************************************/
5335 static Extbyte_dynarr *conversion_out_dynarr;
5336 static Bufbyte_dynarr *conversion_in_dynarr;
5338 /* Determine coding system from coding format */
5340 /* #### not correct for all values of `fmt'! */
5342 external_data_format_to_coding_system (enum external_data_format fmt)
5346 case FORMAT_FILENAME:
5347 case FORMAT_TERMINAL:
5348 if (EQ (Vfile_name_coding_system, Qnil) ||
5349 EQ (Vfile_name_coding_system, Qbinary))
5352 return Fget_coding_system (Vfile_name_coding_system);
5355 return Fget_coding_system (Qctext);
5363 convert_to_external_format (CONST Bufbyte *ptr,
5366 enum external_data_format fmt)
5368 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5370 if (!conversion_out_dynarr)
5371 conversion_out_dynarr = Dynarr_new (Extbyte);
5373 Dynarr_reset (conversion_out_dynarr);
5375 if (NILP (coding_system))
5377 CONST Bufbyte *end = ptr + len;
5383 (*ptr < 0xc0) ? *ptr :
5384 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5387 (BYTE_ASCII_P (*ptr)) ? *ptr :
5388 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5389 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5392 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5396 #ifdef ERROR_CHECK_BUFPOS
5397 assert (ptr == end);
5402 Lisp_Object instream, outstream, da_outstream;
5403 Lstream *istr, *ostr;
5404 struct gcpro gcpro1, gcpro2, gcpro3;
5405 char tempbuf[1024]; /* some random amount */
5407 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5408 da_outstream = make_dynarr_output_stream
5409 ((unsigned_char_dynarr *) conversion_out_dynarr);
5411 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5412 istr = XLSTREAM (instream);
5413 ostr = XLSTREAM (outstream);
5414 GCPRO3 (instream, outstream, da_outstream);
5417 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5420 Lstream_write (ostr, tempbuf, size_in_bytes);
5422 Lstream_close (istr);
5423 Lstream_close (ostr);
5425 Lstream_delete (istr);
5426 Lstream_delete (ostr);
5427 Lstream_delete (XLSTREAM (da_outstream));
5430 *len_out = Dynarr_length (conversion_out_dynarr);
5431 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5432 return Dynarr_atp (conversion_out_dynarr, 0);
5436 convert_from_external_format (CONST Extbyte *ptr,
5439 enum external_data_format fmt)
5441 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5443 if (!conversion_in_dynarr)
5444 conversion_in_dynarr = Dynarr_new (Bufbyte);
5446 Dynarr_reset (conversion_in_dynarr);
5448 if (NILP (coding_system))
5450 CONST Extbyte *end = ptr + len;
5451 for (; ptr < end; ptr++)
5454 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5459 Lisp_Object instream, outstream, da_outstream;
5460 Lstream *istr, *ostr;
5461 struct gcpro gcpro1, gcpro2, gcpro3;
5462 char tempbuf[1024]; /* some random amount */
5464 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5465 da_outstream = make_dynarr_output_stream
5466 ((unsigned_char_dynarr *) conversion_in_dynarr);
5468 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5469 istr = XLSTREAM (instream);
5470 ostr = XLSTREAM (outstream);
5471 GCPRO3 (instream, outstream, da_outstream);
5474 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5477 Lstream_write (ostr, tempbuf, size_in_bytes);
5479 Lstream_close (istr);
5480 Lstream_close (ostr);
5482 Lstream_delete (istr);
5483 Lstream_delete (ostr);
5484 Lstream_delete (XLSTREAM (da_outstream));
5487 *len_out = Dynarr_length (conversion_in_dynarr);
5488 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5489 return Dynarr_atp (conversion_in_dynarr, 0);
5493 /************************************************************************/
5494 /* Initialization */
5495 /************************************************************************/
5498 syms_of_file_coding (void)
5500 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5501 deferror (&Qcoding_system_error, "coding-system-error",
5502 "Coding-system error", Qio_error);
5504 DEFSUBR (Fcoding_system_p);
5505 DEFSUBR (Ffind_coding_system);
5506 DEFSUBR (Fget_coding_system);
5507 DEFSUBR (Fcoding_system_list);
5508 DEFSUBR (Fcoding_system_name);
5509 DEFSUBR (Fmake_coding_system);
5510 DEFSUBR (Fcopy_coding_system);
5511 DEFSUBR (Fdefine_coding_system_alias);
5512 DEFSUBR (Fsubsidiary_coding_system);
5514 DEFSUBR (Fcoding_system_type);
5515 DEFSUBR (Fcoding_system_doc_string);
5517 DEFSUBR (Fcoding_system_charset);
5519 DEFSUBR (Fcoding_system_property);
5521 DEFSUBR (Fcoding_category_list);
5522 DEFSUBR (Fset_coding_priority_list);
5523 DEFSUBR (Fcoding_priority_list);
5524 DEFSUBR (Fset_coding_category_system);
5525 DEFSUBR (Fcoding_category_system);
5527 DEFSUBR (Fdetect_coding_region);
5528 DEFSUBR (Fdecode_coding_region);
5529 DEFSUBR (Fencode_coding_region);
5531 DEFSUBR (Fdecode_shift_jis_char);
5532 DEFSUBR (Fencode_shift_jis_char);
5533 DEFSUBR (Fdecode_big5_char);
5534 DEFSUBR (Fencode_big5_char);
5535 DEFSUBR (Fset_ucs_char);
5536 DEFSUBR (Fucs_char);
5537 DEFSUBR (Fset_char_ucs);
5538 DEFSUBR (Fchar_ucs);
5540 defsymbol (&Qcoding_system_p, "coding-system-p");
5541 defsymbol (&Qno_conversion, "no-conversion");
5542 defsymbol (&Qraw_text, "raw-text");
5544 defsymbol (&Qbig5, "big5");
5545 defsymbol (&Qshift_jis, "shift-jis");
5546 defsymbol (&Qucs4, "ucs-4");
5547 defsymbol (&Qutf8, "utf-8");
5548 defsymbol (&Qccl, "ccl");
5549 defsymbol (&Qiso2022, "iso2022");
5551 defsymbol (&Qmnemonic, "mnemonic");
5552 defsymbol (&Qeol_type, "eol-type");
5553 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5554 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5556 defsymbol (&Qcr, "cr");
5557 defsymbol (&Qlf, "lf");
5558 defsymbol (&Qcrlf, "crlf");
5559 defsymbol (&Qeol_cr, "eol-cr");
5560 defsymbol (&Qeol_lf, "eol-lf");
5561 defsymbol (&Qeol_crlf, "eol-crlf");
5563 defsymbol (&Qcharset_g0, "charset-g0");
5564 defsymbol (&Qcharset_g1, "charset-g1");
5565 defsymbol (&Qcharset_g2, "charset-g2");
5566 defsymbol (&Qcharset_g3, "charset-g3");
5567 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5568 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5569 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5570 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5571 defsymbol (&Qno_iso6429, "no-iso6429");
5572 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5573 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5575 defsymbol (&Qshort, "short");
5576 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5577 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5578 defsymbol (&Qseven, "seven");
5579 defsymbol (&Qlock_shift, "lock-shift");
5580 defsymbol (&Qescape_quoted, "escape-quoted");
5582 defsymbol (&Qencode, "encode");
5583 defsymbol (&Qdecode, "decode");
5586 defsymbol (&Qctext, "ctext");
5587 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5589 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5591 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5593 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5595 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5597 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5599 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5601 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5603 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5606 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5611 lstream_type_create_file_coding (void)
5613 LSTREAM_HAS_METHOD (decoding, reader);
5614 LSTREAM_HAS_METHOD (decoding, writer);
5615 LSTREAM_HAS_METHOD (decoding, rewinder);
5616 LSTREAM_HAS_METHOD (decoding, seekable_p);
5617 LSTREAM_HAS_METHOD (decoding, flusher);
5618 LSTREAM_HAS_METHOD (decoding, closer);
5619 LSTREAM_HAS_METHOD (decoding, marker);
5621 LSTREAM_HAS_METHOD (encoding, reader);
5622 LSTREAM_HAS_METHOD (encoding, writer);
5623 LSTREAM_HAS_METHOD (encoding, rewinder);
5624 LSTREAM_HAS_METHOD (encoding, seekable_p);
5625 LSTREAM_HAS_METHOD (encoding, flusher);
5626 LSTREAM_HAS_METHOD (encoding, closer);
5627 LSTREAM_HAS_METHOD (encoding, marker);
5631 vars_of_file_coding (void)
5635 /* Initialize to something reasonable ... */
5636 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5638 coding_category_system[i] = Qnil;
5639 coding_category_by_priority[i] = i;
5642 Fprovide (intern ("file-coding"));
5644 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5645 Coding system used for TTY keyboard input.
5646 Not used under a windowing system.
5648 Vkeyboard_coding_system = Qnil;
5650 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5651 Coding system used for TTY display output.
5652 Not used under a windowing system.
5654 Vterminal_coding_system = Qnil;
5656 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5657 Overriding coding system used when writing a file or process.
5658 You should *bind* this, not set it. If this is non-nil, it specifies
5659 the coding system that will be used when a file or process is read
5660 in, and overrides `buffer-file-coding-system-for-read',
5661 `insert-file-contents-pre-hook', etc. Use those variables instead of
5662 this one for permanent changes to the environment.
5664 Vcoding_system_for_read = Qnil;
5666 DEFVAR_LISP ("coding-system-for-write",
5667 &Vcoding_system_for_write /*
5668 Overriding coding system used when writing a file or process.
5669 You should *bind* this, not set it. If this is non-nil, it specifies
5670 the coding system that will be used when a file or process is wrote
5671 in, and overrides `buffer-file-coding-system',
5672 `write-region-pre-hook', etc. Use those variables instead of this one
5673 for permanent changes to the environment.
5675 Vcoding_system_for_write = Qnil;
5677 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5678 Coding system used to convert pathnames when accessing files.
5680 Vfile_name_coding_system = Qnil;
5682 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5683 Non-nil means the buffer contents are regarded as multi-byte form
5684 of characters, not a binary code. This affects the display, file I/O,
5685 and behaviors of various editing commands.
5687 Setting this to nil does not do anything.
5689 enable_multibyte_characters = 1;
5693 complex_vars_of_file_coding (void)
5695 staticpro (&Vcoding_system_hash_table);
5696 Vcoding_system_hash_table =
5697 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5699 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5701 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5703 struct codesys_prop csp; \
5705 csp.prop_type = (Prop_Type); \
5706 Dynarr_add (the_codesys_prop_dynarr, csp); \
5709 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5710 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5711 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5712 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5713 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5714 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5715 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5717 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5718 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5719 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5720 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5721 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5722 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5723 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5724 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5725 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5726 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5727 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5728 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5729 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5730 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5731 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5732 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5733 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5735 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5736 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5738 /* Need to create this here or we're really screwed. */
5740 (Qraw_text, Qno_conversion,
5741 build_string ("Raw text, which means it converts only line-break-codes."),
5742 list2 (Qmnemonic, build_string ("Raw")));
5745 (Qbinary, Qno_conversion,
5746 build_string ("Binary, which means it does not convert anything."),
5747 list4 (Qeol_type, Qlf,
5748 Qmnemonic, build_string ("Binary")));
5750 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5752 /* Need this for bootstrapping */
5753 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5754 Fget_coding_system (Qraw_text);
5760 for (i = 0; i < 65536; i++)
5761 ucs_to_mule_table[i] = Qnil;
5763 staticpro (&mule_to_ucs_table);
5764 mule_to_ucs_table = Fmake_char_table(Qgeneric);