1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
825 if (ty == CODESYS_NO_CONVERSION)
826 codesys->fixed.size = 1;
828 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
830 if (EQ (key, Qmnemonic))
833 CHECK_STRING (value);
834 CODING_SYSTEM_MNEMONIC (codesys) = value;
837 else if (EQ (key, Qeol_type))
839 need_to_setup_eol_systems = NILP (value);
842 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
845 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
846 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
848 else if (ty == CODESYS_ISO2022)
850 #define FROB_INITIAL_CHARSET(charset_num) \
851 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
852 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
854 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
855 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
856 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
857 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
859 #define FROB_FORCE_CHARSET(charset_num) \
860 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
862 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
863 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
864 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
865 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
867 #define FROB_BOOLEAN_PROPERTY(prop) \
868 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
870 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
871 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
872 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
873 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
874 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
875 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
876 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
878 else if (EQ (key, Qinput_charset_conversion))
880 codesys->iso2022.input_conv =
881 Dynarr_new (charset_conversion_spec);
882 parse_charset_conversion_specs (codesys->iso2022.input_conv,
885 else if (EQ (key, Qoutput_charset_conversion))
887 codesys->iso2022.output_conv =
888 Dynarr_new (charset_conversion_spec);
889 parse_charset_conversion_specs (codesys->iso2022.output_conv,
893 signal_simple_error ("Unrecognized property", key);
895 else if (EQ (type, Qccl))
897 if (EQ (key, Qdecode))
899 CHECK_VECTOR (value);
900 CODING_SYSTEM_CCL_DECODE (codesys) = value;
902 else if (EQ (key, Qencode))
904 CHECK_VECTOR (value);
905 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
908 signal_simple_error ("Unrecognized property", key);
912 signal_simple_error ("Unrecognized property", key);
915 if (need_to_setup_eol_systems)
916 setup_eol_coding_systems (codesys);
919 Lisp_Object codesys_obj;
920 XSETCODING_SYSTEM (codesys_obj, codesys);
921 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
926 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
927 Copy OLD-CODING-SYSTEM to NEW-NAME.
928 If NEW-NAME does not name an existing coding system, a new one will
931 (old_coding_system, new_name))
933 Lisp_Object new_coding_system;
934 old_coding_system = Fget_coding_system (old_coding_system);
935 new_coding_system = Ffind_coding_system (new_name);
936 if (NILP (new_coding_system))
938 XSETCODING_SYSTEM (new_coding_system,
939 allocate_coding_system
940 (XCODING_SYSTEM_TYPE (old_coding_system),
942 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
946 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
947 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
948 memcpy (((char *) to ) + sizeof (to->header),
949 ((char *) from) + sizeof (from->header),
950 sizeof (*from) - sizeof (from->header));
953 return new_coding_system;
956 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
957 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
959 (alias, coding_system))
961 CHECK_SYMBOL (alias);
962 if (!NILP (Ffind_coding_system (alias)))
963 signal_simple_error ("Symbol already names a coding system", alias);
964 coding_system = Fget_coding_system (coding_system);
965 Fputhash (alias, coding_system, Vcoding_system_hash_table);
967 /* Set up aliases for subsidiaries. */
968 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
971 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
972 #define FROB(type, name) \
974 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
975 if (!NILP (subsidiary)) \
976 Fdefine_coding_system_alias \
977 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
984 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
985 but it doesn't look intentional, so I'd rather return something
986 meaningful or nothing at all. */
991 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
993 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
994 Lisp_Object new_coding_system;
996 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
997 return coding_system;
1001 case EOL_AUTODETECT: return coding_system;
1002 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1003 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1004 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1008 return NILP (new_coding_system) ? coding_system : new_coding_system;
1011 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1012 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1014 (coding_system, eol_type))
1016 coding_system = Fget_coding_system (coding_system);
1018 return subsidiary_coding_system (coding_system,
1019 symbol_to_eol_type (eol_type));
1023 /************************************************************************/
1024 /* Coding system accessors */
1025 /************************************************************************/
1027 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1028 Return the doc string for CODING-SYSTEM.
1032 coding_system = Fget_coding_system (coding_system);
1033 return XCODING_SYSTEM_DOC_STRING (coding_system);
1036 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1037 Return the type of CODING-SYSTEM.
1041 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1044 case CODESYS_AUTODETECT: return Qundecided;
1046 case CODESYS_SHIFT_JIS: return Qshift_jis;
1047 case CODESYS_ISO2022: return Qiso2022;
1048 case CODESYS_BIG5: return Qbig5;
1049 case CODESYS_UCS4: return Qucs4;
1050 case CODESYS_UTF8: return Qutf8;
1051 case CODESYS_CCL: return Qccl;
1053 case CODESYS_NO_CONVERSION: return Qno_conversion;
1055 case CODESYS_INTERNAL: return Qinternal;
1062 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1065 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1067 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1070 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1071 Return initial charset of CODING-SYSTEM designated to GNUM.
1074 (coding_system, gnum))
1076 coding_system = Fget_coding_system (coding_system);
1079 return coding_system_charset (coding_system, XINT (gnum));
1083 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1084 Return the PROP property of CODING-SYSTEM.
1086 (coding_system, prop))
1089 enum coding_system_type type;
1091 coding_system = Fget_coding_system (coding_system);
1092 CHECK_SYMBOL (prop);
1093 type = XCODING_SYSTEM_TYPE (coding_system);
1095 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1096 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1099 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1101 case CODESYS_PROP_ALL_OK:
1104 case CODESYS_PROP_ISO2022:
1105 if (type != CODESYS_ISO2022)
1107 ("Property only valid in ISO2022 coding systems",
1111 case CODESYS_PROP_CCL:
1112 if (type != CODESYS_CCL)
1114 ("Property only valid in CCL coding systems",
1124 signal_simple_error ("Unrecognized property", prop);
1126 if (EQ (prop, Qname))
1127 return XCODING_SYSTEM_NAME (coding_system);
1128 else if (EQ (prop, Qtype))
1129 return Fcoding_system_type (coding_system);
1130 else if (EQ (prop, Qdoc_string))
1131 return XCODING_SYSTEM_DOC_STRING (coding_system);
1132 else if (EQ (prop, Qmnemonic))
1133 return XCODING_SYSTEM_MNEMONIC (coding_system);
1134 else if (EQ (prop, Qeol_type))
1135 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1136 else if (EQ (prop, Qeol_lf))
1137 return XCODING_SYSTEM_EOL_LF (coding_system);
1138 else if (EQ (prop, Qeol_crlf))
1139 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1140 else if (EQ (prop, Qeol_cr))
1141 return XCODING_SYSTEM_EOL_CR (coding_system);
1142 else if (EQ (prop, Qpost_read_conversion))
1143 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1144 else if (EQ (prop, Qpre_write_conversion))
1145 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1147 else if (type == CODESYS_ISO2022)
1149 if (EQ (prop, Qcharset_g0))
1150 return coding_system_charset (coding_system, 0);
1151 else if (EQ (prop, Qcharset_g1))
1152 return coding_system_charset (coding_system, 1);
1153 else if (EQ (prop, Qcharset_g2))
1154 return coding_system_charset (coding_system, 2);
1155 else if (EQ (prop, Qcharset_g3))
1156 return coding_system_charset (coding_system, 3);
1158 #define FORCE_CHARSET(charset_num) \
1159 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1160 (coding_system, charset_num) ? Qt : Qnil)
1162 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1163 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1164 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1165 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1167 #define LISP_BOOLEAN(prop) \
1168 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1170 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1171 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1172 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1173 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1174 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1175 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1176 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1178 else if (EQ (prop, Qinput_charset_conversion))
1180 unparse_charset_conversion_specs
1181 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1182 else if (EQ (prop, Qoutput_charset_conversion))
1184 unparse_charset_conversion_specs
1185 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1189 else if (type == CODESYS_CCL)
1191 if (EQ (prop, Qdecode))
1192 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1193 else if (EQ (prop, Qencode))
1194 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1202 return Qnil; /* not reached */
1206 /************************************************************************/
1207 /* Coding category functions */
1208 /************************************************************************/
1211 decode_coding_category (Lisp_Object symbol)
1215 CHECK_SYMBOL (symbol);
1216 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1217 if (EQ (coding_category_symbol[i], symbol))
1220 signal_simple_error ("Unrecognized coding category", symbol);
1221 return 0; /* not reached */
1224 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1225 Return a list of all recognized coding categories.
1230 Lisp_Object list = Qnil;
1232 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1233 list = Fcons (coding_category_symbol[i], list);
1237 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1238 Change the priority order of the coding categories.
1239 LIST should be list of coding categories, in descending order of
1240 priority. Unspecified coding categories will be lower in priority
1241 than all specified ones, in the same relative order they were in
1246 int category_to_priority[CODING_CATEGORY_LAST + 1];
1250 /* First generate a list that maps coding categories to priorities. */
1252 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1253 category_to_priority[i] = -1;
1255 /* Highest priority comes from the specified list. */
1257 EXTERNAL_LIST_LOOP (rest, list)
1259 int cat = decode_coding_category (XCAR (rest));
1261 if (category_to_priority[cat] >= 0)
1262 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1263 category_to_priority[cat] = i++;
1266 /* Now go through the existing categories by priority to retrieve
1267 the categories not yet specified and preserve their priority
1269 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1271 int cat = coding_category_by_priority[j];
1272 if (category_to_priority[cat] < 0)
1273 category_to_priority[cat] = i++;
1276 /* Now we need to construct the inverse of the mapping we just
1279 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1280 coding_category_by_priority[category_to_priority[i]] = i;
1282 /* Phew! That was confusing. */
1286 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1287 Return a list of coding categories in descending order of priority.
1292 Lisp_Object list = Qnil;
1294 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1295 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1300 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1301 Change the coding system associated with a coding category.
1303 (coding_category, coding_system))
1305 int cat = decode_coding_category (coding_category);
1307 coding_system = Fget_coding_system (coding_system);
1308 coding_category_system[cat] = coding_system;
1312 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1313 Return the coding system associated with a coding category.
1317 int cat = decode_coding_category (coding_category);
1318 Lisp_Object sys = coding_category_system[cat];
1321 return XCODING_SYSTEM_NAME (sys);
1326 /************************************************************************/
1327 /* Detecting the encoding of data */
1328 /************************************************************************/
1330 struct detection_state
1332 enum eol_type eol_type;
1368 struct iso2022_decoder iso;
1370 int high_byte_count;
1371 unsigned int saw_single_shift:1;
1384 acceptable_control_char_p (int c)
1388 /* Allow and ignore control characters that you might
1389 reasonably see in a text file */
1394 case 8: /* backspace */
1395 case 11: /* vertical tab */
1396 case 12: /* form feed */
1397 case 26: /* MS-DOS C-z junk */
1398 case 31: /* '^_' -- for info */
1406 mask_has_at_most_one_bit_p (int mask)
1408 /* Perhaps the only thing useful you learn from intensive Microsoft
1409 technical interviews */
1410 return (mask & (mask - 1)) == 0;
1413 static enum eol_type
1414 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1423 st->eol.just_saw_cr = 1;
1428 if (st->eol.just_saw_cr)
1430 else if (st->eol.seen_anything)
1433 else if (st->eol.just_saw_cr)
1435 st->eol.just_saw_cr = 0;
1437 st->eol.seen_anything = 1;
1440 return EOL_AUTODETECT;
1443 /* Attempt to determine the encoding and EOL type of the given text.
1444 Before calling this function for the first type, you must initialize
1445 st->eol_type as appropriate and initialize st->mask to ~0.
1447 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1450 st->mask holds the determined coding category mask, or ~0 if only
1451 ASCII has been seen so far.
1455 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1456 is present in st->mask
1457 1 == definitive answers are here for both st->eol_type and st->mask
1461 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1462 unsigned int n, int just_do_eol)
1466 if (st->eol_type == EOL_AUTODETECT)
1467 st->eol_type = detect_eol_type (st, src, n);
1470 return st->eol_type != EOL_AUTODETECT;
1472 if (!st->seen_non_ascii)
1474 for (; n; n--, src++)
1477 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1479 st->seen_non_ascii = 1;
1481 st->shift_jis.mask = ~0;
1485 st->iso2022.mask = ~0;
1495 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1496 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1497 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1498 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1499 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1500 st->big5.mask = detect_coding_big5 (st, src, n);
1501 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1502 st->utf8.mask = detect_coding_utf8 (st, src, n);
1503 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1504 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1507 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1508 | st->utf8.mask | st->ucs4.mask;
1511 int retval = mask_has_at_most_one_bit_p (st->mask);
1512 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1513 return retval && st->eol_type != EOL_AUTODETECT;
1518 coding_system_from_mask (int mask)
1522 /* If the file was entirely or basically ASCII, use the
1523 default value of `buffer-file-coding-system'. */
1524 Lisp_Object retval =
1525 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1528 retval = Ffind_coding_system (retval);
1532 (Qbad_variable, Qwarning,
1533 "Invalid `default-buffer-file-coding-system', set to nil");
1534 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1538 retval = Fget_coding_system (Qraw_text);
1546 mask = postprocess_iso2022_mask (mask);
1548 /* Look through the coding categories by priority and find
1549 the first one that is allowed. */
1550 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1552 cat = coding_category_by_priority[i];
1553 if ((mask & (1 << cat)) &&
1554 !NILP (coding_category_system[cat]))
1558 return coding_category_system[cat];
1560 return Fget_coding_system (Qraw_text);
1564 /* Given a seekable read stream and potential coding system and EOL type
1565 as specified, do any autodetection that is called for. If the
1566 coding system and/or EOL type are not autodetect, they will be left
1567 alone; but this function will never return an autodetect coding system
1570 This function does not automatically fetch subsidiary coding systems;
1571 that should be unnecessary with the explicit eol-type argument. */
1574 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1575 enum eol_type *eol_type_in_out)
1577 struct detection_state decst;
1579 if (*eol_type_in_out == EOL_AUTODETECT)
1580 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1583 decst.eol_type = *eol_type_in_out;
1586 /* If autodetection is called for, do it now. */
1587 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1588 *eol_type_in_out == EOL_AUTODETECT)
1593 unsigned char random_buffer[4096];
1596 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1599 if (detect_coding_type (&decst, random_buffer, nread,
1600 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1601 CODESYS_AUTODETECT))
1605 *eol_type_in_out = decst.eol_type;
1606 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1607 *codesys_in_out = coding_system_from_mask (decst.mask);
1610 /* If we absolutely can't determine the EOL type, just assume LF. */
1611 if (*eol_type_in_out == EOL_AUTODETECT)
1612 *eol_type_in_out = EOL_LF;
1614 Lstream_rewind (stream);
1617 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1618 Detect coding system of the text in the region between START and END.
1619 Returned a list of possible coding systems ordered by priority.
1620 If only ASCII characters are found, it returns 'undecided or one of
1621 its subsidiary coding systems according to a detected end-of-line
1622 type. Optional arg BUFFER defaults to the current buffer.
1624 (start, end, buffer))
1626 Lisp_Object val = Qnil;
1627 struct buffer *buf = decode_buffer (buffer, 0);
1629 Lisp_Object instream, lb_instream;
1630 Lstream *istr, *lb_istr;
1631 struct detection_state decst;
1632 struct gcpro gcpro1, gcpro2;
1634 get_buffer_range_char (buf, start, end, &b, &e, 0);
1635 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1636 lb_istr = XLSTREAM (lb_instream);
1637 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1638 istr = XLSTREAM (instream);
1639 GCPRO2 (instream, lb_instream);
1641 decst.eol_type = EOL_AUTODETECT;
1645 unsigned char random_buffer[4096];
1646 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1650 if (detect_coding_type (&decst, random_buffer, nread, 0))
1654 if (decst.mask == ~0)
1655 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1663 decst.mask = postprocess_iso2022_mask (decst.mask);
1665 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1667 int sys = coding_category_by_priority[i];
1668 if (decst.mask & (1 << sys))
1670 Lisp_Object codesys = coding_category_system[sys];
1671 if (!NILP (codesys))
1672 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1673 val = Fcons (codesys, val);
1677 Lstream_close (istr);
1679 Lstream_delete (istr);
1680 Lstream_delete (lb_istr);
1685 /************************************************************************/
1686 /* Converting to internal Mule format ("decoding") */
1687 /************************************************************************/
1689 /* A decoding stream is a stream used for decoding text (i.e.
1690 converting from some external format to internal format).
1691 The decoding-stream object keeps track of the actual coding
1692 stream, the stream that is at the other end, and data that
1693 needs to be persistent across the lifetime of the stream. */
1695 /* Handle the EOL stuff related to just-read-in character C.
1696 EOL_TYPE is the EOL type of the coding stream.
1697 FLAGS is the current value of FLAGS in the coding stream, and may
1698 be modified by this macro. (The macro only looks at the
1699 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1700 bytes are to be written. You need to also define a local goto
1701 label "label_continue_loop" that is at the end of the main
1702 character-reading loop.
1704 If C is a CR character, then this macro handles it entirely and
1705 jumps to label_continue_loop. Otherwise, this macro does not add
1706 anything to DST, and continues normally. You should continue
1707 processing C normally after this macro. */
1709 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1713 if (eol_type == EOL_CR) \
1714 Dynarr_add (dst, '\n'); \
1715 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1716 Dynarr_add (dst, c); \
1718 flags |= CODING_STATE_CR; \
1719 goto label_continue_loop; \
1721 else if (flags & CODING_STATE_CR) \
1722 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1724 Dynarr_add (dst, '\r'); \
1725 flags &= ~CODING_STATE_CR; \
1729 /* C should be a binary character in the range 0 - 255; convert
1730 to internal format and add to Dynarr DST. */
1733 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1735 if (BYTE_ASCII_P (c)) \
1736 Dynarr_add (dst, c); \
1739 Dynarr_add (dst, (c >> 6) | 0xc0); \
1740 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1745 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1749 Dynarr_add (dst, c);
1751 else if ( c <= 0x7ff )
1753 Dynarr_add (dst, (c >> 6) | 0xc0);
1754 Dynarr_add (dst, (c & 0x3f) | 0x80);
1756 else if ( c <= 0xffff )
1758 Dynarr_add (dst, (c >> 12) | 0xe0);
1759 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1760 Dynarr_add (dst, (c & 0x3f) | 0x80);
1762 else if ( c <= 0x1fffff )
1764 Dynarr_add (dst, (c >> 18) | 0xf0);
1765 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1766 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1767 Dynarr_add (dst, (c & 0x3f) | 0x80);
1769 else if ( c <= 0x3ffffff )
1771 Dynarr_add (dst, (c >> 24) | 0xf8);
1772 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1773 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1774 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1775 Dynarr_add (dst, (c & 0x3f) | 0x80);
1779 Dynarr_add (dst, (c >> 30) | 0xfc);
1780 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1781 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1782 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1783 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1784 Dynarr_add (dst, (c & 0x3f) | 0x80);
1788 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1790 if (BYTE_ASCII_P (c)) \
1791 Dynarr_add (dst, c); \
1792 else if (BYTE_C1_P (c)) \
1794 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1795 Dynarr_add (dst, c + 0x20); \
1799 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1800 Dynarr_add (dst, c); \
1805 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1809 DECODE_ADD_BINARY_CHAR (ch, dst); \
1814 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1816 if (flags & CODING_STATE_END) \
1818 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1819 if (flags & CODING_STATE_CR) \
1820 Dynarr_add (dst, '\r'); \
1824 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1826 struct decoding_stream
1828 /* Coding system that governs the conversion. */
1829 Lisp_Coding_System *codesys;
1831 /* Stream that we read the encoded data from or
1832 write the decoded data to. */
1835 /* If we are reading, then we can return only a fixed amount of
1836 data, so if the conversion resulted in too much data, we store it
1837 here for retrieval the next time around. */
1838 unsigned_char_dynarr *runoff;
1840 /* FLAGS holds flags indicating the current state of the decoding.
1841 Some of these flags are dependent on the coding system. */
1844 /* CH holds a partially built-up character. Since we only deal
1845 with one- and two-byte characters at the moment, we only use
1846 this to store the first byte of a two-byte character. */
1849 /* EOL_TYPE specifies the type of end-of-line conversion that
1850 currently applies. We need to keep this separate from the
1851 EOL type stored in CODESYS because the latter might indicate
1852 automatic EOL-type detection while the former will always
1853 indicate a particular EOL type. */
1854 enum eol_type eol_type;
1856 /* Additional ISO2022 information. We define the structure above
1857 because it's also needed by the detection routines. */
1858 struct iso2022_decoder iso2022;
1860 /* Additional information (the state of the running CCL program)
1861 used by the CCL decoder. */
1862 struct ccl_program ccl;
1864 struct detection_state decst;
1867 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1868 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1869 static int decoding_rewinder (Lstream *stream);
1870 static int decoding_seekable_p (Lstream *stream);
1871 static int decoding_flusher (Lstream *stream);
1872 static int decoding_closer (Lstream *stream);
1874 static Lisp_Object decoding_marker (Lisp_Object stream,
1875 void (*markobj) (Lisp_Object));
1877 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1878 sizeof (struct decoding_stream));
1881 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1883 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1884 Lisp_Object str_obj;
1886 /* We do not need to mark the coding systems or charsets stored
1887 within the stream because they are stored in a global list
1888 and automatically marked. */
1890 XSETLSTREAM (str_obj, str);
1892 if (str->imp->marker)
1893 return (str->imp->marker) (str_obj, markobj);
1898 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1899 so we read data from the other end, decode it, and store it into DATA. */
1902 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1904 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1905 unsigned char *orig_data = data;
1907 int error_occurred = 0;
1909 /* We need to interface to mule_decode(), which expects to take some
1910 amount of data and store the result into a Dynarr. We have
1911 mule_decode() store into str->runoff, and take data from there
1914 /* We loop until we have enough data, reading chunks from the other
1915 end and decoding it. */
1918 /* Take data from the runoff if we can. Make sure to take at
1919 most SIZE bytes, and delete the data from the runoff. */
1920 if (Dynarr_length (str->runoff) > 0)
1922 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1923 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1924 Dynarr_delete_many (str->runoff, 0, chunk);
1930 break; /* No more room for data */
1932 if (str->flags & CODING_STATE_END)
1933 /* This means that on the previous iteration, we hit the EOF on
1934 the other end. We loop once more so that mule_decode() can
1935 output any final stuff it may be holding, or any "go back
1936 to a sane state" escape sequences. (This latter makes sense
1937 during encoding.) */
1940 /* Exhausted the runoff, so get some more. DATA has at least
1941 SIZE bytes left of storage in it, so it's OK to read directly
1942 into it. (We'll be overwriting above, after we've decoded it
1943 into the runoff.) */
1944 read_size = Lstream_read (str->other_end, data, size);
1951 /* There might be some more end data produced in the translation.
1952 See the comment above. */
1953 str->flags |= CODING_STATE_END;
1954 mule_decode (stream, data, str->runoff, read_size);
1957 if (data - orig_data == 0)
1958 return error_occurred ? -1 : 0;
1960 return data - orig_data;
1964 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1966 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1969 /* Decode all our data into the runoff, and then attempt to write
1970 it all out to the other end. Remove whatever chunk we succeeded
1972 mule_decode (stream, data, str->runoff, size);
1973 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1974 Dynarr_length (str->runoff));
1976 Dynarr_delete_many (str->runoff, 0, retval);
1977 /* Do NOT return retval. The return value indicates how much
1978 of the incoming data was written, not how many bytes were
1984 reset_decoding_stream (struct decoding_stream *str)
1987 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1989 Lisp_Object coding_system;
1990 XSETCODING_SYSTEM (coding_system, str->codesys);
1991 reset_iso2022 (coding_system, &str->iso2022);
1993 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1995 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1998 str->flags = str->ch = 0;
2002 decoding_rewinder (Lstream *stream)
2004 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2005 reset_decoding_stream (str);
2006 Dynarr_reset (str->runoff);
2007 return Lstream_rewind (str->other_end);
2011 decoding_seekable_p (Lstream *stream)
2013 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2014 return Lstream_seekable_p (str->other_end);
2018 decoding_flusher (Lstream *stream)
2020 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2021 return Lstream_flush (str->other_end);
2025 decoding_closer (Lstream *stream)
2027 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2028 if (stream->flags & LSTREAM_FL_WRITE)
2030 str->flags |= CODING_STATE_END;
2031 decoding_writer (stream, 0, 0);
2033 Dynarr_free (str->runoff);
2035 #ifdef ENABLE_COMPOSITE_CHARS
2036 if (str->iso2022.composite_chars)
2037 Dynarr_free (str->iso2022.composite_chars);
2040 return Lstream_close (str->other_end);
2044 decoding_stream_coding_system (Lstream *stream)
2046 Lisp_Object coding_system;
2047 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2049 XSETCODING_SYSTEM (coding_system, str->codesys);
2050 return subsidiary_coding_system (coding_system, str->eol_type);
2054 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2056 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2057 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2059 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2060 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2061 reset_decoding_stream (str);
2064 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2065 stream for writing, no automatic code detection will be performed.
2066 The reason for this is that automatic code detection requires a
2067 seekable input. Things will also fail if you open a decoding
2068 stream for reading using a non-fully-specified coding system and
2069 a non-seekable input stream. */
2072 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2075 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2076 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2080 str->other_end = stream;
2081 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2082 str->eol_type = EOL_AUTODETECT;
2083 if (!strcmp (mode, "r")
2084 && Lstream_seekable_p (stream))
2085 /* We can determine the coding system now. */
2086 determine_real_coding_system (stream, &codesys, &str->eol_type);
2087 set_decoding_stream_coding_system (lstr, codesys);
2088 str->decst.eol_type = str->eol_type;
2089 str->decst.mask = ~0;
2090 XSETLSTREAM (obj, lstr);
2095 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2097 return make_decoding_stream_1 (stream, codesys, "r");
2101 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2103 return make_decoding_stream_1 (stream, codesys, "w");
2106 /* Note: the decode_coding_* functions all take the same
2107 arguments as mule_decode(), which is to say some SRC data of
2108 size N, which is to be stored into dynamic array DST.
2109 DECODING is the stream within which the decoding is
2110 taking place, but no data is actually read from or
2111 written to that stream; that is handled in decoding_reader()
2112 or decoding_writer(). This allows the same functions to
2113 be used for both reading and writing. */
2116 mule_decode (Lstream *decoding, CONST unsigned char *src,
2117 unsigned_char_dynarr *dst, unsigned int n)
2119 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2121 /* If necessary, do encoding-detection now. We do this when
2122 we're a writing stream or a non-seekable reading stream,
2123 meaning that we can't just process the whole input,
2124 rewind, and start over. */
2126 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2127 str->eol_type == EOL_AUTODETECT)
2129 Lisp_Object codesys;
2131 XSETCODING_SYSTEM (codesys, str->codesys);
2132 detect_coding_type (&str->decst, src, n,
2133 CODING_SYSTEM_TYPE (str->codesys) !=
2134 CODESYS_AUTODETECT);
2135 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2136 str->decst.mask != ~0)
2137 /* #### This is cheesy. What we really ought to do is
2138 buffer up a certain amount of data so as to get a
2139 less random result. */
2140 codesys = coding_system_from_mask (str->decst.mask);
2141 str->eol_type = str->decst.eol_type;
2142 if (XCODING_SYSTEM (codesys) != str->codesys)
2144 /* Preserve the CODING_STATE_END flag in case it was set.
2145 If we erase it, bad things might happen. */
2146 int was_end = str->flags & CODING_STATE_END;
2147 set_decoding_stream_coding_system (decoding, codesys);
2149 str->flags |= CODING_STATE_END;
2153 switch (CODING_SYSTEM_TYPE (str->codesys))
2156 case CODESYS_INTERNAL:
2157 Dynarr_add_many (dst, src, n);
2160 case CODESYS_AUTODETECT:
2161 /* If we got this far and still haven't decided on the coding
2162 system, then do no conversion. */
2163 case CODESYS_NO_CONVERSION:
2164 decode_coding_no_conversion (decoding, src, dst, n);
2167 case CODESYS_SHIFT_JIS:
2168 decode_coding_sjis (decoding, src, dst, n);
2171 decode_coding_big5 (decoding, src, dst, n);
2174 decode_coding_ucs4 (decoding, src, dst, n);
2177 decode_coding_utf8 (decoding, src, dst, n);
2180 ccl_driver (&str->ccl, src, dst, n, 0);
2182 case CODESYS_ISO2022:
2183 decode_coding_iso2022 (decoding, src, dst, n);
2191 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2192 Decode the text between START and END which is encoded in CODING-SYSTEM.
2193 This is useful if you've read in encoded text from a file without decoding
2194 it (e.g. you read in a JIS-formatted file but used the `binary' or
2195 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2196 Return length of decoded text.
2197 BUFFER defaults to the current buffer if unspecified.
2199 (start, end, coding_system, buffer))
2202 struct buffer *buf = decode_buffer (buffer, 0);
2203 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2204 Lstream *istr, *ostr;
2205 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2207 get_buffer_range_char (buf, start, end, &b, &e, 0);
2209 barf_if_buffer_read_only (buf, b, e);
2211 coding_system = Fget_coding_system (coding_system);
2212 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2213 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2214 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2216 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2217 Fget_coding_system (Qbinary));
2218 istr = XLSTREAM (instream);
2219 ostr = XLSTREAM (outstream);
2220 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2222 /* The chain of streams looks like this:
2224 [BUFFER] <----- send through
2225 ------> [ENCODE AS BINARY]
2226 ------> [DECODE AS SPECIFIED]
2232 char tempbuf[1024]; /* some random amount */
2233 Bufpos newpos, even_newer_pos;
2234 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2235 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2239 newpos = lisp_buffer_stream_startpos (istr);
2240 Lstream_write (ostr, tempbuf, size_in_bytes);
2241 even_newer_pos = lisp_buffer_stream_startpos (istr);
2242 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2245 Lstream_close (istr);
2246 Lstream_close (ostr);
2248 Lstream_delete (istr);
2249 Lstream_delete (ostr);
2250 Lstream_delete (XLSTREAM (de_outstream));
2251 Lstream_delete (XLSTREAM (lb_outstream));
2256 /************************************************************************/
2257 /* Converting to an external encoding ("encoding") */
2258 /************************************************************************/
2260 /* An encoding stream is an output stream. When you create the
2261 stream, you specify the coding system that governs the encoding
2262 and another stream that the resulting encoded data is to be
2263 sent to, and then start sending data to it. */
2265 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2267 struct encoding_stream
2269 /* Coding system that governs the conversion. */
2270 Lisp_Coding_System *codesys;
2272 /* Stream that we read the encoded data from or
2273 write the decoded data to. */
2276 /* If we are reading, then we can return only a fixed amount of
2277 data, so if the conversion resulted in too much data, we store it
2278 here for retrieval the next time around. */
2279 unsigned_char_dynarr *runoff;
2281 /* FLAGS holds flags indicating the current state of the encoding.
2282 Some of these flags are dependent on the coding system. */
2285 /* CH holds a partially built-up character. Since we only deal
2286 with one- and two-byte characters at the moment, we only use
2287 this to store the first byte of a two-byte character. */
2290 /* Additional information used by the ISO2022 encoder. */
2293 /* CHARSET holds the character sets currently assigned to the G0
2294 through G3 registers. It is initialized from the array
2295 INITIAL_CHARSET in CODESYS. */
2296 Lisp_Object charset[4];
2298 /* Which registers are currently invoked into the left (GL) and
2299 right (GR) halves of the 8-bit encoding space? */
2300 int register_left, register_right;
2302 /* Whether we need to explicitly designate the charset in the
2303 G? register before using it. It is initialized from the
2304 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2305 unsigned char force_charset_on_output[4];
2307 /* Other state variables that need to be preserved across
2309 Lisp_Object current_charset;
2311 int current_char_boundary;
2314 /* Additional information (the state of the running CCL program)
2315 used by the CCL encoder. */
2316 struct ccl_program ccl;
2320 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2321 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2323 static int encoding_rewinder (Lstream *stream);
2324 static int encoding_seekable_p (Lstream *stream);
2325 static int encoding_flusher (Lstream *stream);
2326 static int encoding_closer (Lstream *stream);
2328 static Lisp_Object encoding_marker (Lisp_Object stream,
2329 void (*markobj) (Lisp_Object));
2331 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2332 sizeof (struct encoding_stream));
2335 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2337 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2338 Lisp_Object str_obj;
2340 /* We do not need to mark the coding systems or charsets stored
2341 within the stream because they are stored in a global list
2342 and automatically marked. */
2344 XSETLSTREAM (str_obj, str);
2346 if (str->imp->marker)
2347 return (str->imp->marker) (str_obj, markobj);
2352 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2353 so we read data from the other end, encode it, and store it into DATA. */
2356 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2358 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2359 unsigned char *orig_data = data;
2361 int error_occurred = 0;
2363 /* We need to interface to mule_encode(), which expects to take some
2364 amount of data and store the result into a Dynarr. We have
2365 mule_encode() store into str->runoff, and take data from there
2368 /* We loop until we have enough data, reading chunks from the other
2369 end and encoding it. */
2372 /* Take data from the runoff if we can. Make sure to take at
2373 most SIZE bytes, and delete the data from the runoff. */
2374 if (Dynarr_length (str->runoff) > 0)
2376 int chunk = min ((int) size, Dynarr_length (str->runoff));
2377 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2378 Dynarr_delete_many (str->runoff, 0, chunk);
2384 break; /* No more room for data */
2386 if (str->flags & CODING_STATE_END)
2387 /* This means that on the previous iteration, we hit the EOF on
2388 the other end. We loop once more so that mule_encode() can
2389 output any final stuff it may be holding, or any "go back
2390 to a sane state" escape sequences. (This latter makes sense
2391 during encoding.) */
2394 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2395 left of storage in it, so it's OK to read directly into it.
2396 (We'll be overwriting above, after we've encoded it into the
2398 read_size = Lstream_read (str->other_end, data, size);
2405 /* There might be some more end data produced in the translation.
2406 See the comment above. */
2407 str->flags |= CODING_STATE_END;
2408 mule_encode (stream, data, str->runoff, read_size);
2411 if (data == orig_data)
2412 return error_occurred ? -1 : 0;
2414 return data - orig_data;
2418 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2420 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2423 /* Encode all our data into the runoff, and then attempt to write
2424 it all out to the other end. Remove whatever chunk we succeeded
2426 mule_encode (stream, data, str->runoff, size);
2427 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2428 Dynarr_length (str->runoff));
2430 Dynarr_delete_many (str->runoff, 0, retval);
2431 /* Do NOT return retval. The return value indicates how much
2432 of the incoming data was written, not how many bytes were
2438 reset_encoding_stream (struct encoding_stream *str)
2441 switch (CODING_SYSTEM_TYPE (str->codesys))
2443 case CODESYS_ISO2022:
2447 for (i = 0; i < 4; i++)
2449 str->iso2022.charset[i] =
2450 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2451 str->iso2022.force_charset_on_output[i] =
2452 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2454 str->iso2022.register_left = 0;
2455 str->iso2022.register_right = 1;
2456 str->iso2022.current_charset = Qnil;
2457 str->iso2022.current_half = 0;
2459 str->iso2022.current_char_boundary = 0;
2461 str->iso2022.current_char_boundary = 1;
2466 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2473 str->flags = str->ch = 0;
2477 encoding_rewinder (Lstream *stream)
2479 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2480 reset_encoding_stream (str);
2481 Dynarr_reset (str->runoff);
2482 return Lstream_rewind (str->other_end);
2486 encoding_seekable_p (Lstream *stream)
2488 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2489 return Lstream_seekable_p (str->other_end);
2493 encoding_flusher (Lstream *stream)
2495 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2496 return Lstream_flush (str->other_end);
2500 encoding_closer (Lstream *stream)
2502 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2503 if (stream->flags & LSTREAM_FL_WRITE)
2505 str->flags |= CODING_STATE_END;
2506 encoding_writer (stream, 0, 0);
2508 Dynarr_free (str->runoff);
2509 return Lstream_close (str->other_end);
2513 encoding_stream_coding_system (Lstream *stream)
2515 Lisp_Object coding_system;
2516 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2518 XSETCODING_SYSTEM (coding_system, str->codesys);
2519 return coding_system;
2523 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2525 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2526 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2528 reset_encoding_stream (str);
2532 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2535 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2536 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2540 str->runoff = Dynarr_new (unsigned_char);
2541 str->other_end = stream;
2542 set_encoding_stream_coding_system (lstr, codesys);
2543 XSETLSTREAM (obj, lstr);
2548 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2550 return make_encoding_stream_1 (stream, codesys, "r");
2554 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2556 return make_encoding_stream_1 (stream, codesys, "w");
2559 /* Convert N bytes of internally-formatted data stored in SRC to an
2560 external format, according to the encoding stream ENCODING.
2561 Store the encoded data into DST. */
2564 mule_encode (Lstream *encoding, CONST unsigned char *src,
2565 unsigned_char_dynarr *dst, unsigned int n)
2567 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2569 switch (CODING_SYSTEM_TYPE (str->codesys))
2572 case CODESYS_INTERNAL:
2573 Dynarr_add_many (dst, src, n);
2576 case CODESYS_AUTODETECT:
2577 /* If we got this far and still haven't decided on the coding
2578 system, then do no conversion. */
2579 case CODESYS_NO_CONVERSION:
2580 encode_coding_no_conversion (encoding, src, dst, n);
2583 case CODESYS_SHIFT_JIS:
2584 encode_coding_sjis (encoding, src, dst, n);
2587 encode_coding_big5 (encoding, src, dst, n);
2590 encode_coding_ucs4 (encoding, src, dst, n);
2593 encode_coding_utf8 (encoding, src, dst, n);
2596 ccl_driver (&str->ccl, src, dst, n, 0);
2598 case CODESYS_ISO2022:
2599 encode_coding_iso2022 (encoding, src, dst, n);
2607 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2608 Encode the text between START and END using CODING-SYSTEM.
2609 This will, for example, convert Japanese characters into stuff such as
2610 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2611 text. BUFFER defaults to the current buffer if unspecified.
2613 (start, end, coding_system, buffer))
2616 struct buffer *buf = decode_buffer (buffer, 0);
2617 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2618 Lstream *istr, *ostr;
2619 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2621 get_buffer_range_char (buf, start, end, &b, &e, 0);
2623 barf_if_buffer_read_only (buf, b, e);
2625 coding_system = Fget_coding_system (coding_system);
2626 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2627 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2628 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2629 Fget_coding_system (Qbinary));
2630 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2632 istr = XLSTREAM (instream);
2633 ostr = XLSTREAM (outstream);
2634 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2635 /* The chain of streams looks like this:
2637 [BUFFER] <----- send through
2638 ------> [ENCODE AS SPECIFIED]
2639 ------> [DECODE AS BINARY]
2644 char tempbuf[1024]; /* some random amount */
2645 Bufpos newpos, even_newer_pos;
2646 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2647 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2651 newpos = lisp_buffer_stream_startpos (istr);
2652 Lstream_write (ostr, tempbuf, size_in_bytes);
2653 even_newer_pos = lisp_buffer_stream_startpos (istr);
2654 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2660 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2661 Lstream_close (istr);
2662 Lstream_close (ostr);
2664 Lstream_delete (istr);
2665 Lstream_delete (ostr);
2666 Lstream_delete (XLSTREAM (de_outstream));
2667 Lstream_delete (XLSTREAM (lb_outstream));
2668 return make_int (retlen);
2674 /************************************************************************/
2675 /* Shift-JIS methods */
2676 /************************************************************************/
2678 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2679 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2680 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2681 encoded by "position-code + 0x80". A character of JISX0208
2682 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2683 position-codes are divided and shifted so that it fit in the range
2686 --- CODE RANGE of Shift-JIS ---
2687 (character set) (range)
2689 JISX0201-Kana 0xA0 .. 0xDF
2690 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2691 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2692 -------------------------------
2696 /* Is this the first byte of a Shift-JIS two-byte char? */
2698 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2699 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2701 /* Is this the second byte of a Shift-JIS two-byte char? */
2703 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2704 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2706 #define BYTE_SJIS_KATAKANA_P(c) \
2707 ((c) >= 0xA1 && (c) <= 0xDF)
2710 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2718 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2720 if (st->shift_jis.in_second_byte)
2722 st->shift_jis.in_second_byte = 0;
2726 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2727 st->shift_jis.in_second_byte = 1;
2729 return CODING_CATEGORY_SHIFT_JIS_MASK;
2732 /* Convert Shift-JIS data to internal format. */
2735 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2736 unsigned_char_dynarr *dst, unsigned int n)
2739 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2740 unsigned int flags = str->flags;
2741 unsigned int ch = str->ch;
2742 eol_type_t eol_type = str->eol_type;
2750 /* Previous character was first byte of Shift-JIS Kanji char. */
2751 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2753 unsigned char e1, e2;
2755 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2756 DECODE_SJIS (ch, c, e1, e2);
2757 Dynarr_add (dst, e1);
2758 Dynarr_add (dst, e2);
2762 DECODE_ADD_BINARY_CHAR (ch, dst);
2763 DECODE_ADD_BINARY_CHAR (c, dst);
2769 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2770 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2772 else if (BYTE_SJIS_KATAKANA_P (c))
2774 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2775 Dynarr_add (dst, c);
2778 DECODE_ADD_BINARY_CHAR (c, dst);
2780 label_continue_loop:;
2783 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2789 /* Convert internally-formatted data to Shift-JIS. */
2792 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2793 unsigned_char_dynarr *dst, unsigned int n)
2796 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2797 unsigned int flags = str->flags;
2798 unsigned int ch = str->ch;
2799 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2806 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2807 Dynarr_add (dst, '\r');
2808 if (eol_type != EOL_CR)
2809 Dynarr_add (dst, '\n');
2812 else if (BYTE_ASCII_P (c))
2814 Dynarr_add (dst, c);
2817 else if (BUFBYTE_LEADING_BYTE_P (c))
2818 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2819 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2820 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2823 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2825 Dynarr_add (dst, c);
2828 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2829 ch == LEADING_BYTE_JAPANESE_JISX0208)
2833 unsigned char j1, j2;
2834 ENCODE_SJIS (ch, c, j1, j2);
2835 Dynarr_add (dst, j1);
2836 Dynarr_add (dst, j2);
2846 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2847 Decode a JISX0208 character of Shift-JIS coding-system.
2848 CODE is the character code in Shift-JIS as a cons of type bytes.
2849 Return the corresponding character.
2853 unsigned char c1, c2, s1, s2;
2856 CHECK_INT (XCAR (code));
2857 CHECK_INT (XCDR (code));
2858 s1 = XINT (XCAR (code));
2859 s2 = XINT (XCDR (code));
2860 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2861 BYTE_SJIS_TWO_BYTE_2_P (s2))
2863 DECODE_SJIS (s1, s2, c1, c2);
2864 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2865 c1 & 0x7F, c2 & 0x7F));
2871 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2872 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2873 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2877 Lisp_Object charset;
2880 CHECK_CHAR_COERCE_INT (ch);
2881 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2882 if (EQ (charset, Vcharset_japanese_jisx0208))
2884 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2885 return Fcons (make_int (s1), make_int (s2));
2892 /************************************************************************/
2894 /************************************************************************/
2896 /* BIG5 is a coding system encoding two character sets: ASCII and
2897 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2898 character set and is encoded in two-byte.
2900 --- CODE RANGE of BIG5 ---
2901 (character set) (range)
2903 Big5 (1st byte) 0xA1 .. 0xFE
2904 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2905 --------------------------
2907 Since the number of characters in Big5 is larger than maximum
2908 characters in Emacs' charset (96x96), it can't be handled as one
2909 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2910 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2911 contains frequently used characters and the latter contains less
2912 frequently used characters. */
2914 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2915 ((c) >= 0xA1 && (c) <= 0xFE)
2917 /* Is this the second byte of a Shift-JIS two-byte char? */
2919 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2920 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2922 /* Number of Big5 characters which have the same code in 1st byte. */
2924 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2926 /* Code conversion macros. These are macros because they are used in
2927 inner loops during code conversion.
2929 Note that temporary variables in macros introduce the classic
2930 dynamic-scoping problems with variable names. We use capital-
2931 lettered variables in the assumption that XEmacs does not use
2932 capital letters in variables except in a very formalized way
2935 /* Convert Big5 code (b1, b2) into its internal string representation
2938 /* There is a much simpler way to split the Big5 charset into two.
2939 For the moment I'm going to leave the algorithm as-is because it
2940 claims to separate out the most-used characters into a single
2941 charset, which perhaps will lead to optimizations in various
2944 The way the algorithm works is something like this:
2946 Big5 can be viewed as a 94x157 charset, where the row is
2947 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2948 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2949 the split between low and high column numbers is apparently
2950 meaningless; ascending rows produce less and less frequent chars.
2951 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2952 the first charset, and the upper half (0xC9 .. 0xFE) to the
2953 second. To do the conversion, we convert the character into
2954 a single number where 0 .. 156 is the first row, 157 .. 313
2955 is the second, etc. That way, the characters are ordered by
2956 decreasing frequency. Then we just chop the space in two
2957 and coerce the result into a 94x94 space.
2960 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2962 int B1 = b1, B2 = b2; \
2964 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2968 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2972 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2973 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2975 c1 = I / (0xFF - 0xA1) + 0xA1; \
2976 c2 = I % (0xFF - 0xA1) + 0xA1; \
2979 /* Convert the internal string representation of a Big5 character
2980 (lb, c1, c2) into Big5 code (b1, b2). */
2982 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2984 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2986 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2988 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2990 b1 = I / BIG5_SAME_ROW + 0xA1; \
2991 b2 = I % BIG5_SAME_ROW; \
2992 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2996 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3004 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3005 (c >= 0x80 && c <= 0xA0))
3007 if (st->big5.in_second_byte)
3009 st->big5.in_second_byte = 0;
3010 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3014 st->big5.in_second_byte = 1;
3016 return CODING_CATEGORY_BIG5_MASK;
3019 /* Convert Big5 data to internal format. */
3022 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3023 unsigned_char_dynarr *dst, unsigned int n)
3026 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3027 unsigned int flags = str->flags;
3028 unsigned int ch = str->ch;
3029 eol_type_t eol_type = str->eol_type;
3036 /* Previous character was first byte of Big5 char. */
3037 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3039 unsigned char b1, b2, b3;
3040 DECODE_BIG5 (ch, c, b1, b2, b3);
3041 Dynarr_add (dst, b1);
3042 Dynarr_add (dst, b2);
3043 Dynarr_add (dst, b3);
3047 DECODE_ADD_BINARY_CHAR (ch, dst);
3048 DECODE_ADD_BINARY_CHAR (c, dst);
3054 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3055 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3058 DECODE_ADD_BINARY_CHAR (c, dst);
3060 label_continue_loop:;
3063 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3069 /* Convert internally-formatted data to Big5. */
3072 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3073 unsigned_char_dynarr *dst, unsigned int n)
3076 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3077 unsigned int flags = str->flags;
3078 unsigned int ch = str->ch;
3079 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3086 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3087 Dynarr_add (dst, '\r');
3088 if (eol_type != EOL_CR)
3089 Dynarr_add (dst, '\n');
3091 else if (BYTE_ASCII_P (c))
3094 Dynarr_add (dst, c);
3096 else if (BUFBYTE_LEADING_BYTE_P (c))
3098 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3099 c == LEADING_BYTE_CHINESE_BIG5_2)
3101 /* A recognized leading byte. */
3103 continue; /* not done with this character. */
3105 /* otherwise just ignore this character. */
3107 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3108 ch == LEADING_BYTE_CHINESE_BIG5_2)
3110 /* Previous char was a recognized leading byte. */
3112 continue; /* not done with this character. */
3116 /* Encountering second byte of a Big5 character. */
3117 unsigned char b1, b2;
3119 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3120 Dynarr_add (dst, b1);
3121 Dynarr_add (dst, b2);
3132 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3133 Decode a Big5 character CODE of BIG5 coding-system.
3134 CODE is the character code in BIG5, a cons of two integers.
3135 Return the corresponding character.
3139 unsigned char c1, c2, b1, b2;
3142 CHECK_INT (XCAR (code));
3143 CHECK_INT (XCDR (code));
3144 b1 = XINT (XCAR (code));
3145 b2 = XINT (XCDR (code));
3146 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3147 BYTE_BIG5_TWO_BYTE_2_P (b2))
3150 Lisp_Object charset;
3151 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3152 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3153 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3159 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3160 Encode the Big5 character CH to BIG5 coding-system.
3161 Return the corresponding character code in Big5.
3165 Lisp_Object charset;
3168 CHECK_CHAR_COERCE_INT (ch);
3169 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3170 if (EQ (charset, Vcharset_chinese_big5_1) ||
3171 EQ (charset, Vcharset_chinese_big5_2))
3173 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3175 return Fcons (make_int (b1), make_int (b2));
3182 /************************************************************************/
3185 /* UCS-4 character codes are implemented as nonnegative integers. */
3187 /************************************************************************/
3189 Lisp_Object ucs_to_mule_table[65536];
3190 Lisp_Object mule_to_ucs_table;
3192 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3193 Map UCS-4 code CODE to Mule character CHARACTER.
3195 Return T on success, NIL on failure.
3201 CHECK_CHAR (character);
3205 if (c < sizeof (ucs_to_mule_table))
3207 ucs_to_mule_table[c] = character;
3215 ucs_to_char (unsigned long code)
3217 if (code < sizeof (ucs_to_mule_table))
3219 return ucs_to_mule_table[code];
3221 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3226 c = code % (94 * 94);
3228 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3229 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3230 CHARSET_LEFT_TO_RIGHT),
3231 c / 94 + 33, c % 94 + 33));
3237 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3238 Return Mule character corresponding to UCS code CODE (a positive integer).
3242 CHECK_NATNUM (code);
3243 return ucs_to_char (XINT (code));
3246 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3247 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3251 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3252 Fset_char_ucs is more restrictive on index arg, but should
3253 check code arg in a char_table method. */
3254 CHECK_CHAR (character);
3255 CHECK_NATNUM (code);
3256 return Fput_char_table (character, code, mule_to_ucs_table);
3259 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3260 Return the UCS code (a positive integer) corresponding to CHARACTER.
3264 return Fget_char_table (character, mule_to_ucs_table);
3268 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3270 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3271 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3272 is not found, instead.
3273 #### do something more appropriate (use blob?)
3274 Danger, Will Robinson! Data loss. Should we signal user? */
3276 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3278 Lisp_Object chr = ucs_to_char (ch);
3282 Bufbyte work[MAX_EMCHAR_LEN];
3287 simple_set_charptr_emchar (work, ch) :
3288 non_ascii_set_charptr_emchar (work, ch);
3289 Dynarr_add_many (dst, work, len);
3293 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3294 Dynarr_add (dst, 34 + 128);
3295 Dynarr_add (dst, 46 + 128);
3300 static unsigned long
3301 mule_char_to_ucs4 (Lisp_Object charset,
3302 unsigned char h, unsigned char l)
3305 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3312 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3313 (XCHARSET_CHARS (charset) == 94) )
3315 unsigned char final = XCHARSET_FINAL (charset);
3317 if ( ('@' <= final) && (final < 0x7f) )
3319 return 0xe00000 + (final - '@') * 94 * 94
3320 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3334 encode_ucs4 (Lisp_Object charset,
3335 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3337 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3338 Dynarr_add (dst, code >> 24);
3339 Dynarr_add (dst, (code >> 16) & 255);
3340 Dynarr_add (dst, (code >> 8) & 255);
3341 Dynarr_add (dst, code & 255);
3345 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3351 switch (st->ucs4.in_byte)
3360 st->ucs4.in_byte = 0;
3366 return CODING_CATEGORY_UCS4_MASK;
3370 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3371 unsigned_char_dynarr *dst, unsigned int n)
3373 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3374 unsigned int flags = str->flags;
3375 unsigned int ch = str->ch;
3379 unsigned char c = *src++;
3387 decode_ucs4 ( ( ch << 8 ) | c, dst);
3392 ch = ( ch << 8 ) | c;
3396 if (flags & CODING_STATE_END)
3397 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3404 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3405 unsigned_char_dynarr *dst, unsigned int n)
3407 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3408 unsigned int flags = str->flags;
3409 unsigned int ch = str->ch;
3410 unsigned char char_boundary = str->iso2022.current_char_boundary;
3411 Lisp_Object charset = str->iso2022.current_charset;
3413 #ifdef ENABLE_COMPOSITE_CHARS
3414 /* flags for handling composite chars. We do a little switcharoo
3415 on the source while we're outputting the composite char. */
3416 unsigned int saved_n = 0;
3417 CONST unsigned char *saved_src = NULL;
3418 int in_composite = 0;
3425 unsigned char c = *src++;
3427 if (BYTE_ASCII_P (c))
3428 { /* Processing ASCII character */
3430 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3433 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3434 { /* Processing Leading Byte */
3436 charset = CHARSET_BY_LEADING_BYTE (c);
3437 if (LEADING_BYTE_PREFIX_P(c))
3442 { /* Processing Non-ASCII character */
3444 if (EQ (charset, Vcharset_control_1))
3446 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3450 switch (XCHARSET_REP_BYTES (charset))
3453 encode_ucs4 (charset, c, 0, dst);
3456 if (XCHARSET_PRIVATE_P (charset))
3458 encode_ucs4 (charset, c, 0, dst);
3463 #ifdef ENABLE_COMPOSITE_CHARS
3464 if (EQ (charset, Vcharset_composite))
3468 /* #### Bother! We don't know how to
3470 Dynarr_add (dst, 0);
3471 Dynarr_add (dst, 0);
3472 Dynarr_add (dst, 0);
3473 Dynarr_add (dst, '~');
3477 Emchar emch = MAKE_CHAR (Vcharset_composite,
3478 ch & 0x7F, c & 0x7F);
3479 Lisp_Object lstr = composite_char_string (emch);
3483 src = XSTRING_DATA (lstr);
3484 n = XSTRING_LENGTH (lstr);
3488 #endif /* ENABLE_COMPOSITE_CHARS */
3490 encode_ucs4(charset, ch, c, dst);
3503 encode_ucs4 (charset, ch, c, dst);
3519 #ifdef ENABLE_COMPOSITE_CHARS
3525 goto back_to_square_n; /* Wheeeeeeeee ..... */
3527 #endif /* ENABLE_COMPOSITE_CHARS */
3531 str->iso2022.current_char_boundary = char_boundary;
3532 str->iso2022.current_charset = charset;
3534 /* Verbum caro factum est! */
3538 /************************************************************************/
3540 /************************************************************************/
3543 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3548 unsigned char c = *src++;
3549 switch (st->utf8.in_byte)
3552 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3555 st->utf8.in_byte = 5;
3557 st->utf8.in_byte = 4;
3559 st->utf8.in_byte = 3;
3561 st->utf8.in_byte = 2;
3563 st->utf8.in_byte = 1;
3568 if ((c & 0xc0) != 0x80)
3574 return CODING_CATEGORY_UTF8_MASK;
3578 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3579 unsigned_char_dynarr *dst, unsigned int n)
3581 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3582 unsigned int flags = str->flags;
3583 unsigned int ch = str->ch;
3584 eol_type_t eol_type = str->eol_type;
3588 unsigned char c = *src++;
3597 else if ( c >= 0xf8 )
3602 else if ( c >= 0xf0 )
3607 else if ( c >= 0xe0 )
3612 else if ( c >= 0xc0 )
3619 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3620 decode_ucs4 (c, dst);
3624 ch = ( ch << 6 ) | ( c & 0x3f );
3625 decode_ucs4 (ch, dst);
3630 ch = ( ch << 6 ) | ( c & 0x3f );
3633 label_continue_loop:;
3636 if (flags & CODING_STATE_END)
3637 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3645 encode_utf8 (Lisp_Object charset,
3646 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3648 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3651 Dynarr_add (dst, code);
3653 else if ( code <= 0x7ff )
3655 Dynarr_add (dst, (code >> 6) | 0xc0);
3656 Dynarr_add (dst, (code & 0x3f) | 0x80);
3658 else if ( code <= 0xffff )
3660 Dynarr_add (dst, (code >> 12) | 0xe0);
3661 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3662 Dynarr_add (dst, (code & 0x3f) | 0x80);
3664 else if ( code <= 0x1fffff )
3666 Dynarr_add (dst, (code >> 18) | 0xf0);
3667 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3668 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3669 Dynarr_add (dst, (code & 0x3f) | 0x80);
3671 else if ( code <= 0x3ffffff )
3673 Dynarr_add (dst, (code >> 24) | 0xf8);
3674 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3675 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3676 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3677 Dynarr_add (dst, (code & 0x3f) | 0x80);
3681 Dynarr_add (dst, (code >> 30) | 0xfc);
3682 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3683 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3684 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3685 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3686 Dynarr_add (dst, (code & 0x3f) | 0x80);
3692 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3693 unsigned_char_dynarr *dst, unsigned int n)
3695 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3696 unsigned int flags = str->flags;
3697 unsigned int ch = str->ch;
3698 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3699 unsigned char char_boundary = str->iso2022.current_char_boundary;
3704 unsigned char c = *src++;
3705 switch (char_boundary)
3710 Dynarr_add (dst, c);
3713 else if ( c >= 0xf8 )
3715 Dynarr_add (dst, c);
3718 else if ( c >= 0xf0 )
3720 Dynarr_add (dst, c);
3723 else if ( c >= 0xe0 )
3725 Dynarr_add (dst, c);
3728 else if ( c >= 0xc0 )
3730 Dynarr_add (dst, c);
3737 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3738 Dynarr_add (dst, '\r');
3739 if (eol_type != EOL_CR)
3740 Dynarr_add (dst, c);
3743 Dynarr_add (dst, c);
3748 Dynarr_add (dst, c);
3752 Dynarr_add (dst, c);
3756 #else /* not UTF2000 */
3757 Lisp_Object charset = str->iso2022.current_charset;
3759 #ifdef ENABLE_COMPOSITE_CHARS
3760 /* flags for handling composite chars. We do a little switcharoo
3761 on the source while we're outputting the composite char. */
3762 unsigned int saved_n = 0;
3763 CONST unsigned char *saved_src = NULL;
3764 int in_composite = 0;
3767 #endif /* ENABLE_COMPOSITE_CHARS */
3771 unsigned char c = *src++;
3773 if (BYTE_ASCII_P (c))
3774 { /* Processing ASCII character */
3778 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3779 Dynarr_add (dst, '\r');
3780 if (eol_type != EOL_CR)
3781 Dynarr_add (dst, c);
3784 encode_utf8 (Vcharset_ascii, c, 0, dst);
3787 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3788 { /* Processing Leading Byte */
3790 charset = CHARSET_BY_LEADING_BYTE (c);
3791 if (LEADING_BYTE_PREFIX_P(c))
3796 { /* Processing Non-ASCII character */
3798 if (EQ (charset, Vcharset_control_1))
3800 encode_utf8 (Vcharset_control_1, c, 0, dst);
3804 switch (XCHARSET_REP_BYTES (charset))
3807 encode_utf8 (charset, c, 0, dst);
3810 if (XCHARSET_PRIVATE_P (charset))
3812 encode_utf8 (charset, c, 0, dst);
3817 #ifdef ENABLE_COMPOSITE_CHARS
3818 if (EQ (charset, Vcharset_composite))
3822 /* #### Bother! We don't know how to
3824 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3828 Emchar emch = MAKE_CHAR (Vcharset_composite,
3829 ch & 0x7F, c & 0x7F);
3830 Lisp_Object lstr = composite_char_string (emch);
3834 src = XSTRING_DATA (lstr);
3835 n = XSTRING_LENGTH (lstr);
3839 #endif /* ENABLE_COMPOSITE_CHARS */
3841 encode_utf8 (charset, ch, c, dst);
3854 encode_utf8 (charset, ch, c, dst);
3870 #ifdef ENABLE_COMPOSITE_CHARS
3876 goto back_to_square_n; /* Wheeeeeeeee ..... */
3880 #endif /* not UTF2000 */
3883 str->iso2022.current_char_boundary = char_boundary;
3885 str->iso2022.current_charset = charset;
3888 /* Verbum caro factum est! */
3892 /************************************************************************/
3893 /* ISO2022 methods */
3894 /************************************************************************/
3896 /* The following note describes the coding system ISO2022 briefly.
3897 Since the intention of this note is to help understand the
3898 functions in this file, some parts are NOT ACCURATE or OVERLY
3899 SIMPLIFIED. For thorough understanding, please refer to the
3900 original document of ISO2022.
3902 ISO2022 provides many mechanisms to encode several character sets
3903 in 7-bit and 8-bit environments. For 7-bit environments, all text
3904 is encoded using bytes less than 128. This may make the encoded
3905 text a little bit longer, but the text passes more easily through
3906 several gateways, some of which strip off MSB (Most Signigant Bit).
3908 There are two kinds of character sets: control character set and
3909 graphic character set. The former contains control characters such
3910 as `newline' and `escape' to provide control functions (control
3911 functions are also provided by escape sequences). The latter
3912 contains graphic characters such as 'A' and '-'. Emacs recognizes
3913 two control character sets and many graphic character sets.
3915 Graphic character sets are classified into one of the following
3916 four classes, according to the number of bytes (DIMENSION) and
3917 number of characters in one dimension (CHARS) of the set:
3918 - DIMENSION1_CHARS94
3919 - DIMENSION1_CHARS96
3920 - DIMENSION2_CHARS94
3921 - DIMENSION2_CHARS96
3923 In addition, each character set is assigned an identification tag,
3924 unique for each set, called "final character" (denoted as <F>
3925 hereafter). The <F> of each character set is decided by ECMA(*)
3926 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3927 (0x30..0x3F are for private use only).
3929 Note (*): ECMA = European Computer Manufacturers Association
3931 Here are examples of graphic character set [NAME(<F>)]:
3932 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3933 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3934 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3935 o DIMENSION2_CHARS96 -- none for the moment
3937 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3938 C0 [0x00..0x1F] -- control character plane 0
3939 GL [0x20..0x7F] -- graphic character plane 0
3940 C1 [0x80..0x9F] -- control character plane 1
3941 GR [0xA0..0xFF] -- graphic character plane 1
3943 A control character set is directly designated and invoked to C0 or
3944 C1 by an escape sequence. The most common case is that:
3945 - ISO646's control character set is designated/invoked to C0, and
3946 - ISO6429's control character set is designated/invoked to C1,
3947 and usually these designations/invocations are omitted in encoded
3948 text. In a 7-bit environment, only C0 can be used, and a control
3949 character for C1 is encoded by an appropriate escape sequence to
3950 fit into the environment. All control characters for C1 are
3951 defined to have corresponding escape sequences.
3953 A graphic character set is at first designated to one of four
3954 graphic registers (G0 through G3), then these graphic registers are
3955 invoked to GL or GR. These designations and invocations can be
3956 done independently. The most common case is that G0 is invoked to
3957 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3958 these invocations and designations are omitted in encoded text.
3959 In a 7-bit environment, only GL can be used.
3961 When a graphic character set of CHARS94 is invoked to GL, codes
3962 0x20 and 0x7F of the GL area work as control characters SPACE and
3963 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3966 There are two ways of invocation: locking-shift and single-shift.
3967 With locking-shift, the invocation lasts until the next different
3968 invocation, whereas with single-shift, the invocation affects the
3969 following character only and doesn't affect the locking-shift
3970 state. Invocations are done by the following control characters or
3973 ----------------------------------------------------------------------
3974 abbrev function cntrl escape seq description
3975 ----------------------------------------------------------------------
3976 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3977 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3978 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3979 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3980 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3981 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3982 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3983 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3984 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3985 ----------------------------------------------------------------------
3986 (*) These are not used by any known coding system.
3988 Control characters for these functions are defined by macros
3989 ISO_CODE_XXX in `coding.h'.
3991 Designations are done by the following escape sequences:
3992 ----------------------------------------------------------------------
3993 escape sequence description
3994 ----------------------------------------------------------------------
3995 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3996 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3997 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3998 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3999 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4000 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4001 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4002 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4003 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4004 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4005 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4006 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4007 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4008 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4009 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4010 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4011 ----------------------------------------------------------------------
4013 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4014 of dimension 1, chars 94, and final character <F>, etc...
4016 Note (*): Although these designations are not allowed in ISO2022,
4017 Emacs accepts them on decoding, and produces them on encoding
4018 CHARS96 character sets in a coding system which is characterized as
4019 7-bit environment, non-locking-shift, and non-single-shift.
4021 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4022 '(' can be omitted. We refer to this as "short-form" hereafter.
4024 Now you may notice that there are a lot of ways for encoding the
4025 same multilingual text in ISO2022. Actually, there exist many
4026 coding systems such as Compound Text (used in X11's inter client
4027 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4028 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4029 localized platforms), and all of these are variants of ISO2022.
4031 In addition to the above, Emacs handles two more kinds of escape
4032 sequences: ISO6429's direction specification and Emacs' private
4033 sequence for specifying character composition.
4035 ISO6429's direction specification takes the following form:
4036 o CSI ']' -- end of the current direction
4037 o CSI '0' ']' -- end of the current direction
4038 o CSI '1' ']' -- start of left-to-right text
4039 o CSI '2' ']' -- start of right-to-left text
4040 The control character CSI (0x9B: control sequence introducer) is
4041 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4043 Character composition specification takes the following form:
4044 o ESC '0' -- start character composition
4045 o ESC '1' -- end character composition
4046 Since these are not standard escape sequences of any ISO standard,
4047 their use with these meanings is restricted to Emacs only. */
4050 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4054 for (i = 0; i < 4; i++)
4056 if (!NILP (coding_system))
4058 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4060 iso->charset[i] = Qt;
4061 iso->invalid_designated[i] = 0;
4063 iso->esc = ISO_ESC_NOTHING;
4064 iso->esc_bytes_index = 0;
4065 iso->register_left = 0;
4066 iso->register_right = 1;
4067 iso->switched_dir_and_no_valid_charset_yet = 0;
4068 iso->invalid_switch_dir = 0;
4069 iso->output_direction_sequence = 0;
4070 iso->output_literally = 0;
4071 #ifdef ENABLE_COMPOSITE_CHARS
4072 if (iso->composite_chars)
4073 Dynarr_reset (iso->composite_chars);
4078 fit_to_be_escape_quoted (unsigned char c)
4095 /* Parse one byte of an ISO2022 escape sequence.
4096 If the result is an invalid escape sequence, return 0 and
4097 do not change anything in STR. Otherwise, if the result is
4098 an incomplete escape sequence, update ISO2022.ESC and
4099 ISO2022.ESC_BYTES and return -1. Otherwise, update
4100 all the state variables (but not ISO2022.ESC_BYTES) and
4103 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4104 or invocation of an invalid character set and treat that as
4105 an unrecognized escape sequence. */
4108 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4109 unsigned char c, unsigned int *flags,
4110 int check_invalid_charsets)
4112 /* (1) If we're at the end of a designation sequence, CS is the
4113 charset being designated and REG is the register to designate
4116 (2) If we're at the end of a locking-shift sequence, REG is
4117 the register to invoke and HALF (0 == left, 1 == right) is
4118 the half to invoke it into.
4120 (3) If we're at the end of a single-shift sequence, REG is
4121 the register to invoke. */
4122 Lisp_Object cs = Qnil;
4125 /* NOTE: This code does goto's all over the fucking place.
4126 The reason for this is that we're basically implementing
4127 a state machine here, and hierarchical languages like C
4128 don't really provide a clean way of doing this. */
4130 if (! (*flags & CODING_STATE_ESCAPE))
4131 /* At beginning of escape sequence; we need to reset our
4132 escape-state variables. */
4133 iso->esc = ISO_ESC_NOTHING;
4135 iso->output_literally = 0;
4136 iso->output_direction_sequence = 0;
4140 case ISO_ESC_NOTHING:
4141 iso->esc_bytes_index = 0;
4144 case ISO_CODE_ESC: /* Start escape sequence */
4145 *flags |= CODING_STATE_ESCAPE;
4149 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4150 *flags |= CODING_STATE_ESCAPE;
4151 iso->esc = ISO_ESC_5_11;
4154 case ISO_CODE_SO: /* locking shift 1 */
4157 case ISO_CODE_SI: /* locking shift 0 */
4161 case ISO_CODE_SS2: /* single shift */
4164 case ISO_CODE_SS3: /* single shift */
4168 default: /* Other control characters */
4175 /**** single shift ****/
4177 case 'N': /* single shift 2 */
4180 case 'O': /* single shift 3 */
4184 /**** locking shift ****/
4186 case '~': /* locking shift 1 right */
4189 case 'n': /* locking shift 2 */
4192 case '}': /* locking shift 2 right */
4195 case 'o': /* locking shift 3 */
4198 case '|': /* locking shift 3 right */
4202 #ifdef ENABLE_COMPOSITE_CHARS
4203 /**** composite ****/
4206 iso->esc = ISO_ESC_START_COMPOSITE;
4207 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4208 CODING_STATE_COMPOSITE;
4212 iso->esc = ISO_ESC_END_COMPOSITE;
4213 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4214 ~CODING_STATE_COMPOSITE;
4216 #endif /* ENABLE_COMPOSITE_CHARS */
4218 /**** directionality ****/
4221 iso->esc = ISO_ESC_5_11;
4224 /**** designation ****/
4226 case '$': /* multibyte charset prefix */
4227 iso->esc = ISO_ESC_2_4;
4231 if (0x28 <= c && c <= 0x2F)
4233 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4237 /* This function is called with CODESYS equal to nil when
4238 doing coding-system detection. */
4240 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4241 && fit_to_be_escape_quoted (c))
4243 iso->esc = ISO_ESC_LITERAL;
4244 *flags &= CODING_STATE_ISO2022_LOCK;
4254 /**** directionality ****/
4256 case ISO_ESC_5_11: /* ISO6429 direction control */
4259 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4260 goto directionality;
4262 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4263 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4264 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4268 case ISO_ESC_5_11_0:
4271 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4272 goto directionality;
4276 case ISO_ESC_5_11_1:
4279 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4280 goto directionality;
4284 case ISO_ESC_5_11_2:
4287 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4288 goto directionality;
4293 iso->esc = ISO_ESC_DIRECTIONALITY;
4294 /* Various junk here to attempt to preserve the direction sequences
4295 literally in the text if they would otherwise be swallowed due
4296 to invalid designations that don't show up as actual charset
4297 changes in the text. */
4298 if (iso->invalid_switch_dir)
4300 /* We already inserted a direction switch literally into the
4301 text. We assume (#### this may not be right) that the
4302 next direction switch is the one going the other way,
4303 and we need to output that literally as well. */
4304 iso->output_literally = 1;
4305 iso->invalid_switch_dir = 0;
4311 /* If we are in the thrall of an invalid designation,
4312 then stick the directionality sequence literally into the
4313 output stream so it ends up in the original text again. */
4314 for (jj = 0; jj < 4; jj++)
4315 if (iso->invalid_designated[jj])
4319 iso->output_literally = 1;
4320 iso->invalid_switch_dir = 1;
4323 /* Indicate that we haven't yet seen a valid designation,
4324 so that if a switch-dir is directly followed by an
4325 invalid designation, both get inserted literally. */
4326 iso->switched_dir_and_no_valid_charset_yet = 1;
4331 /**** designation ****/
4334 if (0x28 <= c && c <= 0x2F)
4336 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4339 if (0x40 <= c && c <= 0x42)
4341 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4342 *flags & CODING_STATE_R2L ?
4343 CHARSET_RIGHT_TO_LEFT :
4344 CHARSET_LEFT_TO_RIGHT);
4354 if (c < '0' || c > '~')
4355 return 0; /* bad final byte */
4357 if (iso->esc >= ISO_ESC_2_8 &&
4358 iso->esc <= ISO_ESC_2_15)
4360 type = ((iso->esc >= ISO_ESC_2_12) ?
4361 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4362 reg = (iso->esc - ISO_ESC_2_8) & 3;
4364 else if (iso->esc >= ISO_ESC_2_4_8 &&
4365 iso->esc <= ISO_ESC_2_4_15)
4367 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4368 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4369 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4373 /* Can this ever be reached? -slb */
4377 cs = CHARSET_BY_ATTRIBUTES (type, c,
4378 *flags & CODING_STATE_R2L ?
4379 CHARSET_RIGHT_TO_LEFT :
4380 CHARSET_LEFT_TO_RIGHT);
4386 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4390 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4391 /* can't invoke something that ain't there. */
4393 iso->esc = ISO_ESC_SINGLE_SHIFT;
4394 *flags &= CODING_STATE_ISO2022_LOCK;
4396 *flags |= CODING_STATE_SS2;
4398 *flags |= CODING_STATE_SS3;
4402 if (check_invalid_charsets &&
4403 !CHARSETP (iso->charset[reg]))
4404 /* can't invoke something that ain't there. */
4407 iso->register_right = reg;
4409 iso->register_left = reg;
4410 *flags &= CODING_STATE_ISO2022_LOCK;
4411 iso->esc = ISO_ESC_LOCKING_SHIFT;
4415 if (NILP (cs) && check_invalid_charsets)
4417 iso->invalid_designated[reg] = 1;
4418 iso->charset[reg] = Vcharset_ascii;
4419 iso->esc = ISO_ESC_DESIGNATE;
4420 *flags &= CODING_STATE_ISO2022_LOCK;
4421 iso->output_literally = 1;
4422 if (iso->switched_dir_and_no_valid_charset_yet)
4424 /* We encountered a switch-direction followed by an
4425 invalid designation. Ensure that the switch-direction
4426 gets outputted; otherwise it will probably get eaten
4427 when the text is written out again. */
4428 iso->switched_dir_and_no_valid_charset_yet = 0;
4429 iso->output_direction_sequence = 1;
4430 /* And make sure that the switch-dir going the other
4431 way gets outputted, as well. */
4432 iso->invalid_switch_dir = 1;
4436 /* This function is called with CODESYS equal to nil when
4437 doing coding-system detection. */
4438 if (!NILP (codesys))
4440 charset_conversion_spec_dynarr *dyn =
4441 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4447 for (i = 0; i < Dynarr_length (dyn); i++)
4449 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4450 if (EQ (cs, spec->from_charset))
4451 cs = spec->to_charset;
4456 iso->charset[reg] = cs;
4457 iso->esc = ISO_ESC_DESIGNATE;
4458 *flags &= CODING_STATE_ISO2022_LOCK;
4459 if (iso->invalid_designated[reg])
4461 iso->invalid_designated[reg] = 0;
4462 iso->output_literally = 1;
4464 if (iso->switched_dir_and_no_valid_charset_yet)
4465 iso->switched_dir_and_no_valid_charset_yet = 0;
4470 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4475 /* #### There are serious deficiencies in the recognition mechanism
4476 here. This needs to be much smarter if it's going to cut it.
4477 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4478 it should be detected as Latin-1.
4479 All the ISO2022 stuff in this file should be synced up with the
4480 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4481 Perhaps we should wait till R2L works in FSF Emacs? */
4483 if (!st->iso2022.initted)
4485 reset_iso2022 (Qnil, &st->iso2022.iso);
4486 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4487 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4488 CODING_CATEGORY_ISO_8_1_MASK |
4489 CODING_CATEGORY_ISO_8_2_MASK |
4490 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4491 st->iso2022.flags = 0;
4492 st->iso2022.high_byte_count = 0;
4493 st->iso2022.saw_single_shift = 0;
4494 st->iso2022.initted = 1;
4497 mask = st->iso2022.mask;
4504 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4505 st->iso2022.high_byte_count++;
4509 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4511 if (st->iso2022.high_byte_count & 1)
4512 /* odd number of high bytes; assume not iso-8-2 */
4513 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4515 st->iso2022.high_byte_count = 0;
4516 st->iso2022.saw_single_shift = 0;
4518 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4520 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4521 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4522 { /* control chars */
4525 /* Allow and ignore control characters that you might
4526 reasonably see in a text file */
4531 case 8: /* backspace */
4532 case 11: /* vertical tab */
4533 case 12: /* form feed */
4534 case 26: /* MS-DOS C-z junk */
4535 case 31: /* '^_' -- for info */
4536 goto label_continue_loop;
4543 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4546 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4547 &st->iso2022.flags, 0))
4549 switch (st->iso2022.iso.esc)
4551 case ISO_ESC_DESIGNATE:
4552 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4553 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4555 case ISO_ESC_LOCKING_SHIFT:
4556 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4557 goto ran_out_of_chars;
4558 case ISO_ESC_SINGLE_SHIFT:
4559 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4560 st->iso2022.saw_single_shift = 1;
4569 goto ran_out_of_chars;
4572 label_continue_loop:;
4581 postprocess_iso2022_mask (int mask)
4583 /* #### kind of cheesy */
4584 /* If seven-bit ISO is allowed, then assume that the encoding is
4585 entirely seven-bit and turn off the eight-bit ones. */
4586 if (mask & CODING_CATEGORY_ISO_7_MASK)
4587 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4588 CODING_CATEGORY_ISO_8_1_MASK |
4589 CODING_CATEGORY_ISO_8_2_MASK);
4593 /* If FLAGS is a null pointer or specifies right-to-left motion,
4594 output a switch-dir-to-left-to-right sequence to DST.
4595 Also update FLAGS if it is not a null pointer.
4596 If INTERNAL_P is set, we are outputting in internal format and
4597 need to handle the CSI differently. */
4600 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4601 unsigned_char_dynarr *dst,
4602 unsigned int *flags,
4605 if (!flags || (*flags & CODING_STATE_R2L))
4607 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4609 Dynarr_add (dst, ISO_CODE_ESC);
4610 Dynarr_add (dst, '[');
4612 else if (internal_p)
4613 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4615 Dynarr_add (dst, ISO_CODE_CSI);
4616 Dynarr_add (dst, '0');
4617 Dynarr_add (dst, ']');
4619 *flags &= ~CODING_STATE_R2L;
4623 /* If FLAGS is a null pointer or specifies a direction different from
4624 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4625 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4626 sequence to DST. Also update FLAGS if it is not a null pointer.
4627 If INTERNAL_P is set, we are outputting in internal format and
4628 need to handle the CSI differently. */
4631 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4632 unsigned_char_dynarr *dst, unsigned int *flags,
4635 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4636 direction == CHARSET_LEFT_TO_RIGHT)
4637 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4638 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4639 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4640 direction == CHARSET_RIGHT_TO_LEFT)
4642 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4644 Dynarr_add (dst, ISO_CODE_ESC);
4645 Dynarr_add (dst, '[');
4647 else if (internal_p)
4648 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4650 Dynarr_add (dst, ISO_CODE_CSI);
4651 Dynarr_add (dst, '2');
4652 Dynarr_add (dst, ']');
4654 *flags |= CODING_STATE_R2L;
4658 /* Convert ISO2022-format data to internal format. */
4661 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4662 unsigned_char_dynarr *dst, unsigned int n)
4664 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4665 unsigned int flags = str->flags;
4666 unsigned int ch = str->ch;
4667 eol_type_t eol_type = str->eol_type;
4668 #ifdef ENABLE_COMPOSITE_CHARS
4669 unsigned_char_dynarr *real_dst = dst;
4671 Lisp_Object coding_system;
4673 XSETCODING_SYSTEM (coding_system, str->codesys);
4675 #ifdef ENABLE_COMPOSITE_CHARS
4676 if (flags & CODING_STATE_COMPOSITE)
4677 dst = str->iso2022.composite_chars;
4678 #endif /* ENABLE_COMPOSITE_CHARS */
4682 unsigned char c = *src++;
4683 if (flags & CODING_STATE_ESCAPE)
4684 { /* Within ESC sequence */
4685 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4690 switch (str->iso2022.esc)
4692 #ifdef ENABLE_COMPOSITE_CHARS
4693 case ISO_ESC_START_COMPOSITE:
4694 if (str->iso2022.composite_chars)
4695 Dynarr_reset (str->iso2022.composite_chars);
4697 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4698 dst = str->iso2022.composite_chars;
4700 case ISO_ESC_END_COMPOSITE:
4702 Bufbyte comstr[MAX_EMCHAR_LEN];
4704 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4705 Dynarr_length (dst));
4707 len = set_charptr_emchar (comstr, emch);
4708 Dynarr_add_many (dst, comstr, len);
4711 #endif /* ENABLE_COMPOSITE_CHARS */
4713 case ISO_ESC_LITERAL:
4714 DECODE_ADD_BINARY_CHAR (c, dst);
4718 /* Everything else handled already */
4723 /* Attempted error recovery. */
4724 if (str->iso2022.output_direction_sequence)
4725 ensure_correct_direction (flags & CODING_STATE_R2L ?
4726 CHARSET_RIGHT_TO_LEFT :
4727 CHARSET_LEFT_TO_RIGHT,
4728 str->codesys, dst, 0, 1);
4729 /* More error recovery. */
4730 if (!retval || str->iso2022.output_literally)
4732 /* Output the (possibly invalid) sequence */
4734 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4735 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4736 flags &= CODING_STATE_ISO2022_LOCK;
4738 n++, src--;/* Repeat the loop with the same character. */
4741 /* No sense in reprocessing the final byte of the
4742 escape sequence; it could mess things up anyway.
4744 DECODE_ADD_BINARY_CHAR (c, dst);
4749 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4750 { /* Control characters */
4752 /***** Error-handling *****/
4754 /* If we were in the middle of a character, dump out the
4755 partial character. */
4756 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4758 /* If we just saw a single-shift character, dump it out.
4759 This may dump out the wrong sort of single-shift character,
4760 but least it will give an indication that something went
4762 if (flags & CODING_STATE_SS2)
4764 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4765 flags &= ~CODING_STATE_SS2;
4767 if (flags & CODING_STATE_SS3)
4769 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4770 flags &= ~CODING_STATE_SS3;
4773 /***** Now handle the control characters. *****/
4776 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4778 flags &= CODING_STATE_ISO2022_LOCK;
4780 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4781 DECODE_ADD_BINARY_CHAR (c, dst);
4784 { /* Graphic characters */
4785 Lisp_Object charset;
4791 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4793 /* Now determine the charset. */
4794 reg = ((flags & CODING_STATE_SS2) ? 2
4795 : (flags & CODING_STATE_SS3) ? 3
4796 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4797 : str->iso2022.register_left);
4798 charset = str->iso2022.charset[reg];
4800 /* Error checking: */
4801 if (! CHARSETP (charset)
4802 || str->iso2022.invalid_designated[reg]
4803 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4804 && XCHARSET_CHARS (charset) == 94))
4805 /* Mrmph. We are trying to invoke a register that has no
4806 or an invalid charset in it, or trying to add a character
4807 outside the range of the charset. Insert that char literally
4808 to preserve it for the output. */
4810 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4811 DECODE_ADD_BINARY_CHAR (c, dst);
4816 /* Things are probably hunky-dorey. */
4818 /* Fetch reverse charset, maybe. */
4819 if (((flags & CODING_STATE_R2L) &&
4820 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4822 (!(flags & CODING_STATE_R2L) &&
4823 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4825 Lisp_Object new_charset =
4826 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4827 if (!NILP (new_charset))
4828 charset = new_charset;
4832 lb = XCHARSET_LEADING_BYTE (charset);
4834 switch (XCHARSET_REP_BYTES (charset))
4837 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4838 Dynarr_add (dst, c & 0x7F);
4841 case 2: /* one-byte official */
4842 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4844 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0), dst);
4846 Dynarr_add (dst, lb);
4847 Dynarr_add (dst, c | 0x80);
4851 case 3: /* one-byte private or two-byte official */
4852 if (XCHARSET_PRIVATE_P (charset))
4854 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4856 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0),
4859 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4860 Dynarr_add (dst, lb);
4861 Dynarr_add (dst, c | 0x80);
4869 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4873 Dynarr_add (dst, lb);
4874 Dynarr_add (dst, ch | 0x80);
4875 Dynarr_add (dst, c | 0x80);
4884 default: /* two-byte private */
4888 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4892 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4893 Dynarr_add (dst, lb);
4894 Dynarr_add (dst, ch | 0x80);
4895 Dynarr_add (dst, c | 0x80);
4905 flags &= CODING_STATE_ISO2022_LOCK;
4908 label_continue_loop:;
4911 if (flags & CODING_STATE_END)
4912 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4919 /***** ISO2022 encoder *****/
4921 /* Designate CHARSET into register REG. */
4924 iso2022_designate (Lisp_Object charset, unsigned char reg,
4925 struct encoding_stream *str, unsigned_char_dynarr *dst)
4927 static CONST char inter94[] = "()*+";
4928 static CONST char inter96[] = ",-./";
4930 unsigned char final;
4931 Lisp_Object old_charset = str->iso2022.charset[reg];
4933 str->iso2022.charset[reg] = charset;
4934 if (!CHARSETP (charset))
4935 /* charset might be an initial nil or t. */
4937 type = XCHARSET_TYPE (charset);
4938 final = XCHARSET_FINAL (charset);
4939 if (!str->iso2022.force_charset_on_output[reg] &&
4940 CHARSETP (old_charset) &&
4941 XCHARSET_TYPE (old_charset) == type &&
4942 XCHARSET_FINAL (old_charset) == final)
4945 str->iso2022.force_charset_on_output[reg] = 0;
4948 charset_conversion_spec_dynarr *dyn =
4949 str->codesys->iso2022.output_conv;
4955 for (i = 0; i < Dynarr_length (dyn); i++)
4957 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4958 if (EQ (charset, spec->from_charset))
4959 charset = spec->to_charset;
4964 Dynarr_add (dst, ISO_CODE_ESC);
4967 case CHARSET_TYPE_94:
4968 Dynarr_add (dst, inter94[reg]);
4970 case CHARSET_TYPE_96:
4971 Dynarr_add (dst, inter96[reg]);
4973 case CHARSET_TYPE_94X94:
4974 Dynarr_add (dst, '$');
4976 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4979 Dynarr_add (dst, inter94[reg]);
4981 case CHARSET_TYPE_96X96:
4982 Dynarr_add (dst, '$');
4983 Dynarr_add (dst, inter96[reg]);
4986 Dynarr_add (dst, final);
4990 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4992 if (str->iso2022.register_left != 0)
4994 Dynarr_add (dst, ISO_CODE_SI);
4995 str->iso2022.register_left = 0;
5000 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5002 if (str->iso2022.register_left != 1)
5004 Dynarr_add (dst, ISO_CODE_SO);
5005 str->iso2022.register_left = 1;
5009 /* Convert internally-formatted data to ISO2022 format. */
5012 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5013 unsigned_char_dynarr *dst, unsigned int n)
5015 unsigned char charmask, c;
5016 unsigned char char_boundary;
5017 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5018 unsigned int flags = str->flags;
5019 Emchar ch = str->ch;
5020 Lisp_Coding_System *codesys = str->codesys;
5021 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5023 Lisp_Object charset;
5026 unsigned int byte1, byte2;
5029 #ifdef ENABLE_COMPOSITE_CHARS
5030 /* flags for handling composite chars. We do a little switcharoo
5031 on the source while we're outputting the composite char. */
5032 unsigned int saved_n = 0;
5033 CONST unsigned char *saved_src = NULL;
5034 int in_composite = 0;
5035 #endif /* ENABLE_COMPOSITE_CHARS */
5037 char_boundary = str->iso2022.current_char_boundary;
5038 charset = str->iso2022.current_charset;
5039 half = str->iso2022.current_half;
5041 #ifdef ENABLE_COMPOSITE_CHARS
5049 switch (char_boundary)
5057 else if ( c >= 0xf8 )
5062 else if ( c >= 0xf0 )
5067 else if ( c >= 0xe0 )
5072 else if ( c >= 0xc0 )
5081 restore_left_to_right_direction (codesys, dst, &flags, 0);
5083 /* Make sure G0 contains ASCII */
5084 if ((c > ' ' && c < ISO_CODE_DEL) ||
5085 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5087 ensure_normal_shift (str, dst);
5088 iso2022_designate (Vcharset_ascii, 0, str, dst);
5091 /* If necessary, restore everything to the default state
5094 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5096 restore_left_to_right_direction (codesys, dst, &flags, 0);
5098 ensure_normal_shift (str, dst);
5100 for (i = 0; i < 4; i++)
5102 Lisp_Object initial_charset =
5103 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5104 iso2022_designate (initial_charset, i, str, dst);
5109 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5110 Dynarr_add (dst, '\r');
5111 if (eol_type != EOL_CR)
5112 Dynarr_add (dst, c);
5116 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5117 && fit_to_be_escape_quoted (c))
5118 Dynarr_add (dst, ISO_CODE_ESC);
5119 Dynarr_add (dst, c);
5125 ch = ( ch << 6 ) | ( c & 0x3f );
5128 if ( (0x80 <= ch) && (ch <= 0x9f) )
5130 charmask = (half == 0 ? 0x00 : 0x80);
5132 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5133 && fit_to_be_escape_quoted (ch))
5134 Dynarr_add (dst, ISO_CODE_ESC);
5135 /* you asked for it ... */
5136 Dynarr_add (dst, ch);
5142 BREAKUP_CHAR (ch, charset, byte1, byte2);
5143 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5144 codesys, dst, &flags, 0);
5146 /* Now determine which register to use. */
5148 for (i = 0; i < 4; i++)
5150 if (EQ (charset, str->iso2022.charset[i]) ||
5152 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5161 if (XCHARSET_GRAPHIC (charset) != 0)
5163 if (!NILP (str->iso2022.charset[1]) &&
5164 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5165 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5167 else if (!NILP (str->iso2022.charset[2]))
5169 else if (!NILP (str->iso2022.charset[3]))
5178 iso2022_designate (charset, reg, str, dst);
5180 /* Now invoke that register. */
5184 ensure_normal_shift (str, dst);
5189 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5191 ensure_shift_out (str, dst);
5199 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5201 Dynarr_add (dst, ISO_CODE_ESC);
5202 Dynarr_add (dst, 'N');
5207 Dynarr_add (dst, ISO_CODE_SS2);
5213 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5215 Dynarr_add (dst, ISO_CODE_ESC);
5216 Dynarr_add (dst, 'O');
5221 Dynarr_add (dst, ISO_CODE_SS3);
5230 charmask = (half == 0 ? 0x00 : 0x80);
5232 switch (XCHARSET_DIMENSION (charset))
5235 Dynarr_add (dst, byte1 | charmask);
5238 Dynarr_add (dst, byte1 | charmask);
5239 Dynarr_add (dst, byte2 | charmask);
5248 ch = ( ch << 6 ) | ( c & 0x3f );
5252 #else /* not UTF2000 */
5258 if (BYTE_ASCII_P (c))
5259 { /* Processing ASCII character */
5262 restore_left_to_right_direction (codesys, dst, &flags, 0);
5264 /* Make sure G0 contains ASCII */
5265 if ((c > ' ' && c < ISO_CODE_DEL) ||
5266 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5268 ensure_normal_shift (str, dst);
5269 iso2022_designate (Vcharset_ascii, 0, str, dst);
5272 /* If necessary, restore everything to the default state
5275 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5277 restore_left_to_right_direction (codesys, dst, &flags, 0);
5279 ensure_normal_shift (str, dst);
5281 for (i = 0; i < 4; i++)
5283 Lisp_Object initial_charset =
5284 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5285 iso2022_designate (initial_charset, i, str, dst);
5290 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5291 Dynarr_add (dst, '\r');
5292 if (eol_type != EOL_CR)
5293 Dynarr_add (dst, c);
5297 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5298 && fit_to_be_escape_quoted (c))
5299 Dynarr_add (dst, ISO_CODE_ESC);
5300 Dynarr_add (dst, c);
5305 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5306 { /* Processing Leading Byte */
5308 charset = CHARSET_BY_LEADING_BYTE (c);
5309 if (LEADING_BYTE_PREFIX_P(c))
5311 else if (!EQ (charset, Vcharset_control_1)
5312 #ifdef ENABLE_COMPOSITE_CHARS
5313 && !EQ (charset, Vcharset_composite)
5319 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5320 codesys, dst, &flags, 0);
5322 /* Now determine which register to use. */
5324 for (i = 0; i < 4; i++)
5326 if (EQ (charset, str->iso2022.charset[i]) ||
5328 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5337 if (XCHARSET_GRAPHIC (charset) != 0)
5339 if (!NILP (str->iso2022.charset[1]) &&
5340 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5341 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5343 else if (!NILP (str->iso2022.charset[2]))
5345 else if (!NILP (str->iso2022.charset[3]))
5354 iso2022_designate (charset, reg, str, dst);
5356 /* Now invoke that register. */
5360 ensure_normal_shift (str, dst);
5365 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5367 ensure_shift_out (str, dst);
5375 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5377 Dynarr_add (dst, ISO_CODE_ESC);
5378 Dynarr_add (dst, 'N');
5383 Dynarr_add (dst, ISO_CODE_SS2);
5389 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5391 Dynarr_add (dst, ISO_CODE_ESC);
5392 Dynarr_add (dst, 'O');
5397 Dynarr_add (dst, ISO_CODE_SS3);
5409 { /* Processing Non-ASCII character */
5410 charmask = (half == 0 ? 0x7F : 0xFF);
5412 if (EQ (charset, Vcharset_control_1))
5414 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5415 && fit_to_be_escape_quoted (c))
5416 Dynarr_add (dst, ISO_CODE_ESC);
5417 /* you asked for it ... */
5418 Dynarr_add (dst, c - 0x20);
5422 switch (XCHARSET_REP_BYTES (charset))
5425 Dynarr_add (dst, c & charmask);
5428 if (XCHARSET_PRIVATE_P (charset))
5430 Dynarr_add (dst, c & charmask);
5435 #ifdef ENABLE_COMPOSITE_CHARS
5436 if (EQ (charset, Vcharset_composite))
5440 /* #### Bother! We don't know how to
5442 Dynarr_add (dst, '~');
5446 Emchar emch = MAKE_CHAR (Vcharset_composite,
5447 ch & 0x7F, c & 0x7F);
5448 Lisp_Object lstr = composite_char_string (emch);
5452 src = XSTRING_DATA (lstr);
5453 n = XSTRING_LENGTH (lstr);
5454 Dynarr_add (dst, ISO_CODE_ESC);
5455 Dynarr_add (dst, '0'); /* start composing */
5459 #endif /* ENABLE_COMPOSITE_CHARS */
5461 Dynarr_add (dst, ch & charmask);
5462 Dynarr_add (dst, c & charmask);
5475 Dynarr_add (dst, ch & charmask);
5476 Dynarr_add (dst, c & charmask);
5491 #endif /* not UTF2000 */
5493 #ifdef ENABLE_COMPOSITE_CHARS
5499 Dynarr_add (dst, ISO_CODE_ESC);
5500 Dynarr_add (dst, '1'); /* end composing */
5501 goto back_to_square_n; /* Wheeeeeeeee ..... */
5503 #endif /* ENABLE_COMPOSITE_CHARS */
5506 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5508 if (char_boundary && flags & CODING_STATE_END)
5511 restore_left_to_right_direction (codesys, dst, &flags, 0);
5512 ensure_normal_shift (str, dst);
5513 for (i = 0; i < 4; i++)
5515 Lisp_Object initial_charset =
5516 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5517 iso2022_designate (initial_charset, i, str, dst);
5523 str->iso2022.current_char_boundary = char_boundary;
5524 str->iso2022.current_charset = charset;
5525 str->iso2022.current_half = half;
5527 /* Verbum caro factum est! */
5531 /************************************************************************/
5532 /* No-conversion methods */
5533 /************************************************************************/
5535 /* This is used when reading in "binary" files -- i.e. files that may
5536 contain all 256 possible byte values and that are not to be
5537 interpreted as being in any particular decoding. */
5539 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5540 unsigned_char_dynarr *dst, unsigned int n)
5543 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5544 unsigned int flags = str->flags;
5545 unsigned int ch = str->ch;
5546 eol_type_t eol_type = str->eol_type;
5552 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5553 DECODE_ADD_BINARY_CHAR (c, dst);
5554 label_continue_loop:;
5557 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5564 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5565 unsigned_char_dynarr *dst, unsigned int n)
5568 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5569 unsigned int flags = str->flags;
5570 unsigned int ch = str->ch;
5571 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5573 unsigned char char_boundary = str->iso2022.current_char_boundary;
5580 switch (char_boundary)
5588 else if ( c >= 0xf8 )
5593 else if ( c >= 0xf0 )
5598 else if ( c >= 0xe0 )
5603 else if ( c >= 0xc0 )
5614 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5615 Dynarr_add (dst, '\r');
5616 if (eol_type != EOL_CR)
5617 Dynarr_add (dst, c);
5620 Dynarr_add (dst, c);
5625 ch = ( ch << 6 ) | ( c & 0x3f );
5626 switch ( str->codesys->fixed.size )
5629 Dynarr_add (dst, ch & 0xff);
5632 Dynarr_add (dst, (ch >> 8) & 0xff);
5633 Dynarr_add (dst, ch & 0xff);
5636 Dynarr_add (dst, (ch >> 16) & 0xff);
5637 Dynarr_add (dst, (ch >> 8) & 0xff);
5638 Dynarr_add (dst, ch & 0xff);
5641 Dynarr_add (dst, (ch >> 24) & 0xff);
5642 Dynarr_add (dst, (ch >> 16) & 0xff);
5643 Dynarr_add (dst, (ch >> 8) & 0xff);
5644 Dynarr_add (dst, ch & 0xff);
5647 fprintf(stderr, "It seems %d bytes stream.\n",
5648 str->codesys->fixed.size);
5654 ch = ( ch << 6 ) | ( c & 0x3f );
5657 #else /* not UTF2000 */
5660 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5661 Dynarr_add (dst, '\r');
5662 if (eol_type != EOL_CR)
5663 Dynarr_add (dst, '\n');
5666 else if (BYTE_ASCII_P (c))
5669 Dynarr_add (dst, c);
5671 else if (BUFBYTE_LEADING_BYTE_P (c))
5674 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5675 c == LEADING_BYTE_CONTROL_1)
5678 Dynarr_add (dst, '~'); /* untranslatable character */
5682 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5683 Dynarr_add (dst, c);
5684 else if (ch == LEADING_BYTE_CONTROL_1)
5687 Dynarr_add (dst, c - 0x20);
5689 /* else it should be the second or third byte of an
5690 untranslatable character, so ignore it */
5693 #endif /* not UTF2000 */
5699 str->iso2022.current_char_boundary = char_boundary;
5704 /************************************************************************/
5705 /* Simple internal/external functions */
5706 /************************************************************************/
5708 static Extbyte_dynarr *conversion_out_dynarr;
5709 static Bufbyte_dynarr *conversion_in_dynarr;
5711 /* Determine coding system from coding format */
5713 /* #### not correct for all values of `fmt'! */
5715 external_data_format_to_coding_system (enum external_data_format fmt)
5719 case FORMAT_FILENAME:
5720 case FORMAT_TERMINAL:
5721 if (EQ (Vfile_name_coding_system, Qnil) ||
5722 EQ (Vfile_name_coding_system, Qbinary))
5725 return Fget_coding_system (Vfile_name_coding_system);
5728 return Fget_coding_system (Qctext);
5736 convert_to_external_format (CONST Bufbyte *ptr,
5739 enum external_data_format fmt)
5741 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5743 if (!conversion_out_dynarr)
5744 conversion_out_dynarr = Dynarr_new (Extbyte);
5746 Dynarr_reset (conversion_out_dynarr);
5748 if (NILP (coding_system))
5750 CONST Bufbyte *end = ptr + len;
5756 (*ptr < 0xc0) ? *ptr :
5757 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5760 (BYTE_ASCII_P (*ptr)) ? *ptr :
5761 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5762 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5765 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5769 #ifdef ERROR_CHECK_BUFPOS
5770 assert (ptr == end);
5775 Lisp_Object instream, outstream, da_outstream;
5776 Lstream *istr, *ostr;
5777 struct gcpro gcpro1, gcpro2, gcpro3;
5778 char tempbuf[1024]; /* some random amount */
5780 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5781 da_outstream = make_dynarr_output_stream
5782 ((unsigned_char_dynarr *) conversion_out_dynarr);
5784 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5785 istr = XLSTREAM (instream);
5786 ostr = XLSTREAM (outstream);
5787 GCPRO3 (instream, outstream, da_outstream);
5790 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5793 Lstream_write (ostr, tempbuf, size_in_bytes);
5795 Lstream_close (istr);
5796 Lstream_close (ostr);
5798 Lstream_delete (istr);
5799 Lstream_delete (ostr);
5800 Lstream_delete (XLSTREAM (da_outstream));
5803 *len_out = Dynarr_length (conversion_out_dynarr);
5804 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5805 return Dynarr_atp (conversion_out_dynarr, 0);
5809 convert_from_external_format (CONST Extbyte *ptr,
5812 enum external_data_format fmt)
5814 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5816 if (!conversion_in_dynarr)
5817 conversion_in_dynarr = Dynarr_new (Bufbyte);
5819 Dynarr_reset (conversion_in_dynarr);
5821 if (NILP (coding_system))
5823 CONST Extbyte *end = ptr + len;
5824 for (; ptr < end; ptr++)
5827 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5832 Lisp_Object instream, outstream, da_outstream;
5833 Lstream *istr, *ostr;
5834 struct gcpro gcpro1, gcpro2, gcpro3;
5835 char tempbuf[1024]; /* some random amount */
5837 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5838 da_outstream = make_dynarr_output_stream
5839 ((unsigned_char_dynarr *) conversion_in_dynarr);
5841 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5842 istr = XLSTREAM (instream);
5843 ostr = XLSTREAM (outstream);
5844 GCPRO3 (instream, outstream, da_outstream);
5847 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5850 Lstream_write (ostr, tempbuf, size_in_bytes);
5852 Lstream_close (istr);
5853 Lstream_close (ostr);
5855 Lstream_delete (istr);
5856 Lstream_delete (ostr);
5857 Lstream_delete (XLSTREAM (da_outstream));
5860 *len_out = Dynarr_length (conversion_in_dynarr);
5861 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5862 return Dynarr_atp (conversion_in_dynarr, 0);
5866 /************************************************************************/
5867 /* Initialization */
5868 /************************************************************************/
5871 syms_of_file_coding (void)
5873 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5874 deferror (&Qcoding_system_error, "coding-system-error",
5875 "Coding-system error", Qio_error);
5877 DEFSUBR (Fcoding_system_p);
5878 DEFSUBR (Ffind_coding_system);
5879 DEFSUBR (Fget_coding_system);
5880 DEFSUBR (Fcoding_system_list);
5881 DEFSUBR (Fcoding_system_name);
5882 DEFSUBR (Fmake_coding_system);
5883 DEFSUBR (Fcopy_coding_system);
5884 DEFSUBR (Fdefine_coding_system_alias);
5885 DEFSUBR (Fsubsidiary_coding_system);
5887 DEFSUBR (Fcoding_system_type);
5888 DEFSUBR (Fcoding_system_doc_string);
5890 DEFSUBR (Fcoding_system_charset);
5892 DEFSUBR (Fcoding_system_property);
5894 DEFSUBR (Fcoding_category_list);
5895 DEFSUBR (Fset_coding_priority_list);
5896 DEFSUBR (Fcoding_priority_list);
5897 DEFSUBR (Fset_coding_category_system);
5898 DEFSUBR (Fcoding_category_system);
5900 DEFSUBR (Fdetect_coding_region);
5901 DEFSUBR (Fdecode_coding_region);
5902 DEFSUBR (Fencode_coding_region);
5904 DEFSUBR (Fdecode_shift_jis_char);
5905 DEFSUBR (Fencode_shift_jis_char);
5906 DEFSUBR (Fdecode_big5_char);
5907 DEFSUBR (Fencode_big5_char);
5908 DEFSUBR (Fset_ucs_char);
5909 DEFSUBR (Fucs_char);
5910 DEFSUBR (Fset_char_ucs);
5911 DEFSUBR (Fchar_ucs);
5913 defsymbol (&Qcoding_system_p, "coding-system-p");
5914 defsymbol (&Qno_conversion, "no-conversion");
5915 defsymbol (&Qraw_text, "raw-text");
5917 defsymbol (&Qbig5, "big5");
5918 defsymbol (&Qshift_jis, "shift-jis");
5919 defsymbol (&Qucs4, "ucs-4");
5920 defsymbol (&Qutf8, "utf-8");
5921 defsymbol (&Qccl, "ccl");
5922 defsymbol (&Qiso2022, "iso2022");
5924 defsymbol (&Qmnemonic, "mnemonic");
5925 defsymbol (&Qeol_type, "eol-type");
5926 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5927 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5929 defsymbol (&Qcr, "cr");
5930 defsymbol (&Qlf, "lf");
5931 defsymbol (&Qcrlf, "crlf");
5932 defsymbol (&Qeol_cr, "eol-cr");
5933 defsymbol (&Qeol_lf, "eol-lf");
5934 defsymbol (&Qeol_crlf, "eol-crlf");
5936 defsymbol (&Qcharset_g0, "charset-g0");
5937 defsymbol (&Qcharset_g1, "charset-g1");
5938 defsymbol (&Qcharset_g2, "charset-g2");
5939 defsymbol (&Qcharset_g3, "charset-g3");
5940 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5941 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5942 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5943 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5944 defsymbol (&Qno_iso6429, "no-iso6429");
5945 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5946 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5948 defsymbol (&Qshort, "short");
5949 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5950 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5951 defsymbol (&Qseven, "seven");
5952 defsymbol (&Qlock_shift, "lock-shift");
5953 defsymbol (&Qescape_quoted, "escape-quoted");
5955 defsymbol (&Qencode, "encode");
5956 defsymbol (&Qdecode, "decode");
5959 defsymbol (&Qctext, "ctext");
5960 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5962 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5964 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5966 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5968 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5970 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5972 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5974 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5976 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5979 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5984 lstream_type_create_file_coding (void)
5986 LSTREAM_HAS_METHOD (decoding, reader);
5987 LSTREAM_HAS_METHOD (decoding, writer);
5988 LSTREAM_HAS_METHOD (decoding, rewinder);
5989 LSTREAM_HAS_METHOD (decoding, seekable_p);
5990 LSTREAM_HAS_METHOD (decoding, flusher);
5991 LSTREAM_HAS_METHOD (decoding, closer);
5992 LSTREAM_HAS_METHOD (decoding, marker);
5994 LSTREAM_HAS_METHOD (encoding, reader);
5995 LSTREAM_HAS_METHOD (encoding, writer);
5996 LSTREAM_HAS_METHOD (encoding, rewinder);
5997 LSTREAM_HAS_METHOD (encoding, seekable_p);
5998 LSTREAM_HAS_METHOD (encoding, flusher);
5999 LSTREAM_HAS_METHOD (encoding, closer);
6000 LSTREAM_HAS_METHOD (encoding, marker);
6004 vars_of_file_coding (void)
6008 /* Initialize to something reasonable ... */
6009 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6011 coding_category_system[i] = Qnil;
6012 coding_category_by_priority[i] = i;
6015 Fprovide (intern ("file-coding"));
6017 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6018 Coding system used for TTY keyboard input.
6019 Not used under a windowing system.
6021 Vkeyboard_coding_system = Qnil;
6023 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6024 Coding system used for TTY display output.
6025 Not used under a windowing system.
6027 Vterminal_coding_system = Qnil;
6029 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6030 Overriding coding system used when writing a file or process.
6031 You should *bind* this, not set it. If this is non-nil, it specifies
6032 the coding system that will be used when a file or process is read
6033 in, and overrides `buffer-file-coding-system-for-read',
6034 `insert-file-contents-pre-hook', etc. Use those variables instead of
6035 this one for permanent changes to the environment.
6037 Vcoding_system_for_read = Qnil;
6039 DEFVAR_LISP ("coding-system-for-write",
6040 &Vcoding_system_for_write /*
6041 Overriding coding system used when writing a file or process.
6042 You should *bind* this, not set it. If this is non-nil, it specifies
6043 the coding system that will be used when a file or process is wrote
6044 in, and overrides `buffer-file-coding-system',
6045 `write-region-pre-hook', etc. Use those variables instead of this one
6046 for permanent changes to the environment.
6048 Vcoding_system_for_write = Qnil;
6050 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6051 Coding system used to convert pathnames when accessing files.
6053 Vfile_name_coding_system = Qnil;
6055 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6056 Non-nil means the buffer contents are regarded as multi-byte form
6057 of characters, not a binary code. This affects the display, file I/O,
6058 and behaviors of various editing commands.
6060 Setting this to nil does not do anything.
6062 enable_multibyte_characters = 1;
6066 complex_vars_of_file_coding (void)
6068 staticpro (&Vcoding_system_hash_table);
6069 Vcoding_system_hash_table =
6070 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6072 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6074 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6076 struct codesys_prop csp; \
6078 csp.prop_type = (Prop_Type); \
6079 Dynarr_add (the_codesys_prop_dynarr, csp); \
6082 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6083 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6084 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6085 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6086 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6087 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6088 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6090 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6091 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6092 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6093 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6094 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6095 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6096 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6097 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6098 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6099 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6100 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6101 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6102 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6103 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6104 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6105 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6106 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6108 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6109 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6111 /* Need to create this here or we're really screwed. */
6113 (Qraw_text, Qno_conversion,
6114 build_string ("Raw text, which means it converts only line-break-codes."),
6115 list2 (Qmnemonic, build_string ("Raw")));
6118 (Qbinary, Qno_conversion,
6119 build_string ("Binary, which means it does not convert anything."),
6120 list4 (Qeol_type, Qlf,
6121 Qmnemonic, build_string ("Binary")));
6126 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6127 list2 (Qmnemonic, build_string ("UTF8")));
6130 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6132 /* Need this for bootstrapping */
6133 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6134 Fget_coding_system (Qraw_text);
6137 coding_category_system[CODING_CATEGORY_UTF8]
6138 = Fget_coding_system (Qutf8);
6145 for (i = 0; i < 65536; i++)
6146 ucs_to_mule_table[i] = Qnil;
6148 staticpro (&mule_to_ucs_table);
6149 mule_to_ucs_table = Fmake_char_table(Qgeneric);