1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
824 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
826 if (EQ (key, Qmnemonic))
829 CHECK_STRING (value);
830 CODING_SYSTEM_MNEMONIC (codesys) = value;
833 else if (EQ (key, Qeol_type))
835 need_to_setup_eol_systems = NILP (value);
838 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
841 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
842 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
844 else if (ty == CODESYS_ISO2022)
846 #define FROB_INITIAL_CHARSET(charset_num) \
847 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
848 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
850 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
851 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
852 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
853 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
855 #define FROB_FORCE_CHARSET(charset_num) \
856 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
858 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
859 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
860 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
861 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
863 #define FROB_BOOLEAN_PROPERTY(prop) \
864 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
866 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
867 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
868 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
869 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
870 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
871 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
872 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
874 else if (EQ (key, Qinput_charset_conversion))
876 codesys->iso2022.input_conv =
877 Dynarr_new (charset_conversion_spec);
878 parse_charset_conversion_specs (codesys->iso2022.input_conv,
881 else if (EQ (key, Qoutput_charset_conversion))
883 codesys->iso2022.output_conv =
884 Dynarr_new (charset_conversion_spec);
885 parse_charset_conversion_specs (codesys->iso2022.output_conv,
889 signal_simple_error ("Unrecognized property", key);
891 else if (EQ (type, Qccl))
893 if (EQ (key, Qdecode))
895 CHECK_VECTOR (value);
896 CODING_SYSTEM_CCL_DECODE (codesys) = value;
898 else if (EQ (key, Qencode))
900 CHECK_VECTOR (value);
901 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
904 signal_simple_error ("Unrecognized property", key);
908 signal_simple_error ("Unrecognized property", key);
911 if (need_to_setup_eol_systems)
912 setup_eol_coding_systems (codesys);
915 Lisp_Object codesys_obj;
916 XSETCODING_SYSTEM (codesys_obj, codesys);
917 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
923 Copy OLD-CODING-SYSTEM to NEW-NAME.
924 If NEW-NAME does not name an existing coding system, a new one will
927 (old_coding_system, new_name))
929 Lisp_Object new_coding_system;
930 old_coding_system = Fget_coding_system (old_coding_system);
931 new_coding_system = Ffind_coding_system (new_name);
932 if (NILP (new_coding_system))
934 XSETCODING_SYSTEM (new_coding_system,
935 allocate_coding_system
936 (XCODING_SYSTEM_TYPE (old_coding_system),
938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
944 memcpy (((char *) to ) + sizeof (to->header),
945 ((char *) from) + sizeof (from->header),
946 sizeof (*from) - sizeof (from->header));
949 return new_coding_system;
952 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
953 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
955 (alias, coding_system))
957 CHECK_SYMBOL (alias);
958 if (!NILP (Ffind_coding_system (alias)))
959 signal_simple_error ("Symbol already names a coding system", alias);
960 coding_system = Fget_coding_system (coding_system);
961 Fputhash (alias, coding_system, Vcoding_system_hash_table);
963 /* Set up aliases for subsidiaries. */
964 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
967 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
968 #define FROB(type, name) \
970 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
971 if (!NILP (subsidiary)) \
972 Fdefine_coding_system_alias \
973 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
980 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
981 but it doesn't look intentional, so I'd rather return something
982 meaningful or nothing at all. */
987 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
989 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
990 Lisp_Object new_coding_system;
992 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
993 return coding_system;
997 case EOL_AUTODETECT: return coding_system;
998 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
999 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1000 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1004 return NILP (new_coding_system) ? coding_system : new_coding_system;
1007 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1008 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1010 (coding_system, eol_type))
1012 coding_system = Fget_coding_system (coding_system);
1014 return subsidiary_coding_system (coding_system,
1015 symbol_to_eol_type (eol_type));
1019 /************************************************************************/
1020 /* Coding system accessors */
1021 /************************************************************************/
1023 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1024 Return the doc string for CODING-SYSTEM.
1028 coding_system = Fget_coding_system (coding_system);
1029 return XCODING_SYSTEM_DOC_STRING (coding_system);
1032 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1033 Return the type of CODING-SYSTEM.
1037 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1040 case CODESYS_AUTODETECT: return Qundecided;
1042 case CODESYS_SHIFT_JIS: return Qshift_jis;
1043 case CODESYS_ISO2022: return Qiso2022;
1044 case CODESYS_BIG5: return Qbig5;
1045 case CODESYS_UCS4: return Qucs4;
1046 case CODESYS_UTF8: return Qutf8;
1047 case CODESYS_CCL: return Qccl;
1049 case CODESYS_NO_CONVERSION: return Qno_conversion;
1051 case CODESYS_INTERNAL: return Qinternal;
1058 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1061 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1063 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1066 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1067 Return initial charset of CODING-SYSTEM designated to GNUM.
1070 (coding_system, gnum))
1072 coding_system = Fget_coding_system (coding_system);
1075 return coding_system_charset (coding_system, XINT (gnum));
1079 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1080 Return the PROP property of CODING-SYSTEM.
1082 (coding_system, prop))
1085 enum coding_system_type type;
1087 coding_system = Fget_coding_system (coding_system);
1088 CHECK_SYMBOL (prop);
1089 type = XCODING_SYSTEM_TYPE (coding_system);
1091 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1092 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1095 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1097 case CODESYS_PROP_ALL_OK:
1100 case CODESYS_PROP_ISO2022:
1101 if (type != CODESYS_ISO2022)
1103 ("Property only valid in ISO2022 coding systems",
1107 case CODESYS_PROP_CCL:
1108 if (type != CODESYS_CCL)
1110 ("Property only valid in CCL coding systems",
1120 signal_simple_error ("Unrecognized property", prop);
1122 if (EQ (prop, Qname))
1123 return XCODING_SYSTEM_NAME (coding_system);
1124 else if (EQ (prop, Qtype))
1125 return Fcoding_system_type (coding_system);
1126 else if (EQ (prop, Qdoc_string))
1127 return XCODING_SYSTEM_DOC_STRING (coding_system);
1128 else if (EQ (prop, Qmnemonic))
1129 return XCODING_SYSTEM_MNEMONIC (coding_system);
1130 else if (EQ (prop, Qeol_type))
1131 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1132 else if (EQ (prop, Qeol_lf))
1133 return XCODING_SYSTEM_EOL_LF (coding_system);
1134 else if (EQ (prop, Qeol_crlf))
1135 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1136 else if (EQ (prop, Qeol_cr))
1137 return XCODING_SYSTEM_EOL_CR (coding_system);
1138 else if (EQ (prop, Qpost_read_conversion))
1139 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1140 else if (EQ (prop, Qpre_write_conversion))
1141 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1143 else if (type == CODESYS_ISO2022)
1145 if (EQ (prop, Qcharset_g0))
1146 return coding_system_charset (coding_system, 0);
1147 else if (EQ (prop, Qcharset_g1))
1148 return coding_system_charset (coding_system, 1);
1149 else if (EQ (prop, Qcharset_g2))
1150 return coding_system_charset (coding_system, 2);
1151 else if (EQ (prop, Qcharset_g3))
1152 return coding_system_charset (coding_system, 3);
1154 #define FORCE_CHARSET(charset_num) \
1155 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1156 (coding_system, charset_num) ? Qt : Qnil)
1158 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1159 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1160 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1161 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1163 #define LISP_BOOLEAN(prop) \
1164 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1166 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1167 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1168 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1169 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1170 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1171 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1172 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1174 else if (EQ (prop, Qinput_charset_conversion))
1176 unparse_charset_conversion_specs
1177 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1178 else if (EQ (prop, Qoutput_charset_conversion))
1180 unparse_charset_conversion_specs
1181 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1185 else if (type == CODESYS_CCL)
1187 if (EQ (prop, Qdecode))
1188 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1189 else if (EQ (prop, Qencode))
1190 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1198 return Qnil; /* not reached */
1202 /************************************************************************/
1203 /* Coding category functions */
1204 /************************************************************************/
1207 decode_coding_category (Lisp_Object symbol)
1211 CHECK_SYMBOL (symbol);
1212 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1213 if (EQ (coding_category_symbol[i], symbol))
1216 signal_simple_error ("Unrecognized coding category", symbol);
1217 return 0; /* not reached */
1220 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1221 Return a list of all recognized coding categories.
1226 Lisp_Object list = Qnil;
1228 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1229 list = Fcons (coding_category_symbol[i], list);
1233 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1234 Change the priority order of the coding categories.
1235 LIST should be list of coding categories, in descending order of
1236 priority. Unspecified coding categories will be lower in priority
1237 than all specified ones, in the same relative order they were in
1242 int category_to_priority[CODING_CATEGORY_LAST + 1];
1246 /* First generate a list that maps coding categories to priorities. */
1248 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1249 category_to_priority[i] = -1;
1251 /* Highest priority comes from the specified list. */
1253 EXTERNAL_LIST_LOOP (rest, list)
1255 int cat = decode_coding_category (XCAR (rest));
1257 if (category_to_priority[cat] >= 0)
1258 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1259 category_to_priority[cat] = i++;
1262 /* Now go through the existing categories by priority to retrieve
1263 the categories not yet specified and preserve their priority
1265 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1267 int cat = coding_category_by_priority[j];
1268 if (category_to_priority[cat] < 0)
1269 category_to_priority[cat] = i++;
1272 /* Now we need to construct the inverse of the mapping we just
1275 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1276 coding_category_by_priority[category_to_priority[i]] = i;
1278 /* Phew! That was confusing. */
1282 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1283 Return a list of coding categories in descending order of priority.
1288 Lisp_Object list = Qnil;
1290 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1291 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1296 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1297 Change the coding system associated with a coding category.
1299 (coding_category, coding_system))
1301 int cat = decode_coding_category (coding_category);
1303 coding_system = Fget_coding_system (coding_system);
1304 coding_category_system[cat] = coding_system;
1308 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1309 Return the coding system associated with a coding category.
1313 int cat = decode_coding_category (coding_category);
1314 Lisp_Object sys = coding_category_system[cat];
1317 return XCODING_SYSTEM_NAME (sys);
1322 /************************************************************************/
1323 /* Detecting the encoding of data */
1324 /************************************************************************/
1326 struct detection_state
1328 enum eol_type eol_type;
1364 struct iso2022_decoder iso;
1366 int high_byte_count;
1367 unsigned int saw_single_shift:1;
1380 acceptable_control_char_p (int c)
1384 /* Allow and ignore control characters that you might
1385 reasonably see in a text file */
1390 case 8: /* backspace */
1391 case 11: /* vertical tab */
1392 case 12: /* form feed */
1393 case 26: /* MS-DOS C-z junk */
1394 case 31: /* '^_' -- for info */
1402 mask_has_at_most_one_bit_p (int mask)
1404 /* Perhaps the only thing useful you learn from intensive Microsoft
1405 technical interviews */
1406 return (mask & (mask - 1)) == 0;
1409 static enum eol_type
1410 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1419 st->eol.just_saw_cr = 1;
1424 if (st->eol.just_saw_cr)
1426 else if (st->eol.seen_anything)
1429 else if (st->eol.just_saw_cr)
1431 st->eol.just_saw_cr = 0;
1433 st->eol.seen_anything = 1;
1436 return EOL_AUTODETECT;
1439 /* Attempt to determine the encoding and EOL type of the given text.
1440 Before calling this function for the first type, you must initialize
1441 st->eol_type as appropriate and initialize st->mask to ~0.
1443 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1446 st->mask holds the determined coding category mask, or ~0 if only
1447 ASCII has been seen so far.
1451 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1452 is present in st->mask
1453 1 == definitive answers are here for both st->eol_type and st->mask
1457 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1458 unsigned int n, int just_do_eol)
1462 if (st->eol_type == EOL_AUTODETECT)
1463 st->eol_type = detect_eol_type (st, src, n);
1466 return st->eol_type != EOL_AUTODETECT;
1468 if (!st->seen_non_ascii)
1470 for (; n; n--, src++)
1473 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1475 st->seen_non_ascii = 1;
1477 st->shift_jis.mask = ~0;
1481 st->iso2022.mask = ~0;
1491 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1492 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1493 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1494 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1495 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1496 st->big5.mask = detect_coding_big5 (st, src, n);
1497 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1498 st->utf8.mask = detect_coding_utf8 (st, src, n);
1499 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1500 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1503 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1504 | st->utf8.mask | st->ucs4.mask;
1507 int retval = mask_has_at_most_one_bit_p (st->mask);
1508 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1509 return retval && st->eol_type != EOL_AUTODETECT;
1514 coding_system_from_mask (int mask)
1518 /* If the file was entirely or basically ASCII, use the
1519 default value of `buffer-file-coding-system'. */
1520 Lisp_Object retval =
1521 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1524 retval = Ffind_coding_system (retval);
1528 (Qbad_variable, Qwarning,
1529 "Invalid `default-buffer-file-coding-system', set to nil");
1530 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1534 retval = Fget_coding_system (Qraw_text);
1542 mask = postprocess_iso2022_mask (mask);
1544 /* Look through the coding categories by priority and find
1545 the first one that is allowed. */
1546 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1548 cat = coding_category_by_priority[i];
1549 if ((mask & (1 << cat)) &&
1550 !NILP (coding_category_system[cat]))
1554 return coding_category_system[cat];
1556 return Fget_coding_system (Qraw_text);
1560 /* Given a seekable read stream and potential coding system and EOL type
1561 as specified, do any autodetection that is called for. If the
1562 coding system and/or EOL type are not autodetect, they will be left
1563 alone; but this function will never return an autodetect coding system
1566 This function does not automatically fetch subsidiary coding systems;
1567 that should be unnecessary with the explicit eol-type argument. */
1570 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1571 enum eol_type *eol_type_in_out)
1573 struct detection_state decst;
1575 if (*eol_type_in_out == EOL_AUTODETECT)
1576 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1579 decst.eol_type = *eol_type_in_out;
1582 /* If autodetection is called for, do it now. */
1583 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1584 *eol_type_in_out == EOL_AUTODETECT)
1589 unsigned char random_buffer[4096];
1592 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1595 if (detect_coding_type (&decst, random_buffer, nread,
1596 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1597 CODESYS_AUTODETECT))
1601 *eol_type_in_out = decst.eol_type;
1602 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1603 *codesys_in_out = coding_system_from_mask (decst.mask);
1606 /* If we absolutely can't determine the EOL type, just assume LF. */
1607 if (*eol_type_in_out == EOL_AUTODETECT)
1608 *eol_type_in_out = EOL_LF;
1610 Lstream_rewind (stream);
1613 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1614 Detect coding system of the text in the region between START and END.
1615 Returned a list of possible coding systems ordered by priority.
1616 If only ASCII characters are found, it returns 'undecided or one of
1617 its subsidiary coding systems according to a detected end-of-line
1618 type. Optional arg BUFFER defaults to the current buffer.
1620 (start, end, buffer))
1622 Lisp_Object val = Qnil;
1623 struct buffer *buf = decode_buffer (buffer, 0);
1625 Lisp_Object instream, lb_instream;
1626 Lstream *istr, *lb_istr;
1627 struct detection_state decst;
1628 struct gcpro gcpro1, gcpro2;
1630 get_buffer_range_char (buf, start, end, &b, &e, 0);
1631 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1632 lb_istr = XLSTREAM (lb_instream);
1633 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1634 istr = XLSTREAM (instream);
1635 GCPRO2 (instream, lb_instream);
1637 decst.eol_type = EOL_AUTODETECT;
1641 unsigned char random_buffer[4096];
1642 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1646 if (detect_coding_type (&decst, random_buffer, nread, 0))
1650 if (decst.mask == ~0)
1651 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1659 decst.mask = postprocess_iso2022_mask (decst.mask);
1661 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1663 int sys = coding_category_by_priority[i];
1664 if (decst.mask & (1 << sys))
1666 Lisp_Object codesys = coding_category_system[sys];
1667 if (!NILP (codesys))
1668 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1669 val = Fcons (codesys, val);
1673 Lstream_close (istr);
1675 Lstream_delete (istr);
1676 Lstream_delete (lb_istr);
1681 /************************************************************************/
1682 /* Converting to internal Mule format ("decoding") */
1683 /************************************************************************/
1685 /* A decoding stream is a stream used for decoding text (i.e.
1686 converting from some external format to internal format).
1687 The decoding-stream object keeps track of the actual coding
1688 stream, the stream that is at the other end, and data that
1689 needs to be persistent across the lifetime of the stream. */
1691 /* Handle the EOL stuff related to just-read-in character C.
1692 EOL_TYPE is the EOL type of the coding stream.
1693 FLAGS is the current value of FLAGS in the coding stream, and may
1694 be modified by this macro. (The macro only looks at the
1695 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1696 bytes are to be written. You need to also define a local goto
1697 label "label_continue_loop" that is at the end of the main
1698 character-reading loop.
1700 If C is a CR character, then this macro handles it entirely and
1701 jumps to label_continue_loop. Otherwise, this macro does not add
1702 anything to DST, and continues normally. You should continue
1703 processing C normally after this macro. */
1705 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1709 if (eol_type == EOL_CR) \
1710 Dynarr_add (dst, '\n'); \
1711 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1712 Dynarr_add (dst, c); \
1714 flags |= CODING_STATE_CR; \
1715 goto label_continue_loop; \
1717 else if (flags & CODING_STATE_CR) \
1718 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1720 Dynarr_add (dst, '\r'); \
1721 flags &= ~CODING_STATE_CR; \
1725 /* C should be a binary character in the range 0 - 255; convert
1726 to internal format and add to Dynarr DST. */
1729 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1731 if (BYTE_ASCII_P (c)) \
1732 Dynarr_add (dst, c); \
1735 Dynarr_add (dst, (c >> 6) | 0xc0); \
1736 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1741 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1745 Dynarr_add (dst, c);
1747 else if ( c <= 0x7ff )
1749 Dynarr_add (dst, (c >> 6) | 0xc0);
1750 Dynarr_add (dst, (c & 0x3f) | 0x80);
1752 else if ( c <= 0xffff )
1754 Dynarr_add (dst, (c >> 12) | 0xe0);
1755 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1756 Dynarr_add (dst, (c & 0x3f) | 0x80);
1758 else if ( c <= 0x1fffff )
1760 Dynarr_add (dst, (c >> 18) | 0xf0);
1761 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1762 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1763 Dynarr_add (dst, (c & 0x3f) | 0x80);
1765 else if ( c <= 0x3ffffff )
1767 Dynarr_add (dst, (c >> 24) | 0xf8);
1768 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1769 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1770 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1771 Dynarr_add (dst, (c & 0x3f) | 0x80);
1775 Dynarr_add (dst, (c >> 30) | 0xfc);
1776 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1777 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1778 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1779 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1780 Dynarr_add (dst, (c & 0x3f) | 0x80);
1784 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1786 if (BYTE_ASCII_P (c)) \
1787 Dynarr_add (dst, c); \
1788 else if (BYTE_C1_P (c)) \
1790 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1791 Dynarr_add (dst, c + 0x20); \
1795 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1796 Dynarr_add (dst, c); \
1801 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1805 DECODE_ADD_BINARY_CHAR (ch, dst); \
1810 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1812 if (flags & CODING_STATE_END) \
1814 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1815 if (flags & CODING_STATE_CR) \
1816 Dynarr_add (dst, '\r'); \
1820 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1822 struct decoding_stream
1824 /* Coding system that governs the conversion. */
1825 Lisp_Coding_System *codesys;
1827 /* Stream that we read the encoded data from or
1828 write the decoded data to. */
1831 /* If we are reading, then we can return only a fixed amount of
1832 data, so if the conversion resulted in too much data, we store it
1833 here for retrieval the next time around. */
1834 unsigned_char_dynarr *runoff;
1836 /* FLAGS holds flags indicating the current state of the decoding.
1837 Some of these flags are dependent on the coding system. */
1840 /* CH holds a partially built-up character. Since we only deal
1841 with one- and two-byte characters at the moment, we only use
1842 this to store the first byte of a two-byte character. */
1845 /* EOL_TYPE specifies the type of end-of-line conversion that
1846 currently applies. We need to keep this separate from the
1847 EOL type stored in CODESYS because the latter might indicate
1848 automatic EOL-type detection while the former will always
1849 indicate a particular EOL type. */
1850 enum eol_type eol_type;
1852 /* Additional ISO2022 information. We define the structure above
1853 because it's also needed by the detection routines. */
1854 struct iso2022_decoder iso2022;
1856 /* Additional information (the state of the running CCL program)
1857 used by the CCL decoder. */
1858 struct ccl_program ccl;
1860 struct detection_state decst;
1863 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1864 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1865 static int decoding_rewinder (Lstream *stream);
1866 static int decoding_seekable_p (Lstream *stream);
1867 static int decoding_flusher (Lstream *stream);
1868 static int decoding_closer (Lstream *stream);
1870 static Lisp_Object decoding_marker (Lisp_Object stream,
1871 void (*markobj) (Lisp_Object));
1873 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1874 sizeof (struct decoding_stream));
1877 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1879 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1880 Lisp_Object str_obj;
1882 /* We do not need to mark the coding systems or charsets stored
1883 within the stream because they are stored in a global list
1884 and automatically marked. */
1886 XSETLSTREAM (str_obj, str);
1888 if (str->imp->marker)
1889 return (str->imp->marker) (str_obj, markobj);
1894 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1895 so we read data from the other end, decode it, and store it into DATA. */
1898 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1900 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1901 unsigned char *orig_data = data;
1903 int error_occurred = 0;
1905 /* We need to interface to mule_decode(), which expects to take some
1906 amount of data and store the result into a Dynarr. We have
1907 mule_decode() store into str->runoff, and take data from there
1910 /* We loop until we have enough data, reading chunks from the other
1911 end and decoding it. */
1914 /* Take data from the runoff if we can. Make sure to take at
1915 most SIZE bytes, and delete the data from the runoff. */
1916 if (Dynarr_length (str->runoff) > 0)
1918 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1919 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1920 Dynarr_delete_many (str->runoff, 0, chunk);
1926 break; /* No more room for data */
1928 if (str->flags & CODING_STATE_END)
1929 /* This means that on the previous iteration, we hit the EOF on
1930 the other end. We loop once more so that mule_decode() can
1931 output any final stuff it may be holding, or any "go back
1932 to a sane state" escape sequences. (This latter makes sense
1933 during encoding.) */
1936 /* Exhausted the runoff, so get some more. DATA has at least
1937 SIZE bytes left of storage in it, so it's OK to read directly
1938 into it. (We'll be overwriting above, after we've decoded it
1939 into the runoff.) */
1940 read_size = Lstream_read (str->other_end, data, size);
1947 /* There might be some more end data produced in the translation.
1948 See the comment above. */
1949 str->flags |= CODING_STATE_END;
1950 mule_decode (stream, data, str->runoff, read_size);
1953 if (data - orig_data == 0)
1954 return error_occurred ? -1 : 0;
1956 return data - orig_data;
1960 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1962 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1965 /* Decode all our data into the runoff, and then attempt to write
1966 it all out to the other end. Remove whatever chunk we succeeded
1968 mule_decode (stream, data, str->runoff, size);
1969 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1970 Dynarr_length (str->runoff));
1972 Dynarr_delete_many (str->runoff, 0, retval);
1973 /* Do NOT return retval. The return value indicates how much
1974 of the incoming data was written, not how many bytes were
1980 reset_decoding_stream (struct decoding_stream *str)
1983 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1985 Lisp_Object coding_system;
1986 XSETCODING_SYSTEM (coding_system, str->codesys);
1987 reset_iso2022 (coding_system, &str->iso2022);
1989 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1991 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1994 str->flags = str->ch = 0;
1998 decoding_rewinder (Lstream *stream)
2000 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2001 reset_decoding_stream (str);
2002 Dynarr_reset (str->runoff);
2003 return Lstream_rewind (str->other_end);
2007 decoding_seekable_p (Lstream *stream)
2009 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2010 return Lstream_seekable_p (str->other_end);
2014 decoding_flusher (Lstream *stream)
2016 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2017 return Lstream_flush (str->other_end);
2021 decoding_closer (Lstream *stream)
2023 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2024 if (stream->flags & LSTREAM_FL_WRITE)
2026 str->flags |= CODING_STATE_END;
2027 decoding_writer (stream, 0, 0);
2029 Dynarr_free (str->runoff);
2031 #ifdef ENABLE_COMPOSITE_CHARS
2032 if (str->iso2022.composite_chars)
2033 Dynarr_free (str->iso2022.composite_chars);
2036 return Lstream_close (str->other_end);
2040 decoding_stream_coding_system (Lstream *stream)
2042 Lisp_Object coding_system;
2043 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2045 XSETCODING_SYSTEM (coding_system, str->codesys);
2046 return subsidiary_coding_system (coding_system, str->eol_type);
2050 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2052 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2053 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2055 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2056 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2057 reset_decoding_stream (str);
2060 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2061 stream for writing, no automatic code detection will be performed.
2062 The reason for this is that automatic code detection requires a
2063 seekable input. Things will also fail if you open a decoding
2064 stream for reading using a non-fully-specified coding system and
2065 a non-seekable input stream. */
2068 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2071 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2072 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2076 str->other_end = stream;
2077 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2078 str->eol_type = EOL_AUTODETECT;
2079 if (!strcmp (mode, "r")
2080 && Lstream_seekable_p (stream))
2081 /* We can determine the coding system now. */
2082 determine_real_coding_system (stream, &codesys, &str->eol_type);
2083 set_decoding_stream_coding_system (lstr, codesys);
2084 str->decst.eol_type = str->eol_type;
2085 str->decst.mask = ~0;
2086 XSETLSTREAM (obj, lstr);
2091 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2093 return make_decoding_stream_1 (stream, codesys, "r");
2097 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2099 return make_decoding_stream_1 (stream, codesys, "w");
2102 /* Note: the decode_coding_* functions all take the same
2103 arguments as mule_decode(), which is to say some SRC data of
2104 size N, which is to be stored into dynamic array DST.
2105 DECODING is the stream within which the decoding is
2106 taking place, but no data is actually read from or
2107 written to that stream; that is handled in decoding_reader()
2108 or decoding_writer(). This allows the same functions to
2109 be used for both reading and writing. */
2112 mule_decode (Lstream *decoding, CONST unsigned char *src,
2113 unsigned_char_dynarr *dst, unsigned int n)
2115 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2117 /* If necessary, do encoding-detection now. We do this when
2118 we're a writing stream or a non-seekable reading stream,
2119 meaning that we can't just process the whole input,
2120 rewind, and start over. */
2122 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2123 str->eol_type == EOL_AUTODETECT)
2125 Lisp_Object codesys;
2127 XSETCODING_SYSTEM (codesys, str->codesys);
2128 detect_coding_type (&str->decst, src, n,
2129 CODING_SYSTEM_TYPE (str->codesys) !=
2130 CODESYS_AUTODETECT);
2131 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2132 str->decst.mask != ~0)
2133 /* #### This is cheesy. What we really ought to do is
2134 buffer up a certain amount of data so as to get a
2135 less random result. */
2136 codesys = coding_system_from_mask (str->decst.mask);
2137 str->eol_type = str->decst.eol_type;
2138 if (XCODING_SYSTEM (codesys) != str->codesys)
2140 /* Preserve the CODING_STATE_END flag in case it was set.
2141 If we erase it, bad things might happen. */
2142 int was_end = str->flags & CODING_STATE_END;
2143 set_decoding_stream_coding_system (decoding, codesys);
2145 str->flags |= CODING_STATE_END;
2149 switch (CODING_SYSTEM_TYPE (str->codesys))
2152 case CODESYS_INTERNAL:
2153 Dynarr_add_many (dst, src, n);
2156 case CODESYS_AUTODETECT:
2157 /* If we got this far and still haven't decided on the coding
2158 system, then do no conversion. */
2159 case CODESYS_NO_CONVERSION:
2160 decode_coding_no_conversion (decoding, src, dst, n);
2163 case CODESYS_SHIFT_JIS:
2164 decode_coding_sjis (decoding, src, dst, n);
2167 decode_coding_big5 (decoding, src, dst, n);
2170 decode_coding_ucs4 (decoding, src, dst, n);
2173 decode_coding_utf8 (decoding, src, dst, n);
2176 ccl_driver (&str->ccl, src, dst, n, 0);
2178 case CODESYS_ISO2022:
2179 decode_coding_iso2022 (decoding, src, dst, n);
2187 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2188 Decode the text between START and END which is encoded in CODING-SYSTEM.
2189 This is useful if you've read in encoded text from a file without decoding
2190 it (e.g. you read in a JIS-formatted file but used the `binary' or
2191 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2192 Return length of decoded text.
2193 BUFFER defaults to the current buffer if unspecified.
2195 (start, end, coding_system, buffer))
2198 struct buffer *buf = decode_buffer (buffer, 0);
2199 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2200 Lstream *istr, *ostr;
2201 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2203 get_buffer_range_char (buf, start, end, &b, &e, 0);
2205 barf_if_buffer_read_only (buf, b, e);
2207 coding_system = Fget_coding_system (coding_system);
2208 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2209 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2210 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2212 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2213 Fget_coding_system (Qbinary));
2214 istr = XLSTREAM (instream);
2215 ostr = XLSTREAM (outstream);
2216 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2218 /* The chain of streams looks like this:
2220 [BUFFER] <----- send through
2221 ------> [ENCODE AS BINARY]
2222 ------> [DECODE AS SPECIFIED]
2228 char tempbuf[1024]; /* some random amount */
2229 Bufpos newpos, even_newer_pos;
2230 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2231 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2235 newpos = lisp_buffer_stream_startpos (istr);
2236 Lstream_write (ostr, tempbuf, size_in_bytes);
2237 even_newer_pos = lisp_buffer_stream_startpos (istr);
2238 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2241 Lstream_close (istr);
2242 Lstream_close (ostr);
2244 Lstream_delete (istr);
2245 Lstream_delete (ostr);
2246 Lstream_delete (XLSTREAM (de_outstream));
2247 Lstream_delete (XLSTREAM (lb_outstream));
2252 /************************************************************************/
2253 /* Converting to an external encoding ("encoding") */
2254 /************************************************************************/
2256 /* An encoding stream is an output stream. When you create the
2257 stream, you specify the coding system that governs the encoding
2258 and another stream that the resulting encoded data is to be
2259 sent to, and then start sending data to it. */
2261 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2263 struct encoding_stream
2265 /* Coding system that governs the conversion. */
2266 Lisp_Coding_System *codesys;
2268 /* Stream that we read the encoded data from or
2269 write the decoded data to. */
2272 /* If we are reading, then we can return only a fixed amount of
2273 data, so if the conversion resulted in too much data, we store it
2274 here for retrieval the next time around. */
2275 unsigned_char_dynarr *runoff;
2277 /* FLAGS holds flags indicating the current state of the encoding.
2278 Some of these flags are dependent on the coding system. */
2281 /* CH holds a partially built-up character. Since we only deal
2282 with one- and two-byte characters at the moment, we only use
2283 this to store the first byte of a two-byte character. */
2286 /* Additional information used by the ISO2022 encoder. */
2289 /* CHARSET holds the character sets currently assigned to the G0
2290 through G3 registers. It is initialized from the array
2291 INITIAL_CHARSET in CODESYS. */
2292 Lisp_Object charset[4];
2294 /* Which registers are currently invoked into the left (GL) and
2295 right (GR) halves of the 8-bit encoding space? */
2296 int register_left, register_right;
2298 /* Whether we need to explicitly designate the charset in the
2299 G? register before using it. It is initialized from the
2300 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2301 unsigned char force_charset_on_output[4];
2303 /* Other state variables that need to be preserved across
2305 Lisp_Object current_charset;
2307 int current_char_boundary;
2310 /* Additional information (the state of the running CCL program)
2311 used by the CCL encoder. */
2312 struct ccl_program ccl;
2316 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2317 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2319 static int encoding_rewinder (Lstream *stream);
2320 static int encoding_seekable_p (Lstream *stream);
2321 static int encoding_flusher (Lstream *stream);
2322 static int encoding_closer (Lstream *stream);
2324 static Lisp_Object encoding_marker (Lisp_Object stream,
2325 void (*markobj) (Lisp_Object));
2327 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2328 sizeof (struct encoding_stream));
2331 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2333 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2334 Lisp_Object str_obj;
2336 /* We do not need to mark the coding systems or charsets stored
2337 within the stream because they are stored in a global list
2338 and automatically marked. */
2340 XSETLSTREAM (str_obj, str);
2342 if (str->imp->marker)
2343 return (str->imp->marker) (str_obj, markobj);
2348 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2349 so we read data from the other end, encode it, and store it into DATA. */
2352 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2354 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2355 unsigned char *orig_data = data;
2357 int error_occurred = 0;
2359 /* We need to interface to mule_encode(), which expects to take some
2360 amount of data and store the result into a Dynarr. We have
2361 mule_encode() store into str->runoff, and take data from there
2364 /* We loop until we have enough data, reading chunks from the other
2365 end and encoding it. */
2368 /* Take data from the runoff if we can. Make sure to take at
2369 most SIZE bytes, and delete the data from the runoff. */
2370 if (Dynarr_length (str->runoff) > 0)
2372 int chunk = min ((int) size, Dynarr_length (str->runoff));
2373 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2374 Dynarr_delete_many (str->runoff, 0, chunk);
2380 break; /* No more room for data */
2382 if (str->flags & CODING_STATE_END)
2383 /* This means that on the previous iteration, we hit the EOF on
2384 the other end. We loop once more so that mule_encode() can
2385 output any final stuff it may be holding, or any "go back
2386 to a sane state" escape sequences. (This latter makes sense
2387 during encoding.) */
2390 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2391 left of storage in it, so it's OK to read directly into it.
2392 (We'll be overwriting above, after we've encoded it into the
2394 read_size = Lstream_read (str->other_end, data, size);
2401 /* There might be some more end data produced in the translation.
2402 See the comment above. */
2403 str->flags |= CODING_STATE_END;
2404 mule_encode (stream, data, str->runoff, read_size);
2407 if (data == orig_data)
2408 return error_occurred ? -1 : 0;
2410 return data - orig_data;
2414 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2416 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2419 /* Encode all our data into the runoff, and then attempt to write
2420 it all out to the other end. Remove whatever chunk we succeeded
2422 mule_encode (stream, data, str->runoff, size);
2423 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2424 Dynarr_length (str->runoff));
2426 Dynarr_delete_many (str->runoff, 0, retval);
2427 /* Do NOT return retval. The return value indicates how much
2428 of the incoming data was written, not how many bytes were
2434 reset_encoding_stream (struct encoding_stream *str)
2437 switch (CODING_SYSTEM_TYPE (str->codesys))
2439 case CODESYS_ISO2022:
2443 for (i = 0; i < 4; i++)
2445 str->iso2022.charset[i] =
2446 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2447 str->iso2022.force_charset_on_output[i] =
2448 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2450 str->iso2022.register_left = 0;
2451 str->iso2022.register_right = 1;
2452 str->iso2022.current_charset = Qnil;
2453 str->iso2022.current_half = 0;
2455 str->iso2022.current_char_boundary = 0;
2457 str->iso2022.current_char_boundary = 1;
2462 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2469 str->flags = str->ch = 0;
2473 encoding_rewinder (Lstream *stream)
2475 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2476 reset_encoding_stream (str);
2477 Dynarr_reset (str->runoff);
2478 return Lstream_rewind (str->other_end);
2482 encoding_seekable_p (Lstream *stream)
2484 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2485 return Lstream_seekable_p (str->other_end);
2489 encoding_flusher (Lstream *stream)
2491 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2492 return Lstream_flush (str->other_end);
2496 encoding_closer (Lstream *stream)
2498 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2499 if (stream->flags & LSTREAM_FL_WRITE)
2501 str->flags |= CODING_STATE_END;
2502 encoding_writer (stream, 0, 0);
2504 Dynarr_free (str->runoff);
2505 return Lstream_close (str->other_end);
2509 encoding_stream_coding_system (Lstream *stream)
2511 Lisp_Object coding_system;
2512 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2514 XSETCODING_SYSTEM (coding_system, str->codesys);
2515 return coding_system;
2519 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2521 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2522 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2524 reset_encoding_stream (str);
2528 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2531 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2532 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2536 str->runoff = Dynarr_new (unsigned_char);
2537 str->other_end = stream;
2538 set_encoding_stream_coding_system (lstr, codesys);
2539 XSETLSTREAM (obj, lstr);
2544 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2546 return make_encoding_stream_1 (stream, codesys, "r");
2550 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2552 return make_encoding_stream_1 (stream, codesys, "w");
2555 /* Convert N bytes of internally-formatted data stored in SRC to an
2556 external format, according to the encoding stream ENCODING.
2557 Store the encoded data into DST. */
2560 mule_encode (Lstream *encoding, CONST unsigned char *src,
2561 unsigned_char_dynarr *dst, unsigned int n)
2563 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2565 switch (CODING_SYSTEM_TYPE (str->codesys))
2568 case CODESYS_INTERNAL:
2569 Dynarr_add_many (dst, src, n);
2572 case CODESYS_AUTODETECT:
2573 /* If we got this far and still haven't decided on the coding
2574 system, then do no conversion. */
2575 case CODESYS_NO_CONVERSION:
2576 encode_coding_no_conversion (encoding, src, dst, n);
2579 case CODESYS_SHIFT_JIS:
2580 encode_coding_sjis (encoding, src, dst, n);
2583 encode_coding_big5 (encoding, src, dst, n);
2586 encode_coding_ucs4 (encoding, src, dst, n);
2589 encode_coding_utf8 (encoding, src, dst, n);
2592 ccl_driver (&str->ccl, src, dst, n, 0);
2594 case CODESYS_ISO2022:
2595 encode_coding_iso2022 (encoding, src, dst, n);
2603 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2604 Encode the text between START and END using CODING-SYSTEM.
2605 This will, for example, convert Japanese characters into stuff such as
2606 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2607 text. BUFFER defaults to the current buffer if unspecified.
2609 (start, end, coding_system, buffer))
2612 struct buffer *buf = decode_buffer (buffer, 0);
2613 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2614 Lstream *istr, *ostr;
2615 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2617 get_buffer_range_char (buf, start, end, &b, &e, 0);
2619 barf_if_buffer_read_only (buf, b, e);
2621 coding_system = Fget_coding_system (coding_system);
2622 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2623 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2624 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2625 Fget_coding_system (Qbinary));
2626 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2628 istr = XLSTREAM (instream);
2629 ostr = XLSTREAM (outstream);
2630 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2631 /* The chain of streams looks like this:
2633 [BUFFER] <----- send through
2634 ------> [ENCODE AS SPECIFIED]
2635 ------> [DECODE AS BINARY]
2640 char tempbuf[1024]; /* some random amount */
2641 Bufpos newpos, even_newer_pos;
2642 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2643 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2647 newpos = lisp_buffer_stream_startpos (istr);
2648 Lstream_write (ostr, tempbuf, size_in_bytes);
2649 even_newer_pos = lisp_buffer_stream_startpos (istr);
2650 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2656 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2657 Lstream_close (istr);
2658 Lstream_close (ostr);
2660 Lstream_delete (istr);
2661 Lstream_delete (ostr);
2662 Lstream_delete (XLSTREAM (de_outstream));
2663 Lstream_delete (XLSTREAM (lb_outstream));
2664 return make_int (retlen);
2670 /************************************************************************/
2671 /* Shift-JIS methods */
2672 /************************************************************************/
2674 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2675 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2676 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2677 encoded by "position-code + 0x80". A character of JISX0208
2678 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2679 position-codes are divided and shifted so that it fit in the range
2682 --- CODE RANGE of Shift-JIS ---
2683 (character set) (range)
2685 JISX0201-Kana 0xA0 .. 0xDF
2686 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2687 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2688 -------------------------------
2692 /* Is this the first byte of a Shift-JIS two-byte char? */
2694 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2695 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2697 /* Is this the second byte of a Shift-JIS two-byte char? */
2699 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2700 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2702 #define BYTE_SJIS_KATAKANA_P(c) \
2703 ((c) >= 0xA1 && (c) <= 0xDF)
2706 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2714 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2716 if (st->shift_jis.in_second_byte)
2718 st->shift_jis.in_second_byte = 0;
2722 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2723 st->shift_jis.in_second_byte = 1;
2725 return CODING_CATEGORY_SHIFT_JIS_MASK;
2728 /* Convert Shift-JIS data to internal format. */
2731 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2732 unsigned_char_dynarr *dst, unsigned int n)
2735 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2736 unsigned int flags = str->flags;
2737 unsigned int ch = str->ch;
2738 eol_type_t eol_type = str->eol_type;
2746 /* Previous character was first byte of Shift-JIS Kanji char. */
2747 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2749 unsigned char e1, e2;
2751 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2752 DECODE_SJIS (ch, c, e1, e2);
2753 Dynarr_add (dst, e1);
2754 Dynarr_add (dst, e2);
2758 DECODE_ADD_BINARY_CHAR (ch, dst);
2759 DECODE_ADD_BINARY_CHAR (c, dst);
2765 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2766 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2768 else if (BYTE_SJIS_KATAKANA_P (c))
2770 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2771 Dynarr_add (dst, c);
2774 DECODE_ADD_BINARY_CHAR (c, dst);
2776 label_continue_loop:;
2779 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2785 /* Convert internally-formatted data to Shift-JIS. */
2788 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2789 unsigned_char_dynarr *dst, unsigned int n)
2792 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2793 unsigned int flags = str->flags;
2794 unsigned int ch = str->ch;
2795 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2802 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2803 Dynarr_add (dst, '\r');
2804 if (eol_type != EOL_CR)
2805 Dynarr_add (dst, '\n');
2808 else if (BYTE_ASCII_P (c))
2810 Dynarr_add (dst, c);
2813 else if (BUFBYTE_LEADING_BYTE_P (c))
2814 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2815 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2816 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2819 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2821 Dynarr_add (dst, c);
2824 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2825 ch == LEADING_BYTE_JAPANESE_JISX0208)
2829 unsigned char j1, j2;
2830 ENCODE_SJIS (ch, c, j1, j2);
2831 Dynarr_add (dst, j1);
2832 Dynarr_add (dst, j2);
2842 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2843 Decode a JISX0208 character of Shift-JIS coding-system.
2844 CODE is the character code in Shift-JIS as a cons of type bytes.
2845 Return the corresponding character.
2849 unsigned char c1, c2, s1, s2;
2852 CHECK_INT (XCAR (code));
2853 CHECK_INT (XCDR (code));
2854 s1 = XINT (XCAR (code));
2855 s2 = XINT (XCDR (code));
2856 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2857 BYTE_SJIS_TWO_BYTE_2_P (s2))
2859 DECODE_SJIS (s1, s2, c1, c2);
2860 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2861 c1 & 0x7F, c2 & 0x7F));
2867 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2868 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2869 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2873 Lisp_Object charset;
2876 CHECK_CHAR_COERCE_INT (ch);
2877 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2878 if (EQ (charset, Vcharset_japanese_jisx0208))
2880 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2881 return Fcons (make_int (s1), make_int (s2));
2888 /************************************************************************/
2890 /************************************************************************/
2892 /* BIG5 is a coding system encoding two character sets: ASCII and
2893 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2894 character set and is encoded in two-byte.
2896 --- CODE RANGE of BIG5 ---
2897 (character set) (range)
2899 Big5 (1st byte) 0xA1 .. 0xFE
2900 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2901 --------------------------
2903 Since the number of characters in Big5 is larger than maximum
2904 characters in Emacs' charset (96x96), it can't be handled as one
2905 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2906 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2907 contains frequently used characters and the latter contains less
2908 frequently used characters. */
2910 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2911 ((c) >= 0xA1 && (c) <= 0xFE)
2913 /* Is this the second byte of a Shift-JIS two-byte char? */
2915 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2916 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2918 /* Number of Big5 characters which have the same code in 1st byte. */
2920 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2922 /* Code conversion macros. These are macros because they are used in
2923 inner loops during code conversion.
2925 Note that temporary variables in macros introduce the classic
2926 dynamic-scoping problems with variable names. We use capital-
2927 lettered variables in the assumption that XEmacs does not use
2928 capital letters in variables except in a very formalized way
2931 /* Convert Big5 code (b1, b2) into its internal string representation
2934 /* There is a much simpler way to split the Big5 charset into two.
2935 For the moment I'm going to leave the algorithm as-is because it
2936 claims to separate out the most-used characters into a single
2937 charset, which perhaps will lead to optimizations in various
2940 The way the algorithm works is something like this:
2942 Big5 can be viewed as a 94x157 charset, where the row is
2943 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2944 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2945 the split between low and high column numbers is apparently
2946 meaningless; ascending rows produce less and less frequent chars.
2947 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2948 the first charset, and the upper half (0xC9 .. 0xFE) to the
2949 second. To do the conversion, we convert the character into
2950 a single number where 0 .. 156 is the first row, 157 .. 313
2951 is the second, etc. That way, the characters are ordered by
2952 decreasing frequency. Then we just chop the space in two
2953 and coerce the result into a 94x94 space.
2956 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2958 int B1 = b1, B2 = b2; \
2960 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2964 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2968 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2969 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2971 c1 = I / (0xFF - 0xA1) + 0xA1; \
2972 c2 = I % (0xFF - 0xA1) + 0xA1; \
2975 /* Convert the internal string representation of a Big5 character
2976 (lb, c1, c2) into Big5 code (b1, b2). */
2978 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2980 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2982 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2984 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2986 b1 = I / BIG5_SAME_ROW + 0xA1; \
2987 b2 = I % BIG5_SAME_ROW; \
2988 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2992 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3000 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3001 (c >= 0x80 && c <= 0xA0))
3003 if (st->big5.in_second_byte)
3005 st->big5.in_second_byte = 0;
3006 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3010 st->big5.in_second_byte = 1;
3012 return CODING_CATEGORY_BIG5_MASK;
3015 /* Convert Big5 data to internal format. */
3018 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3019 unsigned_char_dynarr *dst, unsigned int n)
3022 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3023 unsigned int flags = str->flags;
3024 unsigned int ch = str->ch;
3025 eol_type_t eol_type = str->eol_type;
3032 /* Previous character was first byte of Big5 char. */
3033 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3035 unsigned char b1, b2, b3;
3036 DECODE_BIG5 (ch, c, b1, b2, b3);
3037 Dynarr_add (dst, b1);
3038 Dynarr_add (dst, b2);
3039 Dynarr_add (dst, b3);
3043 DECODE_ADD_BINARY_CHAR (ch, dst);
3044 DECODE_ADD_BINARY_CHAR (c, dst);
3050 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3051 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3054 DECODE_ADD_BINARY_CHAR (c, dst);
3056 label_continue_loop:;
3059 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3065 /* Convert internally-formatted data to Big5. */
3068 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3069 unsigned_char_dynarr *dst, unsigned int n)
3072 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3073 unsigned int flags = str->flags;
3074 unsigned int ch = str->ch;
3075 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3082 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3083 Dynarr_add (dst, '\r');
3084 if (eol_type != EOL_CR)
3085 Dynarr_add (dst, '\n');
3087 else if (BYTE_ASCII_P (c))
3090 Dynarr_add (dst, c);
3092 else if (BUFBYTE_LEADING_BYTE_P (c))
3094 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3095 c == LEADING_BYTE_CHINESE_BIG5_2)
3097 /* A recognized leading byte. */
3099 continue; /* not done with this character. */
3101 /* otherwise just ignore this character. */
3103 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3104 ch == LEADING_BYTE_CHINESE_BIG5_2)
3106 /* Previous char was a recognized leading byte. */
3108 continue; /* not done with this character. */
3112 /* Encountering second byte of a Big5 character. */
3113 unsigned char b1, b2;
3115 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3116 Dynarr_add (dst, b1);
3117 Dynarr_add (dst, b2);
3128 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3129 Decode a Big5 character CODE of BIG5 coding-system.
3130 CODE is the character code in BIG5, a cons of two integers.
3131 Return the corresponding character.
3135 unsigned char c1, c2, b1, b2;
3138 CHECK_INT (XCAR (code));
3139 CHECK_INT (XCDR (code));
3140 b1 = XINT (XCAR (code));
3141 b2 = XINT (XCDR (code));
3142 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3143 BYTE_BIG5_TWO_BYTE_2_P (b2))
3146 Lisp_Object charset;
3147 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3148 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3149 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3155 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3156 Encode the Big5 character CH to BIG5 coding-system.
3157 Return the corresponding character code in Big5.
3161 Lisp_Object charset;
3164 CHECK_CHAR_COERCE_INT (ch);
3165 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3166 if (EQ (charset, Vcharset_chinese_big5_1) ||
3167 EQ (charset, Vcharset_chinese_big5_2))
3169 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3171 return Fcons (make_int (b1), make_int (b2));
3178 /************************************************************************/
3181 /* UCS-4 character codes are implemented as nonnegative integers. */
3183 /************************************************************************/
3185 Lisp_Object ucs_to_mule_table[65536];
3186 Lisp_Object mule_to_ucs_table;
3188 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3189 Map UCS-4 code CODE to Mule character CHARACTER.
3191 Return T on success, NIL on failure.
3197 CHECK_CHAR (character);
3201 if (c < sizeof (ucs_to_mule_table))
3203 ucs_to_mule_table[c] = character;
3211 ucs_to_char (unsigned long code)
3213 if (code < sizeof (ucs_to_mule_table))
3215 return ucs_to_mule_table[code];
3217 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3222 c = code % (94 * 94);
3224 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3225 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3226 CHARSET_LEFT_TO_RIGHT),
3227 c / 94 + 33, c % 94 + 33));
3233 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3234 Return Mule character corresponding to UCS code CODE (a positive integer).
3238 CHECK_NATNUM (code);
3239 return ucs_to_char (XINT (code));
3242 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3243 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3247 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3248 Fset_char_ucs is more restrictive on index arg, but should
3249 check code arg in a char_table method. */
3250 CHECK_CHAR (character);
3251 CHECK_NATNUM (code);
3252 return Fput_char_table (character, code, mule_to_ucs_table);
3255 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3256 Return the UCS code (a positive integer) corresponding to CHARACTER.
3260 return Fget_char_table (character, mule_to_ucs_table);
3264 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3266 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3267 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3268 is not found, instead.
3269 #### do something more appropriate (use blob?)
3270 Danger, Will Robinson! Data loss. Should we signal user? */
3272 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3274 Lisp_Object chr = ucs_to_char (ch);
3278 Bufbyte work[MAX_EMCHAR_LEN];
3283 simple_set_charptr_emchar (work, ch) :
3284 non_ascii_set_charptr_emchar (work, ch);
3285 Dynarr_add_many (dst, work, len);
3289 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3290 Dynarr_add (dst, 34 + 128);
3291 Dynarr_add (dst, 46 + 128);
3296 static unsigned long
3297 mule_char_to_ucs4 (Lisp_Object charset,
3298 unsigned char h, unsigned char l)
3301 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3308 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3309 (XCHARSET_CHARS (charset) == 94) )
3311 unsigned char final = XCHARSET_FINAL (charset);
3313 if ( ('@' <= final) && (final < 0x7f) )
3315 return 0xe00000 + (final - '@') * 94 * 94
3316 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3330 encode_ucs4 (Lisp_Object charset,
3331 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3333 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3334 Dynarr_add (dst, code >> 24);
3335 Dynarr_add (dst, (code >> 16) & 255);
3336 Dynarr_add (dst, (code >> 8) & 255);
3337 Dynarr_add (dst, code & 255);
3341 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3347 switch (st->ucs4.in_byte)
3356 st->ucs4.in_byte = 0;
3362 return CODING_CATEGORY_UCS4_MASK;
3366 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3367 unsigned_char_dynarr *dst, unsigned int n)
3369 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3370 unsigned int flags = str->flags;
3371 unsigned int ch = str->ch;
3375 unsigned char c = *src++;
3383 decode_ucs4 ( ( ch << 8 ) | c, dst);
3388 ch = ( ch << 8 ) | c;
3392 if (flags & CODING_STATE_END)
3393 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3400 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3401 unsigned_char_dynarr *dst, unsigned int n)
3403 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3404 unsigned int flags = str->flags;
3405 unsigned int ch = str->ch;
3406 unsigned char char_boundary = str->iso2022.current_char_boundary;
3407 Lisp_Object charset = str->iso2022.current_charset;
3409 #ifdef ENABLE_COMPOSITE_CHARS
3410 /* flags for handling composite chars. We do a little switcharoo
3411 on the source while we're outputting the composite char. */
3412 unsigned int saved_n = 0;
3413 CONST unsigned char *saved_src = NULL;
3414 int in_composite = 0;
3421 unsigned char c = *src++;
3423 if (BYTE_ASCII_P (c))
3424 { /* Processing ASCII character */
3426 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3429 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3430 { /* Processing Leading Byte */
3432 charset = CHARSET_BY_LEADING_BYTE (c);
3433 if (LEADING_BYTE_PREFIX_P(c))
3438 { /* Processing Non-ASCII character */
3440 if (EQ (charset, Vcharset_control_1))
3442 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3446 switch (XCHARSET_REP_BYTES (charset))
3449 encode_ucs4 (charset, c, 0, dst);
3452 if (XCHARSET_PRIVATE_P (charset))
3454 encode_ucs4 (charset, c, 0, dst);
3459 #ifdef ENABLE_COMPOSITE_CHARS
3460 if (EQ (charset, Vcharset_composite))
3464 /* #### Bother! We don't know how to
3466 Dynarr_add (dst, 0);
3467 Dynarr_add (dst, 0);
3468 Dynarr_add (dst, 0);
3469 Dynarr_add (dst, '~');
3473 Emchar emch = MAKE_CHAR (Vcharset_composite,
3474 ch & 0x7F, c & 0x7F);
3475 Lisp_Object lstr = composite_char_string (emch);
3479 src = XSTRING_DATA (lstr);
3480 n = XSTRING_LENGTH (lstr);
3484 #endif /* ENABLE_COMPOSITE_CHARS */
3486 encode_ucs4(charset, ch, c, dst);
3499 encode_ucs4 (charset, ch, c, dst);
3515 #ifdef ENABLE_COMPOSITE_CHARS
3521 goto back_to_square_n; /* Wheeeeeeeee ..... */
3523 #endif /* ENABLE_COMPOSITE_CHARS */
3527 str->iso2022.current_char_boundary = char_boundary;
3528 str->iso2022.current_charset = charset;
3530 /* Verbum caro factum est! */
3534 /************************************************************************/
3536 /************************************************************************/
3539 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3544 unsigned char c = *src++;
3545 switch (st->utf8.in_byte)
3548 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3551 st->utf8.in_byte = 5;
3553 st->utf8.in_byte = 4;
3555 st->utf8.in_byte = 3;
3557 st->utf8.in_byte = 2;
3559 st->utf8.in_byte = 1;
3564 if ((c & 0xc0) != 0x80)
3570 return CODING_CATEGORY_UTF8_MASK;
3574 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3575 unsigned_char_dynarr *dst, unsigned int n)
3577 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3578 unsigned int flags = str->flags;
3579 unsigned int ch = str->ch;
3580 eol_type_t eol_type = str->eol_type;
3584 unsigned char c = *src++;
3593 else if ( c >= 0xf8 )
3598 else if ( c >= 0xf0 )
3603 else if ( c >= 0xe0 )
3608 else if ( c >= 0xc0 )
3615 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3616 decode_ucs4 (c, dst);
3620 ch = ( ch << 6 ) | ( c & 0x3f );
3621 decode_ucs4 (ch, dst);
3626 ch = ( ch << 6 ) | ( c & 0x3f );
3629 label_continue_loop:;
3632 if (flags & CODING_STATE_END)
3633 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3641 encode_utf8 (Lisp_Object charset,
3642 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3644 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3647 Dynarr_add (dst, code);
3649 else if ( code <= 0x7ff )
3651 Dynarr_add (dst, (code >> 6) | 0xc0);
3652 Dynarr_add (dst, (code & 0x3f) | 0x80);
3654 else if ( code <= 0xffff )
3656 Dynarr_add (dst, (code >> 12) | 0xe0);
3657 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3658 Dynarr_add (dst, (code & 0x3f) | 0x80);
3660 else if ( code <= 0x1fffff )
3662 Dynarr_add (dst, (code >> 18) | 0xf0);
3663 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3664 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3665 Dynarr_add (dst, (code & 0x3f) | 0x80);
3667 else if ( code <= 0x3ffffff )
3669 Dynarr_add (dst, (code >> 24) | 0xf8);
3670 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3671 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3672 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3673 Dynarr_add (dst, (code & 0x3f) | 0x80);
3677 Dynarr_add (dst, (code >> 30) | 0xfc);
3678 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3679 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3680 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3681 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3682 Dynarr_add (dst, (code & 0x3f) | 0x80);
3688 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3689 unsigned_char_dynarr *dst, unsigned int n)
3691 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3692 unsigned int flags = str->flags;
3693 unsigned int ch = str->ch;
3694 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3695 unsigned char char_boundary = str->iso2022.current_char_boundary;
3700 unsigned char c = *src++;
3701 switch (char_boundary)
3706 Dynarr_add (dst, c);
3709 else if ( c >= 0xf8 )
3711 Dynarr_add (dst, c);
3714 else if ( c >= 0xf0 )
3716 Dynarr_add (dst, c);
3719 else if ( c >= 0xe0 )
3721 Dynarr_add (dst, c);
3724 else if ( c >= 0xc0 )
3726 Dynarr_add (dst, c);
3733 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3734 Dynarr_add (dst, '\r');
3735 if (eol_type != EOL_CR)
3736 Dynarr_add (dst, c);
3739 Dynarr_add (dst, c);
3744 Dynarr_add (dst, c);
3748 Dynarr_add (dst, c);
3752 #else /* not UTF2000 */
3753 Lisp_Object charset = str->iso2022.current_charset;
3755 #ifdef ENABLE_COMPOSITE_CHARS
3756 /* flags for handling composite chars. We do a little switcharoo
3757 on the source while we're outputting the composite char. */
3758 unsigned int saved_n = 0;
3759 CONST unsigned char *saved_src = NULL;
3760 int in_composite = 0;
3763 #endif /* ENABLE_COMPOSITE_CHARS */
3767 unsigned char c = *src++;
3769 if (BYTE_ASCII_P (c))
3770 { /* Processing ASCII character */
3774 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3775 Dynarr_add (dst, '\r');
3776 if (eol_type != EOL_CR)
3777 Dynarr_add (dst, c);
3780 encode_utf8 (Vcharset_ascii, c, 0, dst);
3783 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3784 { /* Processing Leading Byte */
3786 charset = CHARSET_BY_LEADING_BYTE (c);
3787 if (LEADING_BYTE_PREFIX_P(c))
3792 { /* Processing Non-ASCII character */
3794 if (EQ (charset, Vcharset_control_1))
3796 encode_utf8 (Vcharset_control_1, c, 0, dst);
3800 switch (XCHARSET_REP_BYTES (charset))
3803 encode_utf8 (charset, c, 0, dst);
3806 if (XCHARSET_PRIVATE_P (charset))
3808 encode_utf8 (charset, c, 0, dst);
3813 #ifdef ENABLE_COMPOSITE_CHARS
3814 if (EQ (charset, Vcharset_composite))
3818 /* #### Bother! We don't know how to
3820 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3824 Emchar emch = MAKE_CHAR (Vcharset_composite,
3825 ch & 0x7F, c & 0x7F);
3826 Lisp_Object lstr = composite_char_string (emch);
3830 src = XSTRING_DATA (lstr);
3831 n = XSTRING_LENGTH (lstr);
3835 #endif /* ENABLE_COMPOSITE_CHARS */
3837 encode_utf8 (charset, ch, c, dst);
3850 encode_utf8 (charset, ch, c, dst);
3866 #ifdef ENABLE_COMPOSITE_CHARS
3872 goto back_to_square_n; /* Wheeeeeeeee ..... */
3876 #endif /* not UTF2000 */
3879 str->iso2022.current_char_boundary = char_boundary;
3881 str->iso2022.current_charset = charset;
3884 /* Verbum caro factum est! */
3888 /************************************************************************/
3889 /* ISO2022 methods */
3890 /************************************************************************/
3892 /* The following note describes the coding system ISO2022 briefly.
3893 Since the intention of this note is to help understand the
3894 functions in this file, some parts are NOT ACCURATE or OVERLY
3895 SIMPLIFIED. For thorough understanding, please refer to the
3896 original document of ISO2022.
3898 ISO2022 provides many mechanisms to encode several character sets
3899 in 7-bit and 8-bit environments. For 7-bit environments, all text
3900 is encoded using bytes less than 128. This may make the encoded
3901 text a little bit longer, but the text passes more easily through
3902 several gateways, some of which strip off MSB (Most Signigant Bit).
3904 There are two kinds of character sets: control character set and
3905 graphic character set. The former contains control characters such
3906 as `newline' and `escape' to provide control functions (control
3907 functions are also provided by escape sequences). The latter
3908 contains graphic characters such as 'A' and '-'. Emacs recognizes
3909 two control character sets and many graphic character sets.
3911 Graphic character sets are classified into one of the following
3912 four classes, according to the number of bytes (DIMENSION) and
3913 number of characters in one dimension (CHARS) of the set:
3914 - DIMENSION1_CHARS94
3915 - DIMENSION1_CHARS96
3916 - DIMENSION2_CHARS94
3917 - DIMENSION2_CHARS96
3919 In addition, each character set is assigned an identification tag,
3920 unique for each set, called "final character" (denoted as <F>
3921 hereafter). The <F> of each character set is decided by ECMA(*)
3922 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3923 (0x30..0x3F are for private use only).
3925 Note (*): ECMA = European Computer Manufacturers Association
3927 Here are examples of graphic character set [NAME(<F>)]:
3928 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3929 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3930 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3931 o DIMENSION2_CHARS96 -- none for the moment
3933 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3934 C0 [0x00..0x1F] -- control character plane 0
3935 GL [0x20..0x7F] -- graphic character plane 0
3936 C1 [0x80..0x9F] -- control character plane 1
3937 GR [0xA0..0xFF] -- graphic character plane 1
3939 A control character set is directly designated and invoked to C0 or
3940 C1 by an escape sequence. The most common case is that:
3941 - ISO646's control character set is designated/invoked to C0, and
3942 - ISO6429's control character set is designated/invoked to C1,
3943 and usually these designations/invocations are omitted in encoded
3944 text. In a 7-bit environment, only C0 can be used, and a control
3945 character for C1 is encoded by an appropriate escape sequence to
3946 fit into the environment. All control characters for C1 are
3947 defined to have corresponding escape sequences.
3949 A graphic character set is at first designated to one of four
3950 graphic registers (G0 through G3), then these graphic registers are
3951 invoked to GL or GR. These designations and invocations can be
3952 done independently. The most common case is that G0 is invoked to
3953 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3954 these invocations and designations are omitted in encoded text.
3955 In a 7-bit environment, only GL can be used.
3957 When a graphic character set of CHARS94 is invoked to GL, codes
3958 0x20 and 0x7F of the GL area work as control characters SPACE and
3959 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3962 There are two ways of invocation: locking-shift and single-shift.
3963 With locking-shift, the invocation lasts until the next different
3964 invocation, whereas with single-shift, the invocation affects the
3965 following character only and doesn't affect the locking-shift
3966 state. Invocations are done by the following control characters or
3969 ----------------------------------------------------------------------
3970 abbrev function cntrl escape seq description
3971 ----------------------------------------------------------------------
3972 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3973 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3974 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3975 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3976 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3977 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3978 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3979 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3980 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3981 ----------------------------------------------------------------------
3982 (*) These are not used by any known coding system.
3984 Control characters for these functions are defined by macros
3985 ISO_CODE_XXX in `coding.h'.
3987 Designations are done by the following escape sequences:
3988 ----------------------------------------------------------------------
3989 escape sequence description
3990 ----------------------------------------------------------------------
3991 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3992 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3993 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3994 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3995 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3996 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3997 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3998 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3999 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4000 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4001 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4002 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4003 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4004 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4005 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4006 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4007 ----------------------------------------------------------------------
4009 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4010 of dimension 1, chars 94, and final character <F>, etc...
4012 Note (*): Although these designations are not allowed in ISO2022,
4013 Emacs accepts them on decoding, and produces them on encoding
4014 CHARS96 character sets in a coding system which is characterized as
4015 7-bit environment, non-locking-shift, and non-single-shift.
4017 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4018 '(' can be omitted. We refer to this as "short-form" hereafter.
4020 Now you may notice that there are a lot of ways for encoding the
4021 same multilingual text in ISO2022. Actually, there exist many
4022 coding systems such as Compound Text (used in X11's inter client
4023 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4024 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4025 localized platforms), and all of these are variants of ISO2022.
4027 In addition to the above, Emacs handles two more kinds of escape
4028 sequences: ISO6429's direction specification and Emacs' private
4029 sequence for specifying character composition.
4031 ISO6429's direction specification takes the following form:
4032 o CSI ']' -- end of the current direction
4033 o CSI '0' ']' -- end of the current direction
4034 o CSI '1' ']' -- start of left-to-right text
4035 o CSI '2' ']' -- start of right-to-left text
4036 The control character CSI (0x9B: control sequence introducer) is
4037 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4039 Character composition specification takes the following form:
4040 o ESC '0' -- start character composition
4041 o ESC '1' -- end character composition
4042 Since these are not standard escape sequences of any ISO standard,
4043 their use with these meanings is restricted to Emacs only. */
4046 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4050 for (i = 0; i < 4; i++)
4052 if (!NILP (coding_system))
4054 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4056 iso->charset[i] = Qt;
4057 iso->invalid_designated[i] = 0;
4059 iso->esc = ISO_ESC_NOTHING;
4060 iso->esc_bytes_index = 0;
4061 iso->register_left = 0;
4062 iso->register_right = 1;
4063 iso->switched_dir_and_no_valid_charset_yet = 0;
4064 iso->invalid_switch_dir = 0;
4065 iso->output_direction_sequence = 0;
4066 iso->output_literally = 0;
4067 #ifdef ENABLE_COMPOSITE_CHARS
4068 if (iso->composite_chars)
4069 Dynarr_reset (iso->composite_chars);
4074 fit_to_be_escape_quoted (unsigned char c)
4091 /* Parse one byte of an ISO2022 escape sequence.
4092 If the result is an invalid escape sequence, return 0 and
4093 do not change anything in STR. Otherwise, if the result is
4094 an incomplete escape sequence, update ISO2022.ESC and
4095 ISO2022.ESC_BYTES and return -1. Otherwise, update
4096 all the state variables (but not ISO2022.ESC_BYTES) and
4099 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4100 or invocation of an invalid character set and treat that as
4101 an unrecognized escape sequence. */
4104 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4105 unsigned char c, unsigned int *flags,
4106 int check_invalid_charsets)
4108 /* (1) If we're at the end of a designation sequence, CS is the
4109 charset being designated and REG is the register to designate
4112 (2) If we're at the end of a locking-shift sequence, REG is
4113 the register to invoke and HALF (0 == left, 1 == right) is
4114 the half to invoke it into.
4116 (3) If we're at the end of a single-shift sequence, REG is
4117 the register to invoke. */
4118 Lisp_Object cs = Qnil;
4121 /* NOTE: This code does goto's all over the fucking place.
4122 The reason for this is that we're basically implementing
4123 a state machine here, and hierarchical languages like C
4124 don't really provide a clean way of doing this. */
4126 if (! (*flags & CODING_STATE_ESCAPE))
4127 /* At beginning of escape sequence; we need to reset our
4128 escape-state variables. */
4129 iso->esc = ISO_ESC_NOTHING;
4131 iso->output_literally = 0;
4132 iso->output_direction_sequence = 0;
4136 case ISO_ESC_NOTHING:
4137 iso->esc_bytes_index = 0;
4140 case ISO_CODE_ESC: /* Start escape sequence */
4141 *flags |= CODING_STATE_ESCAPE;
4145 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4146 *flags |= CODING_STATE_ESCAPE;
4147 iso->esc = ISO_ESC_5_11;
4150 case ISO_CODE_SO: /* locking shift 1 */
4153 case ISO_CODE_SI: /* locking shift 0 */
4157 case ISO_CODE_SS2: /* single shift */
4160 case ISO_CODE_SS3: /* single shift */
4164 default: /* Other control characters */
4171 /**** single shift ****/
4173 case 'N': /* single shift 2 */
4176 case 'O': /* single shift 3 */
4180 /**** locking shift ****/
4182 case '~': /* locking shift 1 right */
4185 case 'n': /* locking shift 2 */
4188 case '}': /* locking shift 2 right */
4191 case 'o': /* locking shift 3 */
4194 case '|': /* locking shift 3 right */
4198 #ifdef ENABLE_COMPOSITE_CHARS
4199 /**** composite ****/
4202 iso->esc = ISO_ESC_START_COMPOSITE;
4203 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4204 CODING_STATE_COMPOSITE;
4208 iso->esc = ISO_ESC_END_COMPOSITE;
4209 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4210 ~CODING_STATE_COMPOSITE;
4212 #endif /* ENABLE_COMPOSITE_CHARS */
4214 /**** directionality ****/
4217 iso->esc = ISO_ESC_5_11;
4220 /**** designation ****/
4222 case '$': /* multibyte charset prefix */
4223 iso->esc = ISO_ESC_2_4;
4227 if (0x28 <= c && c <= 0x2F)
4229 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4233 /* This function is called with CODESYS equal to nil when
4234 doing coding-system detection. */
4236 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4237 && fit_to_be_escape_quoted (c))
4239 iso->esc = ISO_ESC_LITERAL;
4240 *flags &= CODING_STATE_ISO2022_LOCK;
4250 /**** directionality ****/
4252 case ISO_ESC_5_11: /* ISO6429 direction control */
4255 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4256 goto directionality;
4258 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4259 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4260 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4264 case ISO_ESC_5_11_0:
4267 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4268 goto directionality;
4272 case ISO_ESC_5_11_1:
4275 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4276 goto directionality;
4280 case ISO_ESC_5_11_2:
4283 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4284 goto directionality;
4289 iso->esc = ISO_ESC_DIRECTIONALITY;
4290 /* Various junk here to attempt to preserve the direction sequences
4291 literally in the text if they would otherwise be swallowed due
4292 to invalid designations that don't show up as actual charset
4293 changes in the text. */
4294 if (iso->invalid_switch_dir)
4296 /* We already inserted a direction switch literally into the
4297 text. We assume (#### this may not be right) that the
4298 next direction switch is the one going the other way,
4299 and we need to output that literally as well. */
4300 iso->output_literally = 1;
4301 iso->invalid_switch_dir = 0;
4307 /* If we are in the thrall of an invalid designation,
4308 then stick the directionality sequence literally into the
4309 output stream so it ends up in the original text again. */
4310 for (jj = 0; jj < 4; jj++)
4311 if (iso->invalid_designated[jj])
4315 iso->output_literally = 1;
4316 iso->invalid_switch_dir = 1;
4319 /* Indicate that we haven't yet seen a valid designation,
4320 so that if a switch-dir is directly followed by an
4321 invalid designation, both get inserted literally. */
4322 iso->switched_dir_and_no_valid_charset_yet = 1;
4327 /**** designation ****/
4330 if (0x28 <= c && c <= 0x2F)
4332 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4335 if (0x40 <= c && c <= 0x42)
4337 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4338 *flags & CODING_STATE_R2L ?
4339 CHARSET_RIGHT_TO_LEFT :
4340 CHARSET_LEFT_TO_RIGHT);
4350 if (c < '0' || c > '~')
4351 return 0; /* bad final byte */
4353 if (iso->esc >= ISO_ESC_2_8 &&
4354 iso->esc <= ISO_ESC_2_15)
4356 type = ((iso->esc >= ISO_ESC_2_12) ?
4357 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4358 reg = (iso->esc - ISO_ESC_2_8) & 3;
4360 else if (iso->esc >= ISO_ESC_2_4_8 &&
4361 iso->esc <= ISO_ESC_2_4_15)
4363 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4364 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4365 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4369 /* Can this ever be reached? -slb */
4373 cs = CHARSET_BY_ATTRIBUTES (type, c,
4374 *flags & CODING_STATE_R2L ?
4375 CHARSET_RIGHT_TO_LEFT :
4376 CHARSET_LEFT_TO_RIGHT);
4382 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4386 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4387 /* can't invoke something that ain't there. */
4389 iso->esc = ISO_ESC_SINGLE_SHIFT;
4390 *flags &= CODING_STATE_ISO2022_LOCK;
4392 *flags |= CODING_STATE_SS2;
4394 *flags |= CODING_STATE_SS3;
4398 if (check_invalid_charsets &&
4399 !CHARSETP (iso->charset[reg]))
4400 /* can't invoke something that ain't there. */
4403 iso->register_right = reg;
4405 iso->register_left = reg;
4406 *flags &= CODING_STATE_ISO2022_LOCK;
4407 iso->esc = ISO_ESC_LOCKING_SHIFT;
4411 if (NILP (cs) && check_invalid_charsets)
4413 iso->invalid_designated[reg] = 1;
4414 iso->charset[reg] = Vcharset_ascii;
4415 iso->esc = ISO_ESC_DESIGNATE;
4416 *flags &= CODING_STATE_ISO2022_LOCK;
4417 iso->output_literally = 1;
4418 if (iso->switched_dir_and_no_valid_charset_yet)
4420 /* We encountered a switch-direction followed by an
4421 invalid designation. Ensure that the switch-direction
4422 gets outputted; otherwise it will probably get eaten
4423 when the text is written out again. */
4424 iso->switched_dir_and_no_valid_charset_yet = 0;
4425 iso->output_direction_sequence = 1;
4426 /* And make sure that the switch-dir going the other
4427 way gets outputted, as well. */
4428 iso->invalid_switch_dir = 1;
4432 /* This function is called with CODESYS equal to nil when
4433 doing coding-system detection. */
4434 if (!NILP (codesys))
4436 charset_conversion_spec_dynarr *dyn =
4437 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4443 for (i = 0; i < Dynarr_length (dyn); i++)
4445 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4446 if (EQ (cs, spec->from_charset))
4447 cs = spec->to_charset;
4452 iso->charset[reg] = cs;
4453 iso->esc = ISO_ESC_DESIGNATE;
4454 *flags &= CODING_STATE_ISO2022_LOCK;
4455 if (iso->invalid_designated[reg])
4457 iso->invalid_designated[reg] = 0;
4458 iso->output_literally = 1;
4460 if (iso->switched_dir_and_no_valid_charset_yet)
4461 iso->switched_dir_and_no_valid_charset_yet = 0;
4466 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4471 /* #### There are serious deficiencies in the recognition mechanism
4472 here. This needs to be much smarter if it's going to cut it.
4473 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4474 it should be detected as Latin-1.
4475 All the ISO2022 stuff in this file should be synced up with the
4476 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4477 Perhaps we should wait till R2L works in FSF Emacs? */
4479 if (!st->iso2022.initted)
4481 reset_iso2022 (Qnil, &st->iso2022.iso);
4482 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4483 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4484 CODING_CATEGORY_ISO_8_1_MASK |
4485 CODING_CATEGORY_ISO_8_2_MASK |
4486 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4487 st->iso2022.flags = 0;
4488 st->iso2022.high_byte_count = 0;
4489 st->iso2022.saw_single_shift = 0;
4490 st->iso2022.initted = 1;
4493 mask = st->iso2022.mask;
4500 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4501 st->iso2022.high_byte_count++;
4505 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4507 if (st->iso2022.high_byte_count & 1)
4508 /* odd number of high bytes; assume not iso-8-2 */
4509 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4511 st->iso2022.high_byte_count = 0;
4512 st->iso2022.saw_single_shift = 0;
4514 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4516 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4517 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4518 { /* control chars */
4521 /* Allow and ignore control characters that you might
4522 reasonably see in a text file */
4527 case 8: /* backspace */
4528 case 11: /* vertical tab */
4529 case 12: /* form feed */
4530 case 26: /* MS-DOS C-z junk */
4531 case 31: /* '^_' -- for info */
4532 goto label_continue_loop;
4539 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4542 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4543 &st->iso2022.flags, 0))
4545 switch (st->iso2022.iso.esc)
4547 case ISO_ESC_DESIGNATE:
4548 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4549 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4551 case ISO_ESC_LOCKING_SHIFT:
4552 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4553 goto ran_out_of_chars;
4554 case ISO_ESC_SINGLE_SHIFT:
4555 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4556 st->iso2022.saw_single_shift = 1;
4565 goto ran_out_of_chars;
4568 label_continue_loop:;
4577 postprocess_iso2022_mask (int mask)
4579 /* #### kind of cheesy */
4580 /* If seven-bit ISO is allowed, then assume that the encoding is
4581 entirely seven-bit and turn off the eight-bit ones. */
4582 if (mask & CODING_CATEGORY_ISO_7_MASK)
4583 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4584 CODING_CATEGORY_ISO_8_1_MASK |
4585 CODING_CATEGORY_ISO_8_2_MASK);
4589 /* If FLAGS is a null pointer or specifies right-to-left motion,
4590 output a switch-dir-to-left-to-right sequence to DST.
4591 Also update FLAGS if it is not a null pointer.
4592 If INTERNAL_P is set, we are outputting in internal format and
4593 need to handle the CSI differently. */
4596 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4597 unsigned_char_dynarr *dst,
4598 unsigned int *flags,
4601 if (!flags || (*flags & CODING_STATE_R2L))
4603 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4605 Dynarr_add (dst, ISO_CODE_ESC);
4606 Dynarr_add (dst, '[');
4608 else if (internal_p)
4609 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4611 Dynarr_add (dst, ISO_CODE_CSI);
4612 Dynarr_add (dst, '0');
4613 Dynarr_add (dst, ']');
4615 *flags &= ~CODING_STATE_R2L;
4619 /* If FLAGS is a null pointer or specifies a direction different from
4620 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4621 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4622 sequence to DST. Also update FLAGS if it is not a null pointer.
4623 If INTERNAL_P is set, we are outputting in internal format and
4624 need to handle the CSI differently. */
4627 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4628 unsigned_char_dynarr *dst, unsigned int *flags,
4631 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4632 direction == CHARSET_LEFT_TO_RIGHT)
4633 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4634 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4635 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4636 direction == CHARSET_RIGHT_TO_LEFT)
4638 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4640 Dynarr_add (dst, ISO_CODE_ESC);
4641 Dynarr_add (dst, '[');
4643 else if (internal_p)
4644 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4646 Dynarr_add (dst, ISO_CODE_CSI);
4647 Dynarr_add (dst, '2');
4648 Dynarr_add (dst, ']');
4650 *flags |= CODING_STATE_R2L;
4654 /* Convert ISO2022-format data to internal format. */
4657 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4658 unsigned_char_dynarr *dst, unsigned int n)
4660 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4661 unsigned int flags = str->flags;
4662 unsigned int ch = str->ch;
4663 eol_type_t eol_type = str->eol_type;
4664 #ifdef ENABLE_COMPOSITE_CHARS
4665 unsigned_char_dynarr *real_dst = dst;
4667 Lisp_Object coding_system;
4669 XSETCODING_SYSTEM (coding_system, str->codesys);
4671 #ifdef ENABLE_COMPOSITE_CHARS
4672 if (flags & CODING_STATE_COMPOSITE)
4673 dst = str->iso2022.composite_chars;
4674 #endif /* ENABLE_COMPOSITE_CHARS */
4678 unsigned char c = *src++;
4679 if (flags & CODING_STATE_ESCAPE)
4680 { /* Within ESC sequence */
4681 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4686 switch (str->iso2022.esc)
4688 #ifdef ENABLE_COMPOSITE_CHARS
4689 case ISO_ESC_START_COMPOSITE:
4690 if (str->iso2022.composite_chars)
4691 Dynarr_reset (str->iso2022.composite_chars);
4693 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4694 dst = str->iso2022.composite_chars;
4696 case ISO_ESC_END_COMPOSITE:
4698 Bufbyte comstr[MAX_EMCHAR_LEN];
4700 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4701 Dynarr_length (dst));
4703 len = set_charptr_emchar (comstr, emch);
4704 Dynarr_add_many (dst, comstr, len);
4707 #endif /* ENABLE_COMPOSITE_CHARS */
4709 case ISO_ESC_LITERAL:
4710 DECODE_ADD_BINARY_CHAR (c, dst);
4714 /* Everything else handled already */
4719 /* Attempted error recovery. */
4720 if (str->iso2022.output_direction_sequence)
4721 ensure_correct_direction (flags & CODING_STATE_R2L ?
4722 CHARSET_RIGHT_TO_LEFT :
4723 CHARSET_LEFT_TO_RIGHT,
4724 str->codesys, dst, 0, 1);
4725 /* More error recovery. */
4726 if (!retval || str->iso2022.output_literally)
4728 /* Output the (possibly invalid) sequence */
4730 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4731 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4732 flags &= CODING_STATE_ISO2022_LOCK;
4734 n++, src--;/* Repeat the loop with the same character. */
4737 /* No sense in reprocessing the final byte of the
4738 escape sequence; it could mess things up anyway.
4740 DECODE_ADD_BINARY_CHAR (c, dst);
4745 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4746 { /* Control characters */
4748 /***** Error-handling *****/
4750 /* If we were in the middle of a character, dump out the
4751 partial character. */
4752 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4754 /* If we just saw a single-shift character, dump it out.
4755 This may dump out the wrong sort of single-shift character,
4756 but least it will give an indication that something went
4758 if (flags & CODING_STATE_SS2)
4760 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4761 flags &= ~CODING_STATE_SS2;
4763 if (flags & CODING_STATE_SS3)
4765 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4766 flags &= ~CODING_STATE_SS3;
4769 /***** Now handle the control characters. *****/
4772 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4774 flags &= CODING_STATE_ISO2022_LOCK;
4776 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4777 DECODE_ADD_BINARY_CHAR (c, dst);
4780 { /* Graphic characters */
4781 Lisp_Object charset;
4787 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4789 /* Now determine the charset. */
4790 reg = ((flags & CODING_STATE_SS2) ? 2
4791 : (flags & CODING_STATE_SS3) ? 3
4792 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4793 : str->iso2022.register_left);
4794 charset = str->iso2022.charset[reg];
4796 /* Error checking: */
4797 if (! CHARSETP (charset)
4798 || str->iso2022.invalid_designated[reg]
4799 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4800 && XCHARSET_CHARS (charset) == 94))
4801 /* Mrmph. We are trying to invoke a register that has no
4802 or an invalid charset in it, or trying to add a character
4803 outside the range of the charset. Insert that char literally
4804 to preserve it for the output. */
4806 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4807 DECODE_ADD_BINARY_CHAR (c, dst);
4812 /* Things are probably hunky-dorey. */
4814 /* Fetch reverse charset, maybe. */
4815 if (((flags & CODING_STATE_R2L) &&
4816 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4818 (!(flags & CODING_STATE_R2L) &&
4819 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4821 Lisp_Object new_charset =
4822 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4823 if (!NILP (new_charset))
4824 charset = new_charset;
4828 lb = XCHARSET_LEADING_BYTE (charset);
4830 switch (XCHARSET_REP_BYTES (charset))
4833 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4834 Dynarr_add (dst, c & 0x7F);
4837 case 2: /* one-byte official */
4838 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4840 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0), dst);
4842 Dynarr_add (dst, lb);
4843 Dynarr_add (dst, c | 0x80);
4847 case 3: /* one-byte private or two-byte official */
4848 if (XCHARSET_PRIVATE_P (charset))
4850 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4852 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0),
4855 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4856 Dynarr_add (dst, lb);
4857 Dynarr_add (dst, c | 0x80);
4865 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4869 Dynarr_add (dst, lb);
4870 Dynarr_add (dst, ch | 0x80);
4871 Dynarr_add (dst, c | 0x80);
4880 default: /* two-byte private */
4884 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4888 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4889 Dynarr_add (dst, lb);
4890 Dynarr_add (dst, ch | 0x80);
4891 Dynarr_add (dst, c | 0x80);
4901 flags &= CODING_STATE_ISO2022_LOCK;
4904 label_continue_loop:;
4907 if (flags & CODING_STATE_END)
4908 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4915 /***** ISO2022 encoder *****/
4917 /* Designate CHARSET into register REG. */
4920 iso2022_designate (Lisp_Object charset, unsigned char reg,
4921 struct encoding_stream *str, unsigned_char_dynarr *dst)
4923 static CONST char inter94[] = "()*+";
4924 static CONST char inter96[] = ",-./";
4926 unsigned char final;
4927 Lisp_Object old_charset = str->iso2022.charset[reg];
4929 str->iso2022.charset[reg] = charset;
4930 if (!CHARSETP (charset))
4931 /* charset might be an initial nil or t. */
4933 type = XCHARSET_TYPE (charset);
4934 final = XCHARSET_FINAL (charset);
4935 if (!str->iso2022.force_charset_on_output[reg] &&
4936 CHARSETP (old_charset) &&
4937 XCHARSET_TYPE (old_charset) == type &&
4938 XCHARSET_FINAL (old_charset) == final)
4941 str->iso2022.force_charset_on_output[reg] = 0;
4944 charset_conversion_spec_dynarr *dyn =
4945 str->codesys->iso2022.output_conv;
4951 for (i = 0; i < Dynarr_length (dyn); i++)
4953 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4954 if (EQ (charset, spec->from_charset))
4955 charset = spec->to_charset;
4960 Dynarr_add (dst, ISO_CODE_ESC);
4963 case CHARSET_TYPE_94:
4964 Dynarr_add (dst, inter94[reg]);
4966 case CHARSET_TYPE_96:
4967 Dynarr_add (dst, inter96[reg]);
4969 case CHARSET_TYPE_94X94:
4970 Dynarr_add (dst, '$');
4972 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4975 Dynarr_add (dst, inter94[reg]);
4977 case CHARSET_TYPE_96X96:
4978 Dynarr_add (dst, '$');
4979 Dynarr_add (dst, inter96[reg]);
4982 Dynarr_add (dst, final);
4986 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4988 if (str->iso2022.register_left != 0)
4990 Dynarr_add (dst, ISO_CODE_SI);
4991 str->iso2022.register_left = 0;
4996 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4998 if (str->iso2022.register_left != 1)
5000 Dynarr_add (dst, ISO_CODE_SO);
5001 str->iso2022.register_left = 1;
5005 /* Convert internally-formatted data to ISO2022 format. */
5008 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5009 unsigned_char_dynarr *dst, unsigned int n)
5011 unsigned char charmask, c;
5012 unsigned char char_boundary;
5013 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5014 unsigned int flags = str->flags;
5015 Emchar ch = str->ch;
5016 Lisp_Coding_System *codesys = str->codesys;
5017 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5019 Lisp_Object charset;
5022 unsigned int byte1, byte2;
5025 #ifdef ENABLE_COMPOSITE_CHARS
5026 /* flags for handling composite chars. We do a little switcharoo
5027 on the source while we're outputting the composite char. */
5028 unsigned int saved_n = 0;
5029 CONST unsigned char *saved_src = NULL;
5030 int in_composite = 0;
5031 #endif /* ENABLE_COMPOSITE_CHARS */
5033 char_boundary = str->iso2022.current_char_boundary;
5034 charset = str->iso2022.current_charset;
5035 half = str->iso2022.current_half;
5037 #ifdef ENABLE_COMPOSITE_CHARS
5045 switch (char_boundary)
5053 else if ( c >= 0xf8 )
5058 else if ( c >= 0xf0 )
5063 else if ( c >= 0xe0 )
5068 else if ( c >= 0xc0 )
5077 restore_left_to_right_direction (codesys, dst, &flags, 0);
5079 /* Make sure G0 contains ASCII */
5080 if ((c > ' ' && c < ISO_CODE_DEL) ||
5081 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5083 ensure_normal_shift (str, dst);
5084 iso2022_designate (Vcharset_ascii, 0, str, dst);
5087 /* If necessary, restore everything to the default state
5090 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5092 restore_left_to_right_direction (codesys, dst, &flags, 0);
5094 ensure_normal_shift (str, dst);
5096 for (i = 0; i < 4; i++)
5098 Lisp_Object initial_charset =
5099 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5100 iso2022_designate (initial_charset, i, str, dst);
5105 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5106 Dynarr_add (dst, '\r');
5107 if (eol_type != EOL_CR)
5108 Dynarr_add (dst, c);
5112 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5113 && fit_to_be_escape_quoted (c))
5114 Dynarr_add (dst, ISO_CODE_ESC);
5115 Dynarr_add (dst, c);
5121 ch = ( ch << 6 ) | ( c & 0x3f );
5124 if ( (0x80 <= ch) && (ch <= 0x9f) )
5126 charmask = (half == 0 ? 0x00 : 0x80);
5128 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5129 && fit_to_be_escape_quoted (ch))
5130 Dynarr_add (dst, ISO_CODE_ESC);
5131 /* you asked for it ... */
5132 Dynarr_add (dst, ch);
5138 BREAKUP_CHAR (ch, charset, byte1, byte2);
5139 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5140 codesys, dst, &flags, 0);
5142 /* Now determine which register to use. */
5144 for (i = 0; i < 4; i++)
5146 if (EQ (charset, str->iso2022.charset[i]) ||
5148 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5157 if (XCHARSET_GRAPHIC (charset) != 0)
5159 if (!NILP (str->iso2022.charset[1]) &&
5160 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5161 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5163 else if (!NILP (str->iso2022.charset[2]))
5165 else if (!NILP (str->iso2022.charset[3]))
5174 iso2022_designate (charset, reg, str, dst);
5176 /* Now invoke that register. */
5180 ensure_normal_shift (str, dst);
5185 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5187 ensure_shift_out (str, dst);
5195 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5197 Dynarr_add (dst, ISO_CODE_ESC);
5198 Dynarr_add (dst, 'N');
5203 Dynarr_add (dst, ISO_CODE_SS2);
5209 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5211 Dynarr_add (dst, ISO_CODE_ESC);
5212 Dynarr_add (dst, 'O');
5217 Dynarr_add (dst, ISO_CODE_SS3);
5226 charmask = (half == 0 ? 0x00 : 0x80);
5228 switch (XCHARSET_DIMENSION (charset))
5231 Dynarr_add (dst, byte1 | charmask);
5234 Dynarr_add (dst, byte1 | charmask);
5235 Dynarr_add (dst, byte2 | charmask);
5244 ch = ( ch << 6 ) | ( c & 0x3f );
5248 #else /* not UTF2000 */
5254 if (BYTE_ASCII_P (c))
5255 { /* Processing ASCII character */
5258 restore_left_to_right_direction (codesys, dst, &flags, 0);
5260 /* Make sure G0 contains ASCII */
5261 if ((c > ' ' && c < ISO_CODE_DEL) ||
5262 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5264 ensure_normal_shift (str, dst);
5265 iso2022_designate (Vcharset_ascii, 0, str, dst);
5268 /* If necessary, restore everything to the default state
5271 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5273 restore_left_to_right_direction (codesys, dst, &flags, 0);
5275 ensure_normal_shift (str, dst);
5277 for (i = 0; i < 4; i++)
5279 Lisp_Object initial_charset =
5280 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5281 iso2022_designate (initial_charset, i, str, dst);
5286 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5287 Dynarr_add (dst, '\r');
5288 if (eol_type != EOL_CR)
5289 Dynarr_add (dst, c);
5293 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5294 && fit_to_be_escape_quoted (c))
5295 Dynarr_add (dst, ISO_CODE_ESC);
5296 Dynarr_add (dst, c);
5301 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5302 { /* Processing Leading Byte */
5304 charset = CHARSET_BY_LEADING_BYTE (c);
5305 if (LEADING_BYTE_PREFIX_P(c))
5307 else if (!EQ (charset, Vcharset_control_1)
5308 #ifdef ENABLE_COMPOSITE_CHARS
5309 && !EQ (charset, Vcharset_composite)
5315 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5316 codesys, dst, &flags, 0);
5318 /* Now determine which register to use. */
5320 for (i = 0; i < 4; i++)
5322 if (EQ (charset, str->iso2022.charset[i]) ||
5324 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5333 if (XCHARSET_GRAPHIC (charset) != 0)
5335 if (!NILP (str->iso2022.charset[1]) &&
5336 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5337 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5339 else if (!NILP (str->iso2022.charset[2]))
5341 else if (!NILP (str->iso2022.charset[3]))
5350 iso2022_designate (charset, reg, str, dst);
5352 /* Now invoke that register. */
5356 ensure_normal_shift (str, dst);
5361 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5363 ensure_shift_out (str, dst);
5371 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5373 Dynarr_add (dst, ISO_CODE_ESC);
5374 Dynarr_add (dst, 'N');
5379 Dynarr_add (dst, ISO_CODE_SS2);
5385 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5387 Dynarr_add (dst, ISO_CODE_ESC);
5388 Dynarr_add (dst, 'O');
5393 Dynarr_add (dst, ISO_CODE_SS3);
5405 { /* Processing Non-ASCII character */
5406 charmask = (half == 0 ? 0x7F : 0xFF);
5408 if (EQ (charset, Vcharset_control_1))
5410 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5411 && fit_to_be_escape_quoted (c))
5412 Dynarr_add (dst, ISO_CODE_ESC);
5413 /* you asked for it ... */
5414 Dynarr_add (dst, c - 0x20);
5418 switch (XCHARSET_REP_BYTES (charset))
5421 Dynarr_add (dst, c & charmask);
5424 if (XCHARSET_PRIVATE_P (charset))
5426 Dynarr_add (dst, c & charmask);
5431 #ifdef ENABLE_COMPOSITE_CHARS
5432 if (EQ (charset, Vcharset_composite))
5436 /* #### Bother! We don't know how to
5438 Dynarr_add (dst, '~');
5442 Emchar emch = MAKE_CHAR (Vcharset_composite,
5443 ch & 0x7F, c & 0x7F);
5444 Lisp_Object lstr = composite_char_string (emch);
5448 src = XSTRING_DATA (lstr);
5449 n = XSTRING_LENGTH (lstr);
5450 Dynarr_add (dst, ISO_CODE_ESC);
5451 Dynarr_add (dst, '0'); /* start composing */
5455 #endif /* ENABLE_COMPOSITE_CHARS */
5457 Dynarr_add (dst, ch & charmask);
5458 Dynarr_add (dst, c & charmask);
5471 Dynarr_add (dst, ch & charmask);
5472 Dynarr_add (dst, c & charmask);
5487 #endif /* not UTF2000 */
5489 #ifdef ENABLE_COMPOSITE_CHARS
5495 Dynarr_add (dst, ISO_CODE_ESC);
5496 Dynarr_add (dst, '1'); /* end composing */
5497 goto back_to_square_n; /* Wheeeeeeeee ..... */
5499 #endif /* ENABLE_COMPOSITE_CHARS */
5502 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5504 if (char_boundary && flags & CODING_STATE_END)
5507 restore_left_to_right_direction (codesys, dst, &flags, 0);
5508 ensure_normal_shift (str, dst);
5509 for (i = 0; i < 4; i++)
5511 Lisp_Object initial_charset =
5512 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5513 iso2022_designate (initial_charset, i, str, dst);
5519 str->iso2022.current_char_boundary = char_boundary;
5520 str->iso2022.current_charset = charset;
5521 str->iso2022.current_half = half;
5523 /* Verbum caro factum est! */
5527 /************************************************************************/
5528 /* No-conversion methods */
5529 /************************************************************************/
5531 /* This is used when reading in "binary" files -- i.e. files that may
5532 contain all 256 possible byte values and that are not to be
5533 interpreted as being in any particular decoding. */
5535 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5536 unsigned_char_dynarr *dst, unsigned int n)
5539 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5540 unsigned int flags = str->flags;
5541 unsigned int ch = str->ch;
5542 eol_type_t eol_type = str->eol_type;
5548 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5549 DECODE_ADD_BINARY_CHAR (c, dst);
5550 label_continue_loop:;
5553 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5560 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5561 unsigned_char_dynarr *dst, unsigned int n)
5564 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5565 unsigned int flags = str->flags;
5566 unsigned int ch = str->ch;
5567 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5574 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5575 Dynarr_add (dst, '\r');
5576 if (eol_type != EOL_CR)
5577 Dynarr_add (dst, '\n');
5580 else if (BYTE_ASCII_P (c))
5587 Dynarr_add (dst, c);
5590 else if ( (0xc0 <= c) && (c < 0xe0) )
5594 c = ((ch & 0x1f) << 6) | (c & 0x3f);
5596 Dynarr_add (dst, c);
5598 Dynarr_add (dst, '~'); /* untranslatable character */
5600 #else /* not UTF2000 */
5601 else if (BUFBYTE_LEADING_BYTE_P (c))
5604 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5605 c == LEADING_BYTE_CONTROL_1)
5608 Dynarr_add (dst, '~'); /* untranslatable character */
5612 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5613 Dynarr_add (dst, c);
5614 else if (ch == LEADING_BYTE_CONTROL_1)
5617 Dynarr_add (dst, c - 0x20);
5619 /* else it should be the second or third byte of an
5620 untranslatable character, so ignore it */
5623 #endif /* not UTF2000 */
5631 /************************************************************************/
5632 /* Simple internal/external functions */
5633 /************************************************************************/
5635 static Extbyte_dynarr *conversion_out_dynarr;
5636 static Bufbyte_dynarr *conversion_in_dynarr;
5638 /* Determine coding system from coding format */
5640 /* #### not correct for all values of `fmt'! */
5642 external_data_format_to_coding_system (enum external_data_format fmt)
5646 case FORMAT_FILENAME:
5647 case FORMAT_TERMINAL:
5648 if (EQ (Vfile_name_coding_system, Qnil) ||
5649 EQ (Vfile_name_coding_system, Qbinary))
5652 return Fget_coding_system (Vfile_name_coding_system);
5655 return Fget_coding_system (Qctext);
5663 convert_to_external_format (CONST Bufbyte *ptr,
5666 enum external_data_format fmt)
5668 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5670 if (!conversion_out_dynarr)
5671 conversion_out_dynarr = Dynarr_new (Extbyte);
5673 Dynarr_reset (conversion_out_dynarr);
5675 if (NILP (coding_system))
5677 CONST Bufbyte *end = ptr + len;
5683 (*ptr < 0xc0) ? *ptr :
5684 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5687 (BYTE_ASCII_P (*ptr)) ? *ptr :
5688 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5689 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5692 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5696 #ifdef ERROR_CHECK_BUFPOS
5697 assert (ptr == end);
5702 Lisp_Object instream, outstream, da_outstream;
5703 Lstream *istr, *ostr;
5704 struct gcpro gcpro1, gcpro2, gcpro3;
5705 char tempbuf[1024]; /* some random amount */
5707 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5708 da_outstream = make_dynarr_output_stream
5709 ((unsigned_char_dynarr *) conversion_out_dynarr);
5711 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5712 istr = XLSTREAM (instream);
5713 ostr = XLSTREAM (outstream);
5714 GCPRO3 (instream, outstream, da_outstream);
5717 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5720 Lstream_write (ostr, tempbuf, size_in_bytes);
5722 Lstream_close (istr);
5723 Lstream_close (ostr);
5725 Lstream_delete (istr);
5726 Lstream_delete (ostr);
5727 Lstream_delete (XLSTREAM (da_outstream));
5730 *len_out = Dynarr_length (conversion_out_dynarr);
5731 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5732 return Dynarr_atp (conversion_out_dynarr, 0);
5736 convert_from_external_format (CONST Extbyte *ptr,
5739 enum external_data_format fmt)
5741 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5743 if (!conversion_in_dynarr)
5744 conversion_in_dynarr = Dynarr_new (Bufbyte);
5746 Dynarr_reset (conversion_in_dynarr);
5748 if (NILP (coding_system))
5750 CONST Extbyte *end = ptr + len;
5751 for (; ptr < end; ptr++)
5754 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5759 Lisp_Object instream, outstream, da_outstream;
5760 Lstream *istr, *ostr;
5761 struct gcpro gcpro1, gcpro2, gcpro3;
5762 char tempbuf[1024]; /* some random amount */
5764 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5765 da_outstream = make_dynarr_output_stream
5766 ((unsigned_char_dynarr *) conversion_in_dynarr);
5768 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5769 istr = XLSTREAM (instream);
5770 ostr = XLSTREAM (outstream);
5771 GCPRO3 (instream, outstream, da_outstream);
5774 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5777 Lstream_write (ostr, tempbuf, size_in_bytes);
5779 Lstream_close (istr);
5780 Lstream_close (ostr);
5782 Lstream_delete (istr);
5783 Lstream_delete (ostr);
5784 Lstream_delete (XLSTREAM (da_outstream));
5787 *len_out = Dynarr_length (conversion_in_dynarr);
5788 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5789 return Dynarr_atp (conversion_in_dynarr, 0);
5793 /************************************************************************/
5794 /* Initialization */
5795 /************************************************************************/
5798 syms_of_file_coding (void)
5800 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5801 deferror (&Qcoding_system_error, "coding-system-error",
5802 "Coding-system error", Qio_error);
5804 DEFSUBR (Fcoding_system_p);
5805 DEFSUBR (Ffind_coding_system);
5806 DEFSUBR (Fget_coding_system);
5807 DEFSUBR (Fcoding_system_list);
5808 DEFSUBR (Fcoding_system_name);
5809 DEFSUBR (Fmake_coding_system);
5810 DEFSUBR (Fcopy_coding_system);
5811 DEFSUBR (Fdefine_coding_system_alias);
5812 DEFSUBR (Fsubsidiary_coding_system);
5814 DEFSUBR (Fcoding_system_type);
5815 DEFSUBR (Fcoding_system_doc_string);
5817 DEFSUBR (Fcoding_system_charset);
5819 DEFSUBR (Fcoding_system_property);
5821 DEFSUBR (Fcoding_category_list);
5822 DEFSUBR (Fset_coding_priority_list);
5823 DEFSUBR (Fcoding_priority_list);
5824 DEFSUBR (Fset_coding_category_system);
5825 DEFSUBR (Fcoding_category_system);
5827 DEFSUBR (Fdetect_coding_region);
5828 DEFSUBR (Fdecode_coding_region);
5829 DEFSUBR (Fencode_coding_region);
5831 DEFSUBR (Fdecode_shift_jis_char);
5832 DEFSUBR (Fencode_shift_jis_char);
5833 DEFSUBR (Fdecode_big5_char);
5834 DEFSUBR (Fencode_big5_char);
5835 DEFSUBR (Fset_ucs_char);
5836 DEFSUBR (Fucs_char);
5837 DEFSUBR (Fset_char_ucs);
5838 DEFSUBR (Fchar_ucs);
5840 defsymbol (&Qcoding_system_p, "coding-system-p");
5841 defsymbol (&Qno_conversion, "no-conversion");
5842 defsymbol (&Qraw_text, "raw-text");
5844 defsymbol (&Qbig5, "big5");
5845 defsymbol (&Qshift_jis, "shift-jis");
5846 defsymbol (&Qucs4, "ucs-4");
5847 defsymbol (&Qutf8, "utf-8");
5848 defsymbol (&Qccl, "ccl");
5849 defsymbol (&Qiso2022, "iso2022");
5851 defsymbol (&Qmnemonic, "mnemonic");
5852 defsymbol (&Qeol_type, "eol-type");
5853 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5854 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5856 defsymbol (&Qcr, "cr");
5857 defsymbol (&Qlf, "lf");
5858 defsymbol (&Qcrlf, "crlf");
5859 defsymbol (&Qeol_cr, "eol-cr");
5860 defsymbol (&Qeol_lf, "eol-lf");
5861 defsymbol (&Qeol_crlf, "eol-crlf");
5863 defsymbol (&Qcharset_g0, "charset-g0");
5864 defsymbol (&Qcharset_g1, "charset-g1");
5865 defsymbol (&Qcharset_g2, "charset-g2");
5866 defsymbol (&Qcharset_g3, "charset-g3");
5867 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5868 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5869 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5870 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5871 defsymbol (&Qno_iso6429, "no-iso6429");
5872 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5873 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5875 defsymbol (&Qshort, "short");
5876 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5877 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5878 defsymbol (&Qseven, "seven");
5879 defsymbol (&Qlock_shift, "lock-shift");
5880 defsymbol (&Qescape_quoted, "escape-quoted");
5882 defsymbol (&Qencode, "encode");
5883 defsymbol (&Qdecode, "decode");
5886 defsymbol (&Qctext, "ctext");
5887 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5889 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5891 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5893 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5895 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5897 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5899 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5901 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5903 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5906 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5911 lstream_type_create_file_coding (void)
5913 LSTREAM_HAS_METHOD (decoding, reader);
5914 LSTREAM_HAS_METHOD (decoding, writer);
5915 LSTREAM_HAS_METHOD (decoding, rewinder);
5916 LSTREAM_HAS_METHOD (decoding, seekable_p);
5917 LSTREAM_HAS_METHOD (decoding, flusher);
5918 LSTREAM_HAS_METHOD (decoding, closer);
5919 LSTREAM_HAS_METHOD (decoding, marker);
5921 LSTREAM_HAS_METHOD (encoding, reader);
5922 LSTREAM_HAS_METHOD (encoding, writer);
5923 LSTREAM_HAS_METHOD (encoding, rewinder);
5924 LSTREAM_HAS_METHOD (encoding, seekable_p);
5925 LSTREAM_HAS_METHOD (encoding, flusher);
5926 LSTREAM_HAS_METHOD (encoding, closer);
5927 LSTREAM_HAS_METHOD (encoding, marker);
5931 vars_of_file_coding (void)
5935 /* Initialize to something reasonable ... */
5936 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5938 coding_category_system[i] = Qnil;
5939 coding_category_by_priority[i] = i;
5942 Fprovide (intern ("file-coding"));
5944 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5945 Coding system used for TTY keyboard input.
5946 Not used under a windowing system.
5948 Vkeyboard_coding_system = Qnil;
5950 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5951 Coding system used for TTY display output.
5952 Not used under a windowing system.
5954 Vterminal_coding_system = Qnil;
5956 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5957 Overriding coding system used when writing a file or process.
5958 You should *bind* this, not set it. If this is non-nil, it specifies
5959 the coding system that will be used when a file or process is read
5960 in, and overrides `buffer-file-coding-system-for-read',
5961 `insert-file-contents-pre-hook', etc. Use those variables instead of
5962 this one for permanent changes to the environment.
5964 Vcoding_system_for_read = Qnil;
5966 DEFVAR_LISP ("coding-system-for-write",
5967 &Vcoding_system_for_write /*
5968 Overriding coding system used when writing a file or process.
5969 You should *bind* this, not set it. If this is non-nil, it specifies
5970 the coding system that will be used when a file or process is wrote
5971 in, and overrides `buffer-file-coding-system',
5972 `write-region-pre-hook', etc. Use those variables instead of this one
5973 for permanent changes to the environment.
5975 Vcoding_system_for_write = Qnil;
5977 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5978 Coding system used to convert pathnames when accessing files.
5980 Vfile_name_coding_system = Qnil;
5982 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5983 Non-nil means the buffer contents are regarded as multi-byte form
5984 of characters, not a binary code. This affects the display, file I/O,
5985 and behaviors of various editing commands.
5987 Setting this to nil does not do anything.
5989 enable_multibyte_characters = 1;
5993 complex_vars_of_file_coding (void)
5995 staticpro (&Vcoding_system_hash_table);
5996 Vcoding_system_hash_table =
5997 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5999 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6001 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6003 struct codesys_prop csp; \
6005 csp.prop_type = (Prop_Type); \
6006 Dynarr_add (the_codesys_prop_dynarr, csp); \
6009 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6010 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6011 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6012 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6013 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6014 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6015 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6017 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6018 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6019 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6020 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6021 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6022 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6023 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6024 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6025 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6026 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6027 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6028 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6029 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6030 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6031 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6032 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6033 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6035 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6036 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6038 /* Need to create this here or we're really screwed. */
6040 (Qraw_text, Qno_conversion,
6041 build_string ("Raw text, which means it converts only line-break-codes."),
6042 list2 (Qmnemonic, build_string ("Raw")));
6045 (Qbinary, Qno_conversion,
6046 build_string ("Binary, which means it does not convert anything."),
6047 list4 (Qeol_type, Qlf,
6048 Qmnemonic, build_string ("Binary")));
6053 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6054 list2 (Qmnemonic, build_string ("UTF8")));
6057 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6059 /* Need this for bootstrapping */
6060 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6061 Fget_coding_system (Qraw_text);
6064 coding_category_system[CODING_CATEGORY_UTF8]
6065 = Fget_coding_system (Qutf8);
6072 for (i = 0; i < 65536; i++)
6073 ucs_to_mule_table[i] = Qnil;
6075 staticpro (&mule_to_ucs_table);
6076 mule_to_ucs_table = Fmake_char_table(Qgeneric);