1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
824 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
826 if (EQ (key, Qmnemonic))
829 CHECK_STRING (value);
830 CODING_SYSTEM_MNEMONIC (codesys) = value;
833 else if (EQ (key, Qeol_type))
835 need_to_setup_eol_systems = NILP (value);
838 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
841 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
842 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
844 else if (ty == CODESYS_ISO2022)
846 #define FROB_INITIAL_CHARSET(charset_num) \
847 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
848 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
850 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
851 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
852 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
853 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
855 #define FROB_FORCE_CHARSET(charset_num) \
856 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
858 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
859 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
860 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
861 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
863 #define FROB_BOOLEAN_PROPERTY(prop) \
864 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
866 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
867 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
868 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
869 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
870 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
871 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
872 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
874 else if (EQ (key, Qinput_charset_conversion))
876 codesys->iso2022.input_conv =
877 Dynarr_new (charset_conversion_spec);
878 parse_charset_conversion_specs (codesys->iso2022.input_conv,
881 else if (EQ (key, Qoutput_charset_conversion))
883 codesys->iso2022.output_conv =
884 Dynarr_new (charset_conversion_spec);
885 parse_charset_conversion_specs (codesys->iso2022.output_conv,
889 signal_simple_error ("Unrecognized property", key);
891 else if (EQ (type, Qccl))
893 if (EQ (key, Qdecode))
895 CHECK_VECTOR (value);
896 CODING_SYSTEM_CCL_DECODE (codesys) = value;
898 else if (EQ (key, Qencode))
900 CHECK_VECTOR (value);
901 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
904 signal_simple_error ("Unrecognized property", key);
908 signal_simple_error ("Unrecognized property", key);
911 if (need_to_setup_eol_systems)
912 setup_eol_coding_systems (codesys);
915 Lisp_Object codesys_obj;
916 XSETCODING_SYSTEM (codesys_obj, codesys);
917 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
923 Copy OLD-CODING-SYSTEM to NEW-NAME.
924 If NEW-NAME does not name an existing coding system, a new one will
927 (old_coding_system, new_name))
929 Lisp_Object new_coding_system;
930 old_coding_system = Fget_coding_system (old_coding_system);
931 new_coding_system = Ffind_coding_system (new_name);
932 if (NILP (new_coding_system))
934 XSETCODING_SYSTEM (new_coding_system,
935 allocate_coding_system
936 (XCODING_SYSTEM_TYPE (old_coding_system),
938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
944 memcpy (((char *) to ) + sizeof (to->header),
945 ((char *) from) + sizeof (from->header),
946 sizeof (*from) - sizeof (from->header));
949 return new_coding_system;
952 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
953 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
955 (alias, coding_system))
957 CHECK_SYMBOL (alias);
958 if (!NILP (Ffind_coding_system (alias)))
959 signal_simple_error ("Symbol already names a coding system", alias);
960 coding_system = Fget_coding_system (coding_system);
961 Fputhash (alias, coding_system, Vcoding_system_hash_table);
963 /* Set up aliases for subsidiaries. */
964 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
967 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
968 #define FROB(type, name) \
970 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
971 if (!NILP (subsidiary)) \
972 Fdefine_coding_system_alias \
973 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
980 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
981 but it doesn't look intentional, so I'd rather return something
982 meaningful or nothing at all. */
987 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
989 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
990 Lisp_Object new_coding_system;
992 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
993 return coding_system;
997 case EOL_AUTODETECT: return coding_system;
998 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
999 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1000 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1004 return NILP (new_coding_system) ? coding_system : new_coding_system;
1007 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1008 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1010 (coding_system, eol_type))
1012 coding_system = Fget_coding_system (coding_system);
1014 return subsidiary_coding_system (coding_system,
1015 symbol_to_eol_type (eol_type));
1019 /************************************************************************/
1020 /* Coding system accessors */
1021 /************************************************************************/
1023 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1024 Return the doc string for CODING-SYSTEM.
1028 coding_system = Fget_coding_system (coding_system);
1029 return XCODING_SYSTEM_DOC_STRING (coding_system);
1032 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1033 Return the type of CODING-SYSTEM.
1037 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1040 case CODESYS_AUTODETECT: return Qundecided;
1042 case CODESYS_SHIFT_JIS: return Qshift_jis;
1043 case CODESYS_ISO2022: return Qiso2022;
1044 case CODESYS_BIG5: return Qbig5;
1045 case CODESYS_UCS4: return Qucs4;
1046 case CODESYS_UTF8: return Qutf8;
1047 case CODESYS_CCL: return Qccl;
1049 case CODESYS_NO_CONVERSION: return Qno_conversion;
1051 case CODESYS_INTERNAL: return Qinternal;
1058 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1061 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1063 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1066 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1067 Return initial charset of CODING-SYSTEM designated to GNUM.
1070 (coding_system, gnum))
1072 coding_system = Fget_coding_system (coding_system);
1075 return coding_system_charset (coding_system, XINT (gnum));
1079 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1080 Return the PROP property of CODING-SYSTEM.
1082 (coding_system, prop))
1085 enum coding_system_type type;
1087 coding_system = Fget_coding_system (coding_system);
1088 CHECK_SYMBOL (prop);
1089 type = XCODING_SYSTEM_TYPE (coding_system);
1091 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1092 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1095 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1097 case CODESYS_PROP_ALL_OK:
1100 case CODESYS_PROP_ISO2022:
1101 if (type != CODESYS_ISO2022)
1103 ("Property only valid in ISO2022 coding systems",
1107 case CODESYS_PROP_CCL:
1108 if (type != CODESYS_CCL)
1110 ("Property only valid in CCL coding systems",
1120 signal_simple_error ("Unrecognized property", prop);
1122 if (EQ (prop, Qname))
1123 return XCODING_SYSTEM_NAME (coding_system);
1124 else if (EQ (prop, Qtype))
1125 return Fcoding_system_type (coding_system);
1126 else if (EQ (prop, Qdoc_string))
1127 return XCODING_SYSTEM_DOC_STRING (coding_system);
1128 else if (EQ (prop, Qmnemonic))
1129 return XCODING_SYSTEM_MNEMONIC (coding_system);
1130 else if (EQ (prop, Qeol_type))
1131 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1132 else if (EQ (prop, Qeol_lf))
1133 return XCODING_SYSTEM_EOL_LF (coding_system);
1134 else if (EQ (prop, Qeol_crlf))
1135 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1136 else if (EQ (prop, Qeol_cr))
1137 return XCODING_SYSTEM_EOL_CR (coding_system);
1138 else if (EQ (prop, Qpost_read_conversion))
1139 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1140 else if (EQ (prop, Qpre_write_conversion))
1141 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1143 else if (type == CODESYS_ISO2022)
1145 if (EQ (prop, Qcharset_g0))
1146 return coding_system_charset (coding_system, 0);
1147 else if (EQ (prop, Qcharset_g1))
1148 return coding_system_charset (coding_system, 1);
1149 else if (EQ (prop, Qcharset_g2))
1150 return coding_system_charset (coding_system, 2);
1151 else if (EQ (prop, Qcharset_g3))
1152 return coding_system_charset (coding_system, 3);
1154 #define FORCE_CHARSET(charset_num) \
1155 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1156 (coding_system, charset_num) ? Qt : Qnil)
1158 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1159 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1160 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1161 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1163 #define LISP_BOOLEAN(prop) \
1164 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1166 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1167 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1168 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1169 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1170 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1171 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1172 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1174 else if (EQ (prop, Qinput_charset_conversion))
1176 unparse_charset_conversion_specs
1177 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1178 else if (EQ (prop, Qoutput_charset_conversion))
1180 unparse_charset_conversion_specs
1181 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1185 else if (type == CODESYS_CCL)
1187 if (EQ (prop, Qdecode))
1188 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1189 else if (EQ (prop, Qencode))
1190 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1198 return Qnil; /* not reached */
1202 /************************************************************************/
1203 /* Coding category functions */
1204 /************************************************************************/
1207 decode_coding_category (Lisp_Object symbol)
1211 CHECK_SYMBOL (symbol);
1212 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1213 if (EQ (coding_category_symbol[i], symbol))
1216 signal_simple_error ("Unrecognized coding category", symbol);
1217 return 0; /* not reached */
1220 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1221 Return a list of all recognized coding categories.
1226 Lisp_Object list = Qnil;
1228 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1229 list = Fcons (coding_category_symbol[i], list);
1233 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1234 Change the priority order of the coding categories.
1235 LIST should be list of coding categories, in descending order of
1236 priority. Unspecified coding categories will be lower in priority
1237 than all specified ones, in the same relative order they were in
1242 int category_to_priority[CODING_CATEGORY_LAST + 1];
1246 /* First generate a list that maps coding categories to priorities. */
1248 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1249 category_to_priority[i] = -1;
1251 /* Highest priority comes from the specified list. */
1253 EXTERNAL_LIST_LOOP (rest, list)
1255 int cat = decode_coding_category (XCAR (rest));
1257 if (category_to_priority[cat] >= 0)
1258 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1259 category_to_priority[cat] = i++;
1262 /* Now go through the existing categories by priority to retrieve
1263 the categories not yet specified and preserve their priority
1265 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1267 int cat = coding_category_by_priority[j];
1268 if (category_to_priority[cat] < 0)
1269 category_to_priority[cat] = i++;
1272 /* Now we need to construct the inverse of the mapping we just
1275 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1276 coding_category_by_priority[category_to_priority[i]] = i;
1278 /* Phew! That was confusing. */
1282 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1283 Return a list of coding categories in descending order of priority.
1288 Lisp_Object list = Qnil;
1290 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1291 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1296 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1297 Change the coding system associated with a coding category.
1299 (coding_category, coding_system))
1301 int cat = decode_coding_category (coding_category);
1303 coding_system = Fget_coding_system (coding_system);
1304 coding_category_system[cat] = coding_system;
1308 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1309 Return the coding system associated with a coding category.
1313 int cat = decode_coding_category (coding_category);
1314 Lisp_Object sys = coding_category_system[cat];
1317 return XCODING_SYSTEM_NAME (sys);
1322 /************************************************************************/
1323 /* Detecting the encoding of data */
1324 /************************************************************************/
1326 struct detection_state
1328 enum eol_type eol_type;
1364 struct iso2022_decoder iso;
1366 int high_byte_count;
1367 unsigned int saw_single_shift:1;
1380 acceptable_control_char_p (int c)
1384 /* Allow and ignore control characters that you might
1385 reasonably see in a text file */
1390 case 8: /* backspace */
1391 case 11: /* vertical tab */
1392 case 12: /* form feed */
1393 case 26: /* MS-DOS C-z junk */
1394 case 31: /* '^_' -- for info */
1402 mask_has_at_most_one_bit_p (int mask)
1404 /* Perhaps the only thing useful you learn from intensive Microsoft
1405 technical interviews */
1406 return (mask & (mask - 1)) == 0;
1409 static enum eol_type
1410 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1419 st->eol.just_saw_cr = 1;
1424 if (st->eol.just_saw_cr)
1426 else if (st->eol.seen_anything)
1429 else if (st->eol.just_saw_cr)
1431 st->eol.just_saw_cr = 0;
1433 st->eol.seen_anything = 1;
1436 return EOL_AUTODETECT;
1439 /* Attempt to determine the encoding and EOL type of the given text.
1440 Before calling this function for the first type, you must initialize
1441 st->eol_type as appropriate and initialize st->mask to ~0.
1443 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1446 st->mask holds the determined coding category mask, or ~0 if only
1447 ASCII has been seen so far.
1451 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1452 is present in st->mask
1453 1 == definitive answers are here for both st->eol_type and st->mask
1457 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1458 unsigned int n, int just_do_eol)
1462 if (st->eol_type == EOL_AUTODETECT)
1463 st->eol_type = detect_eol_type (st, src, n);
1466 return st->eol_type != EOL_AUTODETECT;
1468 if (!st->seen_non_ascii)
1470 for (; n; n--, src++)
1473 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1475 st->seen_non_ascii = 1;
1477 st->shift_jis.mask = ~0;
1481 st->iso2022.mask = ~0;
1491 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1492 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1493 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1494 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1495 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1496 st->big5.mask = detect_coding_big5 (st, src, n);
1497 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1498 st->utf8.mask = detect_coding_utf8 (st, src, n);
1499 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1500 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1503 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1504 | st->utf8.mask | st->ucs4.mask;
1507 int retval = mask_has_at_most_one_bit_p (st->mask);
1508 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1509 return retval && st->eol_type != EOL_AUTODETECT;
1514 coding_system_from_mask (int mask)
1518 /* If the file was entirely or basically ASCII, use the
1519 default value of `buffer-file-coding-system'. */
1520 Lisp_Object retval =
1521 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1524 retval = Ffind_coding_system (retval);
1528 (Qbad_variable, Qwarning,
1529 "Invalid `default-buffer-file-coding-system', set to nil");
1530 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1534 retval = Fget_coding_system (Qraw_text);
1542 mask = postprocess_iso2022_mask (mask);
1544 /* Look through the coding categories by priority and find
1545 the first one that is allowed. */
1546 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1548 cat = coding_category_by_priority[i];
1549 if ((mask & (1 << cat)) &&
1550 !NILP (coding_category_system[cat]))
1554 return coding_category_system[cat];
1556 return Fget_coding_system (Qraw_text);
1560 /* Given a seekable read stream and potential coding system and EOL type
1561 as specified, do any autodetection that is called for. If the
1562 coding system and/or EOL type are not autodetect, they will be left
1563 alone; but this function will never return an autodetect coding system
1566 This function does not automatically fetch subsidiary coding systems;
1567 that should be unnecessary with the explicit eol-type argument. */
1570 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1571 enum eol_type *eol_type_in_out)
1573 struct detection_state decst;
1575 if (*eol_type_in_out == EOL_AUTODETECT)
1576 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1579 decst.eol_type = *eol_type_in_out;
1582 /* If autodetection is called for, do it now. */
1583 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1584 *eol_type_in_out == EOL_AUTODETECT)
1589 unsigned char random_buffer[4096];
1592 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1595 if (detect_coding_type (&decst, random_buffer, nread,
1596 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1597 CODESYS_AUTODETECT))
1601 *eol_type_in_out = decst.eol_type;
1602 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1603 *codesys_in_out = coding_system_from_mask (decst.mask);
1606 /* If we absolutely can't determine the EOL type, just assume LF. */
1607 if (*eol_type_in_out == EOL_AUTODETECT)
1608 *eol_type_in_out = EOL_LF;
1610 Lstream_rewind (stream);
1613 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1614 Detect coding system of the text in the region between START and END.
1615 Returned a list of possible coding systems ordered by priority.
1616 If only ASCII characters are found, it returns 'undecided or one of
1617 its subsidiary coding systems according to a detected end-of-line
1618 type. Optional arg BUFFER defaults to the current buffer.
1620 (start, end, buffer))
1622 Lisp_Object val = Qnil;
1623 struct buffer *buf = decode_buffer (buffer, 0);
1625 Lisp_Object instream, lb_instream;
1626 Lstream *istr, *lb_istr;
1627 struct detection_state decst;
1628 struct gcpro gcpro1, gcpro2;
1630 get_buffer_range_char (buf, start, end, &b, &e, 0);
1631 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1632 lb_istr = XLSTREAM (lb_instream);
1633 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1634 istr = XLSTREAM (instream);
1635 GCPRO2 (instream, lb_instream);
1637 decst.eol_type = EOL_AUTODETECT;
1641 unsigned char random_buffer[4096];
1642 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1646 if (detect_coding_type (&decst, random_buffer, nread, 0))
1650 if (decst.mask == ~0)
1651 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1659 decst.mask = postprocess_iso2022_mask (decst.mask);
1661 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1663 int sys = coding_category_by_priority[i];
1664 if (decst.mask & (1 << sys))
1666 Lisp_Object codesys = coding_category_system[sys];
1667 if (!NILP (codesys))
1668 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1669 val = Fcons (codesys, val);
1673 Lstream_close (istr);
1675 Lstream_delete (istr);
1676 Lstream_delete (lb_istr);
1681 /************************************************************************/
1682 /* Converting to internal Mule format ("decoding") */
1683 /************************************************************************/
1685 /* A decoding stream is a stream used for decoding text (i.e.
1686 converting from some external format to internal format).
1687 The decoding-stream object keeps track of the actual coding
1688 stream, the stream that is at the other end, and data that
1689 needs to be persistent across the lifetime of the stream. */
1691 /* Handle the EOL stuff related to just-read-in character C.
1692 EOL_TYPE is the EOL type of the coding stream.
1693 FLAGS is the current value of FLAGS in the coding stream, and may
1694 be modified by this macro. (The macro only looks at the
1695 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1696 bytes are to be written. You need to also define a local goto
1697 label "label_continue_loop" that is at the end of the main
1698 character-reading loop.
1700 If C is a CR character, then this macro handles it entirely and
1701 jumps to label_continue_loop. Otherwise, this macro does not add
1702 anything to DST, and continues normally. You should continue
1703 processing C normally after this macro. */
1705 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1709 if (eol_type == EOL_CR) \
1710 Dynarr_add (dst, '\n'); \
1711 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1712 Dynarr_add (dst, c); \
1714 flags |= CODING_STATE_CR; \
1715 goto label_continue_loop; \
1717 else if (flags & CODING_STATE_CR) \
1718 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1720 Dynarr_add (dst, '\r'); \
1721 flags &= ~CODING_STATE_CR; \
1725 /* C should be a binary character in the range 0 - 255; convert
1726 to internal format and add to Dynarr DST. */
1728 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1730 if (BYTE_ASCII_P (c)) \
1731 Dynarr_add (dst, c); \
1732 else if (BYTE_C1_P (c)) \
1734 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1735 Dynarr_add (dst, c + 0x20); \
1739 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1740 Dynarr_add (dst, c); \
1744 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1748 DECODE_ADD_BINARY_CHAR (ch, dst); \
1753 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1755 if (flags & CODING_STATE_END) \
1757 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1758 if (flags & CODING_STATE_CR) \
1759 Dynarr_add (dst, '\r'); \
1763 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1765 struct decoding_stream
1767 /* Coding system that governs the conversion. */
1768 Lisp_Coding_System *codesys;
1770 /* Stream that we read the encoded data from or
1771 write the decoded data to. */
1774 /* If we are reading, then we can return only a fixed amount of
1775 data, so if the conversion resulted in too much data, we store it
1776 here for retrieval the next time around. */
1777 unsigned_char_dynarr *runoff;
1779 /* FLAGS holds flags indicating the current state of the decoding.
1780 Some of these flags are dependent on the coding system. */
1783 /* CH holds a partially built-up character. Since we only deal
1784 with one- and two-byte characters at the moment, we only use
1785 this to store the first byte of a two-byte character. */
1788 /* EOL_TYPE specifies the type of end-of-line conversion that
1789 currently applies. We need to keep this separate from the
1790 EOL type stored in CODESYS because the latter might indicate
1791 automatic EOL-type detection while the former will always
1792 indicate a particular EOL type. */
1793 enum eol_type eol_type;
1795 /* Additional ISO2022 information. We define the structure above
1796 because it's also needed by the detection routines. */
1797 struct iso2022_decoder iso2022;
1799 /* Additional information (the state of the running CCL program)
1800 used by the CCL decoder. */
1801 struct ccl_program ccl;
1803 struct detection_state decst;
1806 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1807 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1808 static int decoding_rewinder (Lstream *stream);
1809 static int decoding_seekable_p (Lstream *stream);
1810 static int decoding_flusher (Lstream *stream);
1811 static int decoding_closer (Lstream *stream);
1813 static Lisp_Object decoding_marker (Lisp_Object stream,
1814 void (*markobj) (Lisp_Object));
1816 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1817 sizeof (struct decoding_stream));
1820 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1822 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1823 Lisp_Object str_obj;
1825 /* We do not need to mark the coding systems or charsets stored
1826 within the stream because they are stored in a global list
1827 and automatically marked. */
1829 XSETLSTREAM (str_obj, str);
1831 if (str->imp->marker)
1832 return (str->imp->marker) (str_obj, markobj);
1837 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1838 so we read data from the other end, decode it, and store it into DATA. */
1841 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1843 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1844 unsigned char *orig_data = data;
1846 int error_occurred = 0;
1848 /* We need to interface to mule_decode(), which expects to take some
1849 amount of data and store the result into a Dynarr. We have
1850 mule_decode() store into str->runoff, and take data from there
1853 /* We loop until we have enough data, reading chunks from the other
1854 end and decoding it. */
1857 /* Take data from the runoff if we can. Make sure to take at
1858 most SIZE bytes, and delete the data from the runoff. */
1859 if (Dynarr_length (str->runoff) > 0)
1861 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1862 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1863 Dynarr_delete_many (str->runoff, 0, chunk);
1869 break; /* No more room for data */
1871 if (str->flags & CODING_STATE_END)
1872 /* This means that on the previous iteration, we hit the EOF on
1873 the other end. We loop once more so that mule_decode() can
1874 output any final stuff it may be holding, or any "go back
1875 to a sane state" escape sequences. (This latter makes sense
1876 during encoding.) */
1879 /* Exhausted the runoff, so get some more. DATA has at least
1880 SIZE bytes left of storage in it, so it's OK to read directly
1881 into it. (We'll be overwriting above, after we've decoded it
1882 into the runoff.) */
1883 read_size = Lstream_read (str->other_end, data, size);
1890 /* There might be some more end data produced in the translation.
1891 See the comment above. */
1892 str->flags |= CODING_STATE_END;
1893 mule_decode (stream, data, str->runoff, read_size);
1896 if (data - orig_data == 0)
1897 return error_occurred ? -1 : 0;
1899 return data - orig_data;
1903 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1905 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1908 /* Decode all our data into the runoff, and then attempt to write
1909 it all out to the other end. Remove whatever chunk we succeeded
1911 mule_decode (stream, data, str->runoff, size);
1912 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1913 Dynarr_length (str->runoff));
1915 Dynarr_delete_many (str->runoff, 0, retval);
1916 /* Do NOT return retval. The return value indicates how much
1917 of the incoming data was written, not how many bytes were
1923 reset_decoding_stream (struct decoding_stream *str)
1926 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1928 Lisp_Object coding_system;
1929 XSETCODING_SYSTEM (coding_system, str->codesys);
1930 reset_iso2022 (coding_system, &str->iso2022);
1932 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1934 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1937 str->flags = str->ch = 0;
1941 decoding_rewinder (Lstream *stream)
1943 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1944 reset_decoding_stream (str);
1945 Dynarr_reset (str->runoff);
1946 return Lstream_rewind (str->other_end);
1950 decoding_seekable_p (Lstream *stream)
1952 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1953 return Lstream_seekable_p (str->other_end);
1957 decoding_flusher (Lstream *stream)
1959 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1960 return Lstream_flush (str->other_end);
1964 decoding_closer (Lstream *stream)
1966 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1967 if (stream->flags & LSTREAM_FL_WRITE)
1969 str->flags |= CODING_STATE_END;
1970 decoding_writer (stream, 0, 0);
1972 Dynarr_free (str->runoff);
1974 #ifdef ENABLE_COMPOSITE_CHARS
1975 if (str->iso2022.composite_chars)
1976 Dynarr_free (str->iso2022.composite_chars);
1979 return Lstream_close (str->other_end);
1983 decoding_stream_coding_system (Lstream *stream)
1985 Lisp_Object coding_system;
1986 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1988 XSETCODING_SYSTEM (coding_system, str->codesys);
1989 return subsidiary_coding_system (coding_system, str->eol_type);
1993 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1995 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1996 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1998 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1999 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2000 reset_decoding_stream (str);
2003 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2004 stream for writing, no automatic code detection will be performed.
2005 The reason for this is that automatic code detection requires a
2006 seekable input. Things will also fail if you open a decoding
2007 stream for reading using a non-fully-specified coding system and
2008 a non-seekable input stream. */
2011 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2014 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2015 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2019 str->other_end = stream;
2020 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2021 str->eol_type = EOL_AUTODETECT;
2022 if (!strcmp (mode, "r")
2023 && Lstream_seekable_p (stream))
2024 /* We can determine the coding system now. */
2025 determine_real_coding_system (stream, &codesys, &str->eol_type);
2026 set_decoding_stream_coding_system (lstr, codesys);
2027 str->decst.eol_type = str->eol_type;
2028 str->decst.mask = ~0;
2029 XSETLSTREAM (obj, lstr);
2034 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2036 return make_decoding_stream_1 (stream, codesys, "r");
2040 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2042 return make_decoding_stream_1 (stream, codesys, "w");
2045 /* Note: the decode_coding_* functions all take the same
2046 arguments as mule_decode(), which is to say some SRC data of
2047 size N, which is to be stored into dynamic array DST.
2048 DECODING is the stream within which the decoding is
2049 taking place, but no data is actually read from or
2050 written to that stream; that is handled in decoding_reader()
2051 or decoding_writer(). This allows the same functions to
2052 be used for both reading and writing. */
2055 mule_decode (Lstream *decoding, CONST unsigned char *src,
2056 unsigned_char_dynarr *dst, unsigned int n)
2058 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2060 /* If necessary, do encoding-detection now. We do this when
2061 we're a writing stream or a non-seekable reading stream,
2062 meaning that we can't just process the whole input,
2063 rewind, and start over. */
2065 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2066 str->eol_type == EOL_AUTODETECT)
2068 Lisp_Object codesys;
2070 XSETCODING_SYSTEM (codesys, str->codesys);
2071 detect_coding_type (&str->decst, src, n,
2072 CODING_SYSTEM_TYPE (str->codesys) !=
2073 CODESYS_AUTODETECT);
2074 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2075 str->decst.mask != ~0)
2076 /* #### This is cheesy. What we really ought to do is
2077 buffer up a certain amount of data so as to get a
2078 less random result. */
2079 codesys = coding_system_from_mask (str->decst.mask);
2080 str->eol_type = str->decst.eol_type;
2081 if (XCODING_SYSTEM (codesys) != str->codesys)
2083 /* Preserve the CODING_STATE_END flag in case it was set.
2084 If we erase it, bad things might happen. */
2085 int was_end = str->flags & CODING_STATE_END;
2086 set_decoding_stream_coding_system (decoding, codesys);
2088 str->flags |= CODING_STATE_END;
2092 switch (CODING_SYSTEM_TYPE (str->codesys))
2095 case CODESYS_INTERNAL:
2096 Dynarr_add_many (dst, src, n);
2099 case CODESYS_AUTODETECT:
2100 /* If we got this far and still haven't decided on the coding
2101 system, then do no conversion. */
2102 case CODESYS_NO_CONVERSION:
2103 decode_coding_no_conversion (decoding, src, dst, n);
2106 case CODESYS_SHIFT_JIS:
2107 decode_coding_sjis (decoding, src, dst, n);
2110 decode_coding_big5 (decoding, src, dst, n);
2113 decode_coding_ucs4 (decoding, src, dst, n);
2116 decode_coding_utf8 (decoding, src, dst, n);
2119 str->ccl.last_block = str->flags & CODING_STATE_END;
2120 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2122 case CODESYS_ISO2022:
2123 decode_coding_iso2022 (decoding, src, dst, n);
2131 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2132 Decode the text between START and END which is encoded in CODING-SYSTEM.
2133 This is useful if you've read in encoded text from a file without decoding
2134 it (e.g. you read in a JIS-formatted file but used the `binary' or
2135 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2136 Return length of decoded text.
2137 BUFFER defaults to the current buffer if unspecified.
2139 (start, end, coding_system, buffer))
2142 struct buffer *buf = decode_buffer (buffer, 0);
2143 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2144 Lstream *istr, *ostr;
2145 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2147 get_buffer_range_char (buf, start, end, &b, &e, 0);
2149 barf_if_buffer_read_only (buf, b, e);
2151 coding_system = Fget_coding_system (coding_system);
2152 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2153 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2154 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2156 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2157 Fget_coding_system (Qbinary));
2158 istr = XLSTREAM (instream);
2159 ostr = XLSTREAM (outstream);
2160 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2162 /* The chain of streams looks like this:
2164 [BUFFER] <----- send through
2165 ------> [ENCODE AS BINARY]
2166 ------> [DECODE AS SPECIFIED]
2172 char tempbuf[1024]; /* some random amount */
2173 Bufpos newpos, even_newer_pos;
2174 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2175 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2179 newpos = lisp_buffer_stream_startpos (istr);
2180 Lstream_write (ostr, tempbuf, size_in_bytes);
2181 even_newer_pos = lisp_buffer_stream_startpos (istr);
2182 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2185 Lstream_close (istr);
2186 Lstream_close (ostr);
2188 Lstream_delete (istr);
2189 Lstream_delete (ostr);
2190 Lstream_delete (XLSTREAM (de_outstream));
2191 Lstream_delete (XLSTREAM (lb_outstream));
2196 /************************************************************************/
2197 /* Converting to an external encoding ("encoding") */
2198 /************************************************************************/
2200 /* An encoding stream is an output stream. When you create the
2201 stream, you specify the coding system that governs the encoding
2202 and another stream that the resulting encoded data is to be
2203 sent to, and then start sending data to it. */
2205 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2207 struct encoding_stream
2209 /* Coding system that governs the conversion. */
2210 Lisp_Coding_System *codesys;
2212 /* Stream that we read the encoded data from or
2213 write the decoded data to. */
2216 /* If we are reading, then we can return only a fixed amount of
2217 data, so if the conversion resulted in too much data, we store it
2218 here for retrieval the next time around. */
2219 unsigned_char_dynarr *runoff;
2221 /* FLAGS holds flags indicating the current state of the encoding.
2222 Some of these flags are dependent on the coding system. */
2225 /* CH holds a partially built-up character. Since we only deal
2226 with one- and two-byte characters at the moment, we only use
2227 this to store the first byte of a two-byte character. */
2230 /* Additional information used by the ISO2022 encoder. */
2233 /* CHARSET holds the character sets currently assigned to the G0
2234 through G3 registers. It is initialized from the array
2235 INITIAL_CHARSET in CODESYS. */
2236 Lisp_Object charset[4];
2238 /* Which registers are currently invoked into the left (GL) and
2239 right (GR) halves of the 8-bit encoding space? */
2240 int register_left, register_right;
2242 /* Whether we need to explicitly designate the charset in the
2243 G? register before using it. It is initialized from the
2244 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2245 unsigned char force_charset_on_output[4];
2247 /* Other state variables that need to be preserved across
2249 Lisp_Object current_charset;
2251 int current_char_boundary;
2254 /* Additional information (the state of the running CCL program)
2255 used by the CCL encoder. */
2256 struct ccl_program ccl;
2260 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2261 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2263 static int encoding_rewinder (Lstream *stream);
2264 static int encoding_seekable_p (Lstream *stream);
2265 static int encoding_flusher (Lstream *stream);
2266 static int encoding_closer (Lstream *stream);
2268 static Lisp_Object encoding_marker (Lisp_Object stream,
2269 void (*markobj) (Lisp_Object));
2271 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2272 sizeof (struct encoding_stream));
2275 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2277 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2278 Lisp_Object str_obj;
2280 /* We do not need to mark the coding systems or charsets stored
2281 within the stream because they are stored in a global list
2282 and automatically marked. */
2284 XSETLSTREAM (str_obj, str);
2286 if (str->imp->marker)
2287 return (str->imp->marker) (str_obj, markobj);
2292 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2293 so we read data from the other end, encode it, and store it into DATA. */
2296 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2298 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2299 unsigned char *orig_data = data;
2301 int error_occurred = 0;
2303 /* We need to interface to mule_encode(), which expects to take some
2304 amount of data and store the result into a Dynarr. We have
2305 mule_encode() store into str->runoff, and take data from there
2308 /* We loop until we have enough data, reading chunks from the other
2309 end and encoding it. */
2312 /* Take data from the runoff if we can. Make sure to take at
2313 most SIZE bytes, and delete the data from the runoff. */
2314 if (Dynarr_length (str->runoff) > 0)
2316 int chunk = min ((int) size, Dynarr_length (str->runoff));
2317 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2318 Dynarr_delete_many (str->runoff, 0, chunk);
2324 break; /* No more room for data */
2326 if (str->flags & CODING_STATE_END)
2327 /* This means that on the previous iteration, we hit the EOF on
2328 the other end. We loop once more so that mule_encode() can
2329 output any final stuff it may be holding, or any "go back
2330 to a sane state" escape sequences. (This latter makes sense
2331 during encoding.) */
2334 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2335 left of storage in it, so it's OK to read directly into it.
2336 (We'll be overwriting above, after we've encoded it into the
2338 read_size = Lstream_read (str->other_end, data, size);
2345 /* There might be some more end data produced in the translation.
2346 See the comment above. */
2347 str->flags |= CODING_STATE_END;
2348 mule_encode (stream, data, str->runoff, read_size);
2351 if (data == orig_data)
2352 return error_occurred ? -1 : 0;
2354 return data - orig_data;
2358 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2360 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2363 /* Encode all our data into the runoff, and then attempt to write
2364 it all out to the other end. Remove whatever chunk we succeeded
2366 mule_encode (stream, data, str->runoff, size);
2367 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2368 Dynarr_length (str->runoff));
2370 Dynarr_delete_many (str->runoff, 0, retval);
2371 /* Do NOT return retval. The return value indicates how much
2372 of the incoming data was written, not how many bytes were
2378 reset_encoding_stream (struct encoding_stream *str)
2381 switch (CODING_SYSTEM_TYPE (str->codesys))
2383 case CODESYS_ISO2022:
2387 for (i = 0; i < 4; i++)
2389 str->iso2022.charset[i] =
2390 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2391 str->iso2022.force_charset_on_output[i] =
2392 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2394 str->iso2022.register_left = 0;
2395 str->iso2022.register_right = 1;
2396 str->iso2022.current_charset = Qnil;
2397 str->iso2022.current_half = 0;
2398 str->iso2022.current_char_boundary = 1;
2402 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2409 str->flags = str->ch = 0;
2413 encoding_rewinder (Lstream *stream)
2415 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2416 reset_encoding_stream (str);
2417 Dynarr_reset (str->runoff);
2418 return Lstream_rewind (str->other_end);
2422 encoding_seekable_p (Lstream *stream)
2424 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2425 return Lstream_seekable_p (str->other_end);
2429 encoding_flusher (Lstream *stream)
2431 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2432 return Lstream_flush (str->other_end);
2436 encoding_closer (Lstream *stream)
2438 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2439 if (stream->flags & LSTREAM_FL_WRITE)
2441 str->flags |= CODING_STATE_END;
2442 encoding_writer (stream, 0, 0);
2444 Dynarr_free (str->runoff);
2445 return Lstream_close (str->other_end);
2449 encoding_stream_coding_system (Lstream *stream)
2451 Lisp_Object coding_system;
2452 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2454 XSETCODING_SYSTEM (coding_system, str->codesys);
2455 return coding_system;
2459 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2461 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2462 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2464 reset_encoding_stream (str);
2468 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2471 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2472 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2476 str->runoff = Dynarr_new (unsigned_char);
2477 str->other_end = stream;
2478 set_encoding_stream_coding_system (lstr, codesys);
2479 XSETLSTREAM (obj, lstr);
2484 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2486 return make_encoding_stream_1 (stream, codesys, "r");
2490 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2492 return make_encoding_stream_1 (stream, codesys, "w");
2495 /* Convert N bytes of internally-formatted data stored in SRC to an
2496 external format, according to the encoding stream ENCODING.
2497 Store the encoded data into DST. */
2500 mule_encode (Lstream *encoding, CONST unsigned char *src,
2501 unsigned_char_dynarr *dst, unsigned int n)
2503 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2505 switch (CODING_SYSTEM_TYPE (str->codesys))
2508 case CODESYS_INTERNAL:
2509 Dynarr_add_many (dst, src, n);
2512 case CODESYS_AUTODETECT:
2513 /* If we got this far and still haven't decided on the coding
2514 system, then do no conversion. */
2515 case CODESYS_NO_CONVERSION:
2516 encode_coding_no_conversion (encoding, src, dst, n);
2519 case CODESYS_SHIFT_JIS:
2520 encode_coding_sjis (encoding, src, dst, n);
2523 encode_coding_big5 (encoding, src, dst, n);
2526 encode_coding_ucs4 (encoding, src, dst, n);
2529 encode_coding_utf8 (encoding, src, dst, n);
2532 str->ccl.last_block = str->flags & CODING_STATE_END;
2533 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2535 case CODESYS_ISO2022:
2536 encode_coding_iso2022 (encoding, src, dst, n);
2544 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2545 Encode the text between START and END using CODING-SYSTEM.
2546 This will, for example, convert Japanese characters into stuff such as
2547 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2548 text. BUFFER defaults to the current buffer if unspecified.
2550 (start, end, coding_system, buffer))
2553 struct buffer *buf = decode_buffer (buffer, 0);
2554 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2555 Lstream *istr, *ostr;
2556 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2558 get_buffer_range_char (buf, start, end, &b, &e, 0);
2560 barf_if_buffer_read_only (buf, b, e);
2562 coding_system = Fget_coding_system (coding_system);
2563 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2564 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2565 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2566 Fget_coding_system (Qbinary));
2567 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2569 istr = XLSTREAM (instream);
2570 ostr = XLSTREAM (outstream);
2571 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2572 /* The chain of streams looks like this:
2574 [BUFFER] <----- send through
2575 ------> [ENCODE AS SPECIFIED]
2576 ------> [DECODE AS BINARY]
2581 char tempbuf[1024]; /* some random amount */
2582 Bufpos newpos, even_newer_pos;
2583 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2584 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2588 newpos = lisp_buffer_stream_startpos (istr);
2589 Lstream_write (ostr, tempbuf, size_in_bytes);
2590 even_newer_pos = lisp_buffer_stream_startpos (istr);
2591 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2597 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2598 Lstream_close (istr);
2599 Lstream_close (ostr);
2601 Lstream_delete (istr);
2602 Lstream_delete (ostr);
2603 Lstream_delete (XLSTREAM (de_outstream));
2604 Lstream_delete (XLSTREAM (lb_outstream));
2605 return make_int (retlen);
2611 /************************************************************************/
2612 /* Shift-JIS methods */
2613 /************************************************************************/
2615 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2616 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2617 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2618 encoded by "position-code + 0x80". A character of JISX0208
2619 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2620 position-codes are divided and shifted so that it fit in the range
2623 --- CODE RANGE of Shift-JIS ---
2624 (character set) (range)
2626 JISX0201-Kana 0xA0 .. 0xDF
2627 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2628 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2629 -------------------------------
2633 /* Is this the first byte of a Shift-JIS two-byte char? */
2635 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2636 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2638 /* Is this the second byte of a Shift-JIS two-byte char? */
2640 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2641 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2643 #define BYTE_SJIS_KATAKANA_P(c) \
2644 ((c) >= 0xA1 && (c) <= 0xDF)
2647 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2655 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2657 if (st->shift_jis.in_second_byte)
2659 st->shift_jis.in_second_byte = 0;
2663 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2664 st->shift_jis.in_second_byte = 1;
2666 return CODING_CATEGORY_SHIFT_JIS_MASK;
2669 /* Convert Shift-JIS data to internal format. */
2672 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2673 unsigned_char_dynarr *dst, unsigned int n)
2676 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2677 unsigned int flags = str->flags;
2678 unsigned int ch = str->ch;
2679 eol_type_t eol_type = str->eol_type;
2687 /* Previous character was first byte of Shift-JIS Kanji char. */
2688 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2690 unsigned char e1, e2;
2692 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2693 DECODE_SJIS (ch, c, e1, e2);
2694 Dynarr_add (dst, e1);
2695 Dynarr_add (dst, e2);
2699 DECODE_ADD_BINARY_CHAR (ch, dst);
2700 DECODE_ADD_BINARY_CHAR (c, dst);
2706 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2707 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2709 else if (BYTE_SJIS_KATAKANA_P (c))
2711 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2712 Dynarr_add (dst, c);
2715 DECODE_ADD_BINARY_CHAR (c, dst);
2717 label_continue_loop:;
2720 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2726 /* Convert internally-formatted data to Shift-JIS. */
2729 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2730 unsigned_char_dynarr *dst, unsigned int n)
2733 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2734 unsigned int flags = str->flags;
2735 unsigned int ch = str->ch;
2736 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2743 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2744 Dynarr_add (dst, '\r');
2745 if (eol_type != EOL_CR)
2746 Dynarr_add (dst, '\n');
2749 else if (BYTE_ASCII_P (c))
2751 Dynarr_add (dst, c);
2754 else if (BUFBYTE_LEADING_BYTE_P (c))
2755 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2756 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2757 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2760 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2762 Dynarr_add (dst, c);
2765 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2766 ch == LEADING_BYTE_JAPANESE_JISX0208)
2770 unsigned char j1, j2;
2771 ENCODE_SJIS (ch, c, j1, j2);
2772 Dynarr_add (dst, j1);
2773 Dynarr_add (dst, j2);
2783 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2784 Decode a JISX0208 character of Shift-JIS coding-system.
2785 CODE is the character code in Shift-JIS as a cons of type bytes.
2786 Return the corresponding character.
2790 unsigned char c1, c2, s1, s2;
2793 CHECK_INT (XCAR (code));
2794 CHECK_INT (XCDR (code));
2795 s1 = XINT (XCAR (code));
2796 s2 = XINT (XCDR (code));
2797 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2798 BYTE_SJIS_TWO_BYTE_2_P (s2))
2800 DECODE_SJIS (s1, s2, c1, c2);
2801 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2802 c1 & 0x7F, c2 & 0x7F));
2808 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2809 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2810 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2814 Lisp_Object charset;
2817 CHECK_CHAR_COERCE_INT (ch);
2818 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2819 if (EQ (charset, Vcharset_japanese_jisx0208))
2821 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2822 return Fcons (make_int (s1), make_int (s2));
2829 /************************************************************************/
2831 /************************************************************************/
2833 /* BIG5 is a coding system encoding two character sets: ASCII and
2834 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2835 character set and is encoded in two-byte.
2837 --- CODE RANGE of BIG5 ---
2838 (character set) (range)
2840 Big5 (1st byte) 0xA1 .. 0xFE
2841 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2842 --------------------------
2844 Since the number of characters in Big5 is larger than maximum
2845 characters in Emacs' charset (96x96), it can't be handled as one
2846 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2847 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2848 contains frequently used characters and the latter contains less
2849 frequently used characters. */
2851 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2852 ((c) >= 0xA1 && (c) <= 0xFE)
2854 /* Is this the second byte of a Shift-JIS two-byte char? */
2856 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2857 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2859 /* Number of Big5 characters which have the same code in 1st byte. */
2861 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2863 /* Code conversion macros. These are macros because they are used in
2864 inner loops during code conversion.
2866 Note that temporary variables in macros introduce the classic
2867 dynamic-scoping problems with variable names. We use capital-
2868 lettered variables in the assumption that XEmacs does not use
2869 capital letters in variables except in a very formalized way
2872 /* Convert Big5 code (b1, b2) into its internal string representation
2875 /* There is a much simpler way to split the Big5 charset into two.
2876 For the moment I'm going to leave the algorithm as-is because it
2877 claims to separate out the most-used characters into a single
2878 charset, which perhaps will lead to optimizations in various
2881 The way the algorithm works is something like this:
2883 Big5 can be viewed as a 94x157 charset, where the row is
2884 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2885 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2886 the split between low and high column numbers is apparently
2887 meaningless; ascending rows produce less and less frequent chars.
2888 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2889 the first charset, and the upper half (0xC9 .. 0xFE) to the
2890 second. To do the conversion, we convert the character into
2891 a single number where 0 .. 156 is the first row, 157 .. 313
2892 is the second, etc. That way, the characters are ordered by
2893 decreasing frequency. Then we just chop the space in two
2894 and coerce the result into a 94x94 space.
2897 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2899 int B1 = b1, B2 = b2; \
2901 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2905 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2909 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2910 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2912 c1 = I / (0xFF - 0xA1) + 0xA1; \
2913 c2 = I % (0xFF - 0xA1) + 0xA1; \
2916 /* Convert the internal string representation of a Big5 character
2917 (lb, c1, c2) into Big5 code (b1, b2). */
2919 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2921 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2923 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2925 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2927 b1 = I / BIG5_SAME_ROW + 0xA1; \
2928 b2 = I % BIG5_SAME_ROW; \
2929 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2933 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2941 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2942 (c >= 0x80 && c <= 0xA0))
2944 if (st->big5.in_second_byte)
2946 st->big5.in_second_byte = 0;
2947 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2951 st->big5.in_second_byte = 1;
2953 return CODING_CATEGORY_BIG5_MASK;
2956 /* Convert Big5 data to internal format. */
2959 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2960 unsigned_char_dynarr *dst, unsigned int n)
2963 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2964 unsigned int flags = str->flags;
2965 unsigned int ch = str->ch;
2966 eol_type_t eol_type = str->eol_type;
2973 /* Previous character was first byte of Big5 char. */
2974 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2976 unsigned char b1, b2, b3;
2977 DECODE_BIG5 (ch, c, b1, b2, b3);
2978 Dynarr_add (dst, b1);
2979 Dynarr_add (dst, b2);
2980 Dynarr_add (dst, b3);
2984 DECODE_ADD_BINARY_CHAR (ch, dst);
2985 DECODE_ADD_BINARY_CHAR (c, dst);
2991 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2992 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2995 DECODE_ADD_BINARY_CHAR (c, dst);
2997 label_continue_loop:;
3000 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3006 /* Convert internally-formatted data to Big5. */
3009 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3010 unsigned_char_dynarr *dst, unsigned int n)
3013 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3014 unsigned int flags = str->flags;
3015 unsigned int ch = str->ch;
3016 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3023 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3024 Dynarr_add (dst, '\r');
3025 if (eol_type != EOL_CR)
3026 Dynarr_add (dst, '\n');
3028 else if (BYTE_ASCII_P (c))
3031 Dynarr_add (dst, c);
3033 else if (BUFBYTE_LEADING_BYTE_P (c))
3035 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3036 c == LEADING_BYTE_CHINESE_BIG5_2)
3038 /* A recognized leading byte. */
3040 continue; /* not done with this character. */
3042 /* otherwise just ignore this character. */
3044 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3045 ch == LEADING_BYTE_CHINESE_BIG5_2)
3047 /* Previous char was a recognized leading byte. */
3049 continue; /* not done with this character. */
3053 /* Encountering second byte of a Big5 character. */
3054 unsigned char b1, b2;
3056 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3057 Dynarr_add (dst, b1);
3058 Dynarr_add (dst, b2);
3069 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3070 Decode a Big5 character CODE of BIG5 coding-system.
3071 CODE is the character code in BIG5, a cons of two integers.
3072 Return the corresponding character.
3076 unsigned char c1, c2, b1, b2;
3079 CHECK_INT (XCAR (code));
3080 CHECK_INT (XCDR (code));
3081 b1 = XINT (XCAR (code));
3082 b2 = XINT (XCDR (code));
3083 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3084 BYTE_BIG5_TWO_BYTE_2_P (b2))
3087 Lisp_Object charset;
3088 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3089 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3090 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3096 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3097 Encode the Big5 character CH to BIG5 coding-system.
3098 Return the corresponding character code in Big5.
3102 Lisp_Object charset;
3105 CHECK_CHAR_COERCE_INT (ch);
3106 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3107 if (EQ (charset, Vcharset_chinese_big5_1) ||
3108 EQ (charset, Vcharset_chinese_big5_2))
3110 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3112 return Fcons (make_int (b1), make_int (b2));
3119 /************************************************************************/
3122 /* UCS-4 character codes are implemented as nonnegative integers. */
3124 /************************************************************************/
3126 Lisp_Object ucs_to_mule_table[65536];
3127 Lisp_Object mule_to_ucs_table;
3129 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3130 Map UCS-4 code CODE to Mule character CHARACTER.
3132 Return T on success, NIL on failure.
3138 CHECK_CHAR (character);
3142 if (c < sizeof (ucs_to_mule_table))
3144 ucs_to_mule_table[c] = character;
3152 ucs_to_char (unsigned long code)
3154 if (code < sizeof (ucs_to_mule_table))
3156 return ucs_to_mule_table[code];
3158 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3163 c = code % (94 * 94);
3165 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3166 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3167 CHARSET_LEFT_TO_RIGHT),
3168 c / 94 + 33, c % 94 + 33));
3174 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3175 Return Mule character corresponding to UCS code CODE (a positive integer).
3179 CHECK_NATNUM (code);
3180 return ucs_to_char (XINT (code));
3183 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3184 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3188 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3189 Fset_char_ucs is more restrictive on index arg, but should
3190 check code arg in a char_table method. */
3191 CHECK_CHAR (character);
3192 CHECK_NATNUM (code);
3193 return Fput_char_table (character, code, mule_to_ucs_table);
3196 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3197 Return the UCS code (a positive integer) corresponding to CHARACTER.
3201 return Fget_char_table (character, mule_to_ucs_table);
3204 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3205 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3206 is not found, instead.
3207 #### do something more appropriate (use blob?)
3208 Danger, Will Robinson! Data loss. Should we signal user? */
3210 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3212 Lisp_Object chr = ucs_to_char (ch);
3216 Bufbyte work[MAX_EMCHAR_LEN];
3221 simple_set_charptr_emchar (work, ch) :
3222 non_ascii_set_charptr_emchar (work, ch);
3223 Dynarr_add_many (dst, work, len);
3227 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3228 Dynarr_add (dst, 34 + 128);
3229 Dynarr_add (dst, 46 + 128);
3233 static unsigned long
3234 mule_char_to_ucs4 (Lisp_Object charset,
3235 unsigned char h, unsigned char l)
3238 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3245 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3246 (XCHARSET_CHARS (charset) == 94) )
3248 unsigned char final = XCHARSET_FINAL (charset);
3250 if ( ('@' <= final) && (final < 0x7f) )
3252 return 0xe00000 + (final - '@') * 94 * 94
3253 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3267 encode_ucs4 (Lisp_Object charset,
3268 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3270 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3271 Dynarr_add (dst, code >> 24);
3272 Dynarr_add (dst, (code >> 16) & 255);
3273 Dynarr_add (dst, (code >> 8) & 255);
3274 Dynarr_add (dst, code & 255);
3278 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3284 switch (st->ucs4.in_byte)
3293 st->ucs4.in_byte = 0;
3299 return CODING_CATEGORY_UCS4_MASK;
3303 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3304 unsigned_char_dynarr *dst, unsigned int n)
3306 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3307 unsigned int flags = str->flags;
3308 unsigned int ch = str->ch;
3312 unsigned char c = *src++;
3320 decode_ucs4 ( ( ch << 8 ) | c, dst);
3325 ch = ( ch << 8 ) | c;
3329 if (flags & CODING_STATE_END)
3330 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3337 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3338 unsigned_char_dynarr *dst, unsigned int n)
3340 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3341 unsigned int flags = str->flags;
3342 unsigned int ch = str->ch;
3343 unsigned char char_boundary = str->iso2022.current_char_boundary;
3344 Lisp_Object charset = str->iso2022.current_charset;
3346 #ifdef ENABLE_COMPOSITE_CHARS
3347 /* flags for handling composite chars. We do a little switcharoo
3348 on the source while we're outputting the composite char. */
3349 unsigned int saved_n = 0;
3350 CONST unsigned char *saved_src = NULL;
3351 int in_composite = 0;
3358 unsigned char c = *src++;
3360 if (BYTE_ASCII_P (c))
3361 { /* Processing ASCII character */
3363 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3366 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3367 { /* Processing Leading Byte */
3369 charset = CHARSET_BY_LEADING_BYTE (c);
3370 if (LEADING_BYTE_PREFIX_P(c))
3375 { /* Processing Non-ASCII character */
3377 if (EQ (charset, Vcharset_control_1))
3379 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3383 switch (XCHARSET_REP_BYTES (charset))
3386 encode_ucs4 (charset, c, 0, dst);
3389 if (XCHARSET_PRIVATE_P (charset))
3391 encode_ucs4 (charset, c, 0, dst);
3396 #ifdef ENABLE_COMPOSITE_CHARS
3397 if (EQ (charset, Vcharset_composite))
3401 /* #### Bother! We don't know how to
3403 Dynarr_add (dst, 0);
3404 Dynarr_add (dst, 0);
3405 Dynarr_add (dst, 0);
3406 Dynarr_add (dst, '~');
3410 Emchar emch = MAKE_CHAR (Vcharset_composite,
3411 ch & 0x7F, c & 0x7F);
3412 Lisp_Object lstr = composite_char_string (emch);
3416 src = XSTRING_DATA (lstr);
3417 n = XSTRING_LENGTH (lstr);
3421 #endif /* ENABLE_COMPOSITE_CHARS */
3423 encode_ucs4(charset, ch, c, dst);
3436 encode_ucs4 (charset, ch, c, dst);
3452 #ifdef ENABLE_COMPOSITE_CHARS
3458 goto back_to_square_n; /* Wheeeeeeeee ..... */
3460 #endif /* ENABLE_COMPOSITE_CHARS */
3464 str->iso2022.current_char_boundary = char_boundary;
3465 str->iso2022.current_charset = charset;
3467 /* Verbum caro factum est! */
3471 /************************************************************************/
3473 /************************************************************************/
3476 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3481 unsigned char c = *src++;
3482 switch (st->utf8.in_byte)
3485 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3488 st->utf8.in_byte = 5;
3490 st->utf8.in_byte = 4;
3492 st->utf8.in_byte = 3;
3494 st->utf8.in_byte = 2;
3496 st->utf8.in_byte = 1;
3501 if ((c & 0xc0) != 0x80)
3507 return CODING_CATEGORY_UTF8_MASK;
3511 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3512 unsigned_char_dynarr *dst, unsigned int n)
3514 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3515 unsigned int flags = str->flags;
3516 unsigned int ch = str->ch;
3517 eol_type_t eol_type = str->eol_type;
3521 unsigned char c = *src++;
3530 else if ( c >= 0xf8 )
3535 else if ( c >= 0xf0 )
3540 else if ( c >= 0xe0 )
3545 else if ( c >= 0xc0 )
3552 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3553 decode_ucs4 (c, dst);
3557 ch = ( ch << 6 ) | ( c & 0x3f );
3558 decode_ucs4 (ch, dst);
3563 ch = ( ch << 6 ) | ( c & 0x3f );
3566 label_continue_loop:;
3569 if (flags & CODING_STATE_END)
3570 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3577 encode_utf8 (Lisp_Object charset,
3578 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3580 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3583 Dynarr_add (dst, code);
3585 else if ( code <= 0x7ff )
3587 Dynarr_add (dst, (code >> 6) | 0xc0);
3588 Dynarr_add (dst, (code & 0x3f) | 0x80);
3590 else if ( code <= 0xffff )
3592 Dynarr_add (dst, (code >> 12) | 0xe0);
3593 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3594 Dynarr_add (dst, (code & 0x3f) | 0x80);
3596 else if ( code <= 0x1fffff )
3598 Dynarr_add (dst, (code >> 18) | 0xf0);
3599 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3600 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3601 Dynarr_add (dst, (code & 0x3f) | 0x80);
3603 else if ( code <= 0x3ffffff )
3605 Dynarr_add (dst, (code >> 24) | 0xf8);
3606 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3607 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3608 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3609 Dynarr_add (dst, (code & 0x3f) | 0x80);
3613 Dynarr_add (dst, (code >> 30) | 0xfc);
3614 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3615 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3616 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3617 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3618 Dynarr_add (dst, (code & 0x3f) | 0x80);
3623 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3624 unsigned_char_dynarr *dst, unsigned int n)
3626 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3627 unsigned int flags = str->flags;
3628 unsigned int ch = str->ch;
3629 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3630 unsigned char char_boundary = str->iso2022.current_char_boundary;
3631 Lisp_Object charset = str->iso2022.current_charset;
3633 #ifdef ENABLE_COMPOSITE_CHARS
3634 /* flags for handling composite chars. We do a little switcharoo
3635 on the source while we're outputting the composite char. */
3636 unsigned int saved_n = 0;
3637 CONST unsigned char *saved_src = NULL;
3638 int in_composite = 0;
3641 #endif /* ENABLE_COMPOSITE_CHARS */
3645 unsigned char c = *src++;
3647 if (BYTE_ASCII_P (c))
3648 { /* Processing ASCII character */
3652 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3653 Dynarr_add (dst, '\r');
3654 if (eol_type != EOL_CR)
3655 Dynarr_add (dst, c);
3658 encode_utf8 (Vcharset_ascii, c, 0, dst);
3661 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3662 { /* Processing Leading Byte */
3664 charset = CHARSET_BY_LEADING_BYTE (c);
3665 if (LEADING_BYTE_PREFIX_P(c))
3670 { /* Processing Non-ASCII character */
3672 if (EQ (charset, Vcharset_control_1))
3674 encode_utf8 (Vcharset_control_1, c, 0, dst);
3678 switch (XCHARSET_REP_BYTES (charset))
3681 encode_utf8 (charset, c, 0, dst);
3684 if (XCHARSET_PRIVATE_P (charset))
3686 encode_utf8 (charset, c, 0, dst);
3691 #ifdef ENABLE_COMPOSITE_CHARS
3692 if (EQ (charset, Vcharset_composite))
3696 /* #### Bother! We don't know how to
3698 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3702 Emchar emch = MAKE_CHAR (Vcharset_composite,
3703 ch & 0x7F, c & 0x7F);
3704 Lisp_Object lstr = composite_char_string (emch);
3708 src = XSTRING_DATA (lstr);
3709 n = XSTRING_LENGTH (lstr);
3713 #endif /* ENABLE_COMPOSITE_CHARS */
3715 encode_utf8 (charset, ch, c, dst);
3728 encode_utf8 (charset, ch, c, dst);
3744 #ifdef ENABLE_COMPOSITE_CHARS
3750 goto back_to_square_n; /* Wheeeeeeeee ..... */
3756 str->iso2022.current_char_boundary = char_boundary;
3757 str->iso2022.current_charset = charset;
3759 /* Verbum caro factum est! */
3763 /************************************************************************/
3764 /* ISO2022 methods */
3765 /************************************************************************/
3767 /* The following note describes the coding system ISO2022 briefly.
3768 Since the intention of this note is to help understand the
3769 functions in this file, some parts are NOT ACCURATE or OVERLY
3770 SIMPLIFIED. For thorough understanding, please refer to the
3771 original document of ISO2022.
3773 ISO2022 provides many mechanisms to encode several character sets
3774 in 7-bit and 8-bit environments. For 7-bit environments, all text
3775 is encoded using bytes less than 128. This may make the encoded
3776 text a little bit longer, but the text passes more easily through
3777 several gateways, some of which strip off MSB (Most Signigant Bit).
3779 There are two kinds of character sets: control character set and
3780 graphic character set. The former contains control characters such
3781 as `newline' and `escape' to provide control functions (control
3782 functions are also provided by escape sequences). The latter
3783 contains graphic characters such as 'A' and '-'. Emacs recognizes
3784 two control character sets and many graphic character sets.
3786 Graphic character sets are classified into one of the following
3787 four classes, according to the number of bytes (DIMENSION) and
3788 number of characters in one dimension (CHARS) of the set:
3789 - DIMENSION1_CHARS94
3790 - DIMENSION1_CHARS96
3791 - DIMENSION2_CHARS94
3792 - DIMENSION2_CHARS96
3794 In addition, each character set is assigned an identification tag,
3795 unique for each set, called "final character" (denoted as <F>
3796 hereafter). The <F> of each character set is decided by ECMA(*)
3797 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3798 (0x30..0x3F are for private use only).
3800 Note (*): ECMA = European Computer Manufacturers Association
3802 Here are examples of graphic character set [NAME(<F>)]:
3803 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3804 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3805 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3806 o DIMENSION2_CHARS96 -- none for the moment
3808 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3809 C0 [0x00..0x1F] -- control character plane 0
3810 GL [0x20..0x7F] -- graphic character plane 0
3811 C1 [0x80..0x9F] -- control character plane 1
3812 GR [0xA0..0xFF] -- graphic character plane 1
3814 A control character set is directly designated and invoked to C0 or
3815 C1 by an escape sequence. The most common case is that:
3816 - ISO646's control character set is designated/invoked to C0, and
3817 - ISO6429's control character set is designated/invoked to C1,
3818 and usually these designations/invocations are omitted in encoded
3819 text. In a 7-bit environment, only C0 can be used, and a control
3820 character for C1 is encoded by an appropriate escape sequence to
3821 fit into the environment. All control characters for C1 are
3822 defined to have corresponding escape sequences.
3824 A graphic character set is at first designated to one of four
3825 graphic registers (G0 through G3), then these graphic registers are
3826 invoked to GL or GR. These designations and invocations can be
3827 done independently. The most common case is that G0 is invoked to
3828 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3829 these invocations and designations are omitted in encoded text.
3830 In a 7-bit environment, only GL can be used.
3832 When a graphic character set of CHARS94 is invoked to GL, codes
3833 0x20 and 0x7F of the GL area work as control characters SPACE and
3834 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3837 There are two ways of invocation: locking-shift and single-shift.
3838 With locking-shift, the invocation lasts until the next different
3839 invocation, whereas with single-shift, the invocation affects the
3840 following character only and doesn't affect the locking-shift
3841 state. Invocations are done by the following control characters or
3844 ----------------------------------------------------------------------
3845 abbrev function cntrl escape seq description
3846 ----------------------------------------------------------------------
3847 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3848 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3849 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3850 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3851 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3852 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3853 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3854 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3855 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3856 ----------------------------------------------------------------------
3857 (*) These are not used by any known coding system.
3859 Control characters for these functions are defined by macros
3860 ISO_CODE_XXX in `coding.h'.
3862 Designations are done by the following escape sequences:
3863 ----------------------------------------------------------------------
3864 escape sequence description
3865 ----------------------------------------------------------------------
3866 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3867 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3868 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3869 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3870 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3871 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3872 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3873 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3874 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3875 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3876 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3877 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3878 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3879 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3880 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3881 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3882 ----------------------------------------------------------------------
3884 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3885 of dimension 1, chars 94, and final character <F>, etc...
3887 Note (*): Although these designations are not allowed in ISO2022,
3888 Emacs accepts them on decoding, and produces them on encoding
3889 CHARS96 character sets in a coding system which is characterized as
3890 7-bit environment, non-locking-shift, and non-single-shift.
3892 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3893 '(' can be omitted. We refer to this as "short-form" hereafter.
3895 Now you may notice that there are a lot of ways for encoding the
3896 same multilingual text in ISO2022. Actually, there exist many
3897 coding systems such as Compound Text (used in X11's inter client
3898 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3899 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3900 localized platforms), and all of these are variants of ISO2022.
3902 In addition to the above, Emacs handles two more kinds of escape
3903 sequences: ISO6429's direction specification and Emacs' private
3904 sequence for specifying character composition.
3906 ISO6429's direction specification takes the following form:
3907 o CSI ']' -- end of the current direction
3908 o CSI '0' ']' -- end of the current direction
3909 o CSI '1' ']' -- start of left-to-right text
3910 o CSI '2' ']' -- start of right-to-left text
3911 The control character CSI (0x9B: control sequence introducer) is
3912 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3914 Character composition specification takes the following form:
3915 o ESC '0' -- start character composition
3916 o ESC '1' -- end character composition
3917 Since these are not standard escape sequences of any ISO standard,
3918 their use with these meanings is restricted to Emacs only. */
3921 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3925 for (i = 0; i < 4; i++)
3927 if (!NILP (coding_system))
3929 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3931 iso->charset[i] = Qt;
3932 iso->invalid_designated[i] = 0;
3934 iso->esc = ISO_ESC_NOTHING;
3935 iso->esc_bytes_index = 0;
3936 iso->register_left = 0;
3937 iso->register_right = 1;
3938 iso->switched_dir_and_no_valid_charset_yet = 0;
3939 iso->invalid_switch_dir = 0;
3940 iso->output_direction_sequence = 0;
3941 iso->output_literally = 0;
3942 #ifdef ENABLE_COMPOSITE_CHARS
3943 if (iso->composite_chars)
3944 Dynarr_reset (iso->composite_chars);
3949 fit_to_be_escape_quoted (unsigned char c)
3966 /* Parse one byte of an ISO2022 escape sequence.
3967 If the result is an invalid escape sequence, return 0 and
3968 do not change anything in STR. Otherwise, if the result is
3969 an incomplete escape sequence, update ISO2022.ESC and
3970 ISO2022.ESC_BYTES and return -1. Otherwise, update
3971 all the state variables (but not ISO2022.ESC_BYTES) and
3974 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3975 or invocation of an invalid character set and treat that as
3976 an unrecognized escape sequence. */
3979 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3980 unsigned char c, unsigned int *flags,
3981 int check_invalid_charsets)
3983 /* (1) If we're at the end of a designation sequence, CS is the
3984 charset being designated and REG is the register to designate
3987 (2) If we're at the end of a locking-shift sequence, REG is
3988 the register to invoke and HALF (0 == left, 1 == right) is
3989 the half to invoke it into.
3991 (3) If we're at the end of a single-shift sequence, REG is
3992 the register to invoke. */
3993 Lisp_Object cs = Qnil;
3996 /* NOTE: This code does goto's all over the fucking place.
3997 The reason for this is that we're basically implementing
3998 a state machine here, and hierarchical languages like C
3999 don't really provide a clean way of doing this. */
4001 if (! (*flags & CODING_STATE_ESCAPE))
4002 /* At beginning of escape sequence; we need to reset our
4003 escape-state variables. */
4004 iso->esc = ISO_ESC_NOTHING;
4006 iso->output_literally = 0;
4007 iso->output_direction_sequence = 0;
4011 case ISO_ESC_NOTHING:
4012 iso->esc_bytes_index = 0;
4015 case ISO_CODE_ESC: /* Start escape sequence */
4016 *flags |= CODING_STATE_ESCAPE;
4020 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4021 *flags |= CODING_STATE_ESCAPE;
4022 iso->esc = ISO_ESC_5_11;
4025 case ISO_CODE_SO: /* locking shift 1 */
4028 case ISO_CODE_SI: /* locking shift 0 */
4032 case ISO_CODE_SS2: /* single shift */
4035 case ISO_CODE_SS3: /* single shift */
4039 default: /* Other control characters */
4046 /**** single shift ****/
4048 case 'N': /* single shift 2 */
4051 case 'O': /* single shift 3 */
4055 /**** locking shift ****/
4057 case '~': /* locking shift 1 right */
4060 case 'n': /* locking shift 2 */
4063 case '}': /* locking shift 2 right */
4066 case 'o': /* locking shift 3 */
4069 case '|': /* locking shift 3 right */
4073 #ifdef ENABLE_COMPOSITE_CHARS
4074 /**** composite ****/
4077 iso->esc = ISO_ESC_START_COMPOSITE;
4078 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4079 CODING_STATE_COMPOSITE;
4083 iso->esc = ISO_ESC_END_COMPOSITE;
4084 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4085 ~CODING_STATE_COMPOSITE;
4087 #endif /* ENABLE_COMPOSITE_CHARS */
4089 /**** directionality ****/
4092 iso->esc = ISO_ESC_5_11;
4095 /**** designation ****/
4097 case '$': /* multibyte charset prefix */
4098 iso->esc = ISO_ESC_2_4;
4102 if (0x28 <= c && c <= 0x2F)
4104 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4108 /* This function is called with CODESYS equal to nil when
4109 doing coding-system detection. */
4111 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4112 && fit_to_be_escape_quoted (c))
4114 iso->esc = ISO_ESC_LITERAL;
4115 *flags &= CODING_STATE_ISO2022_LOCK;
4125 /**** directionality ****/
4127 case ISO_ESC_5_11: /* ISO6429 direction control */
4130 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4131 goto directionality;
4133 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4134 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4135 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4139 case ISO_ESC_5_11_0:
4142 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4143 goto directionality;
4147 case ISO_ESC_5_11_1:
4150 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4151 goto directionality;
4155 case ISO_ESC_5_11_2:
4158 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4159 goto directionality;
4164 iso->esc = ISO_ESC_DIRECTIONALITY;
4165 /* Various junk here to attempt to preserve the direction sequences
4166 literally in the text if they would otherwise be swallowed due
4167 to invalid designations that don't show up as actual charset
4168 changes in the text. */
4169 if (iso->invalid_switch_dir)
4171 /* We already inserted a direction switch literally into the
4172 text. We assume (#### this may not be right) that the
4173 next direction switch is the one going the other way,
4174 and we need to output that literally as well. */
4175 iso->output_literally = 1;
4176 iso->invalid_switch_dir = 0;
4182 /* If we are in the thrall of an invalid designation,
4183 then stick the directionality sequence literally into the
4184 output stream so it ends up in the original text again. */
4185 for (jj = 0; jj < 4; jj++)
4186 if (iso->invalid_designated[jj])
4190 iso->output_literally = 1;
4191 iso->invalid_switch_dir = 1;
4194 /* Indicate that we haven't yet seen a valid designation,
4195 so that if a switch-dir is directly followed by an
4196 invalid designation, both get inserted literally. */
4197 iso->switched_dir_and_no_valid_charset_yet = 1;
4202 /**** designation ****/
4205 if (0x28 <= c && c <= 0x2F)
4207 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4210 if (0x40 <= c && c <= 0x42)
4212 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4213 *flags & CODING_STATE_R2L ?
4214 CHARSET_RIGHT_TO_LEFT :
4215 CHARSET_LEFT_TO_RIGHT);
4225 if (c < '0' || c > '~')
4226 return 0; /* bad final byte */
4228 if (iso->esc >= ISO_ESC_2_8 &&
4229 iso->esc <= ISO_ESC_2_15)
4231 type = ((iso->esc >= ISO_ESC_2_12) ?
4232 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4233 reg = (iso->esc - ISO_ESC_2_8) & 3;
4235 else if (iso->esc >= ISO_ESC_2_4_8 &&
4236 iso->esc <= ISO_ESC_2_4_15)
4238 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4239 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4240 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4244 /* Can this ever be reached? -slb */
4248 cs = CHARSET_BY_ATTRIBUTES (type, c,
4249 *flags & CODING_STATE_R2L ?
4250 CHARSET_RIGHT_TO_LEFT :
4251 CHARSET_LEFT_TO_RIGHT);
4257 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4261 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4262 /* can't invoke something that ain't there. */
4264 iso->esc = ISO_ESC_SINGLE_SHIFT;
4265 *flags &= CODING_STATE_ISO2022_LOCK;
4267 *flags |= CODING_STATE_SS2;
4269 *flags |= CODING_STATE_SS3;
4273 if (check_invalid_charsets &&
4274 !CHARSETP (iso->charset[reg]))
4275 /* can't invoke something that ain't there. */
4278 iso->register_right = reg;
4280 iso->register_left = reg;
4281 *flags &= CODING_STATE_ISO2022_LOCK;
4282 iso->esc = ISO_ESC_LOCKING_SHIFT;
4286 if (NILP (cs) && check_invalid_charsets)
4288 iso->invalid_designated[reg] = 1;
4289 iso->charset[reg] = Vcharset_ascii;
4290 iso->esc = ISO_ESC_DESIGNATE;
4291 *flags &= CODING_STATE_ISO2022_LOCK;
4292 iso->output_literally = 1;
4293 if (iso->switched_dir_and_no_valid_charset_yet)
4295 /* We encountered a switch-direction followed by an
4296 invalid designation. Ensure that the switch-direction
4297 gets outputted; otherwise it will probably get eaten
4298 when the text is written out again. */
4299 iso->switched_dir_and_no_valid_charset_yet = 0;
4300 iso->output_direction_sequence = 1;
4301 /* And make sure that the switch-dir going the other
4302 way gets outputted, as well. */
4303 iso->invalid_switch_dir = 1;
4307 /* This function is called with CODESYS equal to nil when
4308 doing coding-system detection. */
4309 if (!NILP (codesys))
4311 charset_conversion_spec_dynarr *dyn =
4312 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4318 for (i = 0; i < Dynarr_length (dyn); i++)
4320 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4321 if (EQ (cs, spec->from_charset))
4322 cs = spec->to_charset;
4327 iso->charset[reg] = cs;
4328 iso->esc = ISO_ESC_DESIGNATE;
4329 *flags &= CODING_STATE_ISO2022_LOCK;
4330 if (iso->invalid_designated[reg])
4332 iso->invalid_designated[reg] = 0;
4333 iso->output_literally = 1;
4335 if (iso->switched_dir_and_no_valid_charset_yet)
4336 iso->switched_dir_and_no_valid_charset_yet = 0;
4341 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4346 /* #### There are serious deficiencies in the recognition mechanism
4347 here. This needs to be much smarter if it's going to cut it.
4348 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4349 it should be detected as Latin-1.
4350 All the ISO2022 stuff in this file should be synced up with the
4351 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4352 Perhaps we should wait till R2L works in FSF Emacs? */
4354 if (!st->iso2022.initted)
4356 reset_iso2022 (Qnil, &st->iso2022.iso);
4357 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4358 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4359 CODING_CATEGORY_ISO_8_1_MASK |
4360 CODING_CATEGORY_ISO_8_2_MASK |
4361 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4362 st->iso2022.flags = 0;
4363 st->iso2022.high_byte_count = 0;
4364 st->iso2022.saw_single_shift = 0;
4365 st->iso2022.initted = 1;
4368 mask = st->iso2022.mask;
4375 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4376 st->iso2022.high_byte_count++;
4380 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4382 if (st->iso2022.high_byte_count & 1)
4383 /* odd number of high bytes; assume not iso-8-2 */
4384 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4386 st->iso2022.high_byte_count = 0;
4387 st->iso2022.saw_single_shift = 0;
4389 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4391 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4392 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4393 { /* control chars */
4396 /* Allow and ignore control characters that you might
4397 reasonably see in a text file */
4402 case 8: /* backspace */
4403 case 11: /* vertical tab */
4404 case 12: /* form feed */
4405 case 26: /* MS-DOS C-z junk */
4406 case 31: /* '^_' -- for info */
4407 goto label_continue_loop;
4414 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4417 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4418 &st->iso2022.flags, 0))
4420 switch (st->iso2022.iso.esc)
4422 case ISO_ESC_DESIGNATE:
4423 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4424 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4426 case ISO_ESC_LOCKING_SHIFT:
4427 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4428 goto ran_out_of_chars;
4429 case ISO_ESC_SINGLE_SHIFT:
4430 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4431 st->iso2022.saw_single_shift = 1;
4440 goto ran_out_of_chars;
4443 label_continue_loop:;
4452 postprocess_iso2022_mask (int mask)
4454 /* #### kind of cheesy */
4455 /* If seven-bit ISO is allowed, then assume that the encoding is
4456 entirely seven-bit and turn off the eight-bit ones. */
4457 if (mask & CODING_CATEGORY_ISO_7_MASK)
4458 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4459 CODING_CATEGORY_ISO_8_1_MASK |
4460 CODING_CATEGORY_ISO_8_2_MASK);
4464 /* If FLAGS is a null pointer or specifies right-to-left motion,
4465 output a switch-dir-to-left-to-right sequence to DST.
4466 Also update FLAGS if it is not a null pointer.
4467 If INTERNAL_P is set, we are outputting in internal format and
4468 need to handle the CSI differently. */
4471 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4472 unsigned_char_dynarr *dst,
4473 unsigned int *flags,
4476 if (!flags || (*flags & CODING_STATE_R2L))
4478 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4480 Dynarr_add (dst, ISO_CODE_ESC);
4481 Dynarr_add (dst, '[');
4483 else if (internal_p)
4484 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4486 Dynarr_add (dst, ISO_CODE_CSI);
4487 Dynarr_add (dst, '0');
4488 Dynarr_add (dst, ']');
4490 *flags &= ~CODING_STATE_R2L;
4494 /* If FLAGS is a null pointer or specifies a direction different from
4495 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4496 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4497 sequence to DST. Also update FLAGS if it is not a null pointer.
4498 If INTERNAL_P is set, we are outputting in internal format and
4499 need to handle the CSI differently. */
4502 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4503 unsigned_char_dynarr *dst, unsigned int *flags,
4506 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4507 direction == CHARSET_LEFT_TO_RIGHT)
4508 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4509 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4510 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4511 direction == CHARSET_RIGHT_TO_LEFT)
4513 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4515 Dynarr_add (dst, ISO_CODE_ESC);
4516 Dynarr_add (dst, '[');
4518 else if (internal_p)
4519 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4521 Dynarr_add (dst, ISO_CODE_CSI);
4522 Dynarr_add (dst, '2');
4523 Dynarr_add (dst, ']');
4525 *flags |= CODING_STATE_R2L;
4529 /* Convert ISO2022-format data to internal format. */
4532 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4533 unsigned_char_dynarr *dst, unsigned int n)
4535 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4536 unsigned int flags = str->flags;
4537 unsigned int ch = str->ch;
4538 eol_type_t eol_type = str->eol_type;
4539 #ifdef ENABLE_COMPOSITE_CHARS
4540 unsigned_char_dynarr *real_dst = dst;
4542 Lisp_Object coding_system;
4544 XSETCODING_SYSTEM (coding_system, str->codesys);
4546 #ifdef ENABLE_COMPOSITE_CHARS
4547 if (flags & CODING_STATE_COMPOSITE)
4548 dst = str->iso2022.composite_chars;
4549 #endif /* ENABLE_COMPOSITE_CHARS */
4553 unsigned char c = *src++;
4554 if (flags & CODING_STATE_ESCAPE)
4555 { /* Within ESC sequence */
4556 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4561 switch (str->iso2022.esc)
4563 #ifdef ENABLE_COMPOSITE_CHARS
4564 case ISO_ESC_START_COMPOSITE:
4565 if (str->iso2022.composite_chars)
4566 Dynarr_reset (str->iso2022.composite_chars);
4568 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4569 dst = str->iso2022.composite_chars;
4571 case ISO_ESC_END_COMPOSITE:
4573 Bufbyte comstr[MAX_EMCHAR_LEN];
4575 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4576 Dynarr_length (dst));
4578 len = set_charptr_emchar (comstr, emch);
4579 Dynarr_add_many (dst, comstr, len);
4582 #endif /* ENABLE_COMPOSITE_CHARS */
4584 case ISO_ESC_LITERAL:
4585 DECODE_ADD_BINARY_CHAR (c, dst);
4589 /* Everything else handled already */
4594 /* Attempted error recovery. */
4595 if (str->iso2022.output_direction_sequence)
4596 ensure_correct_direction (flags & CODING_STATE_R2L ?
4597 CHARSET_RIGHT_TO_LEFT :
4598 CHARSET_LEFT_TO_RIGHT,
4599 str->codesys, dst, 0, 1);
4600 /* More error recovery. */
4601 if (!retval || str->iso2022.output_literally)
4603 /* Output the (possibly invalid) sequence */
4605 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4606 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4607 flags &= CODING_STATE_ISO2022_LOCK;
4609 n++, src--;/* Repeat the loop with the same character. */
4612 /* No sense in reprocessing the final byte of the
4613 escape sequence; it could mess things up anyway.
4615 DECODE_ADD_BINARY_CHAR (c, dst);
4620 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4621 { /* Control characters */
4623 /***** Error-handling *****/
4625 /* If we were in the middle of a character, dump out the
4626 partial character. */
4627 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4629 /* If we just saw a single-shift character, dump it out.
4630 This may dump out the wrong sort of single-shift character,
4631 but least it will give an indication that something went
4633 if (flags & CODING_STATE_SS2)
4635 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4636 flags &= ~CODING_STATE_SS2;
4638 if (flags & CODING_STATE_SS3)
4640 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4641 flags &= ~CODING_STATE_SS3;
4644 /***** Now handle the control characters. *****/
4647 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4649 flags &= CODING_STATE_ISO2022_LOCK;
4651 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4652 DECODE_ADD_BINARY_CHAR (c, dst);
4655 { /* Graphic characters */
4656 Lisp_Object charset;
4660 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4662 /* Now determine the charset. */
4663 reg = ((flags & CODING_STATE_SS2) ? 2
4664 : (flags & CODING_STATE_SS3) ? 3
4665 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4666 : str->iso2022.register_left);
4667 charset = str->iso2022.charset[reg];
4669 /* Error checking: */
4670 if (! CHARSETP (charset)
4671 || str->iso2022.invalid_designated[reg]
4672 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4673 && XCHARSET_CHARS (charset) == 94))
4674 /* Mrmph. We are trying to invoke a register that has no
4675 or an invalid charset in it, or trying to add a character
4676 outside the range of the charset. Insert that char literally
4677 to preserve it for the output. */
4679 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4680 DECODE_ADD_BINARY_CHAR (c, dst);
4685 /* Things are probably hunky-dorey. */
4687 /* Fetch reverse charset, maybe. */
4688 if (((flags & CODING_STATE_R2L) &&
4689 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4691 (!(flags & CODING_STATE_R2L) &&
4692 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4694 Lisp_Object new_charset =
4695 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4696 if (!NILP (new_charset))
4697 charset = new_charset;
4700 lb = XCHARSET_LEADING_BYTE (charset);
4701 switch (XCHARSET_REP_BYTES (charset))
4704 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4705 Dynarr_add (dst, c & 0x7F);
4708 case 2: /* one-byte official */
4709 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4710 Dynarr_add (dst, lb);
4711 Dynarr_add (dst, c | 0x80);
4714 case 3: /* one-byte private or two-byte official */
4715 if (XCHARSET_PRIVATE_P (charset))
4717 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4718 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4719 Dynarr_add (dst, lb);
4720 Dynarr_add (dst, c | 0x80);
4726 Dynarr_add (dst, lb);
4727 Dynarr_add (dst, ch | 0x80);
4728 Dynarr_add (dst, c | 0x80);
4736 default: /* two-byte private */
4739 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4740 Dynarr_add (dst, lb);
4741 Dynarr_add (dst, ch | 0x80);
4742 Dynarr_add (dst, c | 0x80);
4751 flags &= CODING_STATE_ISO2022_LOCK;
4754 label_continue_loop:;
4757 if (flags & CODING_STATE_END)
4758 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4765 /***** ISO2022 encoder *****/
4767 /* Designate CHARSET into register REG. */
4770 iso2022_designate (Lisp_Object charset, unsigned char reg,
4771 struct encoding_stream *str, unsigned_char_dynarr *dst)
4773 static CONST char inter94[] = "()*+";
4774 static CONST char inter96[] = ",-./";
4776 unsigned char final;
4777 Lisp_Object old_charset = str->iso2022.charset[reg];
4779 str->iso2022.charset[reg] = charset;
4780 if (!CHARSETP (charset))
4781 /* charset might be an initial nil or t. */
4783 type = XCHARSET_TYPE (charset);
4784 final = XCHARSET_FINAL (charset);
4785 if (!str->iso2022.force_charset_on_output[reg] &&
4786 CHARSETP (old_charset) &&
4787 XCHARSET_TYPE (old_charset) == type &&
4788 XCHARSET_FINAL (old_charset) == final)
4791 str->iso2022.force_charset_on_output[reg] = 0;
4794 charset_conversion_spec_dynarr *dyn =
4795 str->codesys->iso2022.output_conv;
4801 for (i = 0; i < Dynarr_length (dyn); i++)
4803 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4804 if (EQ (charset, spec->from_charset))
4805 charset = spec->to_charset;
4810 Dynarr_add (dst, ISO_CODE_ESC);
4813 case CHARSET_TYPE_94:
4814 Dynarr_add (dst, inter94[reg]);
4816 case CHARSET_TYPE_96:
4817 Dynarr_add (dst, inter96[reg]);
4819 case CHARSET_TYPE_94X94:
4820 Dynarr_add (dst, '$');
4822 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4825 Dynarr_add (dst, inter94[reg]);
4827 case CHARSET_TYPE_96X96:
4828 Dynarr_add (dst, '$');
4829 Dynarr_add (dst, inter96[reg]);
4832 Dynarr_add (dst, final);
4836 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4838 if (str->iso2022.register_left != 0)
4840 Dynarr_add (dst, ISO_CODE_SI);
4841 str->iso2022.register_left = 0;
4846 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4848 if (str->iso2022.register_left != 1)
4850 Dynarr_add (dst, ISO_CODE_SO);
4851 str->iso2022.register_left = 1;
4855 /* Convert internally-formatted data to ISO2022 format. */
4858 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4859 unsigned_char_dynarr *dst, unsigned int n)
4861 unsigned char charmask, c;
4862 unsigned char char_boundary;
4863 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4864 unsigned int flags = str->flags;
4865 unsigned int ch = str->ch;
4866 Lisp_Coding_System *codesys = str->codesys;
4867 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4869 Lisp_Object charset;
4872 #ifdef ENABLE_COMPOSITE_CHARS
4873 /* flags for handling composite chars. We do a little switcharoo
4874 on the source while we're outputting the composite char. */
4875 unsigned int saved_n = 0;
4876 CONST unsigned char *saved_src = NULL;
4877 int in_composite = 0;
4878 #endif /* ENABLE_COMPOSITE_CHARS */
4880 char_boundary = str->iso2022.current_char_boundary;
4881 charset = str->iso2022.current_charset;
4882 half = str->iso2022.current_half;
4884 #ifdef ENABLE_COMPOSITE_CHARS
4891 if (BYTE_ASCII_P (c))
4892 { /* Processing ASCII character */
4895 restore_left_to_right_direction (codesys, dst, &flags, 0);
4897 /* Make sure G0 contains ASCII */
4898 if ((c > ' ' && c < ISO_CODE_DEL) ||
4899 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4901 ensure_normal_shift (str, dst);
4902 iso2022_designate (Vcharset_ascii, 0, str, dst);
4905 /* If necessary, restore everything to the default state
4908 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4910 restore_left_to_right_direction (codesys, dst, &flags, 0);
4912 ensure_normal_shift (str, dst);
4914 for (i = 0; i < 4; i++)
4916 Lisp_Object initial_charset =
4917 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4918 iso2022_designate (initial_charset, i, str, dst);
4923 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4924 Dynarr_add (dst, '\r');
4925 if (eol_type != EOL_CR)
4926 Dynarr_add (dst, c);
4930 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4931 && fit_to_be_escape_quoted (c))
4932 Dynarr_add (dst, ISO_CODE_ESC);
4933 Dynarr_add (dst, c);
4938 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4939 { /* Processing Leading Byte */
4941 charset = CHARSET_BY_LEADING_BYTE (c);
4942 if (LEADING_BYTE_PREFIX_P(c))
4944 else if (!EQ (charset, Vcharset_control_1)
4945 #ifdef ENABLE_COMPOSITE_CHARS
4946 && !EQ (charset, Vcharset_composite)
4952 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4953 codesys, dst, &flags, 0);
4955 /* Now determine which register to use. */
4957 for (i = 0; i < 4; i++)
4959 if (EQ (charset, str->iso2022.charset[i]) ||
4961 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4970 if (XCHARSET_GRAPHIC (charset) != 0)
4972 if (!NILP (str->iso2022.charset[1]) &&
4973 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4974 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4976 else if (!NILP (str->iso2022.charset[2]))
4978 else if (!NILP (str->iso2022.charset[3]))
4987 iso2022_designate (charset, reg, str, dst);
4989 /* Now invoke that register. */
4993 ensure_normal_shift (str, dst);
4998 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5000 ensure_shift_out (str, dst);
5008 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5010 Dynarr_add (dst, ISO_CODE_ESC);
5011 Dynarr_add (dst, 'N');
5016 Dynarr_add (dst, ISO_CODE_SS2);
5022 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5024 Dynarr_add (dst, ISO_CODE_ESC);
5025 Dynarr_add (dst, 'O');
5030 Dynarr_add (dst, ISO_CODE_SS3);
5042 { /* Processing Non-ASCII character */
5043 charmask = (half == 0 ? 0x7F : 0xFF);
5045 if (EQ (charset, Vcharset_control_1))
5047 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5048 && fit_to_be_escape_quoted (c))
5049 Dynarr_add (dst, ISO_CODE_ESC);
5050 /* you asked for it ... */
5051 Dynarr_add (dst, c - 0x20);
5055 switch (XCHARSET_REP_BYTES (charset))
5058 Dynarr_add (dst, c & charmask);
5061 if (XCHARSET_PRIVATE_P (charset))
5063 Dynarr_add (dst, c & charmask);
5068 #ifdef ENABLE_COMPOSITE_CHARS
5069 if (EQ (charset, Vcharset_composite))
5073 /* #### Bother! We don't know how to
5075 Dynarr_add (dst, '~');
5079 Emchar emch = MAKE_CHAR (Vcharset_composite,
5080 ch & 0x7F, c & 0x7F);
5081 Lisp_Object lstr = composite_char_string (emch);
5085 src = XSTRING_DATA (lstr);
5086 n = XSTRING_LENGTH (lstr);
5087 Dynarr_add (dst, ISO_CODE_ESC);
5088 Dynarr_add (dst, '0'); /* start composing */
5092 #endif /* ENABLE_COMPOSITE_CHARS */
5094 Dynarr_add (dst, ch & charmask);
5095 Dynarr_add (dst, c & charmask);
5108 Dynarr_add (dst, ch & charmask);
5109 Dynarr_add (dst, c & charmask);
5125 #ifdef ENABLE_COMPOSITE_CHARS
5131 Dynarr_add (dst, ISO_CODE_ESC);
5132 Dynarr_add (dst, '1'); /* end composing */
5133 goto back_to_square_n; /* Wheeeeeeeee ..... */
5135 #endif /* ENABLE_COMPOSITE_CHARS */
5137 if (char_boundary && flags & CODING_STATE_END)
5139 restore_left_to_right_direction (codesys, dst, &flags, 0);
5140 ensure_normal_shift (str, dst);
5141 for (i = 0; i < 4; i++)
5143 Lisp_Object initial_charset =
5144 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5145 iso2022_designate (initial_charset, i, str, dst);
5151 str->iso2022.current_char_boundary = char_boundary;
5152 str->iso2022.current_charset = charset;
5153 str->iso2022.current_half = half;
5155 /* Verbum caro factum est! */
5159 /************************************************************************/
5160 /* No-conversion methods */
5161 /************************************************************************/
5163 /* This is used when reading in "binary" files -- i.e. files that may
5164 contain all 256 possible byte values and that are not to be
5165 interpreted as being in any particular decoding. */
5167 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5168 unsigned_char_dynarr *dst, unsigned int n)
5171 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5172 unsigned int flags = str->flags;
5173 unsigned int ch = str->ch;
5174 eol_type_t eol_type = str->eol_type;
5180 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5181 DECODE_ADD_BINARY_CHAR (c, dst);
5182 label_continue_loop:;
5185 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5192 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5193 unsigned_char_dynarr *dst, unsigned int n)
5196 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5197 unsigned int flags = str->flags;
5198 unsigned int ch = str->ch;
5199 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5206 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5207 Dynarr_add (dst, '\r');
5208 if (eol_type != EOL_CR)
5209 Dynarr_add (dst, '\n');
5212 else if (BYTE_ASCII_P (c))
5215 Dynarr_add (dst, c);
5217 else if (BUFBYTE_LEADING_BYTE_P (c))
5220 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5221 c == LEADING_BYTE_CONTROL_1)
5224 Dynarr_add (dst, '~'); /* untranslatable character */
5228 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5229 Dynarr_add (dst, c);
5230 else if (ch == LEADING_BYTE_CONTROL_1)
5233 Dynarr_add (dst, c - 0x20);
5235 /* else it should be the second or third byte of an
5236 untranslatable character, so ignore it */
5246 /************************************************************************/
5247 /* Simple internal/external functions */
5248 /************************************************************************/
5250 static Extbyte_dynarr *conversion_out_dynarr;
5251 static Bufbyte_dynarr *conversion_in_dynarr;
5253 /* Determine coding system from coding format */
5255 /* #### not correct for all values of `fmt'! */
5257 external_data_format_to_coding_system (enum external_data_format fmt)
5261 case FORMAT_FILENAME:
5262 case FORMAT_TERMINAL:
5263 if (EQ (Vfile_name_coding_system, Qnil) ||
5264 EQ (Vfile_name_coding_system, Qbinary))
5267 return Fget_coding_system (Vfile_name_coding_system);
5270 return Fget_coding_system (Qctext);
5278 convert_to_external_format (CONST Bufbyte *ptr,
5281 enum external_data_format fmt)
5283 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5285 if (!conversion_out_dynarr)
5286 conversion_out_dynarr = Dynarr_new (Extbyte);
5288 Dynarr_reset (conversion_out_dynarr);
5290 if (NILP (coding_system))
5292 CONST Bufbyte *end = ptr + len;
5297 (BYTE_ASCII_P (*ptr)) ? *ptr :
5298 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5299 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5302 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5306 #ifdef ERROR_CHECK_BUFPOS
5307 assert (ptr == end);
5312 Lisp_Object instream, outstream, da_outstream;
5313 Lstream *istr, *ostr;
5314 struct gcpro gcpro1, gcpro2, gcpro3;
5315 char tempbuf[1024]; /* some random amount */
5317 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5318 da_outstream = make_dynarr_output_stream
5319 ((unsigned_char_dynarr *) conversion_out_dynarr);
5321 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5322 istr = XLSTREAM (instream);
5323 ostr = XLSTREAM (outstream);
5324 GCPRO3 (instream, outstream, da_outstream);
5327 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5330 Lstream_write (ostr, tempbuf, size_in_bytes);
5332 Lstream_close (istr);
5333 Lstream_close (ostr);
5335 Lstream_delete (istr);
5336 Lstream_delete (ostr);
5337 Lstream_delete (XLSTREAM (da_outstream));
5340 *len_out = Dynarr_length (conversion_out_dynarr);
5341 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5342 return Dynarr_atp (conversion_out_dynarr, 0);
5346 convert_from_external_format (CONST Extbyte *ptr,
5349 enum external_data_format fmt)
5351 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5353 if (!conversion_in_dynarr)
5354 conversion_in_dynarr = Dynarr_new (Bufbyte);
5356 Dynarr_reset (conversion_in_dynarr);
5358 if (NILP (coding_system))
5360 CONST Extbyte *end = ptr + len;
5361 for (; ptr < end; ptr++)
5364 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5369 Lisp_Object instream, outstream, da_outstream;
5370 Lstream *istr, *ostr;
5371 struct gcpro gcpro1, gcpro2, gcpro3;
5372 char tempbuf[1024]; /* some random amount */
5374 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5375 da_outstream = make_dynarr_output_stream
5376 ((unsigned_char_dynarr *) conversion_in_dynarr);
5378 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5379 istr = XLSTREAM (instream);
5380 ostr = XLSTREAM (outstream);
5381 GCPRO3 (instream, outstream, da_outstream);
5384 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5387 Lstream_write (ostr, tempbuf, size_in_bytes);
5389 Lstream_close (istr);
5390 Lstream_close (ostr);
5392 Lstream_delete (istr);
5393 Lstream_delete (ostr);
5394 Lstream_delete (XLSTREAM (da_outstream));
5397 *len_out = Dynarr_length (conversion_in_dynarr);
5398 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5399 return Dynarr_atp (conversion_in_dynarr, 0);
5403 /************************************************************************/
5404 /* Initialization */
5405 /************************************************************************/
5408 syms_of_file_coding (void)
5410 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5411 deferror (&Qcoding_system_error, "coding-system-error",
5412 "Coding-system error", Qio_error);
5414 DEFSUBR (Fcoding_system_p);
5415 DEFSUBR (Ffind_coding_system);
5416 DEFSUBR (Fget_coding_system);
5417 DEFSUBR (Fcoding_system_list);
5418 DEFSUBR (Fcoding_system_name);
5419 DEFSUBR (Fmake_coding_system);
5420 DEFSUBR (Fcopy_coding_system);
5421 DEFSUBR (Fdefine_coding_system_alias);
5422 DEFSUBR (Fsubsidiary_coding_system);
5424 DEFSUBR (Fcoding_system_type);
5425 DEFSUBR (Fcoding_system_doc_string);
5427 DEFSUBR (Fcoding_system_charset);
5429 DEFSUBR (Fcoding_system_property);
5431 DEFSUBR (Fcoding_category_list);
5432 DEFSUBR (Fset_coding_priority_list);
5433 DEFSUBR (Fcoding_priority_list);
5434 DEFSUBR (Fset_coding_category_system);
5435 DEFSUBR (Fcoding_category_system);
5437 DEFSUBR (Fdetect_coding_region);
5438 DEFSUBR (Fdecode_coding_region);
5439 DEFSUBR (Fencode_coding_region);
5441 DEFSUBR (Fdecode_shift_jis_char);
5442 DEFSUBR (Fencode_shift_jis_char);
5443 DEFSUBR (Fdecode_big5_char);
5444 DEFSUBR (Fencode_big5_char);
5445 DEFSUBR (Fset_ucs_char);
5446 DEFSUBR (Fucs_char);
5447 DEFSUBR (Fset_char_ucs);
5448 DEFSUBR (Fchar_ucs);
5450 defsymbol (&Qcoding_system_p, "coding-system-p");
5451 defsymbol (&Qno_conversion, "no-conversion");
5452 defsymbol (&Qraw_text, "raw-text");
5454 defsymbol (&Qbig5, "big5");
5455 defsymbol (&Qshift_jis, "shift-jis");
5456 defsymbol (&Qucs4, "ucs-4");
5457 defsymbol (&Qutf8, "utf-8");
5458 defsymbol (&Qccl, "ccl");
5459 defsymbol (&Qiso2022, "iso2022");
5461 defsymbol (&Qmnemonic, "mnemonic");
5462 defsymbol (&Qeol_type, "eol-type");
5463 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5464 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5466 defsymbol (&Qcr, "cr");
5467 defsymbol (&Qlf, "lf");
5468 defsymbol (&Qcrlf, "crlf");
5469 defsymbol (&Qeol_cr, "eol-cr");
5470 defsymbol (&Qeol_lf, "eol-lf");
5471 defsymbol (&Qeol_crlf, "eol-crlf");
5473 defsymbol (&Qcharset_g0, "charset-g0");
5474 defsymbol (&Qcharset_g1, "charset-g1");
5475 defsymbol (&Qcharset_g2, "charset-g2");
5476 defsymbol (&Qcharset_g3, "charset-g3");
5477 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5478 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5479 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5480 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5481 defsymbol (&Qno_iso6429, "no-iso6429");
5482 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5483 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5485 defsymbol (&Qshort, "short");
5486 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5487 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5488 defsymbol (&Qseven, "seven");
5489 defsymbol (&Qlock_shift, "lock-shift");
5490 defsymbol (&Qescape_quoted, "escape-quoted");
5492 defsymbol (&Qencode, "encode");
5493 defsymbol (&Qdecode, "decode");
5496 defsymbol (&Qctext, "ctext");
5497 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5499 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5501 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5503 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5505 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5507 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5509 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5511 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5513 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5516 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5521 lstream_type_create_file_coding (void)
5523 LSTREAM_HAS_METHOD (decoding, reader);
5524 LSTREAM_HAS_METHOD (decoding, writer);
5525 LSTREAM_HAS_METHOD (decoding, rewinder);
5526 LSTREAM_HAS_METHOD (decoding, seekable_p);
5527 LSTREAM_HAS_METHOD (decoding, flusher);
5528 LSTREAM_HAS_METHOD (decoding, closer);
5529 LSTREAM_HAS_METHOD (decoding, marker);
5531 LSTREAM_HAS_METHOD (encoding, reader);
5532 LSTREAM_HAS_METHOD (encoding, writer);
5533 LSTREAM_HAS_METHOD (encoding, rewinder);
5534 LSTREAM_HAS_METHOD (encoding, seekable_p);
5535 LSTREAM_HAS_METHOD (encoding, flusher);
5536 LSTREAM_HAS_METHOD (encoding, closer);
5537 LSTREAM_HAS_METHOD (encoding, marker);
5541 vars_of_file_coding (void)
5545 /* Initialize to something reasonable ... */
5546 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5548 coding_category_system[i] = Qnil;
5549 coding_category_by_priority[i] = i;
5552 Fprovide (intern ("file-coding"));
5554 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5555 Coding system used for TTY keyboard input.
5556 Not used under a windowing system.
5558 Vkeyboard_coding_system = Qnil;
5560 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5561 Coding system used for TTY display output.
5562 Not used under a windowing system.
5564 Vterminal_coding_system = Qnil;
5566 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5567 Overriding coding system used when writing a file or process.
5568 You should *bind* this, not set it. If this is non-nil, it specifies
5569 the coding system that will be used when a file or process is read
5570 in, and overrides `buffer-file-coding-system-for-read',
5571 `insert-file-contents-pre-hook', etc. Use those variables instead of
5572 this one for permanent changes to the environment.
5574 Vcoding_system_for_read = Qnil;
5576 DEFVAR_LISP ("coding-system-for-write",
5577 &Vcoding_system_for_write /*
5578 Overriding coding system used when writing a file or process.
5579 You should *bind* this, not set it. If this is non-nil, it specifies
5580 the coding system that will be used when a file or process is wrote
5581 in, and overrides `buffer-file-coding-system',
5582 `write-region-pre-hook', etc. Use those variables instead of this one
5583 for permanent changes to the environment.
5585 Vcoding_system_for_write = Qnil;
5587 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5588 Coding system used to convert pathnames when accessing files.
5590 Vfile_name_coding_system = Qnil;
5592 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5593 Non-nil means the buffer contents are regarded as multi-byte form
5594 of characters, not a binary code. This affects the display, file I/O,
5595 and behaviors of various editing commands.
5597 Setting this to nil does not do anything.
5599 enable_multibyte_characters = 1;
5603 complex_vars_of_file_coding (void)
5605 staticpro (&Vcoding_system_hash_table);
5606 Vcoding_system_hash_table =
5607 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5609 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5611 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5613 struct codesys_prop csp; \
5615 csp.prop_type = (Prop_Type); \
5616 Dynarr_add (the_codesys_prop_dynarr, csp); \
5619 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5620 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5621 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5622 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5623 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5624 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5625 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5627 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5628 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5629 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5630 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5631 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5632 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5633 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5634 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5635 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5636 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5637 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5638 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5639 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5640 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5641 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5642 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5643 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5645 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5646 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5648 /* Need to create this here or we're really screwed. */
5650 (Qraw_text, Qno_conversion,
5651 build_string ("Raw text, which means it converts only line-break-codes."),
5652 list2 (Qmnemonic, build_string ("Raw")));
5655 (Qbinary, Qno_conversion,
5656 build_string ("Binary, which means it does not convert anything."),
5657 list4 (Qeol_type, Qlf,
5658 Qmnemonic, build_string ("Binary")));
5660 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5662 /* Need this for bootstrapping */
5663 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5664 Fget_coding_system (Qraw_text);
5670 for (i = 0; i < 65536; i++)
5671 ucs_to_mule_table[i] = Qnil;
5673 staticpro (&mule_to_ucs_table);
5674 mule_to_ucs_table = Fmake_char_table(Qgeneric);