1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
824 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
826 if (EQ (key, Qmnemonic))
829 CHECK_STRING (value);
830 CODING_SYSTEM_MNEMONIC (codesys) = value;
833 else if (EQ (key, Qeol_type))
835 need_to_setup_eol_systems = NILP (value);
838 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
841 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
842 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
844 else if (ty == CODESYS_ISO2022)
846 #define FROB_INITIAL_CHARSET(charset_num) \
847 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
848 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
850 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
851 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
852 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
853 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
855 #define FROB_FORCE_CHARSET(charset_num) \
856 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
858 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
859 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
860 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
861 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
863 #define FROB_BOOLEAN_PROPERTY(prop) \
864 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
866 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
867 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
868 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
869 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
870 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
871 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
872 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
874 else if (EQ (key, Qinput_charset_conversion))
876 codesys->iso2022.input_conv =
877 Dynarr_new (charset_conversion_spec);
878 parse_charset_conversion_specs (codesys->iso2022.input_conv,
881 else if (EQ (key, Qoutput_charset_conversion))
883 codesys->iso2022.output_conv =
884 Dynarr_new (charset_conversion_spec);
885 parse_charset_conversion_specs (codesys->iso2022.output_conv,
889 signal_simple_error ("Unrecognized property", key);
891 else if (EQ (type, Qccl))
893 if (EQ (key, Qdecode))
895 CHECK_VECTOR (value);
896 CODING_SYSTEM_CCL_DECODE (codesys) = value;
898 else if (EQ (key, Qencode))
900 CHECK_VECTOR (value);
901 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
904 signal_simple_error ("Unrecognized property", key);
908 signal_simple_error ("Unrecognized property", key);
911 if (need_to_setup_eol_systems)
912 setup_eol_coding_systems (codesys);
915 Lisp_Object codesys_obj;
916 XSETCODING_SYSTEM (codesys_obj, codesys);
917 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
923 Copy OLD-CODING-SYSTEM to NEW-NAME.
924 If NEW-NAME does not name an existing coding system, a new one will
927 (old_coding_system, new_name))
929 Lisp_Object new_coding_system;
930 old_coding_system = Fget_coding_system (old_coding_system);
931 new_coding_system = Ffind_coding_system (new_name);
932 if (NILP (new_coding_system))
934 XSETCODING_SYSTEM (new_coding_system,
935 allocate_coding_system
936 (XCODING_SYSTEM_TYPE (old_coding_system),
938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
944 memcpy (((char *) to ) + sizeof (to->header),
945 ((char *) from) + sizeof (from->header),
946 sizeof (*from) - sizeof (from->header));
949 return new_coding_system;
952 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
953 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
955 (alias, coding_system))
957 CHECK_SYMBOL (alias);
958 if (!NILP (Ffind_coding_system (alias)))
959 signal_simple_error ("Symbol already names a coding system", alias);
960 coding_system = Fget_coding_system (coding_system);
961 Fputhash (alias, coding_system, Vcoding_system_hash_table);
963 /* Set up aliases for subsidiaries. */
964 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
967 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
968 #define FROB(type, name) \
970 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
971 if (!NILP (subsidiary)) \
972 Fdefine_coding_system_alias \
973 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
980 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
981 but it doesn't look intentional, so I'd rather return something
982 meaningful or nothing at all. */
987 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
989 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
990 Lisp_Object new_coding_system;
992 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
993 return coding_system;
997 case EOL_AUTODETECT: return coding_system;
998 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
999 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1000 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1004 return NILP (new_coding_system) ? coding_system : new_coding_system;
1007 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1008 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1010 (coding_system, eol_type))
1012 coding_system = Fget_coding_system (coding_system);
1014 return subsidiary_coding_system (coding_system,
1015 symbol_to_eol_type (eol_type));
1019 /************************************************************************/
1020 /* Coding system accessors */
1021 /************************************************************************/
1023 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1024 Return the doc string for CODING-SYSTEM.
1028 coding_system = Fget_coding_system (coding_system);
1029 return XCODING_SYSTEM_DOC_STRING (coding_system);
1032 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1033 Return the type of CODING-SYSTEM.
1037 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1040 case CODESYS_AUTODETECT: return Qundecided;
1042 case CODESYS_SHIFT_JIS: return Qshift_jis;
1043 case CODESYS_ISO2022: return Qiso2022;
1044 case CODESYS_BIG5: return Qbig5;
1045 case CODESYS_UCS4: return Qucs4;
1046 case CODESYS_UTF8: return Qutf8;
1047 case CODESYS_CCL: return Qccl;
1049 case CODESYS_NO_CONVERSION: return Qno_conversion;
1051 case CODESYS_INTERNAL: return Qinternal;
1058 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1061 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1063 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1066 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1067 Return initial charset of CODING-SYSTEM designated to GNUM.
1070 (coding_system, gnum))
1072 coding_system = Fget_coding_system (coding_system);
1075 return coding_system_charset (coding_system, XINT (gnum));
1079 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1080 Return the PROP property of CODING-SYSTEM.
1082 (coding_system, prop))
1085 enum coding_system_type type;
1087 coding_system = Fget_coding_system (coding_system);
1088 CHECK_SYMBOL (prop);
1089 type = XCODING_SYSTEM_TYPE (coding_system);
1091 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1092 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1095 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1097 case CODESYS_PROP_ALL_OK:
1100 case CODESYS_PROP_ISO2022:
1101 if (type != CODESYS_ISO2022)
1103 ("Property only valid in ISO2022 coding systems",
1107 case CODESYS_PROP_CCL:
1108 if (type != CODESYS_CCL)
1110 ("Property only valid in CCL coding systems",
1120 signal_simple_error ("Unrecognized property", prop);
1122 if (EQ (prop, Qname))
1123 return XCODING_SYSTEM_NAME (coding_system);
1124 else if (EQ (prop, Qtype))
1125 return Fcoding_system_type (coding_system);
1126 else if (EQ (prop, Qdoc_string))
1127 return XCODING_SYSTEM_DOC_STRING (coding_system);
1128 else if (EQ (prop, Qmnemonic))
1129 return XCODING_SYSTEM_MNEMONIC (coding_system);
1130 else if (EQ (prop, Qeol_type))
1131 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1132 else if (EQ (prop, Qeol_lf))
1133 return XCODING_SYSTEM_EOL_LF (coding_system);
1134 else if (EQ (prop, Qeol_crlf))
1135 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1136 else if (EQ (prop, Qeol_cr))
1137 return XCODING_SYSTEM_EOL_CR (coding_system);
1138 else if (EQ (prop, Qpost_read_conversion))
1139 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1140 else if (EQ (prop, Qpre_write_conversion))
1141 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1143 else if (type == CODESYS_ISO2022)
1145 if (EQ (prop, Qcharset_g0))
1146 return coding_system_charset (coding_system, 0);
1147 else if (EQ (prop, Qcharset_g1))
1148 return coding_system_charset (coding_system, 1);
1149 else if (EQ (prop, Qcharset_g2))
1150 return coding_system_charset (coding_system, 2);
1151 else if (EQ (prop, Qcharset_g3))
1152 return coding_system_charset (coding_system, 3);
1154 #define FORCE_CHARSET(charset_num) \
1155 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1156 (coding_system, charset_num) ? Qt : Qnil)
1158 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1159 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1160 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1161 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1163 #define LISP_BOOLEAN(prop) \
1164 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1166 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1167 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1168 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1169 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1170 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1171 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1172 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1174 else if (EQ (prop, Qinput_charset_conversion))
1176 unparse_charset_conversion_specs
1177 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1178 else if (EQ (prop, Qoutput_charset_conversion))
1180 unparse_charset_conversion_specs
1181 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1185 else if (type == CODESYS_CCL)
1187 if (EQ (prop, Qdecode))
1188 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1189 else if (EQ (prop, Qencode))
1190 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1198 return Qnil; /* not reached */
1202 /************************************************************************/
1203 /* Coding category functions */
1204 /************************************************************************/
1207 decode_coding_category (Lisp_Object symbol)
1211 CHECK_SYMBOL (symbol);
1212 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1213 if (EQ (coding_category_symbol[i], symbol))
1216 signal_simple_error ("Unrecognized coding category", symbol);
1217 return 0; /* not reached */
1220 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1221 Return a list of all recognized coding categories.
1226 Lisp_Object list = Qnil;
1228 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1229 list = Fcons (coding_category_symbol[i], list);
1233 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1234 Change the priority order of the coding categories.
1235 LIST should be list of coding categories, in descending order of
1236 priority. Unspecified coding categories will be lower in priority
1237 than all specified ones, in the same relative order they were in
1242 int category_to_priority[CODING_CATEGORY_LAST + 1];
1246 /* First generate a list that maps coding categories to priorities. */
1248 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1249 category_to_priority[i] = -1;
1251 /* Highest priority comes from the specified list. */
1253 EXTERNAL_LIST_LOOP (rest, list)
1255 int cat = decode_coding_category (XCAR (rest));
1257 if (category_to_priority[cat] >= 0)
1258 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1259 category_to_priority[cat] = i++;
1262 /* Now go through the existing categories by priority to retrieve
1263 the categories not yet specified and preserve their priority
1265 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1267 int cat = coding_category_by_priority[j];
1268 if (category_to_priority[cat] < 0)
1269 category_to_priority[cat] = i++;
1272 /* Now we need to construct the inverse of the mapping we just
1275 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1276 coding_category_by_priority[category_to_priority[i]] = i;
1278 /* Phew! That was confusing. */
1282 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1283 Return a list of coding categories in descending order of priority.
1288 Lisp_Object list = Qnil;
1290 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1291 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1296 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1297 Change the coding system associated with a coding category.
1299 (coding_category, coding_system))
1301 int cat = decode_coding_category (coding_category);
1303 coding_system = Fget_coding_system (coding_system);
1304 coding_category_system[cat] = coding_system;
1308 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1309 Return the coding system associated with a coding category.
1313 int cat = decode_coding_category (coding_category);
1314 Lisp_Object sys = coding_category_system[cat];
1317 return XCODING_SYSTEM_NAME (sys);
1322 /************************************************************************/
1323 /* Detecting the encoding of data */
1324 /************************************************************************/
1326 struct detection_state
1328 enum eol_type eol_type;
1364 struct iso2022_decoder iso;
1366 int high_byte_count;
1367 unsigned int saw_single_shift:1;
1380 acceptable_control_char_p (int c)
1384 /* Allow and ignore control characters that you might
1385 reasonably see in a text file */
1390 case 8: /* backspace */
1391 case 11: /* vertical tab */
1392 case 12: /* form feed */
1393 case 26: /* MS-DOS C-z junk */
1394 case 31: /* '^_' -- for info */
1402 mask_has_at_most_one_bit_p (int mask)
1404 /* Perhaps the only thing useful you learn from intensive Microsoft
1405 technical interviews */
1406 return (mask & (mask - 1)) == 0;
1409 static enum eol_type
1410 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1419 st->eol.just_saw_cr = 1;
1424 if (st->eol.just_saw_cr)
1426 else if (st->eol.seen_anything)
1429 else if (st->eol.just_saw_cr)
1431 st->eol.just_saw_cr = 0;
1433 st->eol.seen_anything = 1;
1436 return EOL_AUTODETECT;
1439 /* Attempt to determine the encoding and EOL type of the given text.
1440 Before calling this function for the first type, you must initialize
1441 st->eol_type as appropriate and initialize st->mask to ~0.
1443 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1446 st->mask holds the determined coding category mask, or ~0 if only
1447 ASCII has been seen so far.
1451 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1452 is present in st->mask
1453 1 == definitive answers are here for both st->eol_type and st->mask
1457 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1458 unsigned int n, int just_do_eol)
1462 if (st->eol_type == EOL_AUTODETECT)
1463 st->eol_type = detect_eol_type (st, src, n);
1466 return st->eol_type != EOL_AUTODETECT;
1468 if (!st->seen_non_ascii)
1470 for (; n; n--, src++)
1473 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1475 st->seen_non_ascii = 1;
1477 st->shift_jis.mask = ~0;
1481 st->iso2022.mask = ~0;
1491 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1492 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1493 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1494 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1495 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1496 st->big5.mask = detect_coding_big5 (st, src, n);
1497 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1498 st->utf8.mask = detect_coding_utf8 (st, src, n);
1499 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1500 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1503 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1504 | st->utf8.mask | st->ucs4.mask;
1507 int retval = mask_has_at_most_one_bit_p (st->mask);
1508 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1509 return retval && st->eol_type != EOL_AUTODETECT;
1514 coding_system_from_mask (int mask)
1518 /* If the file was entirely or basically ASCII, use the
1519 default value of `buffer-file-coding-system'. */
1520 Lisp_Object retval =
1521 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1524 retval = Ffind_coding_system (retval);
1528 (Qbad_variable, Qwarning,
1529 "Invalid `default-buffer-file-coding-system', set to nil");
1530 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1534 retval = Fget_coding_system (Qraw_text);
1542 mask = postprocess_iso2022_mask (mask);
1544 /* Look through the coding categories by priority and find
1545 the first one that is allowed. */
1546 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1548 cat = coding_category_by_priority[i];
1549 if ((mask & (1 << cat)) &&
1550 !NILP (coding_category_system[cat]))
1554 return coding_category_system[cat];
1556 return Fget_coding_system (Qraw_text);
1560 /* Given a seekable read stream and potential coding system and EOL type
1561 as specified, do any autodetection that is called for. If the
1562 coding system and/or EOL type are not autodetect, they will be left
1563 alone; but this function will never return an autodetect coding system
1566 This function does not automatically fetch subsidiary coding systems;
1567 that should be unnecessary with the explicit eol-type argument. */
1570 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1571 enum eol_type *eol_type_in_out)
1573 struct detection_state decst;
1575 if (*eol_type_in_out == EOL_AUTODETECT)
1576 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1579 decst.eol_type = *eol_type_in_out;
1582 /* If autodetection is called for, do it now. */
1583 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1584 *eol_type_in_out == EOL_AUTODETECT)
1589 unsigned char random_buffer[4096];
1592 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1595 if (detect_coding_type (&decst, random_buffer, nread,
1596 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1597 CODESYS_AUTODETECT))
1601 *eol_type_in_out = decst.eol_type;
1602 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1603 *codesys_in_out = coding_system_from_mask (decst.mask);
1606 /* If we absolutely can't determine the EOL type, just assume LF. */
1607 if (*eol_type_in_out == EOL_AUTODETECT)
1608 *eol_type_in_out = EOL_LF;
1610 Lstream_rewind (stream);
1613 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1614 Detect coding system of the text in the region between START and END.
1615 Returned a list of possible coding systems ordered by priority.
1616 If only ASCII characters are found, it returns 'undecided or one of
1617 its subsidiary coding systems according to a detected end-of-line
1618 type. Optional arg BUFFER defaults to the current buffer.
1620 (start, end, buffer))
1622 Lisp_Object val = Qnil;
1623 struct buffer *buf = decode_buffer (buffer, 0);
1625 Lisp_Object instream, lb_instream;
1626 Lstream *istr, *lb_istr;
1627 struct detection_state decst;
1628 struct gcpro gcpro1, gcpro2;
1630 get_buffer_range_char (buf, start, end, &b, &e, 0);
1631 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1632 lb_istr = XLSTREAM (lb_instream);
1633 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1634 istr = XLSTREAM (instream);
1635 GCPRO2 (instream, lb_instream);
1637 decst.eol_type = EOL_AUTODETECT;
1641 unsigned char random_buffer[4096];
1642 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1646 if (detect_coding_type (&decst, random_buffer, nread, 0))
1650 if (decst.mask == ~0)
1651 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1659 decst.mask = postprocess_iso2022_mask (decst.mask);
1661 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1663 int sys = coding_category_by_priority[i];
1664 if (decst.mask & (1 << sys))
1666 Lisp_Object codesys = coding_category_system[sys];
1667 if (!NILP (codesys))
1668 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1669 val = Fcons (codesys, val);
1673 Lstream_close (istr);
1675 Lstream_delete (istr);
1676 Lstream_delete (lb_istr);
1681 /************************************************************************/
1682 /* Converting to internal Mule format ("decoding") */
1683 /************************************************************************/
1685 /* A decoding stream is a stream used for decoding text (i.e.
1686 converting from some external format to internal format).
1687 The decoding-stream object keeps track of the actual coding
1688 stream, the stream that is at the other end, and data that
1689 needs to be persistent across the lifetime of the stream. */
1691 /* Handle the EOL stuff related to just-read-in character C.
1692 EOL_TYPE is the EOL type of the coding stream.
1693 FLAGS is the current value of FLAGS in the coding stream, and may
1694 be modified by this macro. (The macro only looks at the
1695 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1696 bytes are to be written. You need to also define a local goto
1697 label "label_continue_loop" that is at the end of the main
1698 character-reading loop.
1700 If C is a CR character, then this macro handles it entirely and
1701 jumps to label_continue_loop. Otherwise, this macro does not add
1702 anything to DST, and continues normally. You should continue
1703 processing C normally after this macro. */
1705 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1709 if (eol_type == EOL_CR) \
1710 Dynarr_add (dst, '\n'); \
1711 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1712 Dynarr_add (dst, c); \
1714 flags |= CODING_STATE_CR; \
1715 goto label_continue_loop; \
1717 else if (flags & CODING_STATE_CR) \
1718 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1720 Dynarr_add (dst, '\r'); \
1721 flags &= ~CODING_STATE_CR; \
1725 /* C should be a binary character in the range 0 - 255; convert
1726 to internal format and add to Dynarr DST. */
1728 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1730 if (BYTE_ASCII_P (c)) \
1731 Dynarr_add (dst, c); \
1732 else if (BYTE_C1_P (c)) \
1734 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1735 Dynarr_add (dst, c + 0x20); \
1739 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1740 Dynarr_add (dst, c); \
1744 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1748 DECODE_ADD_BINARY_CHAR (ch, dst); \
1753 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1755 if (flags & CODING_STATE_END) \
1757 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1758 if (flags & CODING_STATE_CR) \
1759 Dynarr_add (dst, '\r'); \
1763 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1765 struct decoding_stream
1767 /* Coding system that governs the conversion. */
1768 Lisp_Coding_System *codesys;
1770 /* Stream that we read the encoded data from or
1771 write the decoded data to. */
1774 /* If we are reading, then we can return only a fixed amount of
1775 data, so if the conversion resulted in too much data, we store it
1776 here for retrieval the next time around. */
1777 unsigned_char_dynarr *runoff;
1779 /* FLAGS holds flags indicating the current state of the decoding.
1780 Some of these flags are dependent on the coding system. */
1783 /* CH holds a partially built-up character. Since we only deal
1784 with one- and two-byte characters at the moment, we only use
1785 this to store the first byte of a two-byte character. */
1788 /* EOL_TYPE specifies the type of end-of-line conversion that
1789 currently applies. We need to keep this separate from the
1790 EOL type stored in CODESYS because the latter might indicate
1791 automatic EOL-type detection while the former will always
1792 indicate a particular EOL type. */
1793 enum eol_type eol_type;
1795 /* Additional ISO2022 information. We define the structure above
1796 because it's also needed by the detection routines. */
1797 struct iso2022_decoder iso2022;
1799 /* Additional information (the state of the running CCL program)
1800 used by the CCL decoder. */
1801 struct ccl_program ccl;
1803 struct detection_state decst;
1806 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1807 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1808 static int decoding_rewinder (Lstream *stream);
1809 static int decoding_seekable_p (Lstream *stream);
1810 static int decoding_flusher (Lstream *stream);
1811 static int decoding_closer (Lstream *stream);
1813 static Lisp_Object decoding_marker (Lisp_Object stream,
1814 void (*markobj) (Lisp_Object));
1816 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1817 sizeof (struct decoding_stream));
1820 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1822 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1823 Lisp_Object str_obj;
1825 /* We do not need to mark the coding systems or charsets stored
1826 within the stream because they are stored in a global list
1827 and automatically marked. */
1829 XSETLSTREAM (str_obj, str);
1831 if (str->imp->marker)
1832 return (str->imp->marker) (str_obj, markobj);
1837 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1838 so we read data from the other end, decode it, and store it into DATA. */
1841 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1843 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1844 unsigned char *orig_data = data;
1846 int error_occurred = 0;
1848 /* We need to interface to mule_decode(), which expects to take some
1849 amount of data and store the result into a Dynarr. We have
1850 mule_decode() store into str->runoff, and take data from there
1853 /* We loop until we have enough data, reading chunks from the other
1854 end and decoding it. */
1857 /* Take data from the runoff if we can. Make sure to take at
1858 most SIZE bytes, and delete the data from the runoff. */
1859 if (Dynarr_length (str->runoff) > 0)
1861 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1862 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1863 Dynarr_delete_many (str->runoff, 0, chunk);
1869 break; /* No more room for data */
1871 if (str->flags & CODING_STATE_END)
1872 /* This means that on the previous iteration, we hit the EOF on
1873 the other end. We loop once more so that mule_decode() can
1874 output any final stuff it may be holding, or any "go back
1875 to a sane state" escape sequences. (This latter makes sense
1876 during encoding.) */
1879 /* Exhausted the runoff, so get some more. DATA has at least
1880 SIZE bytes left of storage in it, so it's OK to read directly
1881 into it. (We'll be overwriting above, after we've decoded it
1882 into the runoff.) */
1883 read_size = Lstream_read (str->other_end, data, size);
1890 /* There might be some more end data produced in the translation.
1891 See the comment above. */
1892 str->flags |= CODING_STATE_END;
1893 mule_decode (stream, data, str->runoff, read_size);
1896 if (data - orig_data == 0)
1897 return error_occurred ? -1 : 0;
1899 return data - orig_data;
1903 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1905 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1908 /* Decode all our data into the runoff, and then attempt to write
1909 it all out to the other end. Remove whatever chunk we succeeded
1911 mule_decode (stream, data, str->runoff, size);
1912 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1913 Dynarr_length (str->runoff));
1915 Dynarr_delete_many (str->runoff, 0, retval);
1916 /* Do NOT return retval. The return value indicates how much
1917 of the incoming data was written, not how many bytes were
1923 reset_decoding_stream (struct decoding_stream *str)
1926 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1928 Lisp_Object coding_system;
1929 XSETCODING_SYSTEM (coding_system, str->codesys);
1930 reset_iso2022 (coding_system, &str->iso2022);
1932 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1934 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1937 str->flags = str->ch = 0;
1941 decoding_rewinder (Lstream *stream)
1943 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1944 reset_decoding_stream (str);
1945 Dynarr_reset (str->runoff);
1946 return Lstream_rewind (str->other_end);
1950 decoding_seekable_p (Lstream *stream)
1952 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1953 return Lstream_seekable_p (str->other_end);
1957 decoding_flusher (Lstream *stream)
1959 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1960 return Lstream_flush (str->other_end);
1964 decoding_closer (Lstream *stream)
1966 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1967 if (stream->flags & LSTREAM_FL_WRITE)
1969 str->flags |= CODING_STATE_END;
1970 decoding_writer (stream, 0, 0);
1972 Dynarr_free (str->runoff);
1974 #ifdef ENABLE_COMPOSITE_CHARS
1975 if (str->iso2022.composite_chars)
1976 Dynarr_free (str->iso2022.composite_chars);
1979 return Lstream_close (str->other_end);
1983 decoding_stream_coding_system (Lstream *stream)
1985 Lisp_Object coding_system;
1986 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1988 XSETCODING_SYSTEM (coding_system, str->codesys);
1989 return subsidiary_coding_system (coding_system, str->eol_type);
1993 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1995 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1996 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1998 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1999 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2000 reset_decoding_stream (str);
2003 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2004 stream for writing, no automatic code detection will be performed.
2005 The reason for this is that automatic code detection requires a
2006 seekable input. Things will also fail if you open a decoding
2007 stream for reading using a non-fully-specified coding system and
2008 a non-seekable input stream. */
2011 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2014 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2015 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2019 str->other_end = stream;
2020 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2021 str->eol_type = EOL_AUTODETECT;
2022 if (!strcmp (mode, "r")
2023 && Lstream_seekable_p (stream))
2024 /* We can determine the coding system now. */
2025 determine_real_coding_system (stream, &codesys, &str->eol_type);
2026 set_decoding_stream_coding_system (lstr, codesys);
2027 str->decst.eol_type = str->eol_type;
2028 str->decst.mask = ~0;
2029 XSETLSTREAM (obj, lstr);
2034 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2036 return make_decoding_stream_1 (stream, codesys, "r");
2040 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2042 return make_decoding_stream_1 (stream, codesys, "w");
2045 /* Note: the decode_coding_* functions all take the same
2046 arguments as mule_decode(), which is to say some SRC data of
2047 size N, which is to be stored into dynamic array DST.
2048 DECODING is the stream within which the decoding is
2049 taking place, but no data is actually read from or
2050 written to that stream; that is handled in decoding_reader()
2051 or decoding_writer(). This allows the same functions to
2052 be used for both reading and writing. */
2055 mule_decode (Lstream *decoding, CONST unsigned char *src,
2056 unsigned_char_dynarr *dst, unsigned int n)
2058 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2060 /* If necessary, do encoding-detection now. We do this when
2061 we're a writing stream or a non-seekable reading stream,
2062 meaning that we can't just process the whole input,
2063 rewind, and start over. */
2065 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2066 str->eol_type == EOL_AUTODETECT)
2068 Lisp_Object codesys;
2070 XSETCODING_SYSTEM (codesys, str->codesys);
2071 detect_coding_type (&str->decst, src, n,
2072 CODING_SYSTEM_TYPE (str->codesys) !=
2073 CODESYS_AUTODETECT);
2074 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2075 str->decst.mask != ~0)
2076 /* #### This is cheesy. What we really ought to do is
2077 buffer up a certain amount of data so as to get a
2078 less random result. */
2079 codesys = coding_system_from_mask (str->decst.mask);
2080 str->eol_type = str->decst.eol_type;
2081 if (XCODING_SYSTEM (codesys) != str->codesys)
2083 /* Preserve the CODING_STATE_END flag in case it was set.
2084 If we erase it, bad things might happen. */
2085 int was_end = str->flags & CODING_STATE_END;
2086 set_decoding_stream_coding_system (decoding, codesys);
2088 str->flags |= CODING_STATE_END;
2092 switch (CODING_SYSTEM_TYPE (str->codesys))
2095 case CODESYS_INTERNAL:
2096 Dynarr_add_many (dst, src, n);
2099 case CODESYS_AUTODETECT:
2100 /* If we got this far and still haven't decided on the coding
2101 system, then do no conversion. */
2102 case CODESYS_NO_CONVERSION:
2103 decode_coding_no_conversion (decoding, src, dst, n);
2106 case CODESYS_SHIFT_JIS:
2107 decode_coding_sjis (decoding, src, dst, n);
2110 decode_coding_big5 (decoding, src, dst, n);
2113 decode_coding_ucs4 (decoding, src, dst, n);
2116 decode_coding_utf8 (decoding, src, dst, n);
2119 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2121 case CODESYS_ISO2022:
2122 decode_coding_iso2022 (decoding, src, dst, n);
2130 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2131 Decode the text between START and END which is encoded in CODING-SYSTEM.
2132 This is useful if you've read in encoded text from a file without decoding
2133 it (e.g. you read in a JIS-formatted file but used the `binary' or
2134 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2135 Return length of decoded text.
2136 BUFFER defaults to the current buffer if unspecified.
2138 (start, end, coding_system, buffer))
2141 struct buffer *buf = decode_buffer (buffer, 0);
2142 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2143 Lstream *istr, *ostr;
2144 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2146 get_buffer_range_char (buf, start, end, &b, &e, 0);
2148 barf_if_buffer_read_only (buf, b, e);
2150 coding_system = Fget_coding_system (coding_system);
2151 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2152 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2153 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2155 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2156 Fget_coding_system (Qbinary));
2157 istr = XLSTREAM (instream);
2158 ostr = XLSTREAM (outstream);
2159 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2161 /* The chain of streams looks like this:
2163 [BUFFER] <----- send through
2164 ------> [ENCODE AS BINARY]
2165 ------> [DECODE AS SPECIFIED]
2171 char tempbuf[1024]; /* some random amount */
2172 Bufpos newpos, even_newer_pos;
2173 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2174 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2178 newpos = lisp_buffer_stream_startpos (istr);
2179 Lstream_write (ostr, tempbuf, size_in_bytes);
2180 even_newer_pos = lisp_buffer_stream_startpos (istr);
2181 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2184 Lstream_close (istr);
2185 Lstream_close (ostr);
2187 Lstream_delete (istr);
2188 Lstream_delete (ostr);
2189 Lstream_delete (XLSTREAM (de_outstream));
2190 Lstream_delete (XLSTREAM (lb_outstream));
2195 /************************************************************************/
2196 /* Converting to an external encoding ("encoding") */
2197 /************************************************************************/
2199 /* An encoding stream is an output stream. When you create the
2200 stream, you specify the coding system that governs the encoding
2201 and another stream that the resulting encoded data is to be
2202 sent to, and then start sending data to it. */
2204 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2206 struct encoding_stream
2208 /* Coding system that governs the conversion. */
2209 Lisp_Coding_System *codesys;
2211 /* Stream that we read the encoded data from or
2212 write the decoded data to. */
2215 /* If we are reading, then we can return only a fixed amount of
2216 data, so if the conversion resulted in too much data, we store it
2217 here for retrieval the next time around. */
2218 unsigned_char_dynarr *runoff;
2220 /* FLAGS holds flags indicating the current state of the encoding.
2221 Some of these flags are dependent on the coding system. */
2224 /* CH holds a partially built-up character. Since we only deal
2225 with one- and two-byte characters at the moment, we only use
2226 this to store the first byte of a two-byte character. */
2229 /* Additional information used by the ISO2022 encoder. */
2232 /* CHARSET holds the character sets currently assigned to the G0
2233 through G3 registers. It is initialized from the array
2234 INITIAL_CHARSET in CODESYS. */
2235 Lisp_Object charset[4];
2237 /* Which registers are currently invoked into the left (GL) and
2238 right (GR) halves of the 8-bit encoding space? */
2239 int register_left, register_right;
2241 /* Whether we need to explicitly designate the charset in the
2242 G? register before using it. It is initialized from the
2243 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2244 unsigned char force_charset_on_output[4];
2246 /* Other state variables that need to be preserved across
2248 Lisp_Object current_charset;
2250 int current_char_boundary;
2253 /* Additional information (the state of the running CCL program)
2254 used by the CCL encoder. */
2255 struct ccl_program ccl;
2259 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2260 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2262 static int encoding_rewinder (Lstream *stream);
2263 static int encoding_seekable_p (Lstream *stream);
2264 static int encoding_flusher (Lstream *stream);
2265 static int encoding_closer (Lstream *stream);
2267 static Lisp_Object encoding_marker (Lisp_Object stream,
2268 void (*markobj) (Lisp_Object));
2270 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2271 sizeof (struct encoding_stream));
2274 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2276 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2277 Lisp_Object str_obj;
2279 /* We do not need to mark the coding systems or charsets stored
2280 within the stream because they are stored in a global list
2281 and automatically marked. */
2283 XSETLSTREAM (str_obj, str);
2285 if (str->imp->marker)
2286 return (str->imp->marker) (str_obj, markobj);
2291 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2292 so we read data from the other end, encode it, and store it into DATA. */
2295 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2297 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2298 unsigned char *orig_data = data;
2300 int error_occurred = 0;
2302 /* We need to interface to mule_encode(), which expects to take some
2303 amount of data and store the result into a Dynarr. We have
2304 mule_encode() store into str->runoff, and take data from there
2307 /* We loop until we have enough data, reading chunks from the other
2308 end and encoding it. */
2311 /* Take data from the runoff if we can. Make sure to take at
2312 most SIZE bytes, and delete the data from the runoff. */
2313 if (Dynarr_length (str->runoff) > 0)
2315 int chunk = min ((int) size, Dynarr_length (str->runoff));
2316 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2317 Dynarr_delete_many (str->runoff, 0, chunk);
2323 break; /* No more room for data */
2325 if (str->flags & CODING_STATE_END)
2326 /* This means that on the previous iteration, we hit the EOF on
2327 the other end. We loop once more so that mule_encode() can
2328 output any final stuff it may be holding, or any "go back
2329 to a sane state" escape sequences. (This latter makes sense
2330 during encoding.) */
2333 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2334 left of storage in it, so it's OK to read directly into it.
2335 (We'll be overwriting above, after we've encoded it into the
2337 read_size = Lstream_read (str->other_end, data, size);
2344 /* There might be some more end data produced in the translation.
2345 See the comment above. */
2346 str->flags |= CODING_STATE_END;
2347 mule_encode (stream, data, str->runoff, read_size);
2350 if (data == orig_data)
2351 return error_occurred ? -1 : 0;
2353 return data - orig_data;
2357 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2359 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2362 /* Encode all our data into the runoff, and then attempt to write
2363 it all out to the other end. Remove whatever chunk we succeeded
2365 mule_encode (stream, data, str->runoff, size);
2366 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2367 Dynarr_length (str->runoff));
2369 Dynarr_delete_many (str->runoff, 0, retval);
2370 /* Do NOT return retval. The return value indicates how much
2371 of the incoming data was written, not how many bytes were
2377 reset_encoding_stream (struct encoding_stream *str)
2380 switch (CODING_SYSTEM_TYPE (str->codesys))
2382 case CODESYS_ISO2022:
2386 for (i = 0; i < 4; i++)
2388 str->iso2022.charset[i] =
2389 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2390 str->iso2022.force_charset_on_output[i] =
2391 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2393 str->iso2022.register_left = 0;
2394 str->iso2022.register_right = 1;
2395 str->iso2022.current_charset = Qnil;
2396 str->iso2022.current_half = 0;
2397 str->iso2022.current_char_boundary = 1;
2401 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2408 str->flags = str->ch = 0;
2412 encoding_rewinder (Lstream *stream)
2414 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2415 reset_encoding_stream (str);
2416 Dynarr_reset (str->runoff);
2417 return Lstream_rewind (str->other_end);
2421 encoding_seekable_p (Lstream *stream)
2423 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2424 return Lstream_seekable_p (str->other_end);
2428 encoding_flusher (Lstream *stream)
2430 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2431 return Lstream_flush (str->other_end);
2435 encoding_closer (Lstream *stream)
2437 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2438 if (stream->flags & LSTREAM_FL_WRITE)
2440 str->flags |= CODING_STATE_END;
2441 encoding_writer (stream, 0, 0);
2443 Dynarr_free (str->runoff);
2444 return Lstream_close (str->other_end);
2448 encoding_stream_coding_system (Lstream *stream)
2450 Lisp_Object coding_system;
2451 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2453 XSETCODING_SYSTEM (coding_system, str->codesys);
2454 return coding_system;
2458 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2460 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2461 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2463 reset_encoding_stream (str);
2467 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2470 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2471 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2475 str->runoff = Dynarr_new (unsigned_char);
2476 str->other_end = stream;
2477 set_encoding_stream_coding_system (lstr, codesys);
2478 XSETLSTREAM (obj, lstr);
2483 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2485 return make_encoding_stream_1 (stream, codesys, "r");
2489 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2491 return make_encoding_stream_1 (stream, codesys, "w");
2494 /* Convert N bytes of internally-formatted data stored in SRC to an
2495 external format, according to the encoding stream ENCODING.
2496 Store the encoded data into DST. */
2499 mule_encode (Lstream *encoding, CONST unsigned char *src,
2500 unsigned_char_dynarr *dst, unsigned int n)
2502 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2504 switch (CODING_SYSTEM_TYPE (str->codesys))
2507 case CODESYS_INTERNAL:
2508 Dynarr_add_many (dst, src, n);
2511 case CODESYS_AUTODETECT:
2512 /* If we got this far and still haven't decided on the coding
2513 system, then do no conversion. */
2514 case CODESYS_NO_CONVERSION:
2515 encode_coding_no_conversion (encoding, src, dst, n);
2518 case CODESYS_SHIFT_JIS:
2519 encode_coding_sjis (encoding, src, dst, n);
2522 encode_coding_big5 (encoding, src, dst, n);
2525 encode_coding_ucs4 (encoding, src, dst, n);
2528 encode_coding_utf8 (encoding, src, dst, n);
2531 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2533 case CODESYS_ISO2022:
2534 encode_coding_iso2022 (encoding, src, dst, n);
2542 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2543 Encode the text between START and END using CODING-SYSTEM.
2544 This will, for example, convert Japanese characters into stuff such as
2545 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2546 text. BUFFER defaults to the current buffer if unspecified.
2548 (start, end, coding_system, buffer))
2551 struct buffer *buf = decode_buffer (buffer, 0);
2552 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2553 Lstream *istr, *ostr;
2554 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2556 get_buffer_range_char (buf, start, end, &b, &e, 0);
2558 barf_if_buffer_read_only (buf, b, e);
2560 coding_system = Fget_coding_system (coding_system);
2561 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2562 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2563 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2564 Fget_coding_system (Qbinary));
2565 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2567 istr = XLSTREAM (instream);
2568 ostr = XLSTREAM (outstream);
2569 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2570 /* The chain of streams looks like this:
2572 [BUFFER] <----- send through
2573 ------> [ENCODE AS SPECIFIED]
2574 ------> [DECODE AS BINARY]
2579 char tempbuf[1024]; /* some random amount */
2580 Bufpos newpos, even_newer_pos;
2581 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2582 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2586 newpos = lisp_buffer_stream_startpos (istr);
2587 Lstream_write (ostr, tempbuf, size_in_bytes);
2588 even_newer_pos = lisp_buffer_stream_startpos (istr);
2589 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2595 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2596 Lstream_close (istr);
2597 Lstream_close (ostr);
2599 Lstream_delete (istr);
2600 Lstream_delete (ostr);
2601 Lstream_delete (XLSTREAM (de_outstream));
2602 Lstream_delete (XLSTREAM (lb_outstream));
2603 return make_int (retlen);
2609 /************************************************************************/
2610 /* Shift-JIS methods */
2611 /************************************************************************/
2613 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2614 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2615 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2616 encoded by "position-code + 0x80". A character of JISX0208
2617 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2618 position-codes are divided and shifted so that it fit in the range
2621 --- CODE RANGE of Shift-JIS ---
2622 (character set) (range)
2624 JISX0201-Kana 0xA0 .. 0xDF
2625 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2626 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2627 -------------------------------
2631 /* Is this the first byte of a Shift-JIS two-byte char? */
2633 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2634 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2636 /* Is this the second byte of a Shift-JIS two-byte char? */
2638 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2639 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2641 #define BYTE_SJIS_KATAKANA_P(c) \
2642 ((c) >= 0xA1 && (c) <= 0xDF)
2645 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2653 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2655 if (st->shift_jis.in_second_byte)
2657 st->shift_jis.in_second_byte = 0;
2661 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2662 st->shift_jis.in_second_byte = 1;
2664 return CODING_CATEGORY_SHIFT_JIS_MASK;
2667 /* Convert Shift-JIS data to internal format. */
2670 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2671 unsigned_char_dynarr *dst, unsigned int n)
2674 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2675 unsigned int flags = str->flags;
2676 unsigned int ch = str->ch;
2677 eol_type_t eol_type = str->eol_type;
2685 /* Previous character was first byte of Shift-JIS Kanji char. */
2686 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2688 unsigned char e1, e2;
2690 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2691 DECODE_SJIS (ch, c, e1, e2);
2692 Dynarr_add (dst, e1);
2693 Dynarr_add (dst, e2);
2697 DECODE_ADD_BINARY_CHAR (ch, dst);
2698 DECODE_ADD_BINARY_CHAR (c, dst);
2704 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2705 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2707 else if (BYTE_SJIS_KATAKANA_P (c))
2709 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2710 Dynarr_add (dst, c);
2713 DECODE_ADD_BINARY_CHAR (c, dst);
2715 label_continue_loop:;
2718 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2724 /* Convert internally-formatted data to Shift-JIS. */
2727 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2728 unsigned_char_dynarr *dst, unsigned int n)
2731 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2732 unsigned int flags = str->flags;
2733 unsigned int ch = str->ch;
2734 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2741 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2742 Dynarr_add (dst, '\r');
2743 if (eol_type != EOL_CR)
2744 Dynarr_add (dst, '\n');
2747 else if (BYTE_ASCII_P (c))
2749 Dynarr_add (dst, c);
2752 else if (BUFBYTE_LEADING_BYTE_P (c))
2753 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2754 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2755 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2758 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2760 Dynarr_add (dst, c);
2763 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2764 ch == LEADING_BYTE_JAPANESE_JISX0208)
2768 unsigned char j1, j2;
2769 ENCODE_SJIS (ch, c, j1, j2);
2770 Dynarr_add (dst, j1);
2771 Dynarr_add (dst, j2);
2781 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2782 Decode a JISX0208 character of Shift-JIS coding-system.
2783 CODE is the character code in Shift-JIS as a cons of type bytes.
2784 Return the corresponding character.
2788 unsigned char c1, c2, s1, s2;
2791 CHECK_INT (XCAR (code));
2792 CHECK_INT (XCDR (code));
2793 s1 = XINT (XCAR (code));
2794 s2 = XINT (XCDR (code));
2795 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2796 BYTE_SJIS_TWO_BYTE_2_P (s2))
2798 DECODE_SJIS (s1, s2, c1, c2);
2799 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2800 c1 & 0x7F, c2 & 0x7F));
2806 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2807 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2808 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2812 Lisp_Object charset;
2815 CHECK_CHAR_COERCE_INT (ch);
2816 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2817 if (EQ (charset, Vcharset_japanese_jisx0208))
2819 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2820 return Fcons (make_int (s1), make_int (s2));
2827 /************************************************************************/
2829 /************************************************************************/
2831 /* BIG5 is a coding system encoding two character sets: ASCII and
2832 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2833 character set and is encoded in two-byte.
2835 --- CODE RANGE of BIG5 ---
2836 (character set) (range)
2838 Big5 (1st byte) 0xA1 .. 0xFE
2839 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2840 --------------------------
2842 Since the number of characters in Big5 is larger than maximum
2843 characters in Emacs' charset (96x96), it can't be handled as one
2844 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2845 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2846 contains frequently used characters and the latter contains less
2847 frequently used characters. */
2849 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2850 ((c) >= 0xA1 && (c) <= 0xFE)
2852 /* Is this the second byte of a Shift-JIS two-byte char? */
2854 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2855 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2857 /* Number of Big5 characters which have the same code in 1st byte. */
2859 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2861 /* Code conversion macros. These are macros because they are used in
2862 inner loops during code conversion.
2864 Note that temporary variables in macros introduce the classic
2865 dynamic-scoping problems with variable names. We use capital-
2866 lettered variables in the assumption that XEmacs does not use
2867 capital letters in variables except in a very formalized way
2870 /* Convert Big5 code (b1, b2) into its internal string representation
2873 /* There is a much simpler way to split the Big5 charset into two.
2874 For the moment I'm going to leave the algorithm as-is because it
2875 claims to separate out the most-used characters into a single
2876 charset, which perhaps will lead to optimizations in various
2879 The way the algorithm works is something like this:
2881 Big5 can be viewed as a 94x157 charset, where the row is
2882 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2883 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2884 the split between low and high column numbers is apparently
2885 meaningless; ascending rows produce less and less frequent chars.
2886 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2887 the first charset, and the upper half (0xC9 .. 0xFE) to the
2888 second. To do the conversion, we convert the character into
2889 a single number where 0 .. 156 is the first row, 157 .. 313
2890 is the second, etc. That way, the characters are ordered by
2891 decreasing frequency. Then we just chop the space in two
2892 and coerce the result into a 94x94 space.
2895 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2897 int B1 = b1, B2 = b2; \
2899 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2903 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2907 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2908 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2910 c1 = I / (0xFF - 0xA1) + 0xA1; \
2911 c2 = I % (0xFF - 0xA1) + 0xA1; \
2914 /* Convert the internal string representation of a Big5 character
2915 (lb, c1, c2) into Big5 code (b1, b2). */
2917 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2919 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2921 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2923 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2925 b1 = I / BIG5_SAME_ROW + 0xA1; \
2926 b2 = I % BIG5_SAME_ROW; \
2927 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2931 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2939 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2940 (c >= 0x80 && c <= 0xA0))
2942 if (st->big5.in_second_byte)
2944 st->big5.in_second_byte = 0;
2945 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2949 st->big5.in_second_byte = 1;
2951 return CODING_CATEGORY_BIG5_MASK;
2954 /* Convert Big5 data to internal format. */
2957 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2958 unsigned_char_dynarr *dst, unsigned int n)
2961 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2962 unsigned int flags = str->flags;
2963 unsigned int ch = str->ch;
2964 eol_type_t eol_type = str->eol_type;
2971 /* Previous character was first byte of Big5 char. */
2972 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2974 unsigned char b1, b2, b3;
2975 DECODE_BIG5 (ch, c, b1, b2, b3);
2976 Dynarr_add (dst, b1);
2977 Dynarr_add (dst, b2);
2978 Dynarr_add (dst, b3);
2982 DECODE_ADD_BINARY_CHAR (ch, dst);
2983 DECODE_ADD_BINARY_CHAR (c, dst);
2989 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2990 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2993 DECODE_ADD_BINARY_CHAR (c, dst);
2995 label_continue_loop:;
2998 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3004 /* Convert internally-formatted data to Big5. */
3007 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3008 unsigned_char_dynarr *dst, unsigned int n)
3011 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3012 unsigned int flags = str->flags;
3013 unsigned int ch = str->ch;
3014 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3021 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3022 Dynarr_add (dst, '\r');
3023 if (eol_type != EOL_CR)
3024 Dynarr_add (dst, '\n');
3026 else if (BYTE_ASCII_P (c))
3029 Dynarr_add (dst, c);
3031 else if (BUFBYTE_LEADING_BYTE_P (c))
3033 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3034 c == LEADING_BYTE_CHINESE_BIG5_2)
3036 /* A recognized leading byte. */
3038 continue; /* not done with this character. */
3040 /* otherwise just ignore this character. */
3042 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3043 ch == LEADING_BYTE_CHINESE_BIG5_2)
3045 /* Previous char was a recognized leading byte. */
3047 continue; /* not done with this character. */
3051 /* Encountering second byte of a Big5 character. */
3052 unsigned char b1, b2;
3054 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3055 Dynarr_add (dst, b1);
3056 Dynarr_add (dst, b2);
3067 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3068 Decode a Big5 character CODE of BIG5 coding-system.
3069 CODE is the character code in BIG5, a cons of two integers.
3070 Return the corresponding character.
3074 unsigned char c1, c2, b1, b2;
3077 CHECK_INT (XCAR (code));
3078 CHECK_INT (XCDR (code));
3079 b1 = XINT (XCAR (code));
3080 b2 = XINT (XCDR (code));
3081 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3082 BYTE_BIG5_TWO_BYTE_2_P (b2))
3085 Lisp_Object charset;
3086 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3087 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3088 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3094 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3095 Encode the Big5 character CH to BIG5 coding-system.
3096 Return the corresponding character code in Big5.
3100 Lisp_Object charset;
3103 CHECK_CHAR_COERCE_INT (ch);
3104 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3105 if (EQ (charset, Vcharset_chinese_big5_1) ||
3106 EQ (charset, Vcharset_chinese_big5_2))
3108 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3110 return Fcons (make_int (b1), make_int (b2));
3117 /************************************************************************/
3120 /* UCS-4 character codes are implemented as nonnegative integers. */
3122 /************************************************************************/
3124 Lisp_Object ucs_to_mule_table[65536];
3125 Lisp_Object mule_to_ucs_table;
3127 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3128 Map UCS-4 code CODE to Mule character CHARACTER.
3130 Return T on success, NIL on failure.
3136 CHECK_CHAR (character);
3140 if (c < sizeof (ucs_to_mule_table))
3142 ucs_to_mule_table[c] = character;
3150 ucs_to_char (unsigned long code)
3152 if (code < sizeof (ucs_to_mule_table))
3154 return ucs_to_mule_table[code];
3156 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3161 c = code % (94 * 94);
3163 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3164 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3165 CHARSET_LEFT_TO_RIGHT),
3166 c / 94 + 33, c % 94 + 33));
3172 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3173 Return Mule character corresponding to UCS code CODE (a positive integer).
3177 CHECK_NATNUM (code);
3178 return ucs_to_char (XINT (code));
3181 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3182 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3186 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3187 Fset_char_ucs is more restrictive on index arg, but should
3188 check code arg in a char_table method. */
3189 CHECK_CHAR (character);
3190 CHECK_NATNUM (code);
3191 return Fput_char_table (character, code, mule_to_ucs_table);
3194 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3195 Return the UCS code (a positive integer) corresponding to CHARACTER.
3199 return Fget_char_table (character, mule_to_ucs_table);
3202 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3203 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3204 is not found, instead.
3205 #### do something more appropriate (use blob?)
3206 Danger, Will Robinson! Data loss. Should we signal user? */
3208 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3210 Lisp_Object chr = ucs_to_char (ch);
3214 Bufbyte work[MAX_EMCHAR_LEN];
3219 simple_set_charptr_emchar (work, ch) :
3220 non_ascii_set_charptr_emchar (work, ch);
3221 Dynarr_add_many (dst, work, len);
3225 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3226 Dynarr_add (dst, 34 + 128);
3227 Dynarr_add (dst, 46 + 128);
3231 static unsigned long
3232 mule_char_to_ucs4 (Lisp_Object charset,
3233 unsigned char h, unsigned char l)
3236 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3243 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3244 (XCHARSET_CHARS (charset) == 94) )
3246 unsigned char final = XCHARSET_FINAL (charset);
3248 if ( ('@' <= final) && (final < 0x7f) )
3250 return 0xe00000 + (final - '@') * 94 * 94
3251 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3265 encode_ucs4 (Lisp_Object charset,
3266 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3268 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3269 Dynarr_add (dst, code >> 24);
3270 Dynarr_add (dst, (code >> 16) & 255);
3271 Dynarr_add (dst, (code >> 8) & 255);
3272 Dynarr_add (dst, code & 255);
3276 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3282 switch (st->ucs4.in_byte)
3291 st->ucs4.in_byte = 0;
3297 return CODING_CATEGORY_UCS4_MASK;
3301 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3302 unsigned_char_dynarr *dst, unsigned int n)
3304 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3305 unsigned int flags = str->flags;
3306 unsigned int ch = str->ch;
3310 unsigned char c = *src++;
3318 decode_ucs4 ( ( ch << 8 ) | c, dst);
3323 ch = ( ch << 8 ) | c;
3327 if (flags & CODING_STATE_END)
3328 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3335 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3336 unsigned_char_dynarr *dst, unsigned int n)
3338 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3339 unsigned int flags = str->flags;
3340 unsigned int ch = str->ch;
3341 unsigned char char_boundary = str->iso2022.current_char_boundary;
3342 Lisp_Object charset = str->iso2022.current_charset;
3344 #ifdef ENABLE_COMPOSITE_CHARS
3345 /* flags for handling composite chars. We do a little switcharoo
3346 on the source while we're outputting the composite char. */
3347 unsigned int saved_n = 0;
3348 CONST unsigned char *saved_src = NULL;
3349 int in_composite = 0;
3356 unsigned char c = *src++;
3358 if (BYTE_ASCII_P (c))
3359 { /* Processing ASCII character */
3361 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3364 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3365 { /* Processing Leading Byte */
3367 charset = CHARSET_BY_LEADING_BYTE (c);
3368 if (LEADING_BYTE_PREFIX_P(c))
3373 { /* Processing Non-ASCII character */
3375 if (EQ (charset, Vcharset_control_1))
3377 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3381 switch (XCHARSET_REP_BYTES (charset))
3384 encode_ucs4 (charset, c, 0, dst);
3387 if (XCHARSET_PRIVATE_P (charset))
3389 encode_ucs4 (charset, c, 0, dst);
3394 #ifdef ENABLE_COMPOSITE_CHARS
3395 if (EQ (charset, Vcharset_composite))
3399 /* #### Bother! We don't know how to
3401 Dynarr_add (dst, 0);
3402 Dynarr_add (dst, 0);
3403 Dynarr_add (dst, 0);
3404 Dynarr_add (dst, '~');
3408 Emchar emch = MAKE_CHAR (Vcharset_composite,
3409 ch & 0x7F, c & 0x7F);
3410 Lisp_Object lstr = composite_char_string (emch);
3414 src = XSTRING_DATA (lstr);
3415 n = XSTRING_LENGTH (lstr);
3419 #endif /* ENABLE_COMPOSITE_CHARS */
3421 encode_ucs4(charset, ch, c, dst);
3434 encode_ucs4 (charset, ch, c, dst);
3450 #ifdef ENABLE_COMPOSITE_CHARS
3456 goto back_to_square_n; /* Wheeeeeeeee ..... */
3458 #endif /* ENABLE_COMPOSITE_CHARS */
3462 str->iso2022.current_char_boundary = char_boundary;
3463 str->iso2022.current_charset = charset;
3465 /* Verbum caro factum est! */
3469 /************************************************************************/
3471 /************************************************************************/
3474 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3479 unsigned char c = *src++;
3480 switch (st->utf8.in_byte)
3483 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3486 st->utf8.in_byte = 5;
3488 st->utf8.in_byte = 4;
3490 st->utf8.in_byte = 3;
3492 st->utf8.in_byte = 2;
3494 st->utf8.in_byte = 1;
3499 if ((c & 0xc0) != 0x80)
3505 return CODING_CATEGORY_UTF8_MASK;
3509 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3510 unsigned_char_dynarr *dst, unsigned int n)
3512 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3513 unsigned int flags = str->flags;
3514 unsigned int ch = str->ch;
3515 eol_type_t eol_type = str->eol_type;
3519 unsigned char c = *src++;
3528 else if ( c >= 0xf8 )
3533 else if ( c >= 0xf0 )
3538 else if ( c >= 0xe0 )
3543 else if ( c >= 0xc0 )
3550 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3551 decode_ucs4 (c, dst);
3555 ch = ( ch << 6 ) | ( c & 0x3f );
3556 decode_ucs4 (ch, dst);
3561 ch = ( ch << 6 ) | ( c & 0x3f );
3564 label_continue_loop:;
3567 if (flags & CODING_STATE_END)
3568 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3575 encode_utf8 (Lisp_Object charset,
3576 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3578 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3581 Dynarr_add (dst, code);
3583 else if ( code <= 0x7ff )
3585 Dynarr_add (dst, (code >> 6) | 0xc0);
3586 Dynarr_add (dst, (code & 0x3f) | 0x80);
3588 else if ( code <= 0xffff )
3590 Dynarr_add (dst, (code >> 12) | 0xe0);
3591 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3592 Dynarr_add (dst, (code & 0x3f) | 0x80);
3594 else if ( code <= 0x1fffff )
3596 Dynarr_add (dst, (code >> 18) | 0xf0);
3597 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3598 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3599 Dynarr_add (dst, (code & 0x3f) | 0x80);
3601 else if ( code <= 0x3ffffff )
3603 Dynarr_add (dst, (code >> 24) | 0xf8);
3604 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3605 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3606 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3607 Dynarr_add (dst, (code & 0x3f) | 0x80);
3611 Dynarr_add (dst, (code >> 30) | 0xfc);
3612 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3613 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3614 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3615 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3616 Dynarr_add (dst, (code & 0x3f) | 0x80);
3621 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3622 unsigned_char_dynarr *dst, unsigned int n)
3624 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3625 unsigned int flags = str->flags;
3626 unsigned int ch = str->ch;
3627 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3628 unsigned char char_boundary = str->iso2022.current_char_boundary;
3629 Lisp_Object charset = str->iso2022.current_charset;
3631 #ifdef ENABLE_COMPOSITE_CHARS
3632 /* flags for handling composite chars. We do a little switcharoo
3633 on the source while we're outputting the composite char. */
3634 unsigned int saved_n = 0;
3635 CONST unsigned char *saved_src = NULL;
3636 int in_composite = 0;
3639 #endif /* ENABLE_COMPOSITE_CHARS */
3643 unsigned char c = *src++;
3645 if (BYTE_ASCII_P (c))
3646 { /* Processing ASCII character */
3650 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3651 Dynarr_add (dst, '\r');
3652 if (eol_type != EOL_CR)
3653 Dynarr_add (dst, c);
3656 encode_utf8 (Vcharset_ascii, c, 0, dst);
3659 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3660 { /* Processing Leading Byte */
3662 charset = CHARSET_BY_LEADING_BYTE (c);
3663 if (LEADING_BYTE_PREFIX_P(c))
3668 { /* Processing Non-ASCII character */
3670 if (EQ (charset, Vcharset_control_1))
3672 encode_utf8 (Vcharset_control_1, c, 0, dst);
3676 switch (XCHARSET_REP_BYTES (charset))
3679 encode_utf8 (charset, c, 0, dst);
3682 if (XCHARSET_PRIVATE_P (charset))
3684 encode_utf8 (charset, c, 0, dst);
3689 #ifdef ENABLE_COMPOSITE_CHARS
3690 if (EQ (charset, Vcharset_composite))
3694 /* #### Bother! We don't know how to
3696 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3700 Emchar emch = MAKE_CHAR (Vcharset_composite,
3701 ch & 0x7F, c & 0x7F);
3702 Lisp_Object lstr = composite_char_string (emch);
3706 src = XSTRING_DATA (lstr);
3707 n = XSTRING_LENGTH (lstr);
3711 #endif /* ENABLE_COMPOSITE_CHARS */
3713 encode_utf8 (charset, ch, c, dst);
3726 encode_utf8 (charset, ch, c, dst);
3742 #ifdef ENABLE_COMPOSITE_CHARS
3748 goto back_to_square_n; /* Wheeeeeeeee ..... */
3754 str->iso2022.current_char_boundary = char_boundary;
3755 str->iso2022.current_charset = charset;
3757 /* Verbum caro factum est! */
3761 /************************************************************************/
3762 /* ISO2022 methods */
3763 /************************************************************************/
3765 /* The following note describes the coding system ISO2022 briefly.
3766 Since the intention of this note is to help understand the
3767 functions in this file, some parts are NOT ACCURATE or OVERLY
3768 SIMPLIFIED. For thorough understanding, please refer to the
3769 original document of ISO2022.
3771 ISO2022 provides many mechanisms to encode several character sets
3772 in 7-bit and 8-bit environments. For 7-bit environments, all text
3773 is encoded using bytes less than 128. This may make the encoded
3774 text a little bit longer, but the text passes more easily through
3775 several gateways, some of which strip off MSB (Most Signigant Bit).
3777 There are two kinds of character sets: control character set and
3778 graphic character set. The former contains control characters such
3779 as `newline' and `escape' to provide control functions (control
3780 functions are also provided by escape sequences). The latter
3781 contains graphic characters such as 'A' and '-'. Emacs recognizes
3782 two control character sets and many graphic character sets.
3784 Graphic character sets are classified into one of the following
3785 four classes, according to the number of bytes (DIMENSION) and
3786 number of characters in one dimension (CHARS) of the set:
3787 - DIMENSION1_CHARS94
3788 - DIMENSION1_CHARS96
3789 - DIMENSION2_CHARS94
3790 - DIMENSION2_CHARS96
3792 In addition, each character set is assigned an identification tag,
3793 unique for each set, called "final character" (denoted as <F>
3794 hereafter). The <F> of each character set is decided by ECMA(*)
3795 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3796 (0x30..0x3F are for private use only).
3798 Note (*): ECMA = European Computer Manufacturers Association
3800 Here are examples of graphic character set [NAME(<F>)]:
3801 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3802 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3803 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3804 o DIMENSION2_CHARS96 -- none for the moment
3806 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3807 C0 [0x00..0x1F] -- control character plane 0
3808 GL [0x20..0x7F] -- graphic character plane 0
3809 C1 [0x80..0x9F] -- control character plane 1
3810 GR [0xA0..0xFF] -- graphic character plane 1
3812 A control character set is directly designated and invoked to C0 or
3813 C1 by an escape sequence. The most common case is that:
3814 - ISO646's control character set is designated/invoked to C0, and
3815 - ISO6429's control character set is designated/invoked to C1,
3816 and usually these designations/invocations are omitted in encoded
3817 text. In a 7-bit environment, only C0 can be used, and a control
3818 character for C1 is encoded by an appropriate escape sequence to
3819 fit into the environment. All control characters for C1 are
3820 defined to have corresponding escape sequences.
3822 A graphic character set is at first designated to one of four
3823 graphic registers (G0 through G3), then these graphic registers are
3824 invoked to GL or GR. These designations and invocations can be
3825 done independently. The most common case is that G0 is invoked to
3826 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3827 these invocations and designations are omitted in encoded text.
3828 In a 7-bit environment, only GL can be used.
3830 When a graphic character set of CHARS94 is invoked to GL, codes
3831 0x20 and 0x7F of the GL area work as control characters SPACE and
3832 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3835 There are two ways of invocation: locking-shift and single-shift.
3836 With locking-shift, the invocation lasts until the next different
3837 invocation, whereas with single-shift, the invocation affects the
3838 following character only and doesn't affect the locking-shift
3839 state. Invocations are done by the following control characters or
3842 ----------------------------------------------------------------------
3843 abbrev function cntrl escape seq description
3844 ----------------------------------------------------------------------
3845 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3846 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3847 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3848 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3849 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3850 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3851 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3852 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3853 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3854 ----------------------------------------------------------------------
3855 (*) These are not used by any known coding system.
3857 Control characters for these functions are defined by macros
3858 ISO_CODE_XXX in `coding.h'.
3860 Designations are done by the following escape sequences:
3861 ----------------------------------------------------------------------
3862 escape sequence description
3863 ----------------------------------------------------------------------
3864 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3865 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3866 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3867 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3868 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3869 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3870 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3871 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3872 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3873 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3874 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3875 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3876 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3877 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3878 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3879 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3880 ----------------------------------------------------------------------
3882 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3883 of dimension 1, chars 94, and final character <F>, etc...
3885 Note (*): Although these designations are not allowed in ISO2022,
3886 Emacs accepts them on decoding, and produces them on encoding
3887 CHARS96 character sets in a coding system which is characterized as
3888 7-bit environment, non-locking-shift, and non-single-shift.
3890 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3891 '(' can be omitted. We refer to this as "short-form" hereafter.
3893 Now you may notice that there are a lot of ways for encoding the
3894 same multilingual text in ISO2022. Actually, there exist many
3895 coding systems such as Compound Text (used in X11's inter client
3896 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3897 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3898 localized platforms), and all of these are variants of ISO2022.
3900 In addition to the above, Emacs handles two more kinds of escape
3901 sequences: ISO6429's direction specification and Emacs' private
3902 sequence for specifying character composition.
3904 ISO6429's direction specification takes the following form:
3905 o CSI ']' -- end of the current direction
3906 o CSI '0' ']' -- end of the current direction
3907 o CSI '1' ']' -- start of left-to-right text
3908 o CSI '2' ']' -- start of right-to-left text
3909 The control character CSI (0x9B: control sequence introducer) is
3910 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3912 Character composition specification takes the following form:
3913 o ESC '0' -- start character composition
3914 o ESC '1' -- end character composition
3915 Since these are not standard escape sequences of any ISO standard,
3916 their use with these meanings is restricted to Emacs only. */
3919 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3923 for (i = 0; i < 4; i++)
3925 if (!NILP (coding_system))
3927 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3929 iso->charset[i] = Qt;
3930 iso->invalid_designated[i] = 0;
3932 iso->esc = ISO_ESC_NOTHING;
3933 iso->esc_bytes_index = 0;
3934 iso->register_left = 0;
3935 iso->register_right = 1;
3936 iso->switched_dir_and_no_valid_charset_yet = 0;
3937 iso->invalid_switch_dir = 0;
3938 iso->output_direction_sequence = 0;
3939 iso->output_literally = 0;
3940 #ifdef ENABLE_COMPOSITE_CHARS
3941 if (iso->composite_chars)
3942 Dynarr_reset (iso->composite_chars);
3947 fit_to_be_escape_quoted (unsigned char c)
3964 /* Parse one byte of an ISO2022 escape sequence.
3965 If the result is an invalid escape sequence, return 0 and
3966 do not change anything in STR. Otherwise, if the result is
3967 an incomplete escape sequence, update ISO2022.ESC and
3968 ISO2022.ESC_BYTES and return -1. Otherwise, update
3969 all the state variables (but not ISO2022.ESC_BYTES) and
3972 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3973 or invocation of an invalid character set and treat that as
3974 an unrecognized escape sequence. */
3977 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3978 unsigned char c, unsigned int *flags,
3979 int check_invalid_charsets)
3981 /* (1) If we're at the end of a designation sequence, CS is the
3982 charset being designated and REG is the register to designate
3985 (2) If we're at the end of a locking-shift sequence, REG is
3986 the register to invoke and HALF (0 == left, 1 == right) is
3987 the half to invoke it into.
3989 (3) If we're at the end of a single-shift sequence, REG is
3990 the register to invoke. */
3991 Lisp_Object cs = Qnil;
3994 /* NOTE: This code does goto's all over the fucking place.
3995 The reason for this is that we're basically implementing
3996 a state machine here, and hierarchical languages like C
3997 don't really provide a clean way of doing this. */
3999 if (! (*flags & CODING_STATE_ESCAPE))
4000 /* At beginning of escape sequence; we need to reset our
4001 escape-state variables. */
4002 iso->esc = ISO_ESC_NOTHING;
4004 iso->output_literally = 0;
4005 iso->output_direction_sequence = 0;
4009 case ISO_ESC_NOTHING:
4010 iso->esc_bytes_index = 0;
4013 case ISO_CODE_ESC: /* Start escape sequence */
4014 *flags |= CODING_STATE_ESCAPE;
4018 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4019 *flags |= CODING_STATE_ESCAPE;
4020 iso->esc = ISO_ESC_5_11;
4023 case ISO_CODE_SO: /* locking shift 1 */
4026 case ISO_CODE_SI: /* locking shift 0 */
4030 case ISO_CODE_SS2: /* single shift */
4033 case ISO_CODE_SS3: /* single shift */
4037 default: /* Other control characters */
4044 /**** single shift ****/
4046 case 'N': /* single shift 2 */
4049 case 'O': /* single shift 3 */
4053 /**** locking shift ****/
4055 case '~': /* locking shift 1 right */
4058 case 'n': /* locking shift 2 */
4061 case '}': /* locking shift 2 right */
4064 case 'o': /* locking shift 3 */
4067 case '|': /* locking shift 3 right */
4071 #ifdef ENABLE_COMPOSITE_CHARS
4072 /**** composite ****/
4075 iso->esc = ISO_ESC_START_COMPOSITE;
4076 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4077 CODING_STATE_COMPOSITE;
4081 iso->esc = ISO_ESC_END_COMPOSITE;
4082 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4083 ~CODING_STATE_COMPOSITE;
4085 #endif /* ENABLE_COMPOSITE_CHARS */
4087 /**** directionality ****/
4090 iso->esc = ISO_ESC_5_11;
4093 /**** designation ****/
4095 case '$': /* multibyte charset prefix */
4096 iso->esc = ISO_ESC_2_4;
4100 if (0x28 <= c && c <= 0x2F)
4102 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4106 /* This function is called with CODESYS equal to nil when
4107 doing coding-system detection. */
4109 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4110 && fit_to_be_escape_quoted (c))
4112 iso->esc = ISO_ESC_LITERAL;
4113 *flags &= CODING_STATE_ISO2022_LOCK;
4123 /**** directionality ****/
4125 case ISO_ESC_5_11: /* ISO6429 direction control */
4128 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4129 goto directionality;
4131 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4132 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4133 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4137 case ISO_ESC_5_11_0:
4140 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4141 goto directionality;
4145 case ISO_ESC_5_11_1:
4148 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4149 goto directionality;
4153 case ISO_ESC_5_11_2:
4156 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4157 goto directionality;
4162 iso->esc = ISO_ESC_DIRECTIONALITY;
4163 /* Various junk here to attempt to preserve the direction sequences
4164 literally in the text if they would otherwise be swallowed due
4165 to invalid designations that don't show up as actual charset
4166 changes in the text. */
4167 if (iso->invalid_switch_dir)
4169 /* We already inserted a direction switch literally into the
4170 text. We assume (#### this may not be right) that the
4171 next direction switch is the one going the other way,
4172 and we need to output that literally as well. */
4173 iso->output_literally = 1;
4174 iso->invalid_switch_dir = 0;
4180 /* If we are in the thrall of an invalid designation,
4181 then stick the directionality sequence literally into the
4182 output stream so it ends up in the original text again. */
4183 for (jj = 0; jj < 4; jj++)
4184 if (iso->invalid_designated[jj])
4188 iso->output_literally = 1;
4189 iso->invalid_switch_dir = 1;
4192 /* Indicate that we haven't yet seen a valid designation,
4193 so that if a switch-dir is directly followed by an
4194 invalid designation, both get inserted literally. */
4195 iso->switched_dir_and_no_valid_charset_yet = 1;
4200 /**** designation ****/
4203 if (0x28 <= c && c <= 0x2F)
4205 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4208 if (0x40 <= c && c <= 0x42)
4210 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4211 *flags & CODING_STATE_R2L ?
4212 CHARSET_RIGHT_TO_LEFT :
4213 CHARSET_LEFT_TO_RIGHT);
4223 if (c < '0' || c > '~')
4224 return 0; /* bad final byte */
4226 if (iso->esc >= ISO_ESC_2_8 &&
4227 iso->esc <= ISO_ESC_2_15)
4229 type = ((iso->esc >= ISO_ESC_2_12) ?
4230 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4231 reg = (iso->esc - ISO_ESC_2_8) & 3;
4233 else if (iso->esc >= ISO_ESC_2_4_8 &&
4234 iso->esc <= ISO_ESC_2_4_15)
4236 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4237 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4238 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4242 /* Can this ever be reached? -slb */
4246 cs = CHARSET_BY_ATTRIBUTES (type, c,
4247 *flags & CODING_STATE_R2L ?
4248 CHARSET_RIGHT_TO_LEFT :
4249 CHARSET_LEFT_TO_RIGHT);
4255 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4259 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4260 /* can't invoke something that ain't there. */
4262 iso->esc = ISO_ESC_SINGLE_SHIFT;
4263 *flags &= CODING_STATE_ISO2022_LOCK;
4265 *flags |= CODING_STATE_SS2;
4267 *flags |= CODING_STATE_SS3;
4271 if (check_invalid_charsets &&
4272 !CHARSETP (iso->charset[reg]))
4273 /* can't invoke something that ain't there. */
4276 iso->register_right = reg;
4278 iso->register_left = reg;
4279 *flags &= CODING_STATE_ISO2022_LOCK;
4280 iso->esc = ISO_ESC_LOCKING_SHIFT;
4284 if (NILP (cs) && check_invalid_charsets)
4286 iso->invalid_designated[reg] = 1;
4287 iso->charset[reg] = Vcharset_ascii;
4288 iso->esc = ISO_ESC_DESIGNATE;
4289 *flags &= CODING_STATE_ISO2022_LOCK;
4290 iso->output_literally = 1;
4291 if (iso->switched_dir_and_no_valid_charset_yet)
4293 /* We encountered a switch-direction followed by an
4294 invalid designation. Ensure that the switch-direction
4295 gets outputted; otherwise it will probably get eaten
4296 when the text is written out again. */
4297 iso->switched_dir_and_no_valid_charset_yet = 0;
4298 iso->output_direction_sequence = 1;
4299 /* And make sure that the switch-dir going the other
4300 way gets outputted, as well. */
4301 iso->invalid_switch_dir = 1;
4305 /* This function is called with CODESYS equal to nil when
4306 doing coding-system detection. */
4307 if (!NILP (codesys))
4309 charset_conversion_spec_dynarr *dyn =
4310 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4316 for (i = 0; i < Dynarr_length (dyn); i++)
4318 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4319 if (EQ (cs, spec->from_charset))
4320 cs = spec->to_charset;
4325 iso->charset[reg] = cs;
4326 iso->esc = ISO_ESC_DESIGNATE;
4327 *flags &= CODING_STATE_ISO2022_LOCK;
4328 if (iso->invalid_designated[reg])
4330 iso->invalid_designated[reg] = 0;
4331 iso->output_literally = 1;
4333 if (iso->switched_dir_and_no_valid_charset_yet)
4334 iso->switched_dir_and_no_valid_charset_yet = 0;
4339 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4344 /* #### There are serious deficiencies in the recognition mechanism
4345 here. This needs to be much smarter if it's going to cut it.
4346 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4347 it should be detected as Latin-1.
4348 All the ISO2022 stuff in this file should be synced up with the
4349 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4350 Perhaps we should wait till R2L works in FSF Emacs? */
4352 if (!st->iso2022.initted)
4354 reset_iso2022 (Qnil, &st->iso2022.iso);
4355 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4356 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4357 CODING_CATEGORY_ISO_8_1_MASK |
4358 CODING_CATEGORY_ISO_8_2_MASK |
4359 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4360 st->iso2022.flags = 0;
4361 st->iso2022.high_byte_count = 0;
4362 st->iso2022.saw_single_shift = 0;
4363 st->iso2022.initted = 1;
4366 mask = st->iso2022.mask;
4373 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4374 st->iso2022.high_byte_count++;
4378 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4380 if (st->iso2022.high_byte_count & 1)
4381 /* odd number of high bytes; assume not iso-8-2 */
4382 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4384 st->iso2022.high_byte_count = 0;
4385 st->iso2022.saw_single_shift = 0;
4387 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4389 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4390 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4391 { /* control chars */
4394 /* Allow and ignore control characters that you might
4395 reasonably see in a text file */
4400 case 8: /* backspace */
4401 case 11: /* vertical tab */
4402 case 12: /* form feed */
4403 case 26: /* MS-DOS C-z junk */
4404 case 31: /* '^_' -- for info */
4405 goto label_continue_loop;
4412 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4415 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4416 &st->iso2022.flags, 0))
4418 switch (st->iso2022.iso.esc)
4420 case ISO_ESC_DESIGNATE:
4421 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4422 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4424 case ISO_ESC_LOCKING_SHIFT:
4425 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4426 goto ran_out_of_chars;
4427 case ISO_ESC_SINGLE_SHIFT:
4428 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4429 st->iso2022.saw_single_shift = 1;
4438 goto ran_out_of_chars;
4441 label_continue_loop:;
4450 postprocess_iso2022_mask (int mask)
4452 /* #### kind of cheesy */
4453 /* If seven-bit ISO is allowed, then assume that the encoding is
4454 entirely seven-bit and turn off the eight-bit ones. */
4455 if (mask & CODING_CATEGORY_ISO_7_MASK)
4456 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4457 CODING_CATEGORY_ISO_8_1_MASK |
4458 CODING_CATEGORY_ISO_8_2_MASK);
4462 /* If FLAGS is a null pointer or specifies right-to-left motion,
4463 output a switch-dir-to-left-to-right sequence to DST.
4464 Also update FLAGS if it is not a null pointer.
4465 If INTERNAL_P is set, we are outputting in internal format and
4466 need to handle the CSI differently. */
4469 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4470 unsigned_char_dynarr *dst,
4471 unsigned int *flags,
4474 if (!flags || (*flags & CODING_STATE_R2L))
4476 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4478 Dynarr_add (dst, ISO_CODE_ESC);
4479 Dynarr_add (dst, '[');
4481 else if (internal_p)
4482 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4484 Dynarr_add (dst, ISO_CODE_CSI);
4485 Dynarr_add (dst, '0');
4486 Dynarr_add (dst, ']');
4488 *flags &= ~CODING_STATE_R2L;
4492 /* If FLAGS is a null pointer or specifies a direction different from
4493 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4494 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4495 sequence to DST. Also update FLAGS if it is not a null pointer.
4496 If INTERNAL_P is set, we are outputting in internal format and
4497 need to handle the CSI differently. */
4500 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4501 unsigned_char_dynarr *dst, unsigned int *flags,
4504 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4505 direction == CHARSET_LEFT_TO_RIGHT)
4506 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4507 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4508 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4509 direction == CHARSET_RIGHT_TO_LEFT)
4511 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4513 Dynarr_add (dst, ISO_CODE_ESC);
4514 Dynarr_add (dst, '[');
4516 else if (internal_p)
4517 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4519 Dynarr_add (dst, ISO_CODE_CSI);
4520 Dynarr_add (dst, '2');
4521 Dynarr_add (dst, ']');
4523 *flags |= CODING_STATE_R2L;
4527 /* Convert ISO2022-format data to internal format. */
4530 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4531 unsigned_char_dynarr *dst, unsigned int n)
4533 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4534 unsigned int flags = str->flags;
4535 unsigned int ch = str->ch;
4536 eol_type_t eol_type = str->eol_type;
4537 #ifdef ENABLE_COMPOSITE_CHARS
4538 unsigned_char_dynarr *real_dst = dst;
4540 Lisp_Object coding_system;
4542 XSETCODING_SYSTEM (coding_system, str->codesys);
4544 #ifdef ENABLE_COMPOSITE_CHARS
4545 if (flags & CODING_STATE_COMPOSITE)
4546 dst = str->iso2022.composite_chars;
4547 #endif /* ENABLE_COMPOSITE_CHARS */
4551 unsigned char c = *src++;
4552 if (flags & CODING_STATE_ESCAPE)
4553 { /* Within ESC sequence */
4554 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4559 switch (str->iso2022.esc)
4561 #ifdef ENABLE_COMPOSITE_CHARS
4562 case ISO_ESC_START_COMPOSITE:
4563 if (str->iso2022.composite_chars)
4564 Dynarr_reset (str->iso2022.composite_chars);
4566 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4567 dst = str->iso2022.composite_chars;
4569 case ISO_ESC_END_COMPOSITE:
4571 Bufbyte comstr[MAX_EMCHAR_LEN];
4573 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4574 Dynarr_length (dst));
4576 len = set_charptr_emchar (comstr, emch);
4577 Dynarr_add_many (dst, comstr, len);
4580 #endif /* ENABLE_COMPOSITE_CHARS */
4582 case ISO_ESC_LITERAL:
4583 DECODE_ADD_BINARY_CHAR (c, dst);
4587 /* Everything else handled already */
4592 /* Attempted error recovery. */
4593 if (str->iso2022.output_direction_sequence)
4594 ensure_correct_direction (flags & CODING_STATE_R2L ?
4595 CHARSET_RIGHT_TO_LEFT :
4596 CHARSET_LEFT_TO_RIGHT,
4597 str->codesys, dst, 0, 1);
4598 /* More error recovery. */
4599 if (!retval || str->iso2022.output_literally)
4601 /* Output the (possibly invalid) sequence */
4603 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4604 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4605 flags &= CODING_STATE_ISO2022_LOCK;
4607 n++, src--;/* Repeat the loop with the same character. */
4610 /* No sense in reprocessing the final byte of the
4611 escape sequence; it could mess things up anyway.
4613 DECODE_ADD_BINARY_CHAR (c, dst);
4618 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4619 { /* Control characters */
4621 /***** Error-handling *****/
4623 /* If we were in the middle of a character, dump out the
4624 partial character. */
4625 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4627 /* If we just saw a single-shift character, dump it out.
4628 This may dump out the wrong sort of single-shift character,
4629 but least it will give an indication that something went
4631 if (flags & CODING_STATE_SS2)
4633 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4634 flags &= ~CODING_STATE_SS2;
4636 if (flags & CODING_STATE_SS3)
4638 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4639 flags &= ~CODING_STATE_SS3;
4642 /***** Now handle the control characters. *****/
4645 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4647 flags &= CODING_STATE_ISO2022_LOCK;
4649 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4650 DECODE_ADD_BINARY_CHAR (c, dst);
4653 { /* Graphic characters */
4654 Lisp_Object charset;
4658 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4660 /* Now determine the charset. */
4661 reg = ((flags & CODING_STATE_SS2) ? 2
4662 : (flags & CODING_STATE_SS3) ? 3
4663 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4664 : str->iso2022.register_left);
4665 charset = str->iso2022.charset[reg];
4667 /* Error checking: */
4668 if (! CHARSETP (charset)
4669 || str->iso2022.invalid_designated[reg]
4670 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4671 && XCHARSET_CHARS (charset) == 94))
4672 /* Mrmph. We are trying to invoke a register that has no
4673 or an invalid charset in it, or trying to add a character
4674 outside the range of the charset. Insert that char literally
4675 to preserve it for the output. */
4677 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4678 DECODE_ADD_BINARY_CHAR (c, dst);
4683 /* Things are probably hunky-dorey. */
4685 /* Fetch reverse charset, maybe. */
4686 if (((flags & CODING_STATE_R2L) &&
4687 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4689 (!(flags & CODING_STATE_R2L) &&
4690 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4692 Lisp_Object new_charset =
4693 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4694 if (!NILP (new_charset))
4695 charset = new_charset;
4698 lb = XCHARSET_LEADING_BYTE (charset);
4699 switch (XCHARSET_REP_BYTES (charset))
4702 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4703 Dynarr_add (dst, c & 0x7F);
4706 case 2: /* one-byte official */
4707 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4708 Dynarr_add (dst, lb);
4709 Dynarr_add (dst, c | 0x80);
4712 case 3: /* one-byte private or two-byte official */
4713 if (XCHARSET_PRIVATE_P (charset))
4715 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4716 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4717 Dynarr_add (dst, lb);
4718 Dynarr_add (dst, c | 0x80);
4724 Dynarr_add (dst, lb);
4725 Dynarr_add (dst, ch | 0x80);
4726 Dynarr_add (dst, c | 0x80);
4734 default: /* two-byte private */
4737 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4738 Dynarr_add (dst, lb);
4739 Dynarr_add (dst, ch | 0x80);
4740 Dynarr_add (dst, c | 0x80);
4749 flags &= CODING_STATE_ISO2022_LOCK;
4752 label_continue_loop:;
4755 if (flags & CODING_STATE_END)
4756 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4763 /***** ISO2022 encoder *****/
4765 /* Designate CHARSET into register REG. */
4768 iso2022_designate (Lisp_Object charset, unsigned char reg,
4769 struct encoding_stream *str, unsigned_char_dynarr *dst)
4771 static CONST char inter94[] = "()*+";
4772 static CONST char inter96[] = ",-./";
4774 unsigned char final;
4775 Lisp_Object old_charset = str->iso2022.charset[reg];
4777 str->iso2022.charset[reg] = charset;
4778 if (!CHARSETP (charset))
4779 /* charset might be an initial nil or t. */
4781 type = XCHARSET_TYPE (charset);
4782 final = XCHARSET_FINAL (charset);
4783 if (!str->iso2022.force_charset_on_output[reg] &&
4784 CHARSETP (old_charset) &&
4785 XCHARSET_TYPE (old_charset) == type &&
4786 XCHARSET_FINAL (old_charset) == final)
4789 str->iso2022.force_charset_on_output[reg] = 0;
4792 charset_conversion_spec_dynarr *dyn =
4793 str->codesys->iso2022.output_conv;
4799 for (i = 0; i < Dynarr_length (dyn); i++)
4801 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4802 if (EQ (charset, spec->from_charset))
4803 charset = spec->to_charset;
4808 Dynarr_add (dst, ISO_CODE_ESC);
4811 case CHARSET_TYPE_94:
4812 Dynarr_add (dst, inter94[reg]);
4814 case CHARSET_TYPE_96:
4815 Dynarr_add (dst, inter96[reg]);
4817 case CHARSET_TYPE_94X94:
4818 Dynarr_add (dst, '$');
4820 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4823 Dynarr_add (dst, inter94[reg]);
4825 case CHARSET_TYPE_96X96:
4826 Dynarr_add (dst, '$');
4827 Dynarr_add (dst, inter96[reg]);
4830 Dynarr_add (dst, final);
4834 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4836 if (str->iso2022.register_left != 0)
4838 Dynarr_add (dst, ISO_CODE_SI);
4839 str->iso2022.register_left = 0;
4844 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4846 if (str->iso2022.register_left != 1)
4848 Dynarr_add (dst, ISO_CODE_SO);
4849 str->iso2022.register_left = 1;
4853 /* Convert internally-formatted data to ISO2022 format. */
4856 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4857 unsigned_char_dynarr *dst, unsigned int n)
4859 unsigned char charmask, c;
4860 unsigned char char_boundary;
4861 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4862 unsigned int flags = str->flags;
4863 unsigned int ch = str->ch;
4864 Lisp_Coding_System *codesys = str->codesys;
4865 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4867 Lisp_Object charset;
4870 #ifdef ENABLE_COMPOSITE_CHARS
4871 /* flags for handling composite chars. We do a little switcharoo
4872 on the source while we're outputting the composite char. */
4873 unsigned int saved_n = 0;
4874 CONST unsigned char *saved_src = NULL;
4875 int in_composite = 0;
4876 #endif /* ENABLE_COMPOSITE_CHARS */
4878 char_boundary = str->iso2022.current_char_boundary;
4879 charset = str->iso2022.current_charset;
4880 half = str->iso2022.current_half;
4882 #ifdef ENABLE_COMPOSITE_CHARS
4889 if (BYTE_ASCII_P (c))
4890 { /* Processing ASCII character */
4893 restore_left_to_right_direction (codesys, dst, &flags, 0);
4895 /* Make sure G0 contains ASCII */
4896 if ((c > ' ' && c < ISO_CODE_DEL) ||
4897 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4899 ensure_normal_shift (str, dst);
4900 iso2022_designate (Vcharset_ascii, 0, str, dst);
4903 /* If necessary, restore everything to the default state
4906 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4908 restore_left_to_right_direction (codesys, dst, &flags, 0);
4910 ensure_normal_shift (str, dst);
4912 for (i = 0; i < 4; i++)
4914 Lisp_Object initial_charset =
4915 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4916 iso2022_designate (initial_charset, i, str, dst);
4921 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4922 Dynarr_add (dst, '\r');
4923 if (eol_type != EOL_CR)
4924 Dynarr_add (dst, c);
4928 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4929 && fit_to_be_escape_quoted (c))
4930 Dynarr_add (dst, ISO_CODE_ESC);
4931 Dynarr_add (dst, c);
4936 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4937 { /* Processing Leading Byte */
4939 charset = CHARSET_BY_LEADING_BYTE (c);
4940 if (LEADING_BYTE_PREFIX_P(c))
4942 else if (!EQ (charset, Vcharset_control_1)
4943 #ifdef ENABLE_COMPOSITE_CHARS
4944 && !EQ (charset, Vcharset_composite)
4950 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4951 codesys, dst, &flags, 0);
4953 /* Now determine which register to use. */
4955 for (i = 0; i < 4; i++)
4957 if (EQ (charset, str->iso2022.charset[i]) ||
4959 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4968 if (XCHARSET_GRAPHIC (charset) != 0)
4970 if (!NILP (str->iso2022.charset[1]) &&
4971 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4972 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4974 else if (!NILP (str->iso2022.charset[2]))
4976 else if (!NILP (str->iso2022.charset[3]))
4985 iso2022_designate (charset, reg, str, dst);
4987 /* Now invoke that register. */
4991 ensure_normal_shift (str, dst);
4996 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4998 ensure_shift_out (str, dst);
5006 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5008 Dynarr_add (dst, ISO_CODE_ESC);
5009 Dynarr_add (dst, 'N');
5014 Dynarr_add (dst, ISO_CODE_SS2);
5020 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5022 Dynarr_add (dst, ISO_CODE_ESC);
5023 Dynarr_add (dst, 'O');
5028 Dynarr_add (dst, ISO_CODE_SS3);
5040 { /* Processing Non-ASCII character */
5041 charmask = (half == 0 ? 0x7F : 0xFF);
5043 if (EQ (charset, Vcharset_control_1))
5045 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5046 && fit_to_be_escape_quoted (c))
5047 Dynarr_add (dst, ISO_CODE_ESC);
5048 /* you asked for it ... */
5049 Dynarr_add (dst, c - 0x20);
5053 switch (XCHARSET_REP_BYTES (charset))
5056 Dynarr_add (dst, c & charmask);
5059 if (XCHARSET_PRIVATE_P (charset))
5061 Dynarr_add (dst, c & charmask);
5066 #ifdef ENABLE_COMPOSITE_CHARS
5067 if (EQ (charset, Vcharset_composite))
5071 /* #### Bother! We don't know how to
5073 Dynarr_add (dst, '~');
5077 Emchar emch = MAKE_CHAR (Vcharset_composite,
5078 ch & 0x7F, c & 0x7F);
5079 Lisp_Object lstr = composite_char_string (emch);
5083 src = XSTRING_DATA (lstr);
5084 n = XSTRING_LENGTH (lstr);
5085 Dynarr_add (dst, ISO_CODE_ESC);
5086 Dynarr_add (dst, '0'); /* start composing */
5090 #endif /* ENABLE_COMPOSITE_CHARS */
5092 Dynarr_add (dst, ch & charmask);
5093 Dynarr_add (dst, c & charmask);
5106 Dynarr_add (dst, ch & charmask);
5107 Dynarr_add (dst, c & charmask);
5123 #ifdef ENABLE_COMPOSITE_CHARS
5129 Dynarr_add (dst, ISO_CODE_ESC);
5130 Dynarr_add (dst, '1'); /* end composing */
5131 goto back_to_square_n; /* Wheeeeeeeee ..... */
5133 #endif /* ENABLE_COMPOSITE_CHARS */
5135 if (char_boundary && flags & CODING_STATE_END)
5137 restore_left_to_right_direction (codesys, dst, &flags, 0);
5138 ensure_normal_shift (str, dst);
5139 for (i = 0; i < 4; i++)
5141 Lisp_Object initial_charset =
5142 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5143 iso2022_designate (initial_charset, i, str, dst);
5149 str->iso2022.current_char_boundary = char_boundary;
5150 str->iso2022.current_charset = charset;
5151 str->iso2022.current_half = half;
5153 /* Verbum caro factum est! */
5157 /************************************************************************/
5158 /* No-conversion methods */
5159 /************************************************************************/
5161 /* This is used when reading in "binary" files -- i.e. files that may
5162 contain all 256 possible byte values and that are not to be
5163 interpreted as being in any particular decoding. */
5165 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5166 unsigned_char_dynarr *dst, unsigned int n)
5169 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5170 unsigned int flags = str->flags;
5171 unsigned int ch = str->ch;
5172 eol_type_t eol_type = str->eol_type;
5178 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5179 DECODE_ADD_BINARY_CHAR (c, dst);
5180 label_continue_loop:;
5183 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5190 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5191 unsigned_char_dynarr *dst, unsigned int n)
5194 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5195 unsigned int flags = str->flags;
5196 unsigned int ch = str->ch;
5197 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5204 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5205 Dynarr_add (dst, '\r');
5206 if (eol_type != EOL_CR)
5207 Dynarr_add (dst, '\n');
5210 else if (BYTE_ASCII_P (c))
5213 Dynarr_add (dst, c);
5215 else if (BUFBYTE_LEADING_BYTE_P (c))
5218 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5219 c == LEADING_BYTE_CONTROL_1)
5222 Dynarr_add (dst, '~'); /* untranslatable character */
5226 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5227 Dynarr_add (dst, c);
5228 else if (ch == LEADING_BYTE_CONTROL_1)
5231 Dynarr_add (dst, c - 0x20);
5233 /* else it should be the second or third byte of an
5234 untranslatable character, so ignore it */
5244 /************************************************************************/
5245 /* Simple internal/external functions */
5246 /************************************************************************/
5248 static Extbyte_dynarr *conversion_out_dynarr;
5249 static Bufbyte_dynarr *conversion_in_dynarr;
5251 /* Determine coding system from coding format */
5253 /* #### not correct for all values of `fmt'! */
5255 external_data_format_to_coding_system (enum external_data_format fmt)
5259 case FORMAT_FILENAME:
5260 case FORMAT_TERMINAL:
5261 if (EQ (Vfile_name_coding_system, Qnil) ||
5262 EQ (Vfile_name_coding_system, Qbinary))
5265 return Fget_coding_system (Vfile_name_coding_system);
5268 return Fget_coding_system (Qctext);
5276 convert_to_external_format (CONST Bufbyte *ptr,
5279 enum external_data_format fmt)
5281 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5283 if (!conversion_out_dynarr)
5284 conversion_out_dynarr = Dynarr_new (Extbyte);
5286 Dynarr_reset (conversion_out_dynarr);
5288 if (NILP (coding_system))
5290 CONST Bufbyte *end = ptr + len;
5295 (BYTE_ASCII_P (*ptr)) ? *ptr :
5296 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5297 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5300 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5304 #ifdef ERROR_CHECK_BUFPOS
5305 assert (ptr == end);
5310 Lisp_Object instream, outstream, da_outstream;
5311 Lstream *istr, *ostr;
5312 struct gcpro gcpro1, gcpro2, gcpro3;
5313 char tempbuf[1024]; /* some random amount */
5315 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5316 da_outstream = make_dynarr_output_stream
5317 ((unsigned_char_dynarr *) conversion_out_dynarr);
5319 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5320 istr = XLSTREAM (instream);
5321 ostr = XLSTREAM (outstream);
5322 GCPRO3 (instream, outstream, da_outstream);
5325 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5328 Lstream_write (ostr, tempbuf, size_in_bytes);
5330 Lstream_close (istr);
5331 Lstream_close (ostr);
5333 Lstream_delete (istr);
5334 Lstream_delete (ostr);
5335 Lstream_delete (XLSTREAM (da_outstream));
5338 *len_out = Dynarr_length (conversion_out_dynarr);
5339 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5340 return Dynarr_atp (conversion_out_dynarr, 0);
5344 convert_from_external_format (CONST Extbyte *ptr,
5347 enum external_data_format fmt)
5349 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5351 if (!conversion_in_dynarr)
5352 conversion_in_dynarr = Dynarr_new (Bufbyte);
5354 Dynarr_reset (conversion_in_dynarr);
5356 if (NILP (coding_system))
5358 CONST Extbyte *end = ptr + len;
5359 for (; ptr < end; ptr++)
5362 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5367 Lisp_Object instream, outstream, da_outstream;
5368 Lstream *istr, *ostr;
5369 struct gcpro gcpro1, gcpro2, gcpro3;
5370 char tempbuf[1024]; /* some random amount */
5372 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5373 da_outstream = make_dynarr_output_stream
5374 ((unsigned_char_dynarr *) conversion_in_dynarr);
5376 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5377 istr = XLSTREAM (instream);
5378 ostr = XLSTREAM (outstream);
5379 GCPRO3 (instream, outstream, da_outstream);
5382 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5385 Lstream_write (ostr, tempbuf, size_in_bytes);
5387 Lstream_close (istr);
5388 Lstream_close (ostr);
5390 Lstream_delete (istr);
5391 Lstream_delete (ostr);
5392 Lstream_delete (XLSTREAM (da_outstream));
5395 *len_out = Dynarr_length (conversion_in_dynarr);
5396 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5397 return Dynarr_atp (conversion_in_dynarr, 0);
5401 /************************************************************************/
5402 /* Initialization */
5403 /************************************************************************/
5406 syms_of_file_coding (void)
5408 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5409 deferror (&Qcoding_system_error, "coding-system-error",
5410 "Coding-system error", Qio_error);
5412 DEFSUBR (Fcoding_system_p);
5413 DEFSUBR (Ffind_coding_system);
5414 DEFSUBR (Fget_coding_system);
5415 DEFSUBR (Fcoding_system_list);
5416 DEFSUBR (Fcoding_system_name);
5417 DEFSUBR (Fmake_coding_system);
5418 DEFSUBR (Fcopy_coding_system);
5419 DEFSUBR (Fdefine_coding_system_alias);
5420 DEFSUBR (Fsubsidiary_coding_system);
5422 DEFSUBR (Fcoding_system_type);
5423 DEFSUBR (Fcoding_system_doc_string);
5425 DEFSUBR (Fcoding_system_charset);
5427 DEFSUBR (Fcoding_system_property);
5429 DEFSUBR (Fcoding_category_list);
5430 DEFSUBR (Fset_coding_priority_list);
5431 DEFSUBR (Fcoding_priority_list);
5432 DEFSUBR (Fset_coding_category_system);
5433 DEFSUBR (Fcoding_category_system);
5435 DEFSUBR (Fdetect_coding_region);
5436 DEFSUBR (Fdecode_coding_region);
5437 DEFSUBR (Fencode_coding_region);
5439 DEFSUBR (Fdecode_shift_jis_char);
5440 DEFSUBR (Fencode_shift_jis_char);
5441 DEFSUBR (Fdecode_big5_char);
5442 DEFSUBR (Fencode_big5_char);
5443 DEFSUBR (Fset_ucs_char);
5444 DEFSUBR (Fucs_char);
5445 DEFSUBR (Fset_char_ucs);
5446 DEFSUBR (Fchar_ucs);
5448 defsymbol (&Qcoding_system_p, "coding-system-p");
5449 defsymbol (&Qno_conversion, "no-conversion");
5450 defsymbol (&Qraw_text, "raw-text");
5452 defsymbol (&Qbig5, "big5");
5453 defsymbol (&Qshift_jis, "shift-jis");
5454 defsymbol (&Qucs4, "ucs-4");
5455 defsymbol (&Qutf8, "utf-8");
5456 defsymbol (&Qccl, "ccl");
5457 defsymbol (&Qiso2022, "iso2022");
5459 defsymbol (&Qmnemonic, "mnemonic");
5460 defsymbol (&Qeol_type, "eol-type");
5461 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5462 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5464 defsymbol (&Qcr, "cr");
5465 defsymbol (&Qlf, "lf");
5466 defsymbol (&Qcrlf, "crlf");
5467 defsymbol (&Qeol_cr, "eol-cr");
5468 defsymbol (&Qeol_lf, "eol-lf");
5469 defsymbol (&Qeol_crlf, "eol-crlf");
5471 defsymbol (&Qcharset_g0, "charset-g0");
5472 defsymbol (&Qcharset_g1, "charset-g1");
5473 defsymbol (&Qcharset_g2, "charset-g2");
5474 defsymbol (&Qcharset_g3, "charset-g3");
5475 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5476 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5477 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5478 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5479 defsymbol (&Qno_iso6429, "no-iso6429");
5480 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5481 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5483 defsymbol (&Qshort, "short");
5484 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5485 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5486 defsymbol (&Qseven, "seven");
5487 defsymbol (&Qlock_shift, "lock-shift");
5488 defsymbol (&Qescape_quoted, "escape-quoted");
5490 defsymbol (&Qencode, "encode");
5491 defsymbol (&Qdecode, "decode");
5494 defsymbol (&Qctext, "ctext");
5495 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5497 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5499 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5501 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5503 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5505 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5507 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5509 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5511 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5514 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5519 lstream_type_create_file_coding (void)
5521 LSTREAM_HAS_METHOD (decoding, reader);
5522 LSTREAM_HAS_METHOD (decoding, writer);
5523 LSTREAM_HAS_METHOD (decoding, rewinder);
5524 LSTREAM_HAS_METHOD (decoding, seekable_p);
5525 LSTREAM_HAS_METHOD (decoding, flusher);
5526 LSTREAM_HAS_METHOD (decoding, closer);
5527 LSTREAM_HAS_METHOD (decoding, marker);
5529 LSTREAM_HAS_METHOD (encoding, reader);
5530 LSTREAM_HAS_METHOD (encoding, writer);
5531 LSTREAM_HAS_METHOD (encoding, rewinder);
5532 LSTREAM_HAS_METHOD (encoding, seekable_p);
5533 LSTREAM_HAS_METHOD (encoding, flusher);
5534 LSTREAM_HAS_METHOD (encoding, closer);
5535 LSTREAM_HAS_METHOD (encoding, marker);
5539 vars_of_file_coding (void)
5543 /* Initialize to something reasonable ... */
5544 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5546 coding_category_system[i] = Qnil;
5547 coding_category_by_priority[i] = i;
5550 Fprovide (intern ("file-coding"));
5552 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5553 Coding system used for TTY keyboard input.
5554 Not used under a windowing system.
5556 Vkeyboard_coding_system = Qnil;
5558 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5559 Coding system used for TTY display output.
5560 Not used under a windowing system.
5562 Vterminal_coding_system = Qnil;
5564 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5565 Overriding coding system used when writing a file or process.
5566 You should *bind* this, not set it. If this is non-nil, it specifies
5567 the coding system that will be used when a file or process is read
5568 in, and overrides `buffer-file-coding-system-for-read',
5569 `insert-file-contents-pre-hook', etc. Use those variables instead of
5570 this one for permanent changes to the environment.
5572 Vcoding_system_for_read = Qnil;
5574 DEFVAR_LISP ("coding-system-for-write",
5575 &Vcoding_system_for_write /*
5576 Overriding coding system used when writing a file or process.
5577 You should *bind* this, not set it. If this is non-nil, it specifies
5578 the coding system that will be used when a file or process is wrote
5579 in, and overrides `buffer-file-coding-system',
5580 `write-region-pre-hook', etc. Use those variables instead of this one
5581 for permanent changes to the environment.
5583 Vcoding_system_for_write = Qnil;
5585 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5586 Coding system used to convert pathnames when accessing files.
5588 Vfile_name_coding_system = Qnil;
5590 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5591 Non-nil means the buffer contents are regarded as multi-byte form
5592 of characters, not a binary code. This affects the display, file I/O,
5593 and behaviors of various editing commands.
5595 Setting this to nil does not do anything.
5597 enable_multibyte_characters = 1;
5601 complex_vars_of_file_coding (void)
5603 staticpro (&Vcoding_system_hash_table);
5604 Vcoding_system_hash_table =
5605 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5607 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5609 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5611 struct codesys_prop csp; \
5613 csp.prop_type = (Prop_Type); \
5614 Dynarr_add (the_codesys_prop_dynarr, csp); \
5617 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5618 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5619 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5620 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5621 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5622 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5623 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5625 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5626 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5627 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5628 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5629 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5630 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5631 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5632 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5633 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5634 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5635 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5636 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5637 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5638 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5639 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5640 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5641 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5643 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5644 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5646 /* Need to create this here or we're really screwed. */
5648 (Qraw_text, Qno_conversion,
5649 build_string ("Raw text, which means it converts only line-break-codes."),
5650 list2 (Qmnemonic, build_string ("Raw")));
5653 (Qbinary, Qno_conversion,
5654 build_string ("Binary, which means it does not convert anything."),
5655 list4 (Qeol_type, Qlf,
5656 Qmnemonic, build_string ("Binary")));
5658 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5660 /* Need this for bootstrapping */
5661 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5662 Fget_coding_system (Qraw_text);
5668 for (i = 0; i < 65536; i++)
5669 ucs_to_mule_table[i] = Qnil;
5671 staticpro (&mule_to_ucs_table);
5672 mule_to_ucs_table = Fmake_char_table(Qgeneric);