1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
58 Lisp_Object Qcoding_system_p;
60 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
61 /* Qinternal in general.c */
63 Lisp_Object Qmnemonic, Qeol_type;
64 Lisp_Object Qcr, Qcrlf, Qlf;
65 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
66 Lisp_Object Qpost_read_conversion;
67 Lisp_Object Qpre_write_conversion;
70 Lisp_Object Qucs4, Qutf8;
71 Lisp_Object Qbig5, Qshift_jis;
72 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
73 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
74 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
75 Lisp_Object Qno_iso6429;
76 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
77 Lisp_Object Qctext, Qescape_quoted;
78 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
80 Lisp_Object Qencode, Qdecode;
82 Lisp_Object Vcoding_system_hash_table;
84 int enable_multibyte_characters;
87 /* Additional information used by the ISO2022 decoder and detector. */
88 struct iso2022_decoder
90 /* CHARSET holds the character sets currently assigned to the G0
91 through G3 variables. It is initialized from the array
92 INITIAL_CHARSET in CODESYS. */
93 Lisp_Object charset[4];
95 /* Which registers are currently invoked into the left (GL) and
96 right (GR) halves of the 8-bit encoding space? */
97 int register_left, register_right;
99 /* ISO_ESC holds a value indicating part of an escape sequence
100 that has already been seen. */
101 enum iso_esc_flag esc;
103 /* This records the bytes we've seen so far in an escape sequence,
104 in case the sequence is invalid (we spit out the bytes unchanged). */
105 unsigned char esc_bytes[8];
107 /* Index for next byte to store in ISO escape sequence. */
110 #ifdef ENABLE_COMPOSITE_CHARS
111 /* Stuff seen so far when composing a string. */
112 unsigned_char_dynarr *composite_chars;
115 /* If we saw an invalid designation sequence for a particular
116 register, we flag it here and switch to ASCII. The next time we
117 see a valid designation for this register, we turn off the flag
118 and do the designation normally, but pretend the sequence was
119 invalid. The effect of all this is that (most of the time) the
120 escape sequences for both the switch to the unknown charset, and
121 the switch back to the known charset, get inserted literally into
122 the buffer and saved out as such. The hope is that we can
123 preserve the escape sequences so that the resulting written out
124 file makes sense. If we don't do any of this, the designation
125 to the invalid charset will be preserved but that switch back
126 to the known charset will probably get eaten because it was
127 the same charset that was already present in the register. */
128 unsigned char invalid_designated[4];
130 /* We try to do similar things as above for direction-switching
131 sequences. If we encountered a direction switch while an
132 invalid designation was present, or an invalid designation
133 just after a direction switch (i.e. no valid designation
134 encountered yet), we insert the direction-switch escape
135 sequence literally into the output stream, and later on
136 insert the corresponding direction-restoring escape sequence
138 unsigned int switched_dir_and_no_valid_charset_yet :1;
139 unsigned int invalid_switch_dir :1;
141 /* Tells the decoder to output the escape sequence literally
142 even though it was valid. Used in the games we play to
143 avoid lossage when we encounter invalid designations. */
144 unsigned int output_literally :1;
145 /* We encountered a direction switch followed by an invalid
146 designation. We didn't output the direction switch
147 literally because we didn't know about the invalid designation;
148 but we have to do so now. */
149 unsigned int output_direction_sequence :1;
152 EXFUN (Fcopy_coding_system, 2);
154 struct detection_state;
155 static int detect_coding_sjis (struct detection_state *st,
156 CONST unsigned char *src,
158 static void decode_coding_sjis (Lstream *decoding,
159 CONST unsigned char *src,
160 unsigned_char_dynarr *dst,
162 static void encode_coding_sjis (Lstream *encoding,
163 CONST unsigned char *src,
164 unsigned_char_dynarr *dst,
166 static int detect_coding_big5 (struct detection_state *st,
167 CONST unsigned char *src,
169 static void decode_coding_big5 (Lstream *decoding,
170 CONST unsigned char *src,
171 unsigned_char_dynarr *dst, unsigned int n);
172 static void encode_coding_big5 (Lstream *encoding,
173 CONST unsigned char *src,
174 unsigned_char_dynarr *dst, unsigned int n);
175 static int detect_coding_ucs4 (struct detection_state *st,
176 CONST unsigned char *src,
178 static void decode_coding_ucs4 (Lstream *decoding,
179 CONST unsigned char *src,
180 unsigned_char_dynarr *dst, unsigned int n);
181 static void encode_coding_ucs4 (Lstream *encoding,
182 CONST unsigned char *src,
183 unsigned_char_dynarr *dst, unsigned int n);
184 static int detect_coding_utf8 (struct detection_state *st,
185 CONST unsigned char *src,
187 static void decode_coding_utf8 (Lstream *decoding,
188 CONST unsigned char *src,
189 unsigned_char_dynarr *dst, unsigned int n);
190 static void encode_coding_utf8 (Lstream *encoding,
191 CONST unsigned char *src,
192 unsigned_char_dynarr *dst, unsigned int n);
193 static int postprocess_iso2022_mask (int mask);
194 static void reset_iso2022 (Lisp_Object coding_system,
195 struct iso2022_decoder *iso);
196 static int detect_coding_iso2022 (struct detection_state *st,
197 CONST unsigned char *src,
199 static void decode_coding_iso2022 (Lstream *decoding,
200 CONST unsigned char *src,
201 unsigned_char_dynarr *dst, unsigned int n);
202 static void encode_coding_iso2022 (Lstream *encoding,
203 CONST unsigned char *src,
204 unsigned_char_dynarr *dst, unsigned int n);
206 static void decode_coding_no_conversion (Lstream *decoding,
207 CONST unsigned char *src,
208 unsigned_char_dynarr *dst,
210 static void encode_coding_no_conversion (Lstream *encoding,
211 CONST unsigned char *src,
212 unsigned_char_dynarr *dst,
214 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
216 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
217 unsigned_char_dynarr *dst, unsigned int n);
219 typedef struct codesys_prop codesys_prop;
228 Dynarr_declare (codesys_prop);
229 } codesys_prop_dynarr;
231 codesys_prop_dynarr *the_codesys_prop_dynarr;
233 enum codesys_prop_enum
236 CODESYS_PROP_ISO2022,
241 /************************************************************************/
242 /* Coding system functions */
243 /************************************************************************/
245 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
246 static void print_coding_system (Lisp_Object, Lisp_Object, int);
247 static void finalize_coding_system (void *header, int for_disksave);
250 static const struct lrecord_description ccs_description_1[] = {
251 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
255 static const struct struct_description ccs_description = {
256 sizeof(charset_conversion_spec),
260 static const struct lrecord_description ccsd_description_1[] = {
261 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
265 static const struct struct_description ccsd_description = {
266 sizeof(charset_conversion_spec_dynarr),
271 static const struct lrecord_description coding_system_description[] = {
272 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
273 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
274 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
276 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
277 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
278 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
279 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
284 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
285 mark_coding_system, print_coding_system,
286 finalize_coding_system,
287 0, 0, coding_system_description,
288 struct Lisp_Coding_System);
291 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
293 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
295 markobj (CODING_SYSTEM_NAME (codesys));
296 markobj (CODING_SYSTEM_DOC_STRING (codesys));
297 markobj (CODING_SYSTEM_MNEMONIC (codesys));
298 markobj (CODING_SYSTEM_EOL_LF (codesys));
299 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
300 markobj (CODING_SYSTEM_EOL_CR (codesys));
302 switch (CODING_SYSTEM_TYPE (codesys))
306 case CODESYS_ISO2022:
307 for (i = 0; i < 4; i++)
308 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
309 if (codesys->iso2022.input_conv)
311 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
313 struct charset_conversion_spec *ccs =
314 Dynarr_atp (codesys->iso2022.input_conv, i);
315 markobj (ccs->from_charset);
316 markobj (ccs->to_charset);
319 if (codesys->iso2022.output_conv)
321 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
323 struct charset_conversion_spec *ccs =
324 Dynarr_atp (codesys->iso2022.output_conv, i);
325 markobj (ccs->from_charset);
326 markobj (ccs->to_charset);
332 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
333 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
340 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
341 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
345 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
348 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
350 error ("printing unreadable object #<coding_system 0x%x>",
353 write_c_string ("#<coding_system ", printcharfun);
354 print_internal (c->name, printcharfun, 1);
355 write_c_string (">", printcharfun);
359 finalize_coding_system (void *header, int for_disksave)
361 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
362 /* Since coding systems never go away, this function is not
363 necessary. But it would be necessary if we changed things
364 so that coding systems could go away. */
365 if (!for_disksave) /* see comment in lstream.c */
367 switch (CODING_SYSTEM_TYPE (c))
370 case CODESYS_ISO2022:
371 if (c->iso2022.input_conv)
373 Dynarr_free (c->iso2022.input_conv);
374 c->iso2022.input_conv = 0;
376 if (c->iso2022.output_conv)
378 Dynarr_free (c->iso2022.output_conv);
379 c->iso2022.output_conv = 0;
390 symbol_to_eol_type (Lisp_Object symbol)
392 CHECK_SYMBOL (symbol);
393 if (NILP (symbol)) return EOL_AUTODETECT;
394 if (EQ (symbol, Qlf)) return EOL_LF;
395 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
396 if (EQ (symbol, Qcr)) return EOL_CR;
398 signal_simple_error ("Unrecognized eol type", symbol);
399 return EOL_AUTODETECT; /* not reached */
403 eol_type_to_symbol (enum eol_type type)
408 case EOL_LF: return Qlf;
409 case EOL_CRLF: return Qcrlf;
410 case EOL_CR: return Qcr;
411 case EOL_AUTODETECT: return Qnil;
416 setup_eol_coding_systems (Lisp_Coding_System *codesys)
418 Lisp_Object codesys_obj;
419 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
420 char *codesys_name = (char *) alloca (len + 7);
422 char *codesys_mnemonic=0;
424 Lisp_Object codesys_name_sym, sub_codesys_obj;
428 XSETCODING_SYSTEM (codesys_obj, codesys);
430 memcpy (codesys_name,
431 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
433 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
435 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
436 codesys_mnemonic = (char *) alloca (mlen + 7);
437 memcpy (codesys_mnemonic,
438 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
441 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
442 strcpy (codesys_name + len, "-" op_sys); \
444 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
445 codesys_name_sym = intern (codesys_name); \
446 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
447 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
449 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
450 build_string (codesys_mnemonic); \
451 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
454 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
455 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
456 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
459 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
460 Return t if OBJECT is a coding system.
461 A coding system is an object that defines how text containing multiple
462 character sets is encoded into a stream of (typically 8-bit) bytes.
463 The coding system is used to decode the stream into a series of
464 characters (which may be from multiple charsets) when the text is read
465 from a file or process, and is used to encode the text back into the
466 same format when it is written out to a file or process.
468 For example, many ISO2022-compliant coding systems (such as Compound
469 Text, which is used for inter-client data under the X Window System)
470 use escape sequences to switch between different charsets -- Japanese
471 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
472 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
473 `make-coding-system' for more information.
475 Coding systems are normally identified using a symbol, and the
476 symbol is accepted in place of the actual coding system object whenever
477 a coding system is called for. (This is similar to how faces work.)
481 return CODING_SYSTEMP (object) ? Qt : Qnil;
484 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
485 Retrieve the coding system of the given name.
487 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
488 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
489 If there is no such coding system, nil is returned. Otherwise the
490 associated coding system object is returned.
492 (coding_system_or_name))
494 if (CODING_SYSTEMP (coding_system_or_name))
495 return coding_system_or_name;
497 if (NILP (coding_system_or_name))
498 coding_system_or_name = Qbinary;
500 CHECK_SYMBOL (coding_system_or_name);
502 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
505 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
506 Retrieve the coding system of the given name.
507 Same as `find-coding-system' except that if there is no such
508 coding system, an error is signaled instead of returning nil.
512 Lisp_Object coding_system = Ffind_coding_system (name);
514 if (NILP (coding_system))
515 signal_simple_error ("No such coding system", name);
516 return coding_system;
519 /* We store the coding systems in hash tables with the names as the key and the
520 actual coding system object as the value. Occasionally we need to use them
521 in a list format. These routines provide us with that. */
522 struct coding_system_list_closure
524 Lisp_Object *coding_system_list;
528 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
529 void *coding_system_list_closure)
531 /* This function can GC */
532 struct coding_system_list_closure *cscl =
533 (struct coding_system_list_closure *) coding_system_list_closure;
534 Lisp_Object *coding_system_list = cscl->coding_system_list;
536 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
537 *coding_system_list);
541 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
542 Return a list of the names of all defined coding systems.
546 Lisp_Object coding_system_list = Qnil;
548 struct coding_system_list_closure coding_system_list_closure;
550 GCPRO1 (coding_system_list);
551 coding_system_list_closure.coding_system_list = &coding_system_list;
552 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
553 &coding_system_list_closure);
556 return coding_system_list;
559 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
560 Return the name of the given coding system.
564 coding_system = Fget_coding_system (coding_system);
565 return XCODING_SYSTEM_NAME (coding_system);
568 static Lisp_Coding_System *
569 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
571 Lisp_Coding_System *codesys =
572 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
574 zero_lcrecord (codesys);
575 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
576 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
577 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
578 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
579 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
580 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
581 CODING_SYSTEM_TYPE (codesys) = type;
582 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
584 if (type == CODESYS_ISO2022)
587 for (i = 0; i < 4; i++)
588 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
590 else if (type == CODESYS_CCL)
592 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
593 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
596 CODING_SYSTEM_NAME (codesys) = name;
602 /* Given a list of charset conversion specs as specified in a Lisp
603 program, parse it into STORE_HERE. */
606 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
607 Lisp_Object spec_list)
611 EXTERNAL_LIST_LOOP (rest, spec_list)
613 Lisp_Object car = XCAR (rest);
614 Lisp_Object from, to;
615 struct charset_conversion_spec spec;
617 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
618 signal_simple_error ("Invalid charset conversion spec", car);
619 from = Fget_charset (XCAR (car));
620 to = Fget_charset (XCAR (XCDR (car)));
621 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
622 signal_simple_error_2
623 ("Attempted conversion between different charset types",
625 spec.from_charset = from;
626 spec.to_charset = to;
628 Dynarr_add (store_here, spec);
632 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
633 specs, return the equivalent as the Lisp programmer would see it.
635 If LOAD_HERE is 0, return Qnil. */
638 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
645 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
647 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
648 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
651 return Fnreverse (result);
656 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
657 Register symbol NAME as a coding system.
659 TYPE describes the conversion method used and should be one of
662 Automatic conversion. XEmacs attempts to detect the coding system
665 No conversion. Use this for binary files and such. On output,
666 graphic characters that are not in ASCII or Latin-1 will be
667 replaced by a ?. (For a no-conversion-encoded buffer, these
668 characters will only be present if you explicitly insert them.)
670 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
672 ISO 10646 UCS-4 encoding.
674 ISO 10646 UTF-8 encoding.
676 Any ISO2022-compliant encoding. Among other things, this includes
677 JIS (the Japanese encoding commonly used for e-mail), EUC (the
678 standard Unix encoding for Japanese and other languages), and
679 Compound Text (the encoding used in X11). You can specify more
680 specific information about the conversion with the FLAGS argument.
682 Big5 (the encoding commonly used for Taiwanese).
684 The conversion is performed using a user-written pseudo-code
685 program. CCL (Code Conversion Language) is the name of this
688 Write out or read in the raw contents of the memory representing
689 the buffer's text. This is primarily useful for debugging
690 purposes, and is only enabled when XEmacs has been compiled with
691 DEBUG_XEMACS defined (via the --debug configure option).
692 WARNING: Reading in a file using 'internal conversion can result
693 in an internal inconsistency in the memory representing a
694 buffer's text, which will produce unpredictable results and may
695 cause XEmacs to crash. Under normal circumstances you should
696 never use 'internal conversion.
698 DOC-STRING is a string describing the coding system.
700 PROPS is a property list, describing the specific nature of the
701 character set. Recognized properties are:
704 String to be displayed in the modeline when this coding system is
708 End-of-line conversion to be used. It should be one of
711 Automatically detect the end-of-line type (LF, CRLF,
712 or CR). Also generate subsidiary coding systems named
713 `NAME-unix', `NAME-dos', and `NAME-mac', that are
714 identical to this coding system but have an EOL-TYPE
715 value of 'lf, 'crlf, and 'cr, respectively.
717 The end of a line is marked externally using ASCII LF.
718 Since this is also the way that XEmacs represents an
719 end-of-line internally, specifying this option results
720 in no end-of-line conversion. This is the standard
721 format for Unix text files.
723 The end of a line is marked externally using ASCII
724 CRLF. This is the standard format for MS-DOS text
727 The end of a line is marked externally using ASCII CR.
728 This is the standard format for Macintosh text files.
730 Automatically detect the end-of-line type but do not
731 generate subsidiary coding systems. (This value is
732 converted to nil when stored internally, and
733 `coding-system-property' will return nil.)
735 'post-read-conversion
736 Function called after a file has been read in, to perform the
737 decoding. Called with two arguments, BEG and END, denoting
738 a region of the current buffer to be decoded.
740 'pre-write-conversion
741 Function called before a file is written out, to perform the
742 encoding. Called with two arguments, BEG and END, denoting
743 a region of the current buffer to be encoded.
746 The following additional properties are recognized if TYPE is 'iso2022:
752 The character set initially designated to the G0 - G3 registers.
753 The value should be one of
755 -- A charset object (designate that character set)
756 -- nil (do not ever use this register)
757 -- t (no character set is initially designated to
758 the register, but may be later on; this automatically
759 sets the corresponding `force-g*-on-output' property)
765 If non-nil, send an explicit designation sequence on output before
766 using the specified register.
769 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
770 "ESC $ B" on output in place of the full designation sequences
771 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
774 If non-nil, don't designate ASCII to G0 at each end of line on output.
775 Setting this to non-nil also suppresses other state-resetting that
776 normally happens at the end of a line.
779 If non-nil, don't designate ASCII to G0 before control chars on output.
782 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
786 If non-nil, use locking-shift (SO/SI) instead of single-shift
787 or designation by escape sequence.
790 If non-nil, don't use ISO6429's direction specification.
793 If non-nil, literal control characters that are the same as
794 the beginning of a recognized ISO2022 or ISO6429 escape sequence
795 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
796 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
797 so that they can be properly distinguished from an escape sequence.
798 (Note that doing this results in a non-portable encoding.) This
799 encoding flag is used for byte-compiled files. Note that ESC
800 is a good choice for a quoting character because there are no
801 escape sequences whose second byte is a character from the Control-0
802 or Control-1 character sets; this is explicitly disallowed by the
805 'input-charset-conversion
806 A list of conversion specifications, specifying conversion of
807 characters in one charset to another when decoding is performed.
808 Each specification is a list of two elements: the source charset,
809 and the destination charset.
811 'output-charset-conversion
812 A list of conversion specifications, specifying conversion of
813 characters in one charset to another when encoding is performed.
814 The form of each specification is the same as for
815 'input-charset-conversion.
818 The following additional properties are recognized (and required)
822 CCL program used for decoding (converting to internal format).
825 CCL program used for encoding (converting to external format).
827 (name, type, doc_string, props))
829 Lisp_Coding_System *codesys;
830 Lisp_Object rest, key, value;
831 enum coding_system_type ty;
832 int need_to_setup_eol_systems = 1;
834 /* Convert type to constant */
835 if (NILP (type) || EQ (type, Qundecided))
836 { ty = CODESYS_AUTODETECT; }
838 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
839 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
840 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
841 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
842 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
843 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
845 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
847 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
850 signal_simple_error ("Invalid coding system type", type);
854 codesys = allocate_coding_system (ty, name);
856 if (NILP (doc_string))
857 doc_string = build_string ("");
859 CHECK_STRING (doc_string);
860 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
862 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
864 if (EQ (key, Qmnemonic))
867 CHECK_STRING (value);
868 CODING_SYSTEM_MNEMONIC (codesys) = value;
871 else if (EQ (key, Qeol_type))
873 need_to_setup_eol_systems = NILP (value);
876 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
879 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
880 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
882 else if (ty == CODESYS_ISO2022)
884 #define FROB_INITIAL_CHARSET(charset_num) \
885 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
886 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
888 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
889 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
890 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
891 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
893 #define FROB_FORCE_CHARSET(charset_num) \
894 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
896 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
897 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
898 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
899 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
901 #define FROB_BOOLEAN_PROPERTY(prop) \
902 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
904 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
905 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
906 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
907 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
908 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
909 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
910 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
912 else if (EQ (key, Qinput_charset_conversion))
914 codesys->iso2022.input_conv =
915 Dynarr_new (charset_conversion_spec);
916 parse_charset_conversion_specs (codesys->iso2022.input_conv,
919 else if (EQ (key, Qoutput_charset_conversion))
921 codesys->iso2022.output_conv =
922 Dynarr_new (charset_conversion_spec);
923 parse_charset_conversion_specs (codesys->iso2022.output_conv,
927 signal_simple_error ("Unrecognized property", key);
929 else if (EQ (type, Qccl))
931 if (EQ (key, Qdecode))
933 CHECK_VECTOR (value);
934 CODING_SYSTEM_CCL_DECODE (codesys) = value;
936 else if (EQ (key, Qencode))
938 CHECK_VECTOR (value);
939 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
942 signal_simple_error ("Unrecognized property", key);
946 signal_simple_error ("Unrecognized property", key);
949 if (need_to_setup_eol_systems)
950 setup_eol_coding_systems (codesys);
953 Lisp_Object codesys_obj;
954 XSETCODING_SYSTEM (codesys_obj, codesys);
955 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
960 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
961 Copy OLD-CODING-SYSTEM to NEW-NAME.
962 If NEW-NAME does not name an existing coding system, a new one will
965 (old_coding_system, new_name))
967 Lisp_Object new_coding_system;
968 old_coding_system = Fget_coding_system (old_coding_system);
969 new_coding_system = Ffind_coding_system (new_name);
970 if (NILP (new_coding_system))
972 XSETCODING_SYSTEM (new_coding_system,
973 allocate_coding_system
974 (XCODING_SYSTEM_TYPE (old_coding_system),
976 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
980 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
981 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
982 memcpy (((char *) to ) + sizeof (to->header),
983 ((char *) from) + sizeof (from->header),
984 sizeof (*from) - sizeof (from->header));
987 return new_coding_system;
990 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
991 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
993 (alias, coding_system))
995 CHECK_SYMBOL (alias);
996 if (!NILP (Ffind_coding_system (alias)))
997 signal_simple_error ("Symbol already names a coding system", alias);
998 coding_system = Fget_coding_system (coding_system);
999 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1001 /* Set up aliases for subsidiaries. */
1002 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1005 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1006 #define FROB(type, name) \
1008 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1009 if (!NILP (subsidiary)) \
1010 Fdefine_coding_system_alias \
1011 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1014 FROB (CRLF, "-dos");
1018 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1019 but it doesn't look intentional, so I'd rather return something
1020 meaningful or nothing at all. */
1025 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1027 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1028 Lisp_Object new_coding_system;
1030 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1031 return coding_system;
1035 case EOL_AUTODETECT: return coding_system;
1036 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1037 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1038 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1042 return NILP (new_coding_system) ? coding_system : new_coding_system;
1045 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1046 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1048 (coding_system, eol_type))
1050 coding_system = Fget_coding_system (coding_system);
1052 return subsidiary_coding_system (coding_system,
1053 symbol_to_eol_type (eol_type));
1057 /************************************************************************/
1058 /* Coding system accessors */
1059 /************************************************************************/
1061 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1062 Return the doc string for CODING-SYSTEM.
1066 coding_system = Fget_coding_system (coding_system);
1067 return XCODING_SYSTEM_DOC_STRING (coding_system);
1070 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1071 Return the type of CODING-SYSTEM.
1075 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1078 case CODESYS_AUTODETECT: return Qundecided;
1080 case CODESYS_SHIFT_JIS: return Qshift_jis;
1081 case CODESYS_ISO2022: return Qiso2022;
1082 case CODESYS_BIG5: return Qbig5;
1083 case CODESYS_UCS4: return Qucs4;
1084 case CODESYS_UTF8: return Qutf8;
1085 case CODESYS_CCL: return Qccl;
1087 case CODESYS_NO_CONVERSION: return Qno_conversion;
1089 case CODESYS_INTERNAL: return Qinternal;
1096 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1099 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1101 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1104 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1105 Return initial charset of CODING-SYSTEM designated to GNUM.
1108 (coding_system, gnum))
1110 coding_system = Fget_coding_system (coding_system);
1113 return coding_system_charset (coding_system, XINT (gnum));
1117 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1118 Return the PROP property of CODING-SYSTEM.
1120 (coding_system, prop))
1123 enum coding_system_type type;
1125 coding_system = Fget_coding_system (coding_system);
1126 CHECK_SYMBOL (prop);
1127 type = XCODING_SYSTEM_TYPE (coding_system);
1129 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1130 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1133 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1135 case CODESYS_PROP_ALL_OK:
1138 case CODESYS_PROP_ISO2022:
1139 if (type != CODESYS_ISO2022)
1141 ("Property only valid in ISO2022 coding systems",
1145 case CODESYS_PROP_CCL:
1146 if (type != CODESYS_CCL)
1148 ("Property only valid in CCL coding systems",
1158 signal_simple_error ("Unrecognized property", prop);
1160 if (EQ (prop, Qname))
1161 return XCODING_SYSTEM_NAME (coding_system);
1162 else if (EQ (prop, Qtype))
1163 return Fcoding_system_type (coding_system);
1164 else if (EQ (prop, Qdoc_string))
1165 return XCODING_SYSTEM_DOC_STRING (coding_system);
1166 else if (EQ (prop, Qmnemonic))
1167 return XCODING_SYSTEM_MNEMONIC (coding_system);
1168 else if (EQ (prop, Qeol_type))
1169 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1170 else if (EQ (prop, Qeol_lf))
1171 return XCODING_SYSTEM_EOL_LF (coding_system);
1172 else if (EQ (prop, Qeol_crlf))
1173 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1174 else if (EQ (prop, Qeol_cr))
1175 return XCODING_SYSTEM_EOL_CR (coding_system);
1176 else if (EQ (prop, Qpost_read_conversion))
1177 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1178 else if (EQ (prop, Qpre_write_conversion))
1179 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1181 else if (type == CODESYS_ISO2022)
1183 if (EQ (prop, Qcharset_g0))
1184 return coding_system_charset (coding_system, 0);
1185 else if (EQ (prop, Qcharset_g1))
1186 return coding_system_charset (coding_system, 1);
1187 else if (EQ (prop, Qcharset_g2))
1188 return coding_system_charset (coding_system, 2);
1189 else if (EQ (prop, Qcharset_g3))
1190 return coding_system_charset (coding_system, 3);
1192 #define FORCE_CHARSET(charset_num) \
1193 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1194 (coding_system, charset_num) ? Qt : Qnil)
1196 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1197 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1198 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1199 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1201 #define LISP_BOOLEAN(prop) \
1202 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1204 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1205 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1206 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1207 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1208 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1209 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1210 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1212 else if (EQ (prop, Qinput_charset_conversion))
1214 unparse_charset_conversion_specs
1215 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1216 else if (EQ (prop, Qoutput_charset_conversion))
1218 unparse_charset_conversion_specs
1219 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1223 else if (type == CODESYS_CCL)
1225 if (EQ (prop, Qdecode))
1226 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1227 else if (EQ (prop, Qencode))
1228 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1236 return Qnil; /* not reached */
1240 /************************************************************************/
1241 /* Coding category functions */
1242 /************************************************************************/
1245 decode_coding_category (Lisp_Object symbol)
1249 CHECK_SYMBOL (symbol);
1250 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1251 if (EQ (coding_category_symbol[i], symbol))
1254 signal_simple_error ("Unrecognized coding category", symbol);
1255 return 0; /* not reached */
1258 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1259 Return a list of all recognized coding categories.
1264 Lisp_Object list = Qnil;
1266 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1267 list = Fcons (coding_category_symbol[i], list);
1271 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1272 Change the priority order of the coding categories.
1273 LIST should be list of coding categories, in descending order of
1274 priority. Unspecified coding categories will be lower in priority
1275 than all specified ones, in the same relative order they were in
1280 int category_to_priority[CODING_CATEGORY_LAST + 1];
1284 /* First generate a list that maps coding categories to priorities. */
1286 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1287 category_to_priority[i] = -1;
1289 /* Highest priority comes from the specified list. */
1291 EXTERNAL_LIST_LOOP (rest, list)
1293 int cat = decode_coding_category (XCAR (rest));
1295 if (category_to_priority[cat] >= 0)
1296 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1297 category_to_priority[cat] = i++;
1300 /* Now go through the existing categories by priority to retrieve
1301 the categories not yet specified and preserve their priority
1303 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1305 int cat = coding_category_by_priority[j];
1306 if (category_to_priority[cat] < 0)
1307 category_to_priority[cat] = i++;
1310 /* Now we need to construct the inverse of the mapping we just
1313 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1314 coding_category_by_priority[category_to_priority[i]] = i;
1316 /* Phew! That was confusing. */
1320 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1321 Return a list of coding categories in descending order of priority.
1326 Lisp_Object list = Qnil;
1328 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1329 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1334 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1335 Change the coding system associated with a coding category.
1337 (coding_category, coding_system))
1339 int cat = decode_coding_category (coding_category);
1341 coding_system = Fget_coding_system (coding_system);
1342 coding_category_system[cat] = coding_system;
1346 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1347 Return the coding system associated with a coding category.
1351 int cat = decode_coding_category (coding_category);
1352 Lisp_Object sys = coding_category_system[cat];
1355 return XCODING_SYSTEM_NAME (sys);
1360 /************************************************************************/
1361 /* Detecting the encoding of data */
1362 /************************************************************************/
1364 struct detection_state
1366 enum eol_type eol_type;
1402 struct iso2022_decoder iso;
1404 int high_byte_count;
1405 unsigned int saw_single_shift:1;
1418 acceptable_control_char_p (int c)
1422 /* Allow and ignore control characters that you might
1423 reasonably see in a text file */
1428 case 8: /* backspace */
1429 case 11: /* vertical tab */
1430 case 12: /* form feed */
1431 case 26: /* MS-DOS C-z junk */
1432 case 31: /* '^_' -- for info */
1440 mask_has_at_most_one_bit_p (int mask)
1442 /* Perhaps the only thing useful you learn from intensive Microsoft
1443 technical interviews */
1444 return (mask & (mask - 1)) == 0;
1447 static enum eol_type
1448 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1457 st->eol.just_saw_cr = 1;
1462 if (st->eol.just_saw_cr)
1464 else if (st->eol.seen_anything)
1467 else if (st->eol.just_saw_cr)
1469 st->eol.just_saw_cr = 0;
1471 st->eol.seen_anything = 1;
1474 return EOL_AUTODETECT;
1477 /* Attempt to determine the encoding and EOL type of the given text.
1478 Before calling this function for the first type, you must initialize
1479 st->eol_type as appropriate and initialize st->mask to ~0.
1481 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1484 st->mask holds the determined coding category mask, or ~0 if only
1485 ASCII has been seen so far.
1489 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1490 is present in st->mask
1491 1 == definitive answers are here for both st->eol_type and st->mask
1495 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1496 unsigned int n, int just_do_eol)
1500 if (st->eol_type == EOL_AUTODETECT)
1501 st->eol_type = detect_eol_type (st, src, n);
1504 return st->eol_type != EOL_AUTODETECT;
1506 if (!st->seen_non_ascii)
1508 for (; n; n--, src++)
1511 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1513 st->seen_non_ascii = 1;
1515 st->shift_jis.mask = ~0;
1519 st->iso2022.mask = ~0;
1529 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1530 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1531 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1532 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1533 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1534 st->big5.mask = detect_coding_big5 (st, src, n);
1535 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1536 st->utf8.mask = detect_coding_utf8 (st, src, n);
1537 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1538 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1541 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1542 | st->utf8.mask | st->ucs4.mask;
1545 int retval = mask_has_at_most_one_bit_p (st->mask);
1546 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1547 return retval && st->eol_type != EOL_AUTODETECT;
1552 coding_system_from_mask (int mask)
1556 /* If the file was entirely or basically ASCII, use the
1557 default value of `buffer-file-coding-system'. */
1558 Lisp_Object retval =
1559 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1562 retval = Ffind_coding_system (retval);
1566 (Qbad_variable, Qwarning,
1567 "Invalid `default-buffer-file-coding-system', set to nil");
1568 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1572 retval = Fget_coding_system (Qraw_text);
1580 mask = postprocess_iso2022_mask (mask);
1582 /* Look through the coding categories by priority and find
1583 the first one that is allowed. */
1584 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1586 cat = coding_category_by_priority[i];
1587 if ((mask & (1 << cat)) &&
1588 !NILP (coding_category_system[cat]))
1592 return coding_category_system[cat];
1594 return Fget_coding_system (Qraw_text);
1598 /* Given a seekable read stream and potential coding system and EOL type
1599 as specified, do any autodetection that is called for. If the
1600 coding system and/or EOL type are not autodetect, they will be left
1601 alone; but this function will never return an autodetect coding system
1604 This function does not automatically fetch subsidiary coding systems;
1605 that should be unnecessary with the explicit eol-type argument. */
1608 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1609 enum eol_type *eol_type_in_out)
1611 struct detection_state decst;
1613 if (*eol_type_in_out == EOL_AUTODETECT)
1614 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1617 decst.eol_type = *eol_type_in_out;
1620 /* If autodetection is called for, do it now. */
1621 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1622 *eol_type_in_out == EOL_AUTODETECT)
1627 unsigned char random_buffer[4096];
1630 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1633 if (detect_coding_type (&decst, random_buffer, nread,
1634 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1635 CODESYS_AUTODETECT))
1639 *eol_type_in_out = decst.eol_type;
1640 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1641 *codesys_in_out = coding_system_from_mask (decst.mask);
1644 /* If we absolutely can't determine the EOL type, just assume LF. */
1645 if (*eol_type_in_out == EOL_AUTODETECT)
1646 *eol_type_in_out = EOL_LF;
1648 Lstream_rewind (stream);
1651 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1652 Detect coding system of the text in the region between START and END.
1653 Returned a list of possible coding systems ordered by priority.
1654 If only ASCII characters are found, it returns 'undecided or one of
1655 its subsidiary coding systems according to a detected end-of-line
1656 type. Optional arg BUFFER defaults to the current buffer.
1658 (start, end, buffer))
1660 Lisp_Object val = Qnil;
1661 struct buffer *buf = decode_buffer (buffer, 0);
1663 Lisp_Object instream, lb_instream;
1664 Lstream *istr, *lb_istr;
1665 struct detection_state decst;
1666 struct gcpro gcpro1, gcpro2;
1668 get_buffer_range_char (buf, start, end, &b, &e, 0);
1669 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1670 lb_istr = XLSTREAM (lb_instream);
1671 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1672 istr = XLSTREAM (instream);
1673 GCPRO2 (instream, lb_instream);
1675 decst.eol_type = EOL_AUTODETECT;
1679 unsigned char random_buffer[4096];
1680 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1684 if (detect_coding_type (&decst, random_buffer, nread, 0))
1688 if (decst.mask == ~0)
1689 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1697 decst.mask = postprocess_iso2022_mask (decst.mask);
1699 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1701 int sys = coding_category_by_priority[i];
1702 if (decst.mask & (1 << sys))
1704 Lisp_Object codesys = coding_category_system[sys];
1705 if (!NILP (codesys))
1706 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1707 val = Fcons (codesys, val);
1711 Lstream_close (istr);
1713 Lstream_delete (istr);
1714 Lstream_delete (lb_istr);
1719 /************************************************************************/
1720 /* Converting to internal Mule format ("decoding") */
1721 /************************************************************************/
1723 /* A decoding stream is a stream used for decoding text (i.e.
1724 converting from some external format to internal format).
1725 The decoding-stream object keeps track of the actual coding
1726 stream, the stream that is at the other end, and data that
1727 needs to be persistent across the lifetime of the stream. */
1729 /* Handle the EOL stuff related to just-read-in character C.
1730 EOL_TYPE is the EOL type of the coding stream.
1731 FLAGS is the current value of FLAGS in the coding stream, and may
1732 be modified by this macro. (The macro only looks at the
1733 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1734 bytes are to be written. You need to also define a local goto
1735 label "label_continue_loop" that is at the end of the main
1736 character-reading loop.
1738 If C is a CR character, then this macro handles it entirely and
1739 jumps to label_continue_loop. Otherwise, this macro does not add
1740 anything to DST, and continues normally. You should continue
1741 processing C normally after this macro. */
1743 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1747 if (eol_type == EOL_CR) \
1748 Dynarr_add (dst, '\n'); \
1749 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1750 Dynarr_add (dst, c); \
1752 flags |= CODING_STATE_CR; \
1753 goto label_continue_loop; \
1755 else if (flags & CODING_STATE_CR) \
1756 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1758 Dynarr_add (dst, '\r'); \
1759 flags &= ~CODING_STATE_CR; \
1763 /* C should be a binary character in the range 0 - 255; convert
1764 to internal format and add to Dynarr DST. */
1766 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1768 if (BYTE_ASCII_P (c)) \
1769 Dynarr_add (dst, c); \
1770 else if (BYTE_C1_P (c)) \
1772 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1773 Dynarr_add (dst, c + 0x20); \
1777 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1778 Dynarr_add (dst, c); \
1782 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1786 DECODE_ADD_BINARY_CHAR (ch, dst); \
1791 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1793 if (flags & CODING_STATE_END) \
1795 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1796 if (flags & CODING_STATE_CR) \
1797 Dynarr_add (dst, '\r'); \
1801 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1803 struct decoding_stream
1805 /* Coding system that governs the conversion. */
1806 Lisp_Coding_System *codesys;
1808 /* Stream that we read the encoded data from or
1809 write the decoded data to. */
1812 /* If we are reading, then we can return only a fixed amount of
1813 data, so if the conversion resulted in too much data, we store it
1814 here for retrieval the next time around. */
1815 unsigned_char_dynarr *runoff;
1817 /* FLAGS holds flags indicating the current state of the decoding.
1818 Some of these flags are dependent on the coding system. */
1821 /* CH holds a partially built-up character. Since we only deal
1822 with one- and two-byte characters at the moment, we only use
1823 this to store the first byte of a two-byte character. */
1826 /* EOL_TYPE specifies the type of end-of-line conversion that
1827 currently applies. We need to keep this separate from the
1828 EOL type stored in CODESYS because the latter might indicate
1829 automatic EOL-type detection while the former will always
1830 indicate a particular EOL type. */
1831 enum eol_type eol_type;
1833 /* Additional ISO2022 information. We define the structure above
1834 because it's also needed by the detection routines. */
1835 struct iso2022_decoder iso2022;
1837 /* Additional information (the state of the running CCL program)
1838 used by the CCL decoder. */
1839 struct ccl_program ccl;
1841 struct detection_state decst;
1844 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1845 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1846 static int decoding_rewinder (Lstream *stream);
1847 static int decoding_seekable_p (Lstream *stream);
1848 static int decoding_flusher (Lstream *stream);
1849 static int decoding_closer (Lstream *stream);
1851 static Lisp_Object decoding_marker (Lisp_Object stream,
1852 void (*markobj) (Lisp_Object));
1854 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1855 sizeof (struct decoding_stream));
1858 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1860 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1861 Lisp_Object str_obj;
1863 /* We do not need to mark the coding systems or charsets stored
1864 within the stream because they are stored in a global list
1865 and automatically marked. */
1867 XSETLSTREAM (str_obj, str);
1869 if (str->imp->marker)
1870 return (str->imp->marker) (str_obj, markobj);
1875 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1876 so we read data from the other end, decode it, and store it into DATA. */
1879 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1881 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1882 unsigned char *orig_data = data;
1884 int error_occurred = 0;
1886 /* We need to interface to mule_decode(), which expects to take some
1887 amount of data and store the result into a Dynarr. We have
1888 mule_decode() store into str->runoff, and take data from there
1891 /* We loop until we have enough data, reading chunks from the other
1892 end and decoding it. */
1895 /* Take data from the runoff if we can. Make sure to take at
1896 most SIZE bytes, and delete the data from the runoff. */
1897 if (Dynarr_length (str->runoff) > 0)
1899 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1900 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1901 Dynarr_delete_many (str->runoff, 0, chunk);
1907 break; /* No more room for data */
1909 if (str->flags & CODING_STATE_END)
1910 /* This means that on the previous iteration, we hit the EOF on
1911 the other end. We loop once more so that mule_decode() can
1912 output any final stuff it may be holding, or any "go back
1913 to a sane state" escape sequences. (This latter makes sense
1914 during encoding.) */
1917 /* Exhausted the runoff, so get some more. DATA has at least
1918 SIZE bytes left of storage in it, so it's OK to read directly
1919 into it. (We'll be overwriting above, after we've decoded it
1920 into the runoff.) */
1921 read_size = Lstream_read (str->other_end, data, size);
1928 /* There might be some more end data produced in the translation.
1929 See the comment above. */
1930 str->flags |= CODING_STATE_END;
1931 mule_decode (stream, data, str->runoff, read_size);
1934 if (data - orig_data == 0)
1935 return error_occurred ? -1 : 0;
1937 return data - orig_data;
1941 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1943 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1946 /* Decode all our data into the runoff, and then attempt to write
1947 it all out to the other end. Remove whatever chunk we succeeded
1949 mule_decode (stream, data, str->runoff, size);
1950 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1951 Dynarr_length (str->runoff));
1953 Dynarr_delete_many (str->runoff, 0, retval);
1954 /* Do NOT return retval. The return value indicates how much
1955 of the incoming data was written, not how many bytes were
1961 reset_decoding_stream (struct decoding_stream *str)
1964 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1966 Lisp_Object coding_system;
1967 XSETCODING_SYSTEM (coding_system, str->codesys);
1968 reset_iso2022 (coding_system, &str->iso2022);
1970 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1972 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1975 str->flags = str->ch = 0;
1979 decoding_rewinder (Lstream *stream)
1981 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1982 reset_decoding_stream (str);
1983 Dynarr_reset (str->runoff);
1984 return Lstream_rewind (str->other_end);
1988 decoding_seekable_p (Lstream *stream)
1990 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1991 return Lstream_seekable_p (str->other_end);
1995 decoding_flusher (Lstream *stream)
1997 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1998 return Lstream_flush (str->other_end);
2002 decoding_closer (Lstream *stream)
2004 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2005 if (stream->flags & LSTREAM_FL_WRITE)
2007 str->flags |= CODING_STATE_END;
2008 decoding_writer (stream, 0, 0);
2010 Dynarr_free (str->runoff);
2012 #ifdef ENABLE_COMPOSITE_CHARS
2013 if (str->iso2022.composite_chars)
2014 Dynarr_free (str->iso2022.composite_chars);
2017 return Lstream_close (str->other_end);
2021 decoding_stream_coding_system (Lstream *stream)
2023 Lisp_Object coding_system;
2024 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2026 XSETCODING_SYSTEM (coding_system, str->codesys);
2027 return subsidiary_coding_system (coding_system, str->eol_type);
2031 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2033 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2034 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2036 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2037 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2038 reset_decoding_stream (str);
2041 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2042 stream for writing, no automatic code detection will be performed.
2043 The reason for this is that automatic code detection requires a
2044 seekable input. Things will also fail if you open a decoding
2045 stream for reading using a non-fully-specified coding system and
2046 a non-seekable input stream. */
2049 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2052 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2053 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2057 str->other_end = stream;
2058 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2059 str->eol_type = EOL_AUTODETECT;
2060 if (!strcmp (mode, "r")
2061 && Lstream_seekable_p (stream))
2062 /* We can determine the coding system now. */
2063 determine_real_coding_system (stream, &codesys, &str->eol_type);
2064 set_decoding_stream_coding_system (lstr, codesys);
2065 str->decst.eol_type = str->eol_type;
2066 str->decst.mask = ~0;
2067 XSETLSTREAM (obj, lstr);
2072 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2074 return make_decoding_stream_1 (stream, codesys, "r");
2078 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2080 return make_decoding_stream_1 (stream, codesys, "w");
2083 /* Note: the decode_coding_* functions all take the same
2084 arguments as mule_decode(), which is to say some SRC data of
2085 size N, which is to be stored into dynamic array DST.
2086 DECODING is the stream within which the decoding is
2087 taking place, but no data is actually read from or
2088 written to that stream; that is handled in decoding_reader()
2089 or decoding_writer(). This allows the same functions to
2090 be used for both reading and writing. */
2093 mule_decode (Lstream *decoding, CONST unsigned char *src,
2094 unsigned_char_dynarr *dst, unsigned int n)
2096 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2098 /* If necessary, do encoding-detection now. We do this when
2099 we're a writing stream or a non-seekable reading stream,
2100 meaning that we can't just process the whole input,
2101 rewind, and start over. */
2103 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2104 str->eol_type == EOL_AUTODETECT)
2106 Lisp_Object codesys;
2108 XSETCODING_SYSTEM (codesys, str->codesys);
2109 detect_coding_type (&str->decst, src, n,
2110 CODING_SYSTEM_TYPE (str->codesys) !=
2111 CODESYS_AUTODETECT);
2112 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2113 str->decst.mask != ~0)
2114 /* #### This is cheesy. What we really ought to do is
2115 buffer up a certain amount of data so as to get a
2116 less random result. */
2117 codesys = coding_system_from_mask (str->decst.mask);
2118 str->eol_type = str->decst.eol_type;
2119 if (XCODING_SYSTEM (codesys) != str->codesys)
2121 /* Preserve the CODING_STATE_END flag in case it was set.
2122 If we erase it, bad things might happen. */
2123 int was_end = str->flags & CODING_STATE_END;
2124 set_decoding_stream_coding_system (decoding, codesys);
2126 str->flags |= CODING_STATE_END;
2130 switch (CODING_SYSTEM_TYPE (str->codesys))
2133 case CODESYS_INTERNAL:
2134 Dynarr_add_many (dst, src, n);
2137 case CODESYS_AUTODETECT:
2138 /* If we got this far and still haven't decided on the coding
2139 system, then do no conversion. */
2140 case CODESYS_NO_CONVERSION:
2141 decode_coding_no_conversion (decoding, src, dst, n);
2144 case CODESYS_SHIFT_JIS:
2145 decode_coding_sjis (decoding, src, dst, n);
2148 decode_coding_big5 (decoding, src, dst, n);
2151 decode_coding_ucs4 (decoding, src, dst, n);
2154 decode_coding_utf8 (decoding, src, dst, n);
2157 str->ccl.last_block = str->flags & CODING_STATE_END;
2158 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2160 case CODESYS_ISO2022:
2161 decode_coding_iso2022 (decoding, src, dst, n);
2169 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2170 Decode the text between START and END which is encoded in CODING-SYSTEM.
2171 This is useful if you've read in encoded text from a file without decoding
2172 it (e.g. you read in a JIS-formatted file but used the `binary' or
2173 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2174 Return length of decoded text.
2175 BUFFER defaults to the current buffer if unspecified.
2177 (start, end, coding_system, buffer))
2180 struct buffer *buf = decode_buffer (buffer, 0);
2181 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2182 Lstream *istr, *ostr;
2183 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2185 get_buffer_range_char (buf, start, end, &b, &e, 0);
2187 barf_if_buffer_read_only (buf, b, e);
2189 coding_system = Fget_coding_system (coding_system);
2190 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2191 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2192 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2194 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2195 Fget_coding_system (Qbinary));
2196 istr = XLSTREAM (instream);
2197 ostr = XLSTREAM (outstream);
2198 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2200 /* The chain of streams looks like this:
2202 [BUFFER] <----- send through
2203 ------> [ENCODE AS BINARY]
2204 ------> [DECODE AS SPECIFIED]
2210 char tempbuf[1024]; /* some random amount */
2211 Bufpos newpos, even_newer_pos;
2212 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2213 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2217 newpos = lisp_buffer_stream_startpos (istr);
2218 Lstream_write (ostr, tempbuf, size_in_bytes);
2219 even_newer_pos = lisp_buffer_stream_startpos (istr);
2220 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2223 Lstream_close (istr);
2224 Lstream_close (ostr);
2226 Lstream_delete (istr);
2227 Lstream_delete (ostr);
2228 Lstream_delete (XLSTREAM (de_outstream));
2229 Lstream_delete (XLSTREAM (lb_outstream));
2234 /************************************************************************/
2235 /* Converting to an external encoding ("encoding") */
2236 /************************************************************************/
2238 /* An encoding stream is an output stream. When you create the
2239 stream, you specify the coding system that governs the encoding
2240 and another stream that the resulting encoded data is to be
2241 sent to, and then start sending data to it. */
2243 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2245 struct encoding_stream
2247 /* Coding system that governs the conversion. */
2248 Lisp_Coding_System *codesys;
2250 /* Stream that we read the encoded data from or
2251 write the decoded data to. */
2254 /* If we are reading, then we can return only a fixed amount of
2255 data, so if the conversion resulted in too much data, we store it
2256 here for retrieval the next time around. */
2257 unsigned_char_dynarr *runoff;
2259 /* FLAGS holds flags indicating the current state of the encoding.
2260 Some of these flags are dependent on the coding system. */
2263 /* CH holds a partially built-up character. Since we only deal
2264 with one- and two-byte characters at the moment, we only use
2265 this to store the first byte of a two-byte character. */
2268 /* Additional information used by the ISO2022 encoder. */
2271 /* CHARSET holds the character sets currently assigned to the G0
2272 through G3 registers. It is initialized from the array
2273 INITIAL_CHARSET in CODESYS. */
2274 Lisp_Object charset[4];
2276 /* Which registers are currently invoked into the left (GL) and
2277 right (GR) halves of the 8-bit encoding space? */
2278 int register_left, register_right;
2280 /* Whether we need to explicitly designate the charset in the
2281 G? register before using it. It is initialized from the
2282 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2283 unsigned char force_charset_on_output[4];
2285 /* Other state variables that need to be preserved across
2287 Lisp_Object current_charset;
2289 int current_char_boundary;
2292 /* Additional information (the state of the running CCL program)
2293 used by the CCL encoder. */
2294 struct ccl_program ccl;
2298 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2299 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2301 static int encoding_rewinder (Lstream *stream);
2302 static int encoding_seekable_p (Lstream *stream);
2303 static int encoding_flusher (Lstream *stream);
2304 static int encoding_closer (Lstream *stream);
2306 static Lisp_Object encoding_marker (Lisp_Object stream,
2307 void (*markobj) (Lisp_Object));
2309 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2310 sizeof (struct encoding_stream));
2313 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2315 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2316 Lisp_Object str_obj;
2318 /* We do not need to mark the coding systems or charsets stored
2319 within the stream because they are stored in a global list
2320 and automatically marked. */
2322 XSETLSTREAM (str_obj, str);
2324 if (str->imp->marker)
2325 return (str->imp->marker) (str_obj, markobj);
2330 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2331 so we read data from the other end, encode it, and store it into DATA. */
2334 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2336 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2337 unsigned char *orig_data = data;
2339 int error_occurred = 0;
2341 /* We need to interface to mule_encode(), which expects to take some
2342 amount of data and store the result into a Dynarr. We have
2343 mule_encode() store into str->runoff, and take data from there
2346 /* We loop until we have enough data, reading chunks from the other
2347 end and encoding it. */
2350 /* Take data from the runoff if we can. Make sure to take at
2351 most SIZE bytes, and delete the data from the runoff. */
2352 if (Dynarr_length (str->runoff) > 0)
2354 int chunk = min ((int) size, Dynarr_length (str->runoff));
2355 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2356 Dynarr_delete_many (str->runoff, 0, chunk);
2362 break; /* No more room for data */
2364 if (str->flags & CODING_STATE_END)
2365 /* This means that on the previous iteration, we hit the EOF on
2366 the other end. We loop once more so that mule_encode() can
2367 output any final stuff it may be holding, or any "go back
2368 to a sane state" escape sequences. (This latter makes sense
2369 during encoding.) */
2372 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2373 left of storage in it, so it's OK to read directly into it.
2374 (We'll be overwriting above, after we've encoded it into the
2376 read_size = Lstream_read (str->other_end, data, size);
2383 /* There might be some more end data produced in the translation.
2384 See the comment above. */
2385 str->flags |= CODING_STATE_END;
2386 mule_encode (stream, data, str->runoff, read_size);
2389 if (data == orig_data)
2390 return error_occurred ? -1 : 0;
2392 return data - orig_data;
2396 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2398 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2401 /* Encode all our data into the runoff, and then attempt to write
2402 it all out to the other end. Remove whatever chunk we succeeded
2404 mule_encode (stream, data, str->runoff, size);
2405 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2406 Dynarr_length (str->runoff));
2408 Dynarr_delete_many (str->runoff, 0, retval);
2409 /* Do NOT return retval. The return value indicates how much
2410 of the incoming data was written, not how many bytes were
2416 reset_encoding_stream (struct encoding_stream *str)
2419 switch (CODING_SYSTEM_TYPE (str->codesys))
2421 case CODESYS_ISO2022:
2425 for (i = 0; i < 4; i++)
2427 str->iso2022.charset[i] =
2428 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2429 str->iso2022.force_charset_on_output[i] =
2430 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2432 str->iso2022.register_left = 0;
2433 str->iso2022.register_right = 1;
2434 str->iso2022.current_charset = Qnil;
2435 str->iso2022.current_half = 0;
2436 str->iso2022.current_char_boundary = 1;
2440 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2447 str->flags = str->ch = 0;
2451 encoding_rewinder (Lstream *stream)
2453 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2454 reset_encoding_stream (str);
2455 Dynarr_reset (str->runoff);
2456 return Lstream_rewind (str->other_end);
2460 encoding_seekable_p (Lstream *stream)
2462 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2463 return Lstream_seekable_p (str->other_end);
2467 encoding_flusher (Lstream *stream)
2469 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2470 return Lstream_flush (str->other_end);
2474 encoding_closer (Lstream *stream)
2476 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2477 if (stream->flags & LSTREAM_FL_WRITE)
2479 str->flags |= CODING_STATE_END;
2480 encoding_writer (stream, 0, 0);
2482 Dynarr_free (str->runoff);
2483 return Lstream_close (str->other_end);
2487 encoding_stream_coding_system (Lstream *stream)
2489 Lisp_Object coding_system;
2490 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2492 XSETCODING_SYSTEM (coding_system, str->codesys);
2493 return coding_system;
2497 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2499 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2500 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2502 reset_encoding_stream (str);
2506 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2509 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2510 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2514 str->runoff = Dynarr_new (unsigned_char);
2515 str->other_end = stream;
2516 set_encoding_stream_coding_system (lstr, codesys);
2517 XSETLSTREAM (obj, lstr);
2522 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2524 return make_encoding_stream_1 (stream, codesys, "r");
2528 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2530 return make_encoding_stream_1 (stream, codesys, "w");
2533 /* Convert N bytes of internally-formatted data stored in SRC to an
2534 external format, according to the encoding stream ENCODING.
2535 Store the encoded data into DST. */
2538 mule_encode (Lstream *encoding, CONST unsigned char *src,
2539 unsigned_char_dynarr *dst, unsigned int n)
2541 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2543 switch (CODING_SYSTEM_TYPE (str->codesys))
2546 case CODESYS_INTERNAL:
2547 Dynarr_add_many (dst, src, n);
2550 case CODESYS_AUTODETECT:
2551 /* If we got this far and still haven't decided on the coding
2552 system, then do no conversion. */
2553 case CODESYS_NO_CONVERSION:
2554 encode_coding_no_conversion (encoding, src, dst, n);
2557 case CODESYS_SHIFT_JIS:
2558 encode_coding_sjis (encoding, src, dst, n);
2561 encode_coding_big5 (encoding, src, dst, n);
2564 encode_coding_ucs4 (encoding, src, dst, n);
2567 encode_coding_utf8 (encoding, src, dst, n);
2570 str->ccl.last_block = str->flags & CODING_STATE_END;
2571 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2573 case CODESYS_ISO2022:
2574 encode_coding_iso2022 (encoding, src, dst, n);
2582 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2583 Encode the text between START and END using CODING-SYSTEM.
2584 This will, for example, convert Japanese characters into stuff such as
2585 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2586 text. BUFFER defaults to the current buffer if unspecified.
2588 (start, end, coding_system, buffer))
2591 struct buffer *buf = decode_buffer (buffer, 0);
2592 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2593 Lstream *istr, *ostr;
2594 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2596 get_buffer_range_char (buf, start, end, &b, &e, 0);
2598 barf_if_buffer_read_only (buf, b, e);
2600 coding_system = Fget_coding_system (coding_system);
2601 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2602 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2603 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2604 Fget_coding_system (Qbinary));
2605 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2607 istr = XLSTREAM (instream);
2608 ostr = XLSTREAM (outstream);
2609 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2610 /* The chain of streams looks like this:
2612 [BUFFER] <----- send through
2613 ------> [ENCODE AS SPECIFIED]
2614 ------> [DECODE AS BINARY]
2619 char tempbuf[1024]; /* some random amount */
2620 Bufpos newpos, even_newer_pos;
2621 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2622 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2626 newpos = lisp_buffer_stream_startpos (istr);
2627 Lstream_write (ostr, tempbuf, size_in_bytes);
2628 even_newer_pos = lisp_buffer_stream_startpos (istr);
2629 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2635 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2636 Lstream_close (istr);
2637 Lstream_close (ostr);
2639 Lstream_delete (istr);
2640 Lstream_delete (ostr);
2641 Lstream_delete (XLSTREAM (de_outstream));
2642 Lstream_delete (XLSTREAM (lb_outstream));
2643 return make_int (retlen);
2649 /************************************************************************/
2650 /* Shift-JIS methods */
2651 /************************************************************************/
2653 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2654 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2655 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2656 encoded by "position-code + 0x80". A character of JISX0208
2657 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2658 position-codes are divided and shifted so that it fit in the range
2661 --- CODE RANGE of Shift-JIS ---
2662 (character set) (range)
2664 JISX0201-Kana 0xA0 .. 0xDF
2665 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2666 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2667 -------------------------------
2671 /* Is this the first byte of a Shift-JIS two-byte char? */
2673 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2674 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2676 /* Is this the second byte of a Shift-JIS two-byte char? */
2678 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2679 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2681 #define BYTE_SJIS_KATAKANA_P(c) \
2682 ((c) >= 0xA1 && (c) <= 0xDF)
2685 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2693 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2695 if (st->shift_jis.in_second_byte)
2697 st->shift_jis.in_second_byte = 0;
2701 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2702 st->shift_jis.in_second_byte = 1;
2704 return CODING_CATEGORY_SHIFT_JIS_MASK;
2707 /* Convert Shift-JIS data to internal format. */
2710 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2711 unsigned_char_dynarr *dst, unsigned int n)
2714 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2715 unsigned int flags = str->flags;
2716 unsigned int ch = str->ch;
2717 eol_type_t eol_type = str->eol_type;
2725 /* Previous character was first byte of Shift-JIS Kanji char. */
2726 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2728 unsigned char e1, e2;
2730 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2731 DECODE_SJIS (ch, c, e1, e2);
2732 Dynarr_add (dst, e1);
2733 Dynarr_add (dst, e2);
2737 DECODE_ADD_BINARY_CHAR (ch, dst);
2738 DECODE_ADD_BINARY_CHAR (c, dst);
2744 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2745 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2747 else if (BYTE_SJIS_KATAKANA_P (c))
2749 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2750 Dynarr_add (dst, c);
2753 DECODE_ADD_BINARY_CHAR (c, dst);
2755 label_continue_loop:;
2758 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2764 /* Convert internally-formatted data to Shift-JIS. */
2767 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2768 unsigned_char_dynarr *dst, unsigned int n)
2771 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2772 unsigned int flags = str->flags;
2773 unsigned int ch = str->ch;
2774 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2781 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2782 Dynarr_add (dst, '\r');
2783 if (eol_type != EOL_CR)
2784 Dynarr_add (dst, '\n');
2787 else if (BYTE_ASCII_P (c))
2789 Dynarr_add (dst, c);
2792 else if (BUFBYTE_LEADING_BYTE_P (c))
2793 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2794 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2795 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2798 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2800 Dynarr_add (dst, c);
2803 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2804 ch == LEADING_BYTE_JAPANESE_JISX0208)
2808 unsigned char j1, j2;
2809 ENCODE_SJIS (ch, c, j1, j2);
2810 Dynarr_add (dst, j1);
2811 Dynarr_add (dst, j2);
2821 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2822 Decode a JISX0208 character of Shift-JIS coding-system.
2823 CODE is the character code in Shift-JIS as a cons of type bytes.
2824 Return the corresponding character.
2828 unsigned char c1, c2, s1, s2;
2831 CHECK_INT (XCAR (code));
2832 CHECK_INT (XCDR (code));
2833 s1 = XINT (XCAR (code));
2834 s2 = XINT (XCDR (code));
2835 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2836 BYTE_SJIS_TWO_BYTE_2_P (s2))
2838 DECODE_SJIS (s1, s2, c1, c2);
2839 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2840 c1 & 0x7F, c2 & 0x7F));
2846 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2847 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2848 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2852 Lisp_Object charset;
2855 CHECK_CHAR_COERCE_INT (ch);
2856 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2857 if (EQ (charset, Vcharset_japanese_jisx0208))
2859 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2860 return Fcons (make_int (s1), make_int (s2));
2867 /************************************************************************/
2869 /************************************************************************/
2871 /* BIG5 is a coding system encoding two character sets: ASCII and
2872 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2873 character set and is encoded in two-byte.
2875 --- CODE RANGE of BIG5 ---
2876 (character set) (range)
2878 Big5 (1st byte) 0xA1 .. 0xFE
2879 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2880 --------------------------
2882 Since the number of characters in Big5 is larger than maximum
2883 characters in Emacs' charset (96x96), it can't be handled as one
2884 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2885 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2886 contains frequently used characters and the latter contains less
2887 frequently used characters. */
2889 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2890 ((c) >= 0xA1 && (c) <= 0xFE)
2892 /* Is this the second byte of a Shift-JIS two-byte char? */
2894 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2895 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2897 /* Number of Big5 characters which have the same code in 1st byte. */
2899 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2901 /* Code conversion macros. These are macros because they are used in
2902 inner loops during code conversion.
2904 Note that temporary variables in macros introduce the classic
2905 dynamic-scoping problems with variable names. We use capital-
2906 lettered variables in the assumption that XEmacs does not use
2907 capital letters in variables except in a very formalized way
2910 /* Convert Big5 code (b1, b2) into its internal string representation
2913 /* There is a much simpler way to split the Big5 charset into two.
2914 For the moment I'm going to leave the algorithm as-is because it
2915 claims to separate out the most-used characters into a single
2916 charset, which perhaps will lead to optimizations in various
2919 The way the algorithm works is something like this:
2921 Big5 can be viewed as a 94x157 charset, where the row is
2922 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2923 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2924 the split between low and high column numbers is apparently
2925 meaningless; ascending rows produce less and less frequent chars.
2926 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2927 the first charset, and the upper half (0xC9 .. 0xFE) to the
2928 second. To do the conversion, we convert the character into
2929 a single number where 0 .. 156 is the first row, 157 .. 313
2930 is the second, etc. That way, the characters are ordered by
2931 decreasing frequency. Then we just chop the space in two
2932 and coerce the result into a 94x94 space.
2935 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2937 int B1 = b1, B2 = b2; \
2939 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2943 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2947 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2948 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2950 c1 = I / (0xFF - 0xA1) + 0xA1; \
2951 c2 = I % (0xFF - 0xA1) + 0xA1; \
2954 /* Convert the internal string representation of a Big5 character
2955 (lb, c1, c2) into Big5 code (b1, b2). */
2957 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2959 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2961 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2963 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2965 b1 = I / BIG5_SAME_ROW + 0xA1; \
2966 b2 = I % BIG5_SAME_ROW; \
2967 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2971 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2979 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2980 (c >= 0x80 && c <= 0xA0))
2982 if (st->big5.in_second_byte)
2984 st->big5.in_second_byte = 0;
2985 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2989 st->big5.in_second_byte = 1;
2991 return CODING_CATEGORY_BIG5_MASK;
2994 /* Convert Big5 data to internal format. */
2997 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2998 unsigned_char_dynarr *dst, unsigned int n)
3001 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3002 unsigned int flags = str->flags;
3003 unsigned int ch = str->ch;
3004 eol_type_t eol_type = str->eol_type;
3011 /* Previous character was first byte of Big5 char. */
3012 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3014 unsigned char b1, b2, b3;
3015 DECODE_BIG5 (ch, c, b1, b2, b3);
3016 Dynarr_add (dst, b1);
3017 Dynarr_add (dst, b2);
3018 Dynarr_add (dst, b3);
3022 DECODE_ADD_BINARY_CHAR (ch, dst);
3023 DECODE_ADD_BINARY_CHAR (c, dst);
3029 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3030 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3033 DECODE_ADD_BINARY_CHAR (c, dst);
3035 label_continue_loop:;
3038 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3044 /* Convert internally-formatted data to Big5. */
3047 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3048 unsigned_char_dynarr *dst, unsigned int n)
3051 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3052 unsigned int flags = str->flags;
3053 unsigned int ch = str->ch;
3054 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3061 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3062 Dynarr_add (dst, '\r');
3063 if (eol_type != EOL_CR)
3064 Dynarr_add (dst, '\n');
3066 else if (BYTE_ASCII_P (c))
3069 Dynarr_add (dst, c);
3071 else if (BUFBYTE_LEADING_BYTE_P (c))
3073 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3074 c == LEADING_BYTE_CHINESE_BIG5_2)
3076 /* A recognized leading byte. */
3078 continue; /* not done with this character. */
3080 /* otherwise just ignore this character. */
3082 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3083 ch == LEADING_BYTE_CHINESE_BIG5_2)
3085 /* Previous char was a recognized leading byte. */
3087 continue; /* not done with this character. */
3091 /* Encountering second byte of a Big5 character. */
3092 unsigned char b1, b2;
3094 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3095 Dynarr_add (dst, b1);
3096 Dynarr_add (dst, b2);
3107 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3108 Decode a Big5 character CODE of BIG5 coding-system.
3109 CODE is the character code in BIG5, a cons of two integers.
3110 Return the corresponding character.
3114 unsigned char c1, c2, b1, b2;
3117 CHECK_INT (XCAR (code));
3118 CHECK_INT (XCDR (code));
3119 b1 = XINT (XCAR (code));
3120 b2 = XINT (XCDR (code));
3121 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3122 BYTE_BIG5_TWO_BYTE_2_P (b2))
3125 Lisp_Object charset;
3126 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3127 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3128 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3134 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3135 Encode the Big5 character CH to BIG5 coding-system.
3136 Return the corresponding character code in Big5.
3140 Lisp_Object charset;
3143 CHECK_CHAR_COERCE_INT (ch);
3144 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3145 if (EQ (charset, Vcharset_chinese_big5_1) ||
3146 EQ (charset, Vcharset_chinese_big5_2))
3148 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3150 return Fcons (make_int (b1), make_int (b2));
3157 /************************************************************************/
3160 /* UCS-4 character codes are implemented as nonnegative integers. */
3162 /************************************************************************/
3164 Lisp_Object ucs_to_mule_table[65536];
3165 Lisp_Object mule_to_ucs_table;
3167 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3168 Map UCS-4 code CODE to Mule character CHARACTER.
3170 Return T on success, NIL on failure.
3176 CHECK_CHAR (character);
3180 if (c < sizeof (ucs_to_mule_table))
3182 ucs_to_mule_table[c] = character;
3190 ucs_to_char (unsigned long code)
3192 if (code < sizeof (ucs_to_mule_table))
3194 return ucs_to_mule_table[code];
3196 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3201 c = code % (94 * 94);
3203 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3204 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3205 CHARSET_LEFT_TO_RIGHT),
3206 c / 94 + 33, c % 94 + 33));
3212 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3213 Return Mule character corresponding to UCS code CODE (a positive integer).
3217 CHECK_NATNUM (code);
3218 return ucs_to_char (XINT (code));
3221 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3222 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3226 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3227 Fset_char_ucs is more restrictive on index arg, but should
3228 check code arg in a char_table method. */
3229 CHECK_CHAR (character);
3230 CHECK_NATNUM (code);
3231 return Fput_char_table (character, code, mule_to_ucs_table);
3234 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3235 Return the UCS code (a positive integer) corresponding to CHARACTER.
3239 return Fget_char_table (character, mule_to_ucs_table);
3242 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3243 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3244 is not found, instead.
3245 #### do something more appropriate (use blob?)
3246 Danger, Will Robinson! Data loss. Should we signal user? */
3248 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3250 Lisp_Object chr = ucs_to_char (ch);
3254 Bufbyte work[MAX_EMCHAR_LEN];
3259 simple_set_charptr_emchar (work, ch) :
3260 non_ascii_set_charptr_emchar (work, ch);
3261 Dynarr_add_many (dst, work, len);
3265 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3266 Dynarr_add (dst, 34 + 128);
3267 Dynarr_add (dst, 46 + 128);
3271 static unsigned long
3272 mule_char_to_ucs4 (Lisp_Object charset,
3273 unsigned char h, unsigned char l)
3276 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3283 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3284 (XCHARSET_CHARS (charset) == 94) )
3286 unsigned char final = XCHARSET_FINAL (charset);
3288 if ( ('@' <= final) && (final < 0x7f) )
3290 return 0xe00000 + (final - '@') * 94 * 94
3291 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3305 encode_ucs4 (Lisp_Object charset,
3306 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3308 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3309 Dynarr_add (dst, code >> 24);
3310 Dynarr_add (dst, (code >> 16) & 255);
3311 Dynarr_add (dst, (code >> 8) & 255);
3312 Dynarr_add (dst, code & 255);
3316 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3322 switch (st->ucs4.in_byte)
3331 st->ucs4.in_byte = 0;
3337 return CODING_CATEGORY_UCS4_MASK;
3341 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3342 unsigned_char_dynarr *dst, unsigned int n)
3344 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3345 unsigned int flags = str->flags;
3346 unsigned int ch = str->ch;
3350 unsigned char c = *src++;
3358 decode_ucs4 ( ( ch << 8 ) | c, dst);
3363 ch = ( ch << 8 ) | c;
3367 if (flags & CODING_STATE_END)
3368 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3375 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3376 unsigned_char_dynarr *dst, unsigned int n)
3378 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3379 unsigned int flags = str->flags;
3380 unsigned int ch = str->ch;
3381 unsigned char char_boundary = str->iso2022.current_char_boundary;
3382 Lisp_Object charset = str->iso2022.current_charset;
3384 #ifdef ENABLE_COMPOSITE_CHARS
3385 /* flags for handling composite chars. We do a little switcharoo
3386 on the source while we're outputting the composite char. */
3387 unsigned int saved_n = 0;
3388 CONST unsigned char *saved_src = NULL;
3389 int in_composite = 0;
3396 unsigned char c = *src++;
3398 if (BYTE_ASCII_P (c))
3399 { /* Processing ASCII character */
3401 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3404 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3405 { /* Processing Leading Byte */
3407 charset = CHARSET_BY_LEADING_BYTE (c);
3408 if (LEADING_BYTE_PREFIX_P(c))
3413 { /* Processing Non-ASCII character */
3415 if (EQ (charset, Vcharset_control_1))
3417 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3421 switch (XCHARSET_REP_BYTES (charset))
3424 encode_ucs4 (charset, c, 0, dst);
3427 if (XCHARSET_PRIVATE_P (charset))
3429 encode_ucs4 (charset, c, 0, dst);
3434 #ifdef ENABLE_COMPOSITE_CHARS
3435 if (EQ (charset, Vcharset_composite))
3439 /* #### Bother! We don't know how to
3441 Dynarr_add (dst, 0);
3442 Dynarr_add (dst, 0);
3443 Dynarr_add (dst, 0);
3444 Dynarr_add (dst, '~');
3448 Emchar emch = MAKE_CHAR (Vcharset_composite,
3449 ch & 0x7F, c & 0x7F);
3450 Lisp_Object lstr = composite_char_string (emch);
3454 src = XSTRING_DATA (lstr);
3455 n = XSTRING_LENGTH (lstr);
3459 #endif /* ENABLE_COMPOSITE_CHARS */
3461 encode_ucs4(charset, ch, c, dst);
3474 encode_ucs4 (charset, ch, c, dst);
3490 #ifdef ENABLE_COMPOSITE_CHARS
3496 goto back_to_square_n; /* Wheeeeeeeee ..... */
3498 #endif /* ENABLE_COMPOSITE_CHARS */
3502 str->iso2022.current_char_boundary = char_boundary;
3503 str->iso2022.current_charset = charset;
3505 /* Verbum caro factum est! */
3509 /************************************************************************/
3511 /************************************************************************/
3514 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3519 unsigned char c = *src++;
3520 switch (st->utf8.in_byte)
3523 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3526 st->utf8.in_byte = 5;
3528 st->utf8.in_byte = 4;
3530 st->utf8.in_byte = 3;
3532 st->utf8.in_byte = 2;
3534 st->utf8.in_byte = 1;
3539 if ((c & 0xc0) != 0x80)
3545 return CODING_CATEGORY_UTF8_MASK;
3549 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3550 unsigned_char_dynarr *dst, unsigned int n)
3552 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3553 unsigned int flags = str->flags;
3554 unsigned int ch = str->ch;
3555 eol_type_t eol_type = str->eol_type;
3559 unsigned char c = *src++;
3568 else if ( c >= 0xf8 )
3573 else if ( c >= 0xf0 )
3578 else if ( c >= 0xe0 )
3583 else if ( c >= 0xc0 )
3590 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3591 decode_ucs4 (c, dst);
3595 ch = ( ch << 6 ) | ( c & 0x3f );
3596 decode_ucs4 (ch, dst);
3601 ch = ( ch << 6 ) | ( c & 0x3f );
3604 label_continue_loop:;
3607 if (flags & CODING_STATE_END)
3608 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3615 encode_utf8 (Lisp_Object charset,
3616 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3618 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3621 Dynarr_add (dst, code);
3623 else if ( code <= 0x7ff )
3625 Dynarr_add (dst, (code >> 6) | 0xc0);
3626 Dynarr_add (dst, (code & 0x3f) | 0x80);
3628 else if ( code <= 0xffff )
3630 Dynarr_add (dst, (code >> 12) | 0xe0);
3631 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3632 Dynarr_add (dst, (code & 0x3f) | 0x80);
3634 else if ( code <= 0x1fffff )
3636 Dynarr_add (dst, (code >> 18) | 0xf0);
3637 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3638 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3639 Dynarr_add (dst, (code & 0x3f) | 0x80);
3641 else if ( code <= 0x3ffffff )
3643 Dynarr_add (dst, (code >> 24) | 0xf8);
3644 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3645 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3646 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3647 Dynarr_add (dst, (code & 0x3f) | 0x80);
3651 Dynarr_add (dst, (code >> 30) | 0xfc);
3652 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3653 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3654 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3655 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3656 Dynarr_add (dst, (code & 0x3f) | 0x80);
3661 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3662 unsigned_char_dynarr *dst, unsigned int n)
3664 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3665 unsigned int flags = str->flags;
3666 unsigned int ch = str->ch;
3667 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3668 unsigned char char_boundary = str->iso2022.current_char_boundary;
3669 Lisp_Object charset = str->iso2022.current_charset;
3671 #ifdef ENABLE_COMPOSITE_CHARS
3672 /* flags for handling composite chars. We do a little switcharoo
3673 on the source while we're outputting the composite char. */
3674 unsigned int saved_n = 0;
3675 CONST unsigned char *saved_src = NULL;
3676 int in_composite = 0;
3679 #endif /* ENABLE_COMPOSITE_CHARS */
3683 unsigned char c = *src++;
3685 if (BYTE_ASCII_P (c))
3686 { /* Processing ASCII character */
3690 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3691 Dynarr_add (dst, '\r');
3692 if (eol_type != EOL_CR)
3693 Dynarr_add (dst, c);
3696 encode_utf8 (Vcharset_ascii, c, 0, dst);
3699 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3700 { /* Processing Leading Byte */
3702 charset = CHARSET_BY_LEADING_BYTE (c);
3703 if (LEADING_BYTE_PREFIX_P(c))
3708 { /* Processing Non-ASCII character */
3710 if (EQ (charset, Vcharset_control_1))
3712 encode_utf8 (Vcharset_control_1, c, 0, dst);
3716 switch (XCHARSET_REP_BYTES (charset))
3719 encode_utf8 (charset, c, 0, dst);
3722 if (XCHARSET_PRIVATE_P (charset))
3724 encode_utf8 (charset, c, 0, dst);
3729 #ifdef ENABLE_COMPOSITE_CHARS
3730 if (EQ (charset, Vcharset_composite))
3734 /* #### Bother! We don't know how to
3736 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3740 Emchar emch = MAKE_CHAR (Vcharset_composite,
3741 ch & 0x7F, c & 0x7F);
3742 Lisp_Object lstr = composite_char_string (emch);
3746 src = XSTRING_DATA (lstr);
3747 n = XSTRING_LENGTH (lstr);
3751 #endif /* ENABLE_COMPOSITE_CHARS */
3753 encode_utf8 (charset, ch, c, dst);
3766 encode_utf8 (charset, ch, c, dst);
3782 #ifdef ENABLE_COMPOSITE_CHARS
3788 goto back_to_square_n; /* Wheeeeeeeee ..... */
3794 str->iso2022.current_char_boundary = char_boundary;
3795 str->iso2022.current_charset = charset;
3797 /* Verbum caro factum est! */
3801 /************************************************************************/
3802 /* ISO2022 methods */
3803 /************************************************************************/
3805 /* The following note describes the coding system ISO2022 briefly.
3806 Since the intention of this note is to help understand the
3807 functions in this file, some parts are NOT ACCURATE or OVERLY
3808 SIMPLIFIED. For thorough understanding, please refer to the
3809 original document of ISO2022.
3811 ISO2022 provides many mechanisms to encode several character sets
3812 in 7-bit and 8-bit environments. For 7-bit environments, all text
3813 is encoded using bytes less than 128. This may make the encoded
3814 text a little bit longer, but the text passes more easily through
3815 several gateways, some of which strip off MSB (Most Signigant Bit).
3817 There are two kinds of character sets: control character set and
3818 graphic character set. The former contains control characters such
3819 as `newline' and `escape' to provide control functions (control
3820 functions are also provided by escape sequences). The latter
3821 contains graphic characters such as 'A' and '-'. Emacs recognizes
3822 two control character sets and many graphic character sets.
3824 Graphic character sets are classified into one of the following
3825 four classes, according to the number of bytes (DIMENSION) and
3826 number of characters in one dimension (CHARS) of the set:
3827 - DIMENSION1_CHARS94
3828 - DIMENSION1_CHARS96
3829 - DIMENSION2_CHARS94
3830 - DIMENSION2_CHARS96
3832 In addition, each character set is assigned an identification tag,
3833 unique for each set, called "final character" (denoted as <F>
3834 hereafter). The <F> of each character set is decided by ECMA(*)
3835 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3836 (0x30..0x3F are for private use only).
3838 Note (*): ECMA = European Computer Manufacturers Association
3840 Here are examples of graphic character set [NAME(<F>)]:
3841 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3842 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3843 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3844 o DIMENSION2_CHARS96 -- none for the moment
3846 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3847 C0 [0x00..0x1F] -- control character plane 0
3848 GL [0x20..0x7F] -- graphic character plane 0
3849 C1 [0x80..0x9F] -- control character plane 1
3850 GR [0xA0..0xFF] -- graphic character plane 1
3852 A control character set is directly designated and invoked to C0 or
3853 C1 by an escape sequence. The most common case is that:
3854 - ISO646's control character set is designated/invoked to C0, and
3855 - ISO6429's control character set is designated/invoked to C1,
3856 and usually these designations/invocations are omitted in encoded
3857 text. In a 7-bit environment, only C0 can be used, and a control
3858 character for C1 is encoded by an appropriate escape sequence to
3859 fit into the environment. All control characters for C1 are
3860 defined to have corresponding escape sequences.
3862 A graphic character set is at first designated to one of four
3863 graphic registers (G0 through G3), then these graphic registers are
3864 invoked to GL or GR. These designations and invocations can be
3865 done independently. The most common case is that G0 is invoked to
3866 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3867 these invocations and designations are omitted in encoded text.
3868 In a 7-bit environment, only GL can be used.
3870 When a graphic character set of CHARS94 is invoked to GL, codes
3871 0x20 and 0x7F of the GL area work as control characters SPACE and
3872 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3875 There are two ways of invocation: locking-shift and single-shift.
3876 With locking-shift, the invocation lasts until the next different
3877 invocation, whereas with single-shift, the invocation affects the
3878 following character only and doesn't affect the locking-shift
3879 state. Invocations are done by the following control characters or
3882 ----------------------------------------------------------------------
3883 abbrev function cntrl escape seq description
3884 ----------------------------------------------------------------------
3885 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3886 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3887 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3888 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3889 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3890 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3891 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3892 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3893 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3894 ----------------------------------------------------------------------
3895 (*) These are not used by any known coding system.
3897 Control characters for these functions are defined by macros
3898 ISO_CODE_XXX in `coding.h'.
3900 Designations are done by the following escape sequences:
3901 ----------------------------------------------------------------------
3902 escape sequence description
3903 ----------------------------------------------------------------------
3904 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3905 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3906 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3907 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3908 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3909 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3910 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3911 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3912 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3913 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3914 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3915 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3916 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3917 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3918 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3919 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3920 ----------------------------------------------------------------------
3922 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3923 of dimension 1, chars 94, and final character <F>, etc...
3925 Note (*): Although these designations are not allowed in ISO2022,
3926 Emacs accepts them on decoding, and produces them on encoding
3927 CHARS96 character sets in a coding system which is characterized as
3928 7-bit environment, non-locking-shift, and non-single-shift.
3930 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3931 '(' can be omitted. We refer to this as "short-form" hereafter.
3933 Now you may notice that there are a lot of ways for encoding the
3934 same multilingual text in ISO2022. Actually, there exist many
3935 coding systems such as Compound Text (used in X11's inter client
3936 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3937 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3938 localized platforms), and all of these are variants of ISO2022.
3940 In addition to the above, Emacs handles two more kinds of escape
3941 sequences: ISO6429's direction specification and Emacs' private
3942 sequence for specifying character composition.
3944 ISO6429's direction specification takes the following form:
3945 o CSI ']' -- end of the current direction
3946 o CSI '0' ']' -- end of the current direction
3947 o CSI '1' ']' -- start of left-to-right text
3948 o CSI '2' ']' -- start of right-to-left text
3949 The control character CSI (0x9B: control sequence introducer) is
3950 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3952 Character composition specification takes the following form:
3953 o ESC '0' -- start character composition
3954 o ESC '1' -- end character composition
3955 Since these are not standard escape sequences of any ISO standard,
3956 their use with these meanings is restricted to Emacs only. */
3959 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3963 for (i = 0; i < 4; i++)
3965 if (!NILP (coding_system))
3967 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3969 iso->charset[i] = Qt;
3970 iso->invalid_designated[i] = 0;
3972 iso->esc = ISO_ESC_NOTHING;
3973 iso->esc_bytes_index = 0;
3974 iso->register_left = 0;
3975 iso->register_right = 1;
3976 iso->switched_dir_and_no_valid_charset_yet = 0;
3977 iso->invalid_switch_dir = 0;
3978 iso->output_direction_sequence = 0;
3979 iso->output_literally = 0;
3980 #ifdef ENABLE_COMPOSITE_CHARS
3981 if (iso->composite_chars)
3982 Dynarr_reset (iso->composite_chars);
3987 fit_to_be_escape_quoted (unsigned char c)
4004 /* Parse one byte of an ISO2022 escape sequence.
4005 If the result is an invalid escape sequence, return 0 and
4006 do not change anything in STR. Otherwise, if the result is
4007 an incomplete escape sequence, update ISO2022.ESC and
4008 ISO2022.ESC_BYTES and return -1. Otherwise, update
4009 all the state variables (but not ISO2022.ESC_BYTES) and
4012 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4013 or invocation of an invalid character set and treat that as
4014 an unrecognized escape sequence. */
4017 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4018 unsigned char c, unsigned int *flags,
4019 int check_invalid_charsets)
4021 /* (1) If we're at the end of a designation sequence, CS is the
4022 charset being designated and REG is the register to designate
4025 (2) If we're at the end of a locking-shift sequence, REG is
4026 the register to invoke and HALF (0 == left, 1 == right) is
4027 the half to invoke it into.
4029 (3) If we're at the end of a single-shift sequence, REG is
4030 the register to invoke. */
4031 Lisp_Object cs = Qnil;
4034 /* NOTE: This code does goto's all over the fucking place.
4035 The reason for this is that we're basically implementing
4036 a state machine here, and hierarchical languages like C
4037 don't really provide a clean way of doing this. */
4039 if (! (*flags & CODING_STATE_ESCAPE))
4040 /* At beginning of escape sequence; we need to reset our
4041 escape-state variables. */
4042 iso->esc = ISO_ESC_NOTHING;
4044 iso->output_literally = 0;
4045 iso->output_direction_sequence = 0;
4049 case ISO_ESC_NOTHING:
4050 iso->esc_bytes_index = 0;
4053 case ISO_CODE_ESC: /* Start escape sequence */
4054 *flags |= CODING_STATE_ESCAPE;
4058 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4059 *flags |= CODING_STATE_ESCAPE;
4060 iso->esc = ISO_ESC_5_11;
4063 case ISO_CODE_SO: /* locking shift 1 */
4066 case ISO_CODE_SI: /* locking shift 0 */
4070 case ISO_CODE_SS2: /* single shift */
4073 case ISO_CODE_SS3: /* single shift */
4077 default: /* Other control characters */
4084 /**** single shift ****/
4086 case 'N': /* single shift 2 */
4089 case 'O': /* single shift 3 */
4093 /**** locking shift ****/
4095 case '~': /* locking shift 1 right */
4098 case 'n': /* locking shift 2 */
4101 case '}': /* locking shift 2 right */
4104 case 'o': /* locking shift 3 */
4107 case '|': /* locking shift 3 right */
4111 #ifdef ENABLE_COMPOSITE_CHARS
4112 /**** composite ****/
4115 iso->esc = ISO_ESC_START_COMPOSITE;
4116 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4117 CODING_STATE_COMPOSITE;
4121 iso->esc = ISO_ESC_END_COMPOSITE;
4122 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4123 ~CODING_STATE_COMPOSITE;
4125 #endif /* ENABLE_COMPOSITE_CHARS */
4127 /**** directionality ****/
4130 iso->esc = ISO_ESC_5_11;
4133 /**** designation ****/
4135 case '$': /* multibyte charset prefix */
4136 iso->esc = ISO_ESC_2_4;
4140 if (0x28 <= c && c <= 0x2F)
4142 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4146 /* This function is called with CODESYS equal to nil when
4147 doing coding-system detection. */
4149 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4150 && fit_to_be_escape_quoted (c))
4152 iso->esc = ISO_ESC_LITERAL;
4153 *flags &= CODING_STATE_ISO2022_LOCK;
4163 /**** directionality ****/
4165 case ISO_ESC_5_11: /* ISO6429 direction control */
4168 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4169 goto directionality;
4171 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4172 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4173 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4177 case ISO_ESC_5_11_0:
4180 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4181 goto directionality;
4185 case ISO_ESC_5_11_1:
4188 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4189 goto directionality;
4193 case ISO_ESC_5_11_2:
4196 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4197 goto directionality;
4202 iso->esc = ISO_ESC_DIRECTIONALITY;
4203 /* Various junk here to attempt to preserve the direction sequences
4204 literally in the text if they would otherwise be swallowed due
4205 to invalid designations that don't show up as actual charset
4206 changes in the text. */
4207 if (iso->invalid_switch_dir)
4209 /* We already inserted a direction switch literally into the
4210 text. We assume (#### this may not be right) that the
4211 next direction switch is the one going the other way,
4212 and we need to output that literally as well. */
4213 iso->output_literally = 1;
4214 iso->invalid_switch_dir = 0;
4220 /* If we are in the thrall of an invalid designation,
4221 then stick the directionality sequence literally into the
4222 output stream so it ends up in the original text again. */
4223 for (jj = 0; jj < 4; jj++)
4224 if (iso->invalid_designated[jj])
4228 iso->output_literally = 1;
4229 iso->invalid_switch_dir = 1;
4232 /* Indicate that we haven't yet seen a valid designation,
4233 so that if a switch-dir is directly followed by an
4234 invalid designation, both get inserted literally. */
4235 iso->switched_dir_and_no_valid_charset_yet = 1;
4240 /**** designation ****/
4243 if (0x28 <= c && c <= 0x2F)
4245 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4248 if (0x40 <= c && c <= 0x42)
4250 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4251 *flags & CODING_STATE_R2L ?
4252 CHARSET_RIGHT_TO_LEFT :
4253 CHARSET_LEFT_TO_RIGHT);
4263 if (c < '0' || c > '~')
4264 return 0; /* bad final byte */
4266 if (iso->esc >= ISO_ESC_2_8 &&
4267 iso->esc <= ISO_ESC_2_15)
4269 type = ((iso->esc >= ISO_ESC_2_12) ?
4270 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4271 reg = (iso->esc - ISO_ESC_2_8) & 3;
4273 else if (iso->esc >= ISO_ESC_2_4_8 &&
4274 iso->esc <= ISO_ESC_2_4_15)
4276 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4277 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4278 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4282 /* Can this ever be reached? -slb */
4286 cs = CHARSET_BY_ATTRIBUTES (type, c,
4287 *flags & CODING_STATE_R2L ?
4288 CHARSET_RIGHT_TO_LEFT :
4289 CHARSET_LEFT_TO_RIGHT);
4295 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4299 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4300 /* can't invoke something that ain't there. */
4302 iso->esc = ISO_ESC_SINGLE_SHIFT;
4303 *flags &= CODING_STATE_ISO2022_LOCK;
4305 *flags |= CODING_STATE_SS2;
4307 *flags |= CODING_STATE_SS3;
4311 if (check_invalid_charsets &&
4312 !CHARSETP (iso->charset[reg]))
4313 /* can't invoke something that ain't there. */
4316 iso->register_right = reg;
4318 iso->register_left = reg;
4319 *flags &= CODING_STATE_ISO2022_LOCK;
4320 iso->esc = ISO_ESC_LOCKING_SHIFT;
4324 if (NILP (cs) && check_invalid_charsets)
4326 iso->invalid_designated[reg] = 1;
4327 iso->charset[reg] = Vcharset_ascii;
4328 iso->esc = ISO_ESC_DESIGNATE;
4329 *flags &= CODING_STATE_ISO2022_LOCK;
4330 iso->output_literally = 1;
4331 if (iso->switched_dir_and_no_valid_charset_yet)
4333 /* We encountered a switch-direction followed by an
4334 invalid designation. Ensure that the switch-direction
4335 gets outputted; otherwise it will probably get eaten
4336 when the text is written out again. */
4337 iso->switched_dir_and_no_valid_charset_yet = 0;
4338 iso->output_direction_sequence = 1;
4339 /* And make sure that the switch-dir going the other
4340 way gets outputted, as well. */
4341 iso->invalid_switch_dir = 1;
4345 /* This function is called with CODESYS equal to nil when
4346 doing coding-system detection. */
4347 if (!NILP (codesys))
4349 charset_conversion_spec_dynarr *dyn =
4350 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4356 for (i = 0; i < Dynarr_length (dyn); i++)
4358 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4359 if (EQ (cs, spec->from_charset))
4360 cs = spec->to_charset;
4365 iso->charset[reg] = cs;
4366 iso->esc = ISO_ESC_DESIGNATE;
4367 *flags &= CODING_STATE_ISO2022_LOCK;
4368 if (iso->invalid_designated[reg])
4370 iso->invalid_designated[reg] = 0;
4371 iso->output_literally = 1;
4373 if (iso->switched_dir_and_no_valid_charset_yet)
4374 iso->switched_dir_and_no_valid_charset_yet = 0;
4379 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4384 /* #### There are serious deficiencies in the recognition mechanism
4385 here. This needs to be much smarter if it's going to cut it.
4386 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4387 it should be detected as Latin-1.
4388 All the ISO2022 stuff in this file should be synced up with the
4389 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4390 Perhaps we should wait till R2L works in FSF Emacs? */
4392 if (!st->iso2022.initted)
4394 reset_iso2022 (Qnil, &st->iso2022.iso);
4395 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4396 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4397 CODING_CATEGORY_ISO_8_1_MASK |
4398 CODING_CATEGORY_ISO_8_2_MASK |
4399 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4400 st->iso2022.flags = 0;
4401 st->iso2022.high_byte_count = 0;
4402 st->iso2022.saw_single_shift = 0;
4403 st->iso2022.initted = 1;
4406 mask = st->iso2022.mask;
4413 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4414 st->iso2022.high_byte_count++;
4418 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4420 if (st->iso2022.high_byte_count & 1)
4421 /* odd number of high bytes; assume not iso-8-2 */
4422 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4424 st->iso2022.high_byte_count = 0;
4425 st->iso2022.saw_single_shift = 0;
4427 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4429 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4430 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4431 { /* control chars */
4434 /* Allow and ignore control characters that you might
4435 reasonably see in a text file */
4440 case 8: /* backspace */
4441 case 11: /* vertical tab */
4442 case 12: /* form feed */
4443 case 26: /* MS-DOS C-z junk */
4444 case 31: /* '^_' -- for info */
4445 goto label_continue_loop;
4452 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4455 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4456 &st->iso2022.flags, 0))
4458 switch (st->iso2022.iso.esc)
4460 case ISO_ESC_DESIGNATE:
4461 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4462 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4464 case ISO_ESC_LOCKING_SHIFT:
4465 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4466 goto ran_out_of_chars;
4467 case ISO_ESC_SINGLE_SHIFT:
4468 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4469 st->iso2022.saw_single_shift = 1;
4478 goto ran_out_of_chars;
4481 label_continue_loop:;
4490 postprocess_iso2022_mask (int mask)
4492 /* #### kind of cheesy */
4493 /* If seven-bit ISO is allowed, then assume that the encoding is
4494 entirely seven-bit and turn off the eight-bit ones. */
4495 if (mask & CODING_CATEGORY_ISO_7_MASK)
4496 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4497 CODING_CATEGORY_ISO_8_1_MASK |
4498 CODING_CATEGORY_ISO_8_2_MASK);
4502 /* If FLAGS is a null pointer or specifies right-to-left motion,
4503 output a switch-dir-to-left-to-right sequence to DST.
4504 Also update FLAGS if it is not a null pointer.
4505 If INTERNAL_P is set, we are outputting in internal format and
4506 need to handle the CSI differently. */
4509 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4510 unsigned_char_dynarr *dst,
4511 unsigned int *flags,
4514 if (!flags || (*flags & CODING_STATE_R2L))
4516 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4518 Dynarr_add (dst, ISO_CODE_ESC);
4519 Dynarr_add (dst, '[');
4521 else if (internal_p)
4522 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4524 Dynarr_add (dst, ISO_CODE_CSI);
4525 Dynarr_add (dst, '0');
4526 Dynarr_add (dst, ']');
4528 *flags &= ~CODING_STATE_R2L;
4532 /* If FLAGS is a null pointer or specifies a direction different from
4533 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4534 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4535 sequence to DST. Also update FLAGS if it is not a null pointer.
4536 If INTERNAL_P is set, we are outputting in internal format and
4537 need to handle the CSI differently. */
4540 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4541 unsigned_char_dynarr *dst, unsigned int *flags,
4544 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4545 direction == CHARSET_LEFT_TO_RIGHT)
4546 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4547 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4548 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4549 direction == CHARSET_RIGHT_TO_LEFT)
4551 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4553 Dynarr_add (dst, ISO_CODE_ESC);
4554 Dynarr_add (dst, '[');
4556 else if (internal_p)
4557 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4559 Dynarr_add (dst, ISO_CODE_CSI);
4560 Dynarr_add (dst, '2');
4561 Dynarr_add (dst, ']');
4563 *flags |= CODING_STATE_R2L;
4567 /* Convert ISO2022-format data to internal format. */
4570 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4571 unsigned_char_dynarr *dst, unsigned int n)
4573 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4574 unsigned int flags = str->flags;
4575 unsigned int ch = str->ch;
4576 eol_type_t eol_type = str->eol_type;
4577 #ifdef ENABLE_COMPOSITE_CHARS
4578 unsigned_char_dynarr *real_dst = dst;
4580 Lisp_Object coding_system;
4582 XSETCODING_SYSTEM (coding_system, str->codesys);
4584 #ifdef ENABLE_COMPOSITE_CHARS
4585 if (flags & CODING_STATE_COMPOSITE)
4586 dst = str->iso2022.composite_chars;
4587 #endif /* ENABLE_COMPOSITE_CHARS */
4591 unsigned char c = *src++;
4592 if (flags & CODING_STATE_ESCAPE)
4593 { /* Within ESC sequence */
4594 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4599 switch (str->iso2022.esc)
4601 #ifdef ENABLE_COMPOSITE_CHARS
4602 case ISO_ESC_START_COMPOSITE:
4603 if (str->iso2022.composite_chars)
4604 Dynarr_reset (str->iso2022.composite_chars);
4606 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4607 dst = str->iso2022.composite_chars;
4609 case ISO_ESC_END_COMPOSITE:
4611 Bufbyte comstr[MAX_EMCHAR_LEN];
4613 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4614 Dynarr_length (dst));
4616 len = set_charptr_emchar (comstr, emch);
4617 Dynarr_add_many (dst, comstr, len);
4620 #endif /* ENABLE_COMPOSITE_CHARS */
4622 case ISO_ESC_LITERAL:
4623 DECODE_ADD_BINARY_CHAR (c, dst);
4627 /* Everything else handled already */
4632 /* Attempted error recovery. */
4633 if (str->iso2022.output_direction_sequence)
4634 ensure_correct_direction (flags & CODING_STATE_R2L ?
4635 CHARSET_RIGHT_TO_LEFT :
4636 CHARSET_LEFT_TO_RIGHT,
4637 str->codesys, dst, 0, 1);
4638 /* More error recovery. */
4639 if (!retval || str->iso2022.output_literally)
4641 /* Output the (possibly invalid) sequence */
4643 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4644 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4645 flags &= CODING_STATE_ISO2022_LOCK;
4647 n++, src--;/* Repeat the loop with the same character. */
4650 /* No sense in reprocessing the final byte of the
4651 escape sequence; it could mess things up anyway.
4653 DECODE_ADD_BINARY_CHAR (c, dst);
4658 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4659 { /* Control characters */
4661 /***** Error-handling *****/
4663 /* If we were in the middle of a character, dump out the
4664 partial character. */
4665 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4667 /* If we just saw a single-shift character, dump it out.
4668 This may dump out the wrong sort of single-shift character,
4669 but least it will give an indication that something went
4671 if (flags & CODING_STATE_SS2)
4673 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4674 flags &= ~CODING_STATE_SS2;
4676 if (flags & CODING_STATE_SS3)
4678 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4679 flags &= ~CODING_STATE_SS3;
4682 /***** Now handle the control characters. *****/
4685 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4687 flags &= CODING_STATE_ISO2022_LOCK;
4689 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4690 DECODE_ADD_BINARY_CHAR (c, dst);
4693 { /* Graphic characters */
4694 Lisp_Object charset;
4698 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4700 /* Now determine the charset. */
4701 reg = ((flags & CODING_STATE_SS2) ? 2
4702 : (flags & CODING_STATE_SS3) ? 3
4703 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4704 : str->iso2022.register_left);
4705 charset = str->iso2022.charset[reg];
4707 /* Error checking: */
4708 if (! CHARSETP (charset)
4709 || str->iso2022.invalid_designated[reg]
4710 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4711 && XCHARSET_CHARS (charset) == 94))
4712 /* Mrmph. We are trying to invoke a register that has no
4713 or an invalid charset in it, or trying to add a character
4714 outside the range of the charset. Insert that char literally
4715 to preserve it for the output. */
4717 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4718 DECODE_ADD_BINARY_CHAR (c, dst);
4723 /* Things are probably hunky-dorey. */
4725 /* Fetch reverse charset, maybe. */
4726 if (((flags & CODING_STATE_R2L) &&
4727 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4729 (!(flags & CODING_STATE_R2L) &&
4730 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4732 Lisp_Object new_charset =
4733 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4734 if (!NILP (new_charset))
4735 charset = new_charset;
4738 lb = XCHARSET_LEADING_BYTE (charset);
4739 switch (XCHARSET_REP_BYTES (charset))
4742 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4743 Dynarr_add (dst, c & 0x7F);
4746 case 2: /* one-byte official */
4747 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4748 Dynarr_add (dst, lb);
4749 Dynarr_add (dst, c | 0x80);
4752 case 3: /* one-byte private or two-byte official */
4753 if (XCHARSET_PRIVATE_P (charset))
4755 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4756 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4757 Dynarr_add (dst, lb);
4758 Dynarr_add (dst, c | 0x80);
4764 Dynarr_add (dst, lb);
4765 Dynarr_add (dst, ch | 0x80);
4766 Dynarr_add (dst, c | 0x80);
4774 default: /* two-byte private */
4777 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4778 Dynarr_add (dst, lb);
4779 Dynarr_add (dst, ch | 0x80);
4780 Dynarr_add (dst, c | 0x80);
4789 flags &= CODING_STATE_ISO2022_LOCK;
4792 label_continue_loop:;
4795 if (flags & CODING_STATE_END)
4796 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4803 /***** ISO2022 encoder *****/
4805 /* Designate CHARSET into register REG. */
4808 iso2022_designate (Lisp_Object charset, unsigned char reg,
4809 struct encoding_stream *str, unsigned_char_dynarr *dst)
4811 static CONST char inter94[] = "()*+";
4812 static CONST char inter96[] = ",-./";
4814 unsigned char final;
4815 Lisp_Object old_charset = str->iso2022.charset[reg];
4817 str->iso2022.charset[reg] = charset;
4818 if (!CHARSETP (charset))
4819 /* charset might be an initial nil or t. */
4821 type = XCHARSET_TYPE (charset);
4822 final = XCHARSET_FINAL (charset);
4823 if (!str->iso2022.force_charset_on_output[reg] &&
4824 CHARSETP (old_charset) &&
4825 XCHARSET_TYPE (old_charset) == type &&
4826 XCHARSET_FINAL (old_charset) == final)
4829 str->iso2022.force_charset_on_output[reg] = 0;
4832 charset_conversion_spec_dynarr *dyn =
4833 str->codesys->iso2022.output_conv;
4839 for (i = 0; i < Dynarr_length (dyn); i++)
4841 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4842 if (EQ (charset, spec->from_charset))
4843 charset = spec->to_charset;
4848 Dynarr_add (dst, ISO_CODE_ESC);
4851 case CHARSET_TYPE_94:
4852 Dynarr_add (dst, inter94[reg]);
4854 case CHARSET_TYPE_96:
4855 Dynarr_add (dst, inter96[reg]);
4857 case CHARSET_TYPE_94X94:
4858 Dynarr_add (dst, '$');
4860 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4863 Dynarr_add (dst, inter94[reg]);
4865 case CHARSET_TYPE_96X96:
4866 Dynarr_add (dst, '$');
4867 Dynarr_add (dst, inter96[reg]);
4870 Dynarr_add (dst, final);
4874 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4876 if (str->iso2022.register_left != 0)
4878 Dynarr_add (dst, ISO_CODE_SI);
4879 str->iso2022.register_left = 0;
4884 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4886 if (str->iso2022.register_left != 1)
4888 Dynarr_add (dst, ISO_CODE_SO);
4889 str->iso2022.register_left = 1;
4893 /* Convert internally-formatted data to ISO2022 format. */
4896 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4897 unsigned_char_dynarr *dst, unsigned int n)
4899 unsigned char charmask, c;
4900 unsigned char char_boundary;
4901 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4902 unsigned int flags = str->flags;
4903 unsigned int ch = str->ch;
4904 Lisp_Coding_System *codesys = str->codesys;
4905 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4907 Lisp_Object charset;
4910 #ifdef ENABLE_COMPOSITE_CHARS
4911 /* flags for handling composite chars. We do a little switcharoo
4912 on the source while we're outputting the composite char. */
4913 unsigned int saved_n = 0;
4914 CONST unsigned char *saved_src = NULL;
4915 int in_composite = 0;
4916 #endif /* ENABLE_COMPOSITE_CHARS */
4918 char_boundary = str->iso2022.current_char_boundary;
4919 charset = str->iso2022.current_charset;
4920 half = str->iso2022.current_half;
4922 #ifdef ENABLE_COMPOSITE_CHARS
4929 if (BYTE_ASCII_P (c))
4930 { /* Processing ASCII character */
4933 restore_left_to_right_direction (codesys, dst, &flags, 0);
4935 /* Make sure G0 contains ASCII */
4936 if ((c > ' ' && c < ISO_CODE_DEL) ||
4937 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4939 ensure_normal_shift (str, dst);
4940 iso2022_designate (Vcharset_ascii, 0, str, dst);
4943 /* If necessary, restore everything to the default state
4946 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4948 restore_left_to_right_direction (codesys, dst, &flags, 0);
4950 ensure_normal_shift (str, dst);
4952 for (i = 0; i < 4; i++)
4954 Lisp_Object initial_charset =
4955 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4956 iso2022_designate (initial_charset, i, str, dst);
4961 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4962 Dynarr_add (dst, '\r');
4963 if (eol_type != EOL_CR)
4964 Dynarr_add (dst, c);
4968 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4969 && fit_to_be_escape_quoted (c))
4970 Dynarr_add (dst, ISO_CODE_ESC);
4971 Dynarr_add (dst, c);
4976 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4977 { /* Processing Leading Byte */
4979 charset = CHARSET_BY_LEADING_BYTE (c);
4980 if (LEADING_BYTE_PREFIX_P(c))
4982 else if (!EQ (charset, Vcharset_control_1)
4983 #ifdef ENABLE_COMPOSITE_CHARS
4984 && !EQ (charset, Vcharset_composite)
4990 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4991 codesys, dst, &flags, 0);
4993 /* Now determine which register to use. */
4995 for (i = 0; i < 4; i++)
4997 if (EQ (charset, str->iso2022.charset[i]) ||
4999 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5008 if (XCHARSET_GRAPHIC (charset) != 0)
5010 if (!NILP (str->iso2022.charset[1]) &&
5011 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5012 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5014 else if (!NILP (str->iso2022.charset[2]))
5016 else if (!NILP (str->iso2022.charset[3]))
5025 iso2022_designate (charset, reg, str, dst);
5027 /* Now invoke that register. */
5031 ensure_normal_shift (str, dst);
5036 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5038 ensure_shift_out (str, dst);
5046 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5048 Dynarr_add (dst, ISO_CODE_ESC);
5049 Dynarr_add (dst, 'N');
5054 Dynarr_add (dst, ISO_CODE_SS2);
5060 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5062 Dynarr_add (dst, ISO_CODE_ESC);
5063 Dynarr_add (dst, 'O');
5068 Dynarr_add (dst, ISO_CODE_SS3);
5080 { /* Processing Non-ASCII character */
5081 charmask = (half == 0 ? 0x7F : 0xFF);
5083 if (EQ (charset, Vcharset_control_1))
5085 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5086 && fit_to_be_escape_quoted (c))
5087 Dynarr_add (dst, ISO_CODE_ESC);
5088 /* you asked for it ... */
5089 Dynarr_add (dst, c - 0x20);
5093 switch (XCHARSET_REP_BYTES (charset))
5096 Dynarr_add (dst, c & charmask);
5099 if (XCHARSET_PRIVATE_P (charset))
5101 Dynarr_add (dst, c & charmask);
5106 #ifdef ENABLE_COMPOSITE_CHARS
5107 if (EQ (charset, Vcharset_composite))
5111 /* #### Bother! We don't know how to
5113 Dynarr_add (dst, '~');
5117 Emchar emch = MAKE_CHAR (Vcharset_composite,
5118 ch & 0x7F, c & 0x7F);
5119 Lisp_Object lstr = composite_char_string (emch);
5123 src = XSTRING_DATA (lstr);
5124 n = XSTRING_LENGTH (lstr);
5125 Dynarr_add (dst, ISO_CODE_ESC);
5126 Dynarr_add (dst, '0'); /* start composing */
5130 #endif /* ENABLE_COMPOSITE_CHARS */
5132 Dynarr_add (dst, ch & charmask);
5133 Dynarr_add (dst, c & charmask);
5146 Dynarr_add (dst, ch & charmask);
5147 Dynarr_add (dst, c & charmask);
5163 #ifdef ENABLE_COMPOSITE_CHARS
5169 Dynarr_add (dst, ISO_CODE_ESC);
5170 Dynarr_add (dst, '1'); /* end composing */
5171 goto back_to_square_n; /* Wheeeeeeeee ..... */
5173 #endif /* ENABLE_COMPOSITE_CHARS */
5175 if (char_boundary && flags & CODING_STATE_END)
5177 restore_left_to_right_direction (codesys, dst, &flags, 0);
5178 ensure_normal_shift (str, dst);
5179 for (i = 0; i < 4; i++)
5181 Lisp_Object initial_charset =
5182 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5183 iso2022_designate (initial_charset, i, str, dst);
5189 str->iso2022.current_char_boundary = char_boundary;
5190 str->iso2022.current_charset = charset;
5191 str->iso2022.current_half = half;
5193 /* Verbum caro factum est! */
5197 /************************************************************************/
5198 /* No-conversion methods */
5199 /************************************************************************/
5201 /* This is used when reading in "binary" files -- i.e. files that may
5202 contain all 256 possible byte values and that are not to be
5203 interpreted as being in any particular decoding. */
5205 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5206 unsigned_char_dynarr *dst, unsigned int n)
5209 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5210 unsigned int flags = str->flags;
5211 unsigned int ch = str->ch;
5212 eol_type_t eol_type = str->eol_type;
5218 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5219 DECODE_ADD_BINARY_CHAR (c, dst);
5220 label_continue_loop:;
5223 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5230 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5231 unsigned_char_dynarr *dst, unsigned int n)
5234 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5235 unsigned int flags = str->flags;
5236 unsigned int ch = str->ch;
5237 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5244 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5245 Dynarr_add (dst, '\r');
5246 if (eol_type != EOL_CR)
5247 Dynarr_add (dst, '\n');
5250 else if (BYTE_ASCII_P (c))
5253 Dynarr_add (dst, c);
5255 else if (BUFBYTE_LEADING_BYTE_P (c))
5258 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5259 c == LEADING_BYTE_CONTROL_1)
5262 Dynarr_add (dst, '~'); /* untranslatable character */
5266 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5267 Dynarr_add (dst, c);
5268 else if (ch == LEADING_BYTE_CONTROL_1)
5271 Dynarr_add (dst, c - 0x20);
5273 /* else it should be the second or third byte of an
5274 untranslatable character, so ignore it */
5284 /************************************************************************/
5285 /* Simple internal/external functions */
5286 /************************************************************************/
5288 static Extbyte_dynarr *conversion_out_dynarr;
5289 static Bufbyte_dynarr *conversion_in_dynarr;
5291 /* Determine coding system from coding format */
5293 /* #### not correct for all values of `fmt'! */
5295 external_data_format_to_coding_system (enum external_data_format fmt)
5299 case FORMAT_FILENAME:
5300 case FORMAT_TERMINAL:
5301 if (EQ (Vfile_name_coding_system, Qnil) ||
5302 EQ (Vfile_name_coding_system, Qbinary))
5305 return Fget_coding_system (Vfile_name_coding_system);
5308 return Fget_coding_system (Qctext);
5316 convert_to_external_format (CONST Bufbyte *ptr,
5319 enum external_data_format fmt)
5321 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5323 if (!conversion_out_dynarr)
5324 conversion_out_dynarr = Dynarr_new (Extbyte);
5326 Dynarr_reset (conversion_out_dynarr);
5328 if (NILP (coding_system))
5330 CONST Bufbyte *end = ptr + len;
5335 (BYTE_ASCII_P (*ptr)) ? *ptr :
5336 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5337 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5340 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5344 #ifdef ERROR_CHECK_BUFPOS
5345 assert (ptr == end);
5350 Lisp_Object instream, outstream, da_outstream;
5351 Lstream *istr, *ostr;
5352 struct gcpro gcpro1, gcpro2, gcpro3;
5353 char tempbuf[1024]; /* some random amount */
5355 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5356 da_outstream = make_dynarr_output_stream
5357 ((unsigned_char_dynarr *) conversion_out_dynarr);
5359 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5360 istr = XLSTREAM (instream);
5361 ostr = XLSTREAM (outstream);
5362 GCPRO3 (instream, outstream, da_outstream);
5365 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5368 Lstream_write (ostr, tempbuf, size_in_bytes);
5370 Lstream_close (istr);
5371 Lstream_close (ostr);
5373 Lstream_delete (istr);
5374 Lstream_delete (ostr);
5375 Lstream_delete (XLSTREAM (da_outstream));
5378 *len_out = Dynarr_length (conversion_out_dynarr);
5379 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5380 return Dynarr_atp (conversion_out_dynarr, 0);
5384 convert_from_external_format (CONST Extbyte *ptr,
5387 enum external_data_format fmt)
5389 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5391 if (!conversion_in_dynarr)
5392 conversion_in_dynarr = Dynarr_new (Bufbyte);
5394 Dynarr_reset (conversion_in_dynarr);
5396 if (NILP (coding_system))
5398 CONST Extbyte *end = ptr + len;
5399 for (; ptr < end; ptr++)
5402 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5407 Lisp_Object instream, outstream, da_outstream;
5408 Lstream *istr, *ostr;
5409 struct gcpro gcpro1, gcpro2, gcpro3;
5410 char tempbuf[1024]; /* some random amount */
5412 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5413 da_outstream = make_dynarr_output_stream
5414 ((unsigned_char_dynarr *) conversion_in_dynarr);
5416 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5417 istr = XLSTREAM (instream);
5418 ostr = XLSTREAM (outstream);
5419 GCPRO3 (instream, outstream, da_outstream);
5422 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5425 Lstream_write (ostr, tempbuf, size_in_bytes);
5427 Lstream_close (istr);
5428 Lstream_close (ostr);
5430 Lstream_delete (istr);
5431 Lstream_delete (ostr);
5432 Lstream_delete (XLSTREAM (da_outstream));
5435 *len_out = Dynarr_length (conversion_in_dynarr);
5436 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5437 return Dynarr_atp (conversion_in_dynarr, 0);
5441 /************************************************************************/
5442 /* Initialization */
5443 /************************************************************************/
5446 syms_of_file_coding (void)
5448 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5449 deferror (&Qcoding_system_error, "coding-system-error",
5450 "Coding-system error", Qio_error);
5452 DEFSUBR (Fcoding_system_p);
5453 DEFSUBR (Ffind_coding_system);
5454 DEFSUBR (Fget_coding_system);
5455 DEFSUBR (Fcoding_system_list);
5456 DEFSUBR (Fcoding_system_name);
5457 DEFSUBR (Fmake_coding_system);
5458 DEFSUBR (Fcopy_coding_system);
5459 DEFSUBR (Fdefine_coding_system_alias);
5460 DEFSUBR (Fsubsidiary_coding_system);
5462 DEFSUBR (Fcoding_system_type);
5463 DEFSUBR (Fcoding_system_doc_string);
5465 DEFSUBR (Fcoding_system_charset);
5467 DEFSUBR (Fcoding_system_property);
5469 DEFSUBR (Fcoding_category_list);
5470 DEFSUBR (Fset_coding_priority_list);
5471 DEFSUBR (Fcoding_priority_list);
5472 DEFSUBR (Fset_coding_category_system);
5473 DEFSUBR (Fcoding_category_system);
5475 DEFSUBR (Fdetect_coding_region);
5476 DEFSUBR (Fdecode_coding_region);
5477 DEFSUBR (Fencode_coding_region);
5479 DEFSUBR (Fdecode_shift_jis_char);
5480 DEFSUBR (Fencode_shift_jis_char);
5481 DEFSUBR (Fdecode_big5_char);
5482 DEFSUBR (Fencode_big5_char);
5483 DEFSUBR (Fset_ucs_char);
5484 DEFSUBR (Fucs_char);
5485 DEFSUBR (Fset_char_ucs);
5486 DEFSUBR (Fchar_ucs);
5488 defsymbol (&Qcoding_system_p, "coding-system-p");
5489 defsymbol (&Qno_conversion, "no-conversion");
5490 defsymbol (&Qraw_text, "raw-text");
5492 defsymbol (&Qbig5, "big5");
5493 defsymbol (&Qshift_jis, "shift-jis");
5494 defsymbol (&Qucs4, "ucs-4");
5495 defsymbol (&Qutf8, "utf-8");
5496 defsymbol (&Qccl, "ccl");
5497 defsymbol (&Qiso2022, "iso2022");
5499 defsymbol (&Qmnemonic, "mnemonic");
5500 defsymbol (&Qeol_type, "eol-type");
5501 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5502 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5504 defsymbol (&Qcr, "cr");
5505 defsymbol (&Qlf, "lf");
5506 defsymbol (&Qcrlf, "crlf");
5507 defsymbol (&Qeol_cr, "eol-cr");
5508 defsymbol (&Qeol_lf, "eol-lf");
5509 defsymbol (&Qeol_crlf, "eol-crlf");
5511 defsymbol (&Qcharset_g0, "charset-g0");
5512 defsymbol (&Qcharset_g1, "charset-g1");
5513 defsymbol (&Qcharset_g2, "charset-g2");
5514 defsymbol (&Qcharset_g3, "charset-g3");
5515 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5516 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5517 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5518 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5519 defsymbol (&Qno_iso6429, "no-iso6429");
5520 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5521 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5523 defsymbol (&Qshort, "short");
5524 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5525 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5526 defsymbol (&Qseven, "seven");
5527 defsymbol (&Qlock_shift, "lock-shift");
5528 defsymbol (&Qescape_quoted, "escape-quoted");
5530 defsymbol (&Qencode, "encode");
5531 defsymbol (&Qdecode, "decode");
5534 defsymbol (&Qctext, "ctext");
5535 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5537 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5539 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5541 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5543 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5545 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5547 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5549 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5551 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5554 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5559 lstream_type_create_file_coding (void)
5561 LSTREAM_HAS_METHOD (decoding, reader);
5562 LSTREAM_HAS_METHOD (decoding, writer);
5563 LSTREAM_HAS_METHOD (decoding, rewinder);
5564 LSTREAM_HAS_METHOD (decoding, seekable_p);
5565 LSTREAM_HAS_METHOD (decoding, flusher);
5566 LSTREAM_HAS_METHOD (decoding, closer);
5567 LSTREAM_HAS_METHOD (decoding, marker);
5569 LSTREAM_HAS_METHOD (encoding, reader);
5570 LSTREAM_HAS_METHOD (encoding, writer);
5571 LSTREAM_HAS_METHOD (encoding, rewinder);
5572 LSTREAM_HAS_METHOD (encoding, seekable_p);
5573 LSTREAM_HAS_METHOD (encoding, flusher);
5574 LSTREAM_HAS_METHOD (encoding, closer);
5575 LSTREAM_HAS_METHOD (encoding, marker);
5579 vars_of_file_coding (void)
5583 /* Initialize to something reasonable ... */
5584 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5586 coding_category_system[i] = Qnil;
5587 coding_category_by_priority[i] = i;
5590 Fprovide (intern ("file-coding"));
5592 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5593 Coding system used for TTY keyboard input.
5594 Not used under a windowing system.
5596 Vkeyboard_coding_system = Qnil;
5598 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5599 Coding system used for TTY display output.
5600 Not used under a windowing system.
5602 Vterminal_coding_system = Qnil;
5604 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5605 Overriding coding system used when writing a file or process.
5606 You should *bind* this, not set it. If this is non-nil, it specifies
5607 the coding system that will be used when a file or process is read
5608 in, and overrides `buffer-file-coding-system-for-read',
5609 `insert-file-contents-pre-hook', etc. Use those variables instead of
5610 this one for permanent changes to the environment.
5612 Vcoding_system_for_read = Qnil;
5614 DEFVAR_LISP ("coding-system-for-write",
5615 &Vcoding_system_for_write /*
5616 Overriding coding system used when writing a file or process.
5617 You should *bind* this, not set it. If this is non-nil, it specifies
5618 the coding system that will be used when a file or process is wrote
5619 in, and overrides `buffer-file-coding-system',
5620 `write-region-pre-hook', etc. Use those variables instead of this one
5621 for permanent changes to the environment.
5623 Vcoding_system_for_write = Qnil;
5625 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5626 Coding system used to convert pathnames when accessing files.
5628 Vfile_name_coding_system = Qnil;
5630 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5631 Non-nil means the buffer contents are regarded as multi-byte form
5632 of characters, not a binary code. This affects the display, file I/O,
5633 and behaviors of various editing commands.
5635 Setting this to nil does not do anything.
5637 enable_multibyte_characters = 1;
5641 complex_vars_of_file_coding (void)
5643 staticpro (&Vcoding_system_hash_table);
5644 Vcoding_system_hash_table =
5645 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5647 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5649 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5651 struct codesys_prop csp; \
5653 csp.prop_type = (Prop_Type); \
5654 Dynarr_add (the_codesys_prop_dynarr, csp); \
5657 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5658 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5659 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5660 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5661 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5662 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5663 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5665 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5666 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5667 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5668 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5669 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5670 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5671 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5672 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5673 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5674 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5675 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5676 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5677 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5678 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5679 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5680 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5681 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5683 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5684 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5686 /* Need to create this here or we're really screwed. */
5688 (Qraw_text, Qno_conversion,
5689 build_string ("Raw text, which means it converts only line-break-codes."),
5690 list2 (Qmnemonic, build_string ("Raw")));
5693 (Qbinary, Qno_conversion,
5694 build_string ("Binary, which means it does not convert anything."),
5695 list4 (Qeol_type, Qlf,
5696 Qmnemonic, build_string ("Binary")));
5698 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5700 /* Need this for bootstrapping */
5701 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5702 Fget_coding_system (Qraw_text);
5708 for (i = 0; i < 65536; i++)
5709 ucs_to_mule_table[i] = Qnil;
5711 staticpro (&mule_to_ucs_table);
5712 mule_to_ucs_table = Fmake_char_table(Qgeneric);