1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
154 static int detect_coding_sjis (struct detection_state *st,
155 CONST unsigned char *src,
157 static void decode_coding_sjis (Lstream *decoding,
158 CONST unsigned char *src,
159 unsigned_char_dynarr *dst,
161 static void encode_coding_sjis (Lstream *encoding,
162 CONST unsigned char *src,
163 unsigned_char_dynarr *dst,
165 static int detect_coding_big5 (struct detection_state *st,
166 CONST unsigned char *src,
168 static void decode_coding_big5 (Lstream *decoding,
169 CONST unsigned char *src,
170 unsigned_char_dynarr *dst, unsigned int n);
171 static void encode_coding_big5 (Lstream *encoding,
172 CONST unsigned char *src,
173 unsigned_char_dynarr *dst, unsigned int n);
174 static int detect_coding_ucs4 (struct detection_state *st,
175 CONST unsigned char *src,
177 static void decode_coding_ucs4 (Lstream *decoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static void encode_coding_ucs4 (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst, unsigned int n);
183 static int detect_coding_utf8 (struct detection_state *st,
184 CONST unsigned char *src,
186 static void decode_coding_utf8 (Lstream *decoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst, unsigned int n);
189 static void encode_coding_utf8 (Lstream *encoding,
190 CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static int postprocess_iso2022_mask (int mask);
193 static void reset_iso2022 (Lisp_Object coding_system,
194 struct iso2022_decoder *iso);
195 static int detect_coding_iso2022 (struct detection_state *st,
196 CONST unsigned char *src,
198 static void decode_coding_iso2022 (Lstream *decoding,
199 CONST unsigned char *src,
200 unsigned_char_dynarr *dst, unsigned int n);
201 static void encode_coding_iso2022 (Lstream *encoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
205 static void decode_coding_no_conversion (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst,
209 static void encode_coding_no_conversion (Lstream *encoding,
210 CONST unsigned char *src,
211 unsigned_char_dynarr *dst,
213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
214 unsigned_char_dynarr *dst, unsigned int n);
215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
216 unsigned_char_dynarr *dst, unsigned int n);
218 typedef struct codesys_prop codesys_prop;
227 Dynarr_declare (codesys_prop);
228 } codesys_prop_dynarr;
230 codesys_prop_dynarr *the_codesys_prop_dynarr;
232 enum codesys_prop_enum
235 CODESYS_PROP_ISO2022,
240 /************************************************************************/
241 /* Coding system functions */
242 /************************************************************************/
244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
246 static void finalize_coding_system (void *header, int for_disksave);
249 static const struct lrecord_description ccs_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
254 static const struct struct_description ccs_description = {
255 sizeof(charset_conversion_spec),
259 static const struct lrecord_description ccsd_description_1[] = {
260 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
264 static const struct struct_description ccsd_description = {
265 sizeof(charset_conversion_spec_dynarr),
270 static const struct lrecord_description coding_system_description[] = {
271 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
272 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
273 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
275 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
276 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
277 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
278 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
284 mark_coding_system, print_coding_system,
285 finalize_coding_system,
286 0, 0, coding_system_description,
287 struct Lisp_Coding_System);
290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
292 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
294 markobj (CODING_SYSTEM_NAME (codesys));
295 markobj (CODING_SYSTEM_DOC_STRING (codesys));
296 markobj (CODING_SYSTEM_MNEMONIC (codesys));
297 markobj (CODING_SYSTEM_EOL_LF (codesys));
298 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
299 markobj (CODING_SYSTEM_EOL_CR (codesys));
301 switch (CODING_SYSTEM_TYPE (codesys))
305 case CODESYS_ISO2022:
306 for (i = 0; i < 4; i++)
307 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
308 if (codesys->iso2022.input_conv)
310 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
312 struct charset_conversion_spec *ccs =
313 Dynarr_atp (codesys->iso2022.input_conv, i);
314 markobj (ccs->from_charset);
315 markobj (ccs->to_charset);
318 if (codesys->iso2022.output_conv)
320 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
322 struct charset_conversion_spec *ccs =
323 Dynarr_atp (codesys->iso2022.output_conv, i);
324 markobj (ccs->from_charset);
325 markobj (ccs->to_charset);
331 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
332 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
339 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
340 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
347 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
349 error ("printing unreadable object #<coding_system 0x%x>",
352 write_c_string ("#<coding_system ", printcharfun);
353 print_internal (c->name, printcharfun, 1);
354 write_c_string (">", printcharfun);
358 finalize_coding_system (void *header, int for_disksave)
360 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
361 /* Since coding systems never go away, this function is not
362 necessary. But it would be necessary if we changed things
363 so that coding systems could go away. */
364 if (!for_disksave) /* see comment in lstream.c */
366 switch (CODING_SYSTEM_TYPE (c))
369 case CODESYS_ISO2022:
370 if (c->iso2022.input_conv)
372 Dynarr_free (c->iso2022.input_conv);
373 c->iso2022.input_conv = 0;
375 if (c->iso2022.output_conv)
377 Dynarr_free (c->iso2022.output_conv);
378 c->iso2022.output_conv = 0;
389 symbol_to_eol_type (Lisp_Object symbol)
391 CHECK_SYMBOL (symbol);
392 if (NILP (symbol)) return EOL_AUTODETECT;
393 if (EQ (symbol, Qlf)) return EOL_LF;
394 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
395 if (EQ (symbol, Qcr)) return EOL_CR;
397 signal_simple_error ("Unrecognized eol type", symbol);
398 return EOL_AUTODETECT; /* not reached */
402 eol_type_to_symbol (enum eol_type type)
407 case EOL_LF: return Qlf;
408 case EOL_CRLF: return Qcrlf;
409 case EOL_CR: return Qcr;
410 case EOL_AUTODETECT: return Qnil;
415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
417 Lisp_Object codesys_obj;
418 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
419 char *codesys_name = (char *) alloca (len + 7);
421 char *codesys_mnemonic=0;
423 Lisp_Object codesys_name_sym, sub_codesys_obj;
427 XSETCODING_SYSTEM (codesys_obj, codesys);
429 memcpy (codesys_name,
430 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
432 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
434 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
435 codesys_mnemonic = (char *) alloca (mlen + 7);
436 memcpy (codesys_mnemonic,
437 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
441 strcpy (codesys_name + len, "-" op_sys); \
443 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
444 codesys_name_sym = intern (codesys_name); \
445 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
446 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
448 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
449 build_string (codesys_mnemonic); \
450 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
453 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
454 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
455 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
459 Return t if OBJECT is a coding system.
460 A coding system is an object that defines how text containing multiple
461 character sets is encoded into a stream of (typically 8-bit) bytes.
462 The coding system is used to decode the stream into a series of
463 characters (which may be from multiple charsets) when the text is read
464 from a file or process, and is used to encode the text back into the
465 same format when it is written out to a file or process.
467 For example, many ISO2022-compliant coding systems (such as Compound
468 Text, which is used for inter-client data under the X Window System)
469 use escape sequences to switch between different charsets -- Japanese
470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
472 `make-coding-system' for more information.
474 Coding systems are normally identified using a symbol, and the
475 symbol is accepted in place of the actual coding system object whenever
476 a coding system is called for. (This is similar to how faces work.)
480 return CODING_SYSTEMP (object) ? Qt : Qnil;
483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
484 Retrieve the coding system of the given name.
486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
487 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
488 If there is no such coding system, nil is returned. Otherwise the
489 associated coding system object is returned.
491 (coding_system_or_name))
493 if (CODING_SYSTEMP (coding_system_or_name))
494 return coding_system_or_name;
496 if (NILP (coding_system_or_name))
497 coding_system_or_name = Qbinary;
499 CHECK_SYMBOL (coding_system_or_name);
501 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
505 Retrieve the coding system of the given name.
506 Same as `find-coding-system' except that if there is no such
507 coding system, an error is signaled instead of returning nil.
511 Lisp_Object coding_system = Ffind_coding_system (name);
513 if (NILP (coding_system))
514 signal_simple_error ("No such coding system", name);
515 return coding_system;
518 /* We store the coding systems in hash tables with the names as the key and the
519 actual coding system object as the value. Occasionally we need to use them
520 in a list format. These routines provide us with that. */
521 struct coding_system_list_closure
523 Lisp_Object *coding_system_list;
527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
528 void *coding_system_list_closure)
530 /* This function can GC */
531 struct coding_system_list_closure *cscl =
532 (struct coding_system_list_closure *) coding_system_list_closure;
533 Lisp_Object *coding_system_list = cscl->coding_system_list;
535 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
536 *coding_system_list);
540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
541 Return a list of the names of all defined coding systems.
545 Lisp_Object coding_system_list = Qnil;
547 struct coding_system_list_closure coding_system_list_closure;
549 GCPRO1 (coding_system_list);
550 coding_system_list_closure.coding_system_list = &coding_system_list;
551 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
552 &coding_system_list_closure);
555 return coding_system_list;
558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
559 Return the name of the given coding system.
563 coding_system = Fget_coding_system (coding_system);
564 return XCODING_SYSTEM_NAME (coding_system);
567 static Lisp_Coding_System *
568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
570 Lisp_Coding_System *codesys =
571 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
573 zero_lcrecord (codesys);
574 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
575 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
576 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
577 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
578 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
579 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
580 CODING_SYSTEM_TYPE (codesys) = type;
581 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
583 if (type == CODESYS_ISO2022)
586 for (i = 0; i < 4; i++)
587 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
589 else if (type == CODESYS_CCL)
591 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
592 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
595 CODING_SYSTEM_NAME (codesys) = name;
601 /* Given a list of charset conversion specs as specified in a Lisp
602 program, parse it into STORE_HERE. */
605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
606 Lisp_Object spec_list)
610 EXTERNAL_LIST_LOOP (rest, spec_list)
612 Lisp_Object car = XCAR (rest);
613 Lisp_Object from, to;
614 struct charset_conversion_spec spec;
616 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
617 signal_simple_error ("Invalid charset conversion spec", car);
618 from = Fget_charset (XCAR (car));
619 to = Fget_charset (XCAR (XCDR (car)));
620 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
621 signal_simple_error_2
622 ("Attempted conversion between different charset types",
624 spec.from_charset = from;
625 spec.to_charset = to;
627 Dynarr_add (store_here, spec);
631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
632 specs, return the equivalent as the Lisp programmer would see it.
634 If LOAD_HERE is 0, return Qnil. */
637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
644 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
646 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
647 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
650 return Fnreverse (result);
655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
656 Register symbol NAME as a coding system.
658 TYPE describes the conversion method used and should be one of
661 Automatic conversion. XEmacs attempts to detect the coding system
664 No conversion. Use this for binary files and such. On output,
665 graphic characters that are not in ASCII or Latin-1 will be
666 replaced by a ?. (For a no-conversion-encoded buffer, these
667 characters will only be present if you explicitly insert them.)
669 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
671 ISO 10646 UCS-4 encoding.
673 ISO 10646 UTF-8 encoding.
675 Any ISO2022-compliant encoding. Among other things, this includes
676 JIS (the Japanese encoding commonly used for e-mail), EUC (the
677 standard Unix encoding for Japanese and other languages), and
678 Compound Text (the encoding used in X11). You can specify more
679 specific information about the conversion with the FLAGS argument.
681 Big5 (the encoding commonly used for Taiwanese).
683 The conversion is performed using a user-written pseudo-code
684 program. CCL (Code Conversion Language) is the name of this
687 Write out or read in the raw contents of the memory representing
688 the buffer's text. This is primarily useful for debugging
689 purposes, and is only enabled when XEmacs has been compiled with
690 DEBUG_XEMACS defined (via the --debug configure option).
691 WARNING: Reading in a file using 'internal conversion can result
692 in an internal inconsistency in the memory representing a
693 buffer's text, which will produce unpredictable results and may
694 cause XEmacs to crash. Under normal circumstances you should
695 never use 'internal conversion.
697 DOC-STRING is a string describing the coding system.
699 PROPS is a property list, describing the specific nature of the
700 character set. Recognized properties are:
703 String to be displayed in the modeline when this coding system is
707 End-of-line conversion to be used. It should be one of
710 Automatically detect the end-of-line type (LF, CRLF,
711 or CR). Also generate subsidiary coding systems named
712 `NAME-unix', `NAME-dos', and `NAME-mac', that are
713 identical to this coding system but have an EOL-TYPE
714 value of 'lf, 'crlf, and 'cr, respectively.
716 The end of a line is marked externally using ASCII LF.
717 Since this is also the way that XEmacs represents an
718 end-of-line internally, specifying this option results
719 in no end-of-line conversion. This is the standard
720 format for Unix text files.
722 The end of a line is marked externally using ASCII
723 CRLF. This is the standard format for MS-DOS text
726 The end of a line is marked externally using ASCII CR.
727 This is the standard format for Macintosh text files.
729 Automatically detect the end-of-line type but do not
730 generate subsidiary coding systems. (This value is
731 converted to nil when stored internally, and
732 `coding-system-property' will return nil.)
734 'post-read-conversion
735 Function called after a file has been read in, to perform the
736 decoding. Called with two arguments, BEG and END, denoting
737 a region of the current buffer to be decoded.
739 'pre-write-conversion
740 Function called before a file is written out, to perform the
741 encoding. Called with two arguments, BEG and END, denoting
742 a region of the current buffer to be encoded.
745 The following additional properties are recognized if TYPE is 'iso2022:
751 The character set initially designated to the G0 - G3 registers.
752 The value should be one of
754 -- A charset object (designate that character set)
755 -- nil (do not ever use this register)
756 -- t (no character set is initially designated to
757 the register, but may be later on; this automatically
758 sets the corresponding `force-g*-on-output' property)
764 If non-nil, send an explicit designation sequence on output before
765 using the specified register.
768 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
769 "ESC $ B" on output in place of the full designation sequences
770 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
773 If non-nil, don't designate ASCII to G0 at each end of line on output.
774 Setting this to non-nil also suppresses other state-resetting that
775 normally happens at the end of a line.
778 If non-nil, don't designate ASCII to G0 before control chars on output.
781 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
785 If non-nil, use locking-shift (SO/SI) instead of single-shift
786 or designation by escape sequence.
789 If non-nil, don't use ISO6429's direction specification.
792 If non-nil, literal control characters that are the same as
793 the beginning of a recognized ISO2022 or ISO6429 escape sequence
794 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
795 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
796 so that they can be properly distinguished from an escape sequence.
797 (Note that doing this results in a non-portable encoding.) This
798 encoding flag is used for byte-compiled files. Note that ESC
799 is a good choice for a quoting character because there are no
800 escape sequences whose second byte is a character from the Control-0
801 or Control-1 character sets; this is explicitly disallowed by the
804 'input-charset-conversion
805 A list of conversion specifications, specifying conversion of
806 characters in one charset to another when decoding is performed.
807 Each specification is a list of two elements: the source charset,
808 and the destination charset.
810 'output-charset-conversion
811 A list of conversion specifications, specifying conversion of
812 characters in one charset to another when encoding is performed.
813 The form of each specification is the same as for
814 'input-charset-conversion.
817 The following additional properties are recognized (and required)
821 CCL program used for decoding (converting to internal format).
824 CCL program used for encoding (converting to external format).
826 (name, type, doc_string, props))
828 Lisp_Coding_System *codesys;
829 Lisp_Object rest, key, value;
830 enum coding_system_type ty;
831 int need_to_setup_eol_systems = 1;
833 /* Convert type to constant */
834 if (NILP (type) || EQ (type, Qundecided))
835 { ty = CODESYS_AUTODETECT; }
837 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
838 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
839 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
840 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
841 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
842 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
844 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
846 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
849 signal_simple_error ("Invalid coding system type", type);
853 codesys = allocate_coding_system (ty, name);
855 if (NILP (doc_string))
856 doc_string = build_string ("");
858 CHECK_STRING (doc_string);
859 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
861 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
863 if (EQ (key, Qmnemonic))
866 CHECK_STRING (value);
867 CODING_SYSTEM_MNEMONIC (codesys) = value;
870 else if (EQ (key, Qeol_type))
872 need_to_setup_eol_systems = NILP (value);
875 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
878 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
879 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
881 else if (ty == CODESYS_ISO2022)
883 #define FROB_INITIAL_CHARSET(charset_num) \
884 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
885 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
887 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
888 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
889 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
890 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
892 #define FROB_FORCE_CHARSET(charset_num) \
893 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
895 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
896 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
897 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
898 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
900 #define FROB_BOOLEAN_PROPERTY(prop) \
901 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
903 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
904 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
905 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
906 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
907 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
908 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
909 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
911 else if (EQ (key, Qinput_charset_conversion))
913 codesys->iso2022.input_conv =
914 Dynarr_new (charset_conversion_spec);
915 parse_charset_conversion_specs (codesys->iso2022.input_conv,
918 else if (EQ (key, Qoutput_charset_conversion))
920 codesys->iso2022.output_conv =
921 Dynarr_new (charset_conversion_spec);
922 parse_charset_conversion_specs (codesys->iso2022.output_conv,
926 signal_simple_error ("Unrecognized property", key);
928 else if (EQ (type, Qccl))
930 if (EQ (key, Qdecode))
932 CHECK_VECTOR (value);
933 CODING_SYSTEM_CCL_DECODE (codesys) = value;
935 else if (EQ (key, Qencode))
937 CHECK_VECTOR (value);
938 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
941 signal_simple_error ("Unrecognized property", key);
945 signal_simple_error ("Unrecognized property", key);
948 if (need_to_setup_eol_systems)
949 setup_eol_coding_systems (codesys);
952 Lisp_Object codesys_obj;
953 XSETCODING_SYSTEM (codesys_obj, codesys);
954 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
959 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
960 Copy OLD-CODING-SYSTEM to NEW-NAME.
961 If NEW-NAME does not name an existing coding system, a new one will
964 (old_coding_system, new_name))
966 Lisp_Object new_coding_system;
967 old_coding_system = Fget_coding_system (old_coding_system);
968 new_coding_system = Ffind_coding_system (new_name);
969 if (NILP (new_coding_system))
971 XSETCODING_SYSTEM (new_coding_system,
972 allocate_coding_system
973 (XCODING_SYSTEM_TYPE (old_coding_system),
975 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
979 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
980 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
981 memcpy (((char *) to ) + sizeof (to->header),
982 ((char *) from) + sizeof (from->header),
983 sizeof (*from) - sizeof (from->header));
986 return new_coding_system;
989 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
990 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
992 (alias, coding_system))
994 CHECK_SYMBOL (alias);
995 if (!NILP (Ffind_coding_system (alias)))
996 signal_simple_error ("Symbol already names a coding system", alias);
997 coding_system = Fget_coding_system (coding_system);
998 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1000 /* Set up aliases for subsidiaries. */
1001 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1004 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1005 #define FROB(type, name) \
1007 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1008 if (!NILP (subsidiary)) \
1009 Fdefine_coding_system_alias \
1010 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1013 FROB (CRLF, "-dos");
1017 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1018 but it doesn't look intentional, so I'd rather return something
1019 meaningful or nothing at all. */
1024 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1026 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1027 Lisp_Object new_coding_system;
1029 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1030 return coding_system;
1034 case EOL_AUTODETECT: return coding_system;
1035 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1036 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1037 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1041 return NILP (new_coding_system) ? coding_system : new_coding_system;
1044 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1045 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1047 (coding_system, eol_type))
1049 coding_system = Fget_coding_system (coding_system);
1051 return subsidiary_coding_system (coding_system,
1052 symbol_to_eol_type (eol_type));
1056 /************************************************************************/
1057 /* Coding system accessors */
1058 /************************************************************************/
1060 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1061 Return the doc string for CODING-SYSTEM.
1065 coding_system = Fget_coding_system (coding_system);
1066 return XCODING_SYSTEM_DOC_STRING (coding_system);
1069 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1070 Return the type of CODING-SYSTEM.
1074 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1077 case CODESYS_AUTODETECT: return Qundecided;
1079 case CODESYS_SHIFT_JIS: return Qshift_jis;
1080 case CODESYS_ISO2022: return Qiso2022;
1081 case CODESYS_BIG5: return Qbig5;
1082 case CODESYS_UCS4: return Qucs4;
1083 case CODESYS_UTF8: return Qutf8;
1084 case CODESYS_CCL: return Qccl;
1086 case CODESYS_NO_CONVERSION: return Qno_conversion;
1088 case CODESYS_INTERNAL: return Qinternal;
1095 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1098 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1100 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1103 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1104 Return initial charset of CODING-SYSTEM designated to GNUM.
1107 (coding_system, gnum))
1109 coding_system = Fget_coding_system (coding_system);
1112 return coding_system_charset (coding_system, XINT (gnum));
1116 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1117 Return the PROP property of CODING-SYSTEM.
1119 (coding_system, prop))
1122 enum coding_system_type type;
1124 coding_system = Fget_coding_system (coding_system);
1125 CHECK_SYMBOL (prop);
1126 type = XCODING_SYSTEM_TYPE (coding_system);
1128 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1129 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1132 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1134 case CODESYS_PROP_ALL_OK:
1137 case CODESYS_PROP_ISO2022:
1138 if (type != CODESYS_ISO2022)
1140 ("Property only valid in ISO2022 coding systems",
1144 case CODESYS_PROP_CCL:
1145 if (type != CODESYS_CCL)
1147 ("Property only valid in CCL coding systems",
1157 signal_simple_error ("Unrecognized property", prop);
1159 if (EQ (prop, Qname))
1160 return XCODING_SYSTEM_NAME (coding_system);
1161 else if (EQ (prop, Qtype))
1162 return Fcoding_system_type (coding_system);
1163 else if (EQ (prop, Qdoc_string))
1164 return XCODING_SYSTEM_DOC_STRING (coding_system);
1165 else if (EQ (prop, Qmnemonic))
1166 return XCODING_SYSTEM_MNEMONIC (coding_system);
1167 else if (EQ (prop, Qeol_type))
1168 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1169 else if (EQ (prop, Qeol_lf))
1170 return XCODING_SYSTEM_EOL_LF (coding_system);
1171 else if (EQ (prop, Qeol_crlf))
1172 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1173 else if (EQ (prop, Qeol_cr))
1174 return XCODING_SYSTEM_EOL_CR (coding_system);
1175 else if (EQ (prop, Qpost_read_conversion))
1176 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1177 else if (EQ (prop, Qpre_write_conversion))
1178 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1180 else if (type == CODESYS_ISO2022)
1182 if (EQ (prop, Qcharset_g0))
1183 return coding_system_charset (coding_system, 0);
1184 else if (EQ (prop, Qcharset_g1))
1185 return coding_system_charset (coding_system, 1);
1186 else if (EQ (prop, Qcharset_g2))
1187 return coding_system_charset (coding_system, 2);
1188 else if (EQ (prop, Qcharset_g3))
1189 return coding_system_charset (coding_system, 3);
1191 #define FORCE_CHARSET(charset_num) \
1192 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1193 (coding_system, charset_num) ? Qt : Qnil)
1195 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1196 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1197 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1198 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1200 #define LISP_BOOLEAN(prop) \
1201 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1203 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1204 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1205 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1206 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1207 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1208 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1209 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1211 else if (EQ (prop, Qinput_charset_conversion))
1213 unparse_charset_conversion_specs
1214 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1215 else if (EQ (prop, Qoutput_charset_conversion))
1217 unparse_charset_conversion_specs
1218 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1222 else if (type == CODESYS_CCL)
1224 if (EQ (prop, Qdecode))
1225 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1226 else if (EQ (prop, Qencode))
1227 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1235 return Qnil; /* not reached */
1239 /************************************************************************/
1240 /* Coding category functions */
1241 /************************************************************************/
1244 decode_coding_category (Lisp_Object symbol)
1248 CHECK_SYMBOL (symbol);
1249 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1250 if (EQ (coding_category_symbol[i], symbol))
1253 signal_simple_error ("Unrecognized coding category", symbol);
1254 return 0; /* not reached */
1257 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1258 Return a list of all recognized coding categories.
1263 Lisp_Object list = Qnil;
1265 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1266 list = Fcons (coding_category_symbol[i], list);
1270 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1271 Change the priority order of the coding categories.
1272 LIST should be list of coding categories, in descending order of
1273 priority. Unspecified coding categories will be lower in priority
1274 than all specified ones, in the same relative order they were in
1279 int category_to_priority[CODING_CATEGORY_LAST + 1];
1283 /* First generate a list that maps coding categories to priorities. */
1285 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1286 category_to_priority[i] = -1;
1288 /* Highest priority comes from the specified list. */
1290 EXTERNAL_LIST_LOOP (rest, list)
1292 int cat = decode_coding_category (XCAR (rest));
1294 if (category_to_priority[cat] >= 0)
1295 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1296 category_to_priority[cat] = i++;
1299 /* Now go through the existing categories by priority to retrieve
1300 the categories not yet specified and preserve their priority
1302 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1304 int cat = coding_category_by_priority[j];
1305 if (category_to_priority[cat] < 0)
1306 category_to_priority[cat] = i++;
1309 /* Now we need to construct the inverse of the mapping we just
1312 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1313 coding_category_by_priority[category_to_priority[i]] = i;
1315 /* Phew! That was confusing. */
1319 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1320 Return a list of coding categories in descending order of priority.
1325 Lisp_Object list = Qnil;
1327 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1328 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1333 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1334 Change the coding system associated with a coding category.
1336 (coding_category, coding_system))
1338 int cat = decode_coding_category (coding_category);
1340 coding_system = Fget_coding_system (coding_system);
1341 coding_category_system[cat] = coding_system;
1345 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1346 Return the coding system associated with a coding category.
1350 int cat = decode_coding_category (coding_category);
1351 Lisp_Object sys = coding_category_system[cat];
1354 return XCODING_SYSTEM_NAME (sys);
1359 /************************************************************************/
1360 /* Detecting the encoding of data */
1361 /************************************************************************/
1363 struct detection_state
1365 enum eol_type eol_type;
1401 struct iso2022_decoder iso;
1403 int high_byte_count;
1404 unsigned int saw_single_shift:1;
1417 acceptable_control_char_p (int c)
1421 /* Allow and ignore control characters that you might
1422 reasonably see in a text file */
1427 case 8: /* backspace */
1428 case 11: /* vertical tab */
1429 case 12: /* form feed */
1430 case 26: /* MS-DOS C-z junk */
1431 case 31: /* '^_' -- for info */
1439 mask_has_at_most_one_bit_p (int mask)
1441 /* Perhaps the only thing useful you learn from intensive Microsoft
1442 technical interviews */
1443 return (mask & (mask - 1)) == 0;
1446 static enum eol_type
1447 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1456 st->eol.just_saw_cr = 1;
1461 if (st->eol.just_saw_cr)
1463 else if (st->eol.seen_anything)
1466 else if (st->eol.just_saw_cr)
1468 st->eol.just_saw_cr = 0;
1470 st->eol.seen_anything = 1;
1473 return EOL_AUTODETECT;
1476 /* Attempt to determine the encoding and EOL type of the given text.
1477 Before calling this function for the first type, you must initialize
1478 st->eol_type as appropriate and initialize st->mask to ~0.
1480 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1483 st->mask holds the determined coding category mask, or ~0 if only
1484 ASCII has been seen so far.
1488 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1489 is present in st->mask
1490 1 == definitive answers are here for both st->eol_type and st->mask
1494 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1495 unsigned int n, int just_do_eol)
1499 if (st->eol_type == EOL_AUTODETECT)
1500 st->eol_type = detect_eol_type (st, src, n);
1503 return st->eol_type != EOL_AUTODETECT;
1505 if (!st->seen_non_ascii)
1507 for (; n; n--, src++)
1510 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1512 st->seen_non_ascii = 1;
1514 st->shift_jis.mask = ~0;
1518 st->iso2022.mask = ~0;
1528 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1529 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1530 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1531 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1532 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1533 st->big5.mask = detect_coding_big5 (st, src, n);
1534 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1535 st->utf8.mask = detect_coding_utf8 (st, src, n);
1536 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1537 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1540 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1541 | st->utf8.mask | st->ucs4.mask;
1544 int retval = mask_has_at_most_one_bit_p (st->mask);
1545 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1546 return retval && st->eol_type != EOL_AUTODETECT;
1551 coding_system_from_mask (int mask)
1555 /* If the file was entirely or basically ASCII, use the
1556 default value of `buffer-file-coding-system'. */
1557 Lisp_Object retval =
1558 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1561 retval = Ffind_coding_system (retval);
1565 (Qbad_variable, Qwarning,
1566 "Invalid `default-buffer-file-coding-system', set to nil");
1567 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1571 retval = Fget_coding_system (Qraw_text);
1579 mask = postprocess_iso2022_mask (mask);
1581 /* Look through the coding categories by priority and find
1582 the first one that is allowed. */
1583 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1585 cat = coding_category_by_priority[i];
1586 if ((mask & (1 << cat)) &&
1587 !NILP (coding_category_system[cat]))
1591 return coding_category_system[cat];
1593 return Fget_coding_system (Qraw_text);
1597 /* Given a seekable read stream and potential coding system and EOL type
1598 as specified, do any autodetection that is called for. If the
1599 coding system and/or EOL type are not autodetect, they will be left
1600 alone; but this function will never return an autodetect coding system
1603 This function does not automatically fetch subsidiary coding systems;
1604 that should be unnecessary with the explicit eol-type argument. */
1607 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1608 enum eol_type *eol_type_in_out)
1610 struct detection_state decst;
1612 if (*eol_type_in_out == EOL_AUTODETECT)
1613 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1616 decst.eol_type = *eol_type_in_out;
1619 /* If autodetection is called for, do it now. */
1620 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1621 *eol_type_in_out == EOL_AUTODETECT)
1626 unsigned char random_buffer[4096];
1629 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1632 if (detect_coding_type (&decst, random_buffer, nread,
1633 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1634 CODESYS_AUTODETECT))
1638 *eol_type_in_out = decst.eol_type;
1639 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1640 *codesys_in_out = coding_system_from_mask (decst.mask);
1643 /* If we absolutely can't determine the EOL type, just assume LF. */
1644 if (*eol_type_in_out == EOL_AUTODETECT)
1645 *eol_type_in_out = EOL_LF;
1647 Lstream_rewind (stream);
1650 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1651 Detect coding system of the text in the region between START and END.
1652 Returned a list of possible coding systems ordered by priority.
1653 If only ASCII characters are found, it returns 'undecided or one of
1654 its subsidiary coding systems according to a detected end-of-line
1655 type. Optional arg BUFFER defaults to the current buffer.
1657 (start, end, buffer))
1659 Lisp_Object val = Qnil;
1660 struct buffer *buf = decode_buffer (buffer, 0);
1662 Lisp_Object instream, lb_instream;
1663 Lstream *istr, *lb_istr;
1664 struct detection_state decst;
1665 struct gcpro gcpro1, gcpro2;
1667 get_buffer_range_char (buf, start, end, &b, &e, 0);
1668 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1669 lb_istr = XLSTREAM (lb_instream);
1670 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1671 istr = XLSTREAM (instream);
1672 GCPRO2 (instream, lb_instream);
1674 decst.eol_type = EOL_AUTODETECT;
1678 unsigned char random_buffer[4096];
1679 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1683 if (detect_coding_type (&decst, random_buffer, nread, 0))
1687 if (decst.mask == ~0)
1688 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1696 decst.mask = postprocess_iso2022_mask (decst.mask);
1698 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1700 int sys = coding_category_by_priority[i];
1701 if (decst.mask & (1 << sys))
1703 Lisp_Object codesys = coding_category_system[sys];
1704 if (!NILP (codesys))
1705 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1706 val = Fcons (codesys, val);
1710 Lstream_close (istr);
1712 Lstream_delete (istr);
1713 Lstream_delete (lb_istr);
1718 /************************************************************************/
1719 /* Converting to internal Mule format ("decoding") */
1720 /************************************************************************/
1722 /* A decoding stream is a stream used for decoding text (i.e.
1723 converting from some external format to internal format).
1724 The decoding-stream object keeps track of the actual coding
1725 stream, the stream that is at the other end, and data that
1726 needs to be persistent across the lifetime of the stream. */
1728 /* Handle the EOL stuff related to just-read-in character C.
1729 EOL_TYPE is the EOL type of the coding stream.
1730 FLAGS is the current value of FLAGS in the coding stream, and may
1731 be modified by this macro. (The macro only looks at the
1732 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1733 bytes are to be written. You need to also define a local goto
1734 label "label_continue_loop" that is at the end of the main
1735 character-reading loop.
1737 If C is a CR character, then this macro handles it entirely and
1738 jumps to label_continue_loop. Otherwise, this macro does not add
1739 anything to DST, and continues normally. You should continue
1740 processing C normally after this macro. */
1742 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1746 if (eol_type == EOL_CR) \
1747 Dynarr_add (dst, '\n'); \
1748 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1749 Dynarr_add (dst, c); \
1751 flags |= CODING_STATE_CR; \
1752 goto label_continue_loop; \
1754 else if (flags & CODING_STATE_CR) \
1755 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1757 Dynarr_add (dst, '\r'); \
1758 flags &= ~CODING_STATE_CR; \
1762 /* C should be a binary character in the range 0 - 255; convert
1763 to internal format and add to Dynarr DST. */
1765 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1767 if (BYTE_ASCII_P (c)) \
1768 Dynarr_add (dst, c); \
1769 else if (BYTE_C1_P (c)) \
1771 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1772 Dynarr_add (dst, c + 0x20); \
1776 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1777 Dynarr_add (dst, c); \
1781 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1785 DECODE_ADD_BINARY_CHAR (ch, dst); \
1790 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1792 if (flags & CODING_STATE_END) \
1794 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1795 if (flags & CODING_STATE_CR) \
1796 Dynarr_add (dst, '\r'); \
1800 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1802 struct decoding_stream
1804 /* Coding system that governs the conversion. */
1805 Lisp_Coding_System *codesys;
1807 /* Stream that we read the encoded data from or
1808 write the decoded data to. */
1811 /* If we are reading, then we can return only a fixed amount of
1812 data, so if the conversion resulted in too much data, we store it
1813 here for retrieval the next time around. */
1814 unsigned_char_dynarr *runoff;
1816 /* FLAGS holds flags indicating the current state of the decoding.
1817 Some of these flags are dependent on the coding system. */
1820 /* CH holds a partially built-up character. Since we only deal
1821 with one- and two-byte characters at the moment, we only use
1822 this to store the first byte of a two-byte character. */
1825 /* EOL_TYPE specifies the type of end-of-line conversion that
1826 currently applies. We need to keep this separate from the
1827 EOL type stored in CODESYS because the latter might indicate
1828 automatic EOL-type detection while the former will always
1829 indicate a particular EOL type. */
1830 enum eol_type eol_type;
1832 /* Additional ISO2022 information. We define the structure above
1833 because it's also needed by the detection routines. */
1834 struct iso2022_decoder iso2022;
1836 /* Additional information (the state of the running CCL program)
1837 used by the CCL decoder. */
1838 struct ccl_program ccl;
1840 struct detection_state decst;
1843 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1844 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1845 static int decoding_rewinder (Lstream *stream);
1846 static int decoding_seekable_p (Lstream *stream);
1847 static int decoding_flusher (Lstream *stream);
1848 static int decoding_closer (Lstream *stream);
1850 static Lisp_Object decoding_marker (Lisp_Object stream,
1851 void (*markobj) (Lisp_Object));
1853 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1854 sizeof (struct decoding_stream));
1857 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1859 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1860 Lisp_Object str_obj;
1862 /* We do not need to mark the coding systems or charsets stored
1863 within the stream because they are stored in a global list
1864 and automatically marked. */
1866 XSETLSTREAM (str_obj, str);
1868 if (str->imp->marker)
1869 return (str->imp->marker) (str_obj, markobj);
1874 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1875 so we read data from the other end, decode it, and store it into DATA. */
1878 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1880 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1881 unsigned char *orig_data = data;
1883 int error_occurred = 0;
1885 /* We need to interface to mule_decode(), which expects to take some
1886 amount of data and store the result into a Dynarr. We have
1887 mule_decode() store into str->runoff, and take data from there
1890 /* We loop until we have enough data, reading chunks from the other
1891 end and decoding it. */
1894 /* Take data from the runoff if we can. Make sure to take at
1895 most SIZE bytes, and delete the data from the runoff. */
1896 if (Dynarr_length (str->runoff) > 0)
1898 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1899 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1900 Dynarr_delete_many (str->runoff, 0, chunk);
1906 break; /* No more room for data */
1908 if (str->flags & CODING_STATE_END)
1909 /* This means that on the previous iteration, we hit the EOF on
1910 the other end. We loop once more so that mule_decode() can
1911 output any final stuff it may be holding, or any "go back
1912 to a sane state" escape sequences. (This latter makes sense
1913 during encoding.) */
1916 /* Exhausted the runoff, so get some more. DATA has at least
1917 SIZE bytes left of storage in it, so it's OK to read directly
1918 into it. (We'll be overwriting above, after we've decoded it
1919 into the runoff.) */
1920 read_size = Lstream_read (str->other_end, data, size);
1927 /* There might be some more end data produced in the translation.
1928 See the comment above. */
1929 str->flags |= CODING_STATE_END;
1930 mule_decode (stream, data, str->runoff, read_size);
1933 if (data - orig_data == 0)
1934 return error_occurred ? -1 : 0;
1936 return data - orig_data;
1940 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1942 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1945 /* Decode all our data into the runoff, and then attempt to write
1946 it all out to the other end. Remove whatever chunk we succeeded
1948 mule_decode (stream, data, str->runoff, size);
1949 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1950 Dynarr_length (str->runoff));
1952 Dynarr_delete_many (str->runoff, 0, retval);
1953 /* Do NOT return retval. The return value indicates how much
1954 of the incoming data was written, not how many bytes were
1960 reset_decoding_stream (struct decoding_stream *str)
1963 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1965 Lisp_Object coding_system;
1966 XSETCODING_SYSTEM (coding_system, str->codesys);
1967 reset_iso2022 (coding_system, &str->iso2022);
1969 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1971 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1974 str->flags = str->ch = 0;
1978 decoding_rewinder (Lstream *stream)
1980 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1981 reset_decoding_stream (str);
1982 Dynarr_reset (str->runoff);
1983 return Lstream_rewind (str->other_end);
1987 decoding_seekable_p (Lstream *stream)
1989 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1990 return Lstream_seekable_p (str->other_end);
1994 decoding_flusher (Lstream *stream)
1996 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1997 return Lstream_flush (str->other_end);
2001 decoding_closer (Lstream *stream)
2003 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2004 if (stream->flags & LSTREAM_FL_WRITE)
2006 str->flags |= CODING_STATE_END;
2007 decoding_writer (stream, 0, 0);
2009 Dynarr_free (str->runoff);
2011 #ifdef ENABLE_COMPOSITE_CHARS
2012 if (str->iso2022.composite_chars)
2013 Dynarr_free (str->iso2022.composite_chars);
2016 return Lstream_close (str->other_end);
2020 decoding_stream_coding_system (Lstream *stream)
2022 Lisp_Object coding_system;
2023 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2025 XSETCODING_SYSTEM (coding_system, str->codesys);
2026 return subsidiary_coding_system (coding_system, str->eol_type);
2030 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2032 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2033 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2035 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2036 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2037 reset_decoding_stream (str);
2040 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2041 stream for writing, no automatic code detection will be performed.
2042 The reason for this is that automatic code detection requires a
2043 seekable input. Things will also fail if you open a decoding
2044 stream for reading using a non-fully-specified coding system and
2045 a non-seekable input stream. */
2048 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2051 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2052 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2056 str->other_end = stream;
2057 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2058 str->eol_type = EOL_AUTODETECT;
2059 if (!strcmp (mode, "r")
2060 && Lstream_seekable_p (stream))
2061 /* We can determine the coding system now. */
2062 determine_real_coding_system (stream, &codesys, &str->eol_type);
2063 set_decoding_stream_coding_system (lstr, codesys);
2064 str->decst.eol_type = str->eol_type;
2065 str->decst.mask = ~0;
2066 XSETLSTREAM (obj, lstr);
2071 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2073 return make_decoding_stream_1 (stream, codesys, "r");
2077 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2079 return make_decoding_stream_1 (stream, codesys, "w");
2082 /* Note: the decode_coding_* functions all take the same
2083 arguments as mule_decode(), which is to say some SRC data of
2084 size N, which is to be stored into dynamic array DST.
2085 DECODING is the stream within which the decoding is
2086 taking place, but no data is actually read from or
2087 written to that stream; that is handled in decoding_reader()
2088 or decoding_writer(). This allows the same functions to
2089 be used for both reading and writing. */
2092 mule_decode (Lstream *decoding, CONST unsigned char *src,
2093 unsigned_char_dynarr *dst, unsigned int n)
2095 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2097 /* If necessary, do encoding-detection now. We do this when
2098 we're a writing stream or a non-seekable reading stream,
2099 meaning that we can't just process the whole input,
2100 rewind, and start over. */
2102 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2103 str->eol_type == EOL_AUTODETECT)
2105 Lisp_Object codesys;
2107 XSETCODING_SYSTEM (codesys, str->codesys);
2108 detect_coding_type (&str->decst, src, n,
2109 CODING_SYSTEM_TYPE (str->codesys) !=
2110 CODESYS_AUTODETECT);
2111 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2112 str->decst.mask != ~0)
2113 /* #### This is cheesy. What we really ought to do is
2114 buffer up a certain amount of data so as to get a
2115 less random result. */
2116 codesys = coding_system_from_mask (str->decst.mask);
2117 str->eol_type = str->decst.eol_type;
2118 if (XCODING_SYSTEM (codesys) != str->codesys)
2120 /* Preserve the CODING_STATE_END flag in case it was set.
2121 If we erase it, bad things might happen. */
2122 int was_end = str->flags & CODING_STATE_END;
2123 set_decoding_stream_coding_system (decoding, codesys);
2125 str->flags |= CODING_STATE_END;
2129 switch (CODING_SYSTEM_TYPE (str->codesys))
2132 case CODESYS_INTERNAL:
2133 Dynarr_add_many (dst, src, n);
2136 case CODESYS_AUTODETECT:
2137 /* If we got this far and still haven't decided on the coding
2138 system, then do no conversion. */
2139 case CODESYS_NO_CONVERSION:
2140 decode_coding_no_conversion (decoding, src, dst, n);
2143 case CODESYS_SHIFT_JIS:
2144 decode_coding_sjis (decoding, src, dst, n);
2147 decode_coding_big5 (decoding, src, dst, n);
2150 decode_coding_ucs4 (decoding, src, dst, n);
2153 decode_coding_utf8 (decoding, src, dst, n);
2156 str->ccl.last_block = str->flags & CODING_STATE_END;
2157 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2159 case CODESYS_ISO2022:
2160 decode_coding_iso2022 (decoding, src, dst, n);
2168 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2169 Decode the text between START and END which is encoded in CODING-SYSTEM.
2170 This is useful if you've read in encoded text from a file without decoding
2171 it (e.g. you read in a JIS-formatted file but used the `binary' or
2172 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2173 Return length of decoded text.
2174 BUFFER defaults to the current buffer if unspecified.
2176 (start, end, coding_system, buffer))
2179 struct buffer *buf = decode_buffer (buffer, 0);
2180 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2181 Lstream *istr, *ostr;
2182 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2184 get_buffer_range_char (buf, start, end, &b, &e, 0);
2186 barf_if_buffer_read_only (buf, b, e);
2188 coding_system = Fget_coding_system (coding_system);
2189 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2190 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2191 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2193 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2194 Fget_coding_system (Qbinary));
2195 istr = XLSTREAM (instream);
2196 ostr = XLSTREAM (outstream);
2197 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2199 /* The chain of streams looks like this:
2201 [BUFFER] <----- send through
2202 ------> [ENCODE AS BINARY]
2203 ------> [DECODE AS SPECIFIED]
2209 char tempbuf[1024]; /* some random amount */
2210 Bufpos newpos, even_newer_pos;
2211 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2212 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2216 newpos = lisp_buffer_stream_startpos (istr);
2217 Lstream_write (ostr, tempbuf, size_in_bytes);
2218 even_newer_pos = lisp_buffer_stream_startpos (istr);
2219 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2222 Lstream_close (istr);
2223 Lstream_close (ostr);
2225 Lstream_delete (istr);
2226 Lstream_delete (ostr);
2227 Lstream_delete (XLSTREAM (de_outstream));
2228 Lstream_delete (XLSTREAM (lb_outstream));
2233 /************************************************************************/
2234 /* Converting to an external encoding ("encoding") */
2235 /************************************************************************/
2237 /* An encoding stream is an output stream. When you create the
2238 stream, you specify the coding system that governs the encoding
2239 and another stream that the resulting encoded data is to be
2240 sent to, and then start sending data to it. */
2242 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2244 struct encoding_stream
2246 /* Coding system that governs the conversion. */
2247 Lisp_Coding_System *codesys;
2249 /* Stream that we read the encoded data from or
2250 write the decoded data to. */
2253 /* If we are reading, then we can return only a fixed amount of
2254 data, so if the conversion resulted in too much data, we store it
2255 here for retrieval the next time around. */
2256 unsigned_char_dynarr *runoff;
2258 /* FLAGS holds flags indicating the current state of the encoding.
2259 Some of these flags are dependent on the coding system. */
2262 /* CH holds a partially built-up character. Since we only deal
2263 with one- and two-byte characters at the moment, we only use
2264 this to store the first byte of a two-byte character. */
2267 /* Additional information used by the ISO2022 encoder. */
2270 /* CHARSET holds the character sets currently assigned to the G0
2271 through G3 registers. It is initialized from the array
2272 INITIAL_CHARSET in CODESYS. */
2273 Lisp_Object charset[4];
2275 /* Which registers are currently invoked into the left (GL) and
2276 right (GR) halves of the 8-bit encoding space? */
2277 int register_left, register_right;
2279 /* Whether we need to explicitly designate the charset in the
2280 G? register before using it. It is initialized from the
2281 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2282 unsigned char force_charset_on_output[4];
2284 /* Other state variables that need to be preserved across
2286 Lisp_Object current_charset;
2288 int current_char_boundary;
2291 /* Additional information (the state of the running CCL program)
2292 used by the CCL encoder. */
2293 struct ccl_program ccl;
2297 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2298 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2300 static int encoding_rewinder (Lstream *stream);
2301 static int encoding_seekable_p (Lstream *stream);
2302 static int encoding_flusher (Lstream *stream);
2303 static int encoding_closer (Lstream *stream);
2305 static Lisp_Object encoding_marker (Lisp_Object stream,
2306 void (*markobj) (Lisp_Object));
2308 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2309 sizeof (struct encoding_stream));
2312 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2314 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2315 Lisp_Object str_obj;
2317 /* We do not need to mark the coding systems or charsets stored
2318 within the stream because they are stored in a global list
2319 and automatically marked. */
2321 XSETLSTREAM (str_obj, str);
2323 if (str->imp->marker)
2324 return (str->imp->marker) (str_obj, markobj);
2329 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2330 so we read data from the other end, encode it, and store it into DATA. */
2333 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2335 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2336 unsigned char *orig_data = data;
2338 int error_occurred = 0;
2340 /* We need to interface to mule_encode(), which expects to take some
2341 amount of data and store the result into a Dynarr. We have
2342 mule_encode() store into str->runoff, and take data from there
2345 /* We loop until we have enough data, reading chunks from the other
2346 end and encoding it. */
2349 /* Take data from the runoff if we can. Make sure to take at
2350 most SIZE bytes, and delete the data from the runoff. */
2351 if (Dynarr_length (str->runoff) > 0)
2353 int chunk = min ((int) size, Dynarr_length (str->runoff));
2354 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2355 Dynarr_delete_many (str->runoff, 0, chunk);
2361 break; /* No more room for data */
2363 if (str->flags & CODING_STATE_END)
2364 /* This means that on the previous iteration, we hit the EOF on
2365 the other end. We loop once more so that mule_encode() can
2366 output any final stuff it may be holding, or any "go back
2367 to a sane state" escape sequences. (This latter makes sense
2368 during encoding.) */
2371 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2372 left of storage in it, so it's OK to read directly into it.
2373 (We'll be overwriting above, after we've encoded it into the
2375 read_size = Lstream_read (str->other_end, data, size);
2382 /* There might be some more end data produced in the translation.
2383 See the comment above. */
2384 str->flags |= CODING_STATE_END;
2385 mule_encode (stream, data, str->runoff, read_size);
2388 if (data == orig_data)
2389 return error_occurred ? -1 : 0;
2391 return data - orig_data;
2395 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2397 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2400 /* Encode all our data into the runoff, and then attempt to write
2401 it all out to the other end. Remove whatever chunk we succeeded
2403 mule_encode (stream, data, str->runoff, size);
2404 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2405 Dynarr_length (str->runoff));
2407 Dynarr_delete_many (str->runoff, 0, retval);
2408 /* Do NOT return retval. The return value indicates how much
2409 of the incoming data was written, not how many bytes were
2415 reset_encoding_stream (struct encoding_stream *str)
2418 switch (CODING_SYSTEM_TYPE (str->codesys))
2420 case CODESYS_ISO2022:
2424 for (i = 0; i < 4; i++)
2426 str->iso2022.charset[i] =
2427 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2428 str->iso2022.force_charset_on_output[i] =
2429 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2431 str->iso2022.register_left = 0;
2432 str->iso2022.register_right = 1;
2433 str->iso2022.current_charset = Qnil;
2434 str->iso2022.current_half = 0;
2435 str->iso2022.current_char_boundary = 1;
2439 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2446 str->flags = str->ch = 0;
2450 encoding_rewinder (Lstream *stream)
2452 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2453 reset_encoding_stream (str);
2454 Dynarr_reset (str->runoff);
2455 return Lstream_rewind (str->other_end);
2459 encoding_seekable_p (Lstream *stream)
2461 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2462 return Lstream_seekable_p (str->other_end);
2466 encoding_flusher (Lstream *stream)
2468 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2469 return Lstream_flush (str->other_end);
2473 encoding_closer (Lstream *stream)
2475 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2476 if (stream->flags & LSTREAM_FL_WRITE)
2478 str->flags |= CODING_STATE_END;
2479 encoding_writer (stream, 0, 0);
2481 Dynarr_free (str->runoff);
2482 return Lstream_close (str->other_end);
2486 encoding_stream_coding_system (Lstream *stream)
2488 Lisp_Object coding_system;
2489 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2491 XSETCODING_SYSTEM (coding_system, str->codesys);
2492 return coding_system;
2496 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2498 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2499 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2501 reset_encoding_stream (str);
2505 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2508 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2509 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2513 str->runoff = Dynarr_new (unsigned_char);
2514 str->other_end = stream;
2515 set_encoding_stream_coding_system (lstr, codesys);
2516 XSETLSTREAM (obj, lstr);
2521 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2523 return make_encoding_stream_1 (stream, codesys, "r");
2527 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2529 return make_encoding_stream_1 (stream, codesys, "w");
2532 /* Convert N bytes of internally-formatted data stored in SRC to an
2533 external format, according to the encoding stream ENCODING.
2534 Store the encoded data into DST. */
2537 mule_encode (Lstream *encoding, CONST unsigned char *src,
2538 unsigned_char_dynarr *dst, unsigned int n)
2540 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2542 switch (CODING_SYSTEM_TYPE (str->codesys))
2545 case CODESYS_INTERNAL:
2546 Dynarr_add_many (dst, src, n);
2549 case CODESYS_AUTODETECT:
2550 /* If we got this far and still haven't decided on the coding
2551 system, then do no conversion. */
2552 case CODESYS_NO_CONVERSION:
2553 encode_coding_no_conversion (encoding, src, dst, n);
2556 case CODESYS_SHIFT_JIS:
2557 encode_coding_sjis (encoding, src, dst, n);
2560 encode_coding_big5 (encoding, src, dst, n);
2563 encode_coding_ucs4 (encoding, src, dst, n);
2566 encode_coding_utf8 (encoding, src, dst, n);
2569 str->ccl.last_block = str->flags & CODING_STATE_END;
2570 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2572 case CODESYS_ISO2022:
2573 encode_coding_iso2022 (encoding, src, dst, n);
2581 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2582 Encode the text between START and END using CODING-SYSTEM.
2583 This will, for example, convert Japanese characters into stuff such as
2584 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2585 text. BUFFER defaults to the current buffer if unspecified.
2587 (start, end, coding_system, buffer))
2590 struct buffer *buf = decode_buffer (buffer, 0);
2591 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2592 Lstream *istr, *ostr;
2593 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2595 get_buffer_range_char (buf, start, end, &b, &e, 0);
2597 barf_if_buffer_read_only (buf, b, e);
2599 coding_system = Fget_coding_system (coding_system);
2600 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2601 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2602 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2603 Fget_coding_system (Qbinary));
2604 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2606 istr = XLSTREAM (instream);
2607 ostr = XLSTREAM (outstream);
2608 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2609 /* The chain of streams looks like this:
2611 [BUFFER] <----- send through
2612 ------> [ENCODE AS SPECIFIED]
2613 ------> [DECODE AS BINARY]
2618 char tempbuf[1024]; /* some random amount */
2619 Bufpos newpos, even_newer_pos;
2620 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2621 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2625 newpos = lisp_buffer_stream_startpos (istr);
2626 Lstream_write (ostr, tempbuf, size_in_bytes);
2627 even_newer_pos = lisp_buffer_stream_startpos (istr);
2628 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2634 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2635 Lstream_close (istr);
2636 Lstream_close (ostr);
2638 Lstream_delete (istr);
2639 Lstream_delete (ostr);
2640 Lstream_delete (XLSTREAM (de_outstream));
2641 Lstream_delete (XLSTREAM (lb_outstream));
2642 return make_int (retlen);
2648 /************************************************************************/
2649 /* Shift-JIS methods */
2650 /************************************************************************/
2652 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2653 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2654 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2655 encoded by "position-code + 0x80". A character of JISX0208
2656 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2657 position-codes are divided and shifted so that it fit in the range
2660 --- CODE RANGE of Shift-JIS ---
2661 (character set) (range)
2663 JISX0201-Kana 0xA0 .. 0xDF
2664 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2665 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2666 -------------------------------
2670 /* Is this the first byte of a Shift-JIS two-byte char? */
2672 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2673 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2675 /* Is this the second byte of a Shift-JIS two-byte char? */
2677 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2678 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2680 #define BYTE_SJIS_KATAKANA_P(c) \
2681 ((c) >= 0xA1 && (c) <= 0xDF)
2684 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2692 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2694 if (st->shift_jis.in_second_byte)
2696 st->shift_jis.in_second_byte = 0;
2700 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2701 st->shift_jis.in_second_byte = 1;
2703 return CODING_CATEGORY_SHIFT_JIS_MASK;
2706 /* Convert Shift-JIS data to internal format. */
2709 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2710 unsigned_char_dynarr *dst, unsigned int n)
2713 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2714 unsigned int flags = str->flags;
2715 unsigned int ch = str->ch;
2716 eol_type_t eol_type = str->eol_type;
2724 /* Previous character was first byte of Shift-JIS Kanji char. */
2725 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2727 unsigned char e1, e2;
2729 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2730 DECODE_SJIS (ch, c, e1, e2);
2731 Dynarr_add (dst, e1);
2732 Dynarr_add (dst, e2);
2736 DECODE_ADD_BINARY_CHAR (ch, dst);
2737 DECODE_ADD_BINARY_CHAR (c, dst);
2743 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2744 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2746 else if (BYTE_SJIS_KATAKANA_P (c))
2748 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2749 Dynarr_add (dst, c);
2752 DECODE_ADD_BINARY_CHAR (c, dst);
2754 label_continue_loop:;
2757 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2763 /* Convert internally-formatted data to Shift-JIS. */
2766 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2767 unsigned_char_dynarr *dst, unsigned int n)
2770 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2771 unsigned int flags = str->flags;
2772 unsigned int ch = str->ch;
2773 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2780 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2781 Dynarr_add (dst, '\r');
2782 if (eol_type != EOL_CR)
2783 Dynarr_add (dst, '\n');
2786 else if (BYTE_ASCII_P (c))
2788 Dynarr_add (dst, c);
2791 else if (BUFBYTE_LEADING_BYTE_P (c))
2792 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2793 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2794 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2797 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2799 Dynarr_add (dst, c);
2802 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2803 ch == LEADING_BYTE_JAPANESE_JISX0208)
2807 unsigned char j1, j2;
2808 ENCODE_SJIS (ch, c, j1, j2);
2809 Dynarr_add (dst, j1);
2810 Dynarr_add (dst, j2);
2820 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2821 Decode a JISX0208 character of Shift-JIS coding-system.
2822 CODE is the character code in Shift-JIS as a cons of type bytes.
2823 Return the corresponding character.
2827 unsigned char c1, c2, s1, s2;
2830 CHECK_INT (XCAR (code));
2831 CHECK_INT (XCDR (code));
2832 s1 = XINT (XCAR (code));
2833 s2 = XINT (XCDR (code));
2834 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2835 BYTE_SJIS_TWO_BYTE_2_P (s2))
2837 DECODE_SJIS (s1, s2, c1, c2);
2838 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2839 c1 & 0x7F, c2 & 0x7F));
2845 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2846 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2847 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2851 Lisp_Object charset;
2854 CHECK_CHAR_COERCE_INT (ch);
2855 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2856 if (EQ (charset, Vcharset_japanese_jisx0208))
2858 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2859 return Fcons (make_int (s1), make_int (s2));
2866 /************************************************************************/
2868 /************************************************************************/
2870 /* BIG5 is a coding system encoding two character sets: ASCII and
2871 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2872 character set and is encoded in two-byte.
2874 --- CODE RANGE of BIG5 ---
2875 (character set) (range)
2877 Big5 (1st byte) 0xA1 .. 0xFE
2878 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2879 --------------------------
2881 Since the number of characters in Big5 is larger than maximum
2882 characters in Emacs' charset (96x96), it can't be handled as one
2883 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2884 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2885 contains frequently used characters and the latter contains less
2886 frequently used characters. */
2888 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2889 ((c) >= 0xA1 && (c) <= 0xFE)
2891 /* Is this the second byte of a Shift-JIS two-byte char? */
2893 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2894 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2896 /* Number of Big5 characters which have the same code in 1st byte. */
2898 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2900 /* Code conversion macros. These are macros because they are used in
2901 inner loops during code conversion.
2903 Note that temporary variables in macros introduce the classic
2904 dynamic-scoping problems with variable names. We use capital-
2905 lettered variables in the assumption that XEmacs does not use
2906 capital letters in variables except in a very formalized way
2909 /* Convert Big5 code (b1, b2) into its internal string representation
2912 /* There is a much simpler way to split the Big5 charset into two.
2913 For the moment I'm going to leave the algorithm as-is because it
2914 claims to separate out the most-used characters into a single
2915 charset, which perhaps will lead to optimizations in various
2918 The way the algorithm works is something like this:
2920 Big5 can be viewed as a 94x157 charset, where the row is
2921 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2922 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2923 the split between low and high column numbers is apparently
2924 meaningless; ascending rows produce less and less frequent chars.
2925 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2926 the first charset, and the upper half (0xC9 .. 0xFE) to the
2927 second. To do the conversion, we convert the character into
2928 a single number where 0 .. 156 is the first row, 157 .. 313
2929 is the second, etc. That way, the characters are ordered by
2930 decreasing frequency. Then we just chop the space in two
2931 and coerce the result into a 94x94 space.
2934 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2936 int B1 = b1, B2 = b2; \
2938 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2942 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2946 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2947 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2949 c1 = I / (0xFF - 0xA1) + 0xA1; \
2950 c2 = I % (0xFF - 0xA1) + 0xA1; \
2953 /* Convert the internal string representation of a Big5 character
2954 (lb, c1, c2) into Big5 code (b1, b2). */
2956 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2958 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2960 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2962 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2964 b1 = I / BIG5_SAME_ROW + 0xA1; \
2965 b2 = I % BIG5_SAME_ROW; \
2966 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2970 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2978 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2979 (c >= 0x80 && c <= 0xA0))
2981 if (st->big5.in_second_byte)
2983 st->big5.in_second_byte = 0;
2984 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2988 st->big5.in_second_byte = 1;
2990 return CODING_CATEGORY_BIG5_MASK;
2993 /* Convert Big5 data to internal format. */
2996 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2997 unsigned_char_dynarr *dst, unsigned int n)
3000 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3001 unsigned int flags = str->flags;
3002 unsigned int ch = str->ch;
3003 eol_type_t eol_type = str->eol_type;
3010 /* Previous character was first byte of Big5 char. */
3011 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3013 unsigned char b1, b2, b3;
3014 DECODE_BIG5 (ch, c, b1, b2, b3);
3015 Dynarr_add (dst, b1);
3016 Dynarr_add (dst, b2);
3017 Dynarr_add (dst, b3);
3021 DECODE_ADD_BINARY_CHAR (ch, dst);
3022 DECODE_ADD_BINARY_CHAR (c, dst);
3028 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3029 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3032 DECODE_ADD_BINARY_CHAR (c, dst);
3034 label_continue_loop:;
3037 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3043 /* Convert internally-formatted data to Big5. */
3046 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3047 unsigned_char_dynarr *dst, unsigned int n)
3050 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3051 unsigned int flags = str->flags;
3052 unsigned int ch = str->ch;
3053 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3060 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3061 Dynarr_add (dst, '\r');
3062 if (eol_type != EOL_CR)
3063 Dynarr_add (dst, '\n');
3065 else if (BYTE_ASCII_P (c))
3068 Dynarr_add (dst, c);
3070 else if (BUFBYTE_LEADING_BYTE_P (c))
3072 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3073 c == LEADING_BYTE_CHINESE_BIG5_2)
3075 /* A recognized leading byte. */
3077 continue; /* not done with this character. */
3079 /* otherwise just ignore this character. */
3081 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3082 ch == LEADING_BYTE_CHINESE_BIG5_2)
3084 /* Previous char was a recognized leading byte. */
3086 continue; /* not done with this character. */
3090 /* Encountering second byte of a Big5 character. */
3091 unsigned char b1, b2;
3093 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3094 Dynarr_add (dst, b1);
3095 Dynarr_add (dst, b2);
3106 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3107 Decode a Big5 character CODE of BIG5 coding-system.
3108 CODE is the character code in BIG5, a cons of two integers.
3109 Return the corresponding character.
3113 unsigned char c1, c2, b1, b2;
3116 CHECK_INT (XCAR (code));
3117 CHECK_INT (XCDR (code));
3118 b1 = XINT (XCAR (code));
3119 b2 = XINT (XCDR (code));
3120 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3121 BYTE_BIG5_TWO_BYTE_2_P (b2))
3124 Lisp_Object charset;
3125 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3126 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3127 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3133 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3134 Encode the Big5 character CH to BIG5 coding-system.
3135 Return the corresponding character code in Big5.
3139 Lisp_Object charset;
3142 CHECK_CHAR_COERCE_INT (ch);
3143 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3144 if (EQ (charset, Vcharset_chinese_big5_1) ||
3145 EQ (charset, Vcharset_chinese_big5_2))
3147 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3149 return Fcons (make_int (b1), make_int (b2));
3156 /************************************************************************/
3159 /* UCS-4 character codes are implemented as nonnegative integers. */
3161 /************************************************************************/
3163 Lisp_Object ucs_to_mule_table[65536];
3164 Lisp_Object mule_to_ucs_table;
3166 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3167 Map UCS-4 code CODE to Mule character CHARACTER.
3169 Return T on success, NIL on failure.
3175 CHECK_CHAR (character);
3179 if (c < sizeof (ucs_to_mule_table))
3181 ucs_to_mule_table[c] = character;
3189 ucs_to_char (unsigned long code)
3191 if (code < sizeof (ucs_to_mule_table))
3193 return ucs_to_mule_table[code];
3195 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3200 c = code % (94 * 94);
3202 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3203 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3204 CHARSET_LEFT_TO_RIGHT),
3205 c / 94 + 33, c % 94 + 33));
3211 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3212 Return Mule character corresponding to UCS code CODE (a positive integer).
3216 CHECK_NATNUM (code);
3217 return ucs_to_char (XINT (code));
3220 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3221 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3225 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3226 Fset_char_ucs is more restrictive on index arg, but should
3227 check code arg in a char_table method. */
3228 CHECK_CHAR (character);
3229 CHECK_NATNUM (code);
3230 return Fput_char_table (character, code, mule_to_ucs_table);
3233 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3234 Return the UCS code (a positive integer) corresponding to CHARACTER.
3238 return Fget_char_table (character, mule_to_ucs_table);
3241 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3242 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3243 is not found, instead.
3244 #### do something more appropriate (use blob?)
3245 Danger, Will Robinson! Data loss. Should we signal user? */
3247 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3249 Lisp_Object chr = ucs_to_char (ch);
3253 Bufbyte work[MAX_EMCHAR_LEN];
3258 simple_set_charptr_emchar (work, ch) :
3259 non_ascii_set_charptr_emchar (work, ch);
3260 Dynarr_add_many (dst, work, len);
3264 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3265 Dynarr_add (dst, 34 + 128);
3266 Dynarr_add (dst, 46 + 128);
3270 static unsigned long
3271 mule_char_to_ucs4 (Lisp_Object charset,
3272 unsigned char h, unsigned char l)
3275 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3282 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3283 (XCHARSET_CHARS (charset) == 94) )
3285 unsigned char final = XCHARSET_FINAL (charset);
3287 if ( ('@' <= final) && (final < 0x7f) )
3289 return 0xe00000 + (final - '@') * 94 * 94
3290 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3304 encode_ucs4 (Lisp_Object charset,
3305 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3307 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3308 Dynarr_add (dst, code >> 24);
3309 Dynarr_add (dst, (code >> 16) & 255);
3310 Dynarr_add (dst, (code >> 8) & 255);
3311 Dynarr_add (dst, code & 255);
3315 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3321 switch (st->ucs4.in_byte)
3330 st->ucs4.in_byte = 0;
3336 return CODING_CATEGORY_UCS4_MASK;
3340 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3341 unsigned_char_dynarr *dst, unsigned int n)
3343 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3344 unsigned int flags = str->flags;
3345 unsigned int ch = str->ch;
3349 unsigned char c = *src++;
3357 decode_ucs4 ( ( ch << 8 ) | c, dst);
3362 ch = ( ch << 8 ) | c;
3366 if (flags & CODING_STATE_END)
3367 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3374 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3375 unsigned_char_dynarr *dst, unsigned int n)
3377 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3378 unsigned int flags = str->flags;
3379 unsigned int ch = str->ch;
3380 unsigned char char_boundary = str->iso2022.current_char_boundary;
3381 Lisp_Object charset = str->iso2022.current_charset;
3383 #ifdef ENABLE_COMPOSITE_CHARS
3384 /* flags for handling composite chars. We do a little switcharoo
3385 on the source while we're outputting the composite char. */
3386 unsigned int saved_n = 0;
3387 CONST unsigned char *saved_src = NULL;
3388 int in_composite = 0;
3395 unsigned char c = *src++;
3397 if (BYTE_ASCII_P (c))
3398 { /* Processing ASCII character */
3400 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3403 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3404 { /* Processing Leading Byte */
3406 charset = CHARSET_BY_LEADING_BYTE (c);
3407 if (LEADING_BYTE_PREFIX_P(c))
3412 { /* Processing Non-ASCII character */
3414 if (EQ (charset, Vcharset_control_1))
3416 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3420 switch (XCHARSET_REP_BYTES (charset))
3423 encode_ucs4 (charset, c, 0, dst);
3426 if (XCHARSET_PRIVATE_P (charset))
3428 encode_ucs4 (charset, c, 0, dst);
3433 #ifdef ENABLE_COMPOSITE_CHARS
3434 if (EQ (charset, Vcharset_composite))
3438 /* #### Bother! We don't know how to
3440 Dynarr_add (dst, 0);
3441 Dynarr_add (dst, 0);
3442 Dynarr_add (dst, 0);
3443 Dynarr_add (dst, '~');
3447 Emchar emch = MAKE_CHAR (Vcharset_composite,
3448 ch & 0x7F, c & 0x7F);
3449 Lisp_Object lstr = composite_char_string (emch);
3453 src = XSTRING_DATA (lstr);
3454 n = XSTRING_LENGTH (lstr);
3458 #endif /* ENABLE_COMPOSITE_CHARS */
3460 encode_ucs4(charset, ch, c, dst);
3473 encode_ucs4 (charset, ch, c, dst);
3489 #ifdef ENABLE_COMPOSITE_CHARS
3495 goto back_to_square_n; /* Wheeeeeeeee ..... */
3497 #endif /* ENABLE_COMPOSITE_CHARS */
3501 str->iso2022.current_char_boundary = char_boundary;
3502 str->iso2022.current_charset = charset;
3504 /* Verbum caro factum est! */
3508 /************************************************************************/
3510 /************************************************************************/
3513 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3518 unsigned char c = *src++;
3519 switch (st->utf8.in_byte)
3522 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3525 st->utf8.in_byte = 5;
3527 st->utf8.in_byte = 4;
3529 st->utf8.in_byte = 3;
3531 st->utf8.in_byte = 2;
3533 st->utf8.in_byte = 1;
3538 if ((c & 0xc0) != 0x80)
3544 return CODING_CATEGORY_UTF8_MASK;
3548 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3549 unsigned_char_dynarr *dst, unsigned int n)
3551 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3552 unsigned int flags = str->flags;
3553 unsigned int ch = str->ch;
3554 eol_type_t eol_type = str->eol_type;
3558 unsigned char c = *src++;
3567 else if ( c >= 0xf8 )
3572 else if ( c >= 0xf0 )
3577 else if ( c >= 0xe0 )
3582 else if ( c >= 0xc0 )
3589 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3590 decode_ucs4 (c, dst);
3594 ch = ( ch << 6 ) | ( c & 0x3f );
3595 decode_ucs4 (ch, dst);
3600 ch = ( ch << 6 ) | ( c & 0x3f );
3603 label_continue_loop:;
3606 if (flags & CODING_STATE_END)
3607 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3614 encode_utf8 (Lisp_Object charset,
3615 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3617 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3620 Dynarr_add (dst, code);
3622 else if ( code <= 0x7ff )
3624 Dynarr_add (dst, (code >> 6) | 0xc0);
3625 Dynarr_add (dst, (code & 0x3f) | 0x80);
3627 else if ( code <= 0xffff )
3629 Dynarr_add (dst, (code >> 12) | 0xe0);
3630 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3631 Dynarr_add (dst, (code & 0x3f) | 0x80);
3633 else if ( code <= 0x1fffff )
3635 Dynarr_add (dst, (code >> 18) | 0xf0);
3636 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3637 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3638 Dynarr_add (dst, (code & 0x3f) | 0x80);
3640 else if ( code <= 0x3ffffff )
3642 Dynarr_add (dst, (code >> 24) | 0xf8);
3643 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3644 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3645 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3646 Dynarr_add (dst, (code & 0x3f) | 0x80);
3650 Dynarr_add (dst, (code >> 30) | 0xfc);
3651 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3652 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3653 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3654 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3655 Dynarr_add (dst, (code & 0x3f) | 0x80);
3660 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3661 unsigned_char_dynarr *dst, unsigned int n)
3663 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3664 unsigned int flags = str->flags;
3665 unsigned int ch = str->ch;
3666 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3667 unsigned char char_boundary = str->iso2022.current_char_boundary;
3668 Lisp_Object charset = str->iso2022.current_charset;
3670 #ifdef ENABLE_COMPOSITE_CHARS
3671 /* flags for handling composite chars. We do a little switcharoo
3672 on the source while we're outputting the composite char. */
3673 unsigned int saved_n = 0;
3674 CONST unsigned char *saved_src = NULL;
3675 int in_composite = 0;
3678 #endif /* ENABLE_COMPOSITE_CHARS */
3682 unsigned char c = *src++;
3684 if (BYTE_ASCII_P (c))
3685 { /* Processing ASCII character */
3689 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3690 Dynarr_add (dst, '\r');
3691 if (eol_type != EOL_CR)
3692 Dynarr_add (dst, c);
3695 encode_utf8 (Vcharset_ascii, c, 0, dst);
3698 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3699 { /* Processing Leading Byte */
3701 charset = CHARSET_BY_LEADING_BYTE (c);
3702 if (LEADING_BYTE_PREFIX_P(c))
3707 { /* Processing Non-ASCII character */
3709 if (EQ (charset, Vcharset_control_1))
3711 encode_utf8 (Vcharset_control_1, c, 0, dst);
3715 switch (XCHARSET_REP_BYTES (charset))
3718 encode_utf8 (charset, c, 0, dst);
3721 if (XCHARSET_PRIVATE_P (charset))
3723 encode_utf8 (charset, c, 0, dst);
3728 #ifdef ENABLE_COMPOSITE_CHARS
3729 if (EQ (charset, Vcharset_composite))
3733 /* #### Bother! We don't know how to
3735 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3739 Emchar emch = MAKE_CHAR (Vcharset_composite,
3740 ch & 0x7F, c & 0x7F);
3741 Lisp_Object lstr = composite_char_string (emch);
3745 src = XSTRING_DATA (lstr);
3746 n = XSTRING_LENGTH (lstr);
3750 #endif /* ENABLE_COMPOSITE_CHARS */
3752 encode_utf8 (charset, ch, c, dst);
3765 encode_utf8 (charset, ch, c, dst);
3781 #ifdef ENABLE_COMPOSITE_CHARS
3787 goto back_to_square_n; /* Wheeeeeeeee ..... */
3793 str->iso2022.current_char_boundary = char_boundary;
3794 str->iso2022.current_charset = charset;
3796 /* Verbum caro factum est! */
3800 /************************************************************************/
3801 /* ISO2022 methods */
3802 /************************************************************************/
3804 /* The following note describes the coding system ISO2022 briefly.
3805 Since the intention of this note is to help understand the
3806 functions in this file, some parts are NOT ACCURATE or OVERLY
3807 SIMPLIFIED. For thorough understanding, please refer to the
3808 original document of ISO2022.
3810 ISO2022 provides many mechanisms to encode several character sets
3811 in 7-bit and 8-bit environments. For 7-bit environments, all text
3812 is encoded using bytes less than 128. This may make the encoded
3813 text a little bit longer, but the text passes more easily through
3814 several gateways, some of which strip off MSB (Most Signigant Bit).
3816 There are two kinds of character sets: control character set and
3817 graphic character set. The former contains control characters such
3818 as `newline' and `escape' to provide control functions (control
3819 functions are also provided by escape sequences). The latter
3820 contains graphic characters such as 'A' and '-'. Emacs recognizes
3821 two control character sets and many graphic character sets.
3823 Graphic character sets are classified into one of the following
3824 four classes, according to the number of bytes (DIMENSION) and
3825 number of characters in one dimension (CHARS) of the set:
3826 - DIMENSION1_CHARS94
3827 - DIMENSION1_CHARS96
3828 - DIMENSION2_CHARS94
3829 - DIMENSION2_CHARS96
3831 In addition, each character set is assigned an identification tag,
3832 unique for each set, called "final character" (denoted as <F>
3833 hereafter). The <F> of each character set is decided by ECMA(*)
3834 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3835 (0x30..0x3F are for private use only).
3837 Note (*): ECMA = European Computer Manufacturers Association
3839 Here are examples of graphic character set [NAME(<F>)]:
3840 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3841 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3842 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3843 o DIMENSION2_CHARS96 -- none for the moment
3845 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3846 C0 [0x00..0x1F] -- control character plane 0
3847 GL [0x20..0x7F] -- graphic character plane 0
3848 C1 [0x80..0x9F] -- control character plane 1
3849 GR [0xA0..0xFF] -- graphic character plane 1
3851 A control character set is directly designated and invoked to C0 or
3852 C1 by an escape sequence. The most common case is that:
3853 - ISO646's control character set is designated/invoked to C0, and
3854 - ISO6429's control character set is designated/invoked to C1,
3855 and usually these designations/invocations are omitted in encoded
3856 text. In a 7-bit environment, only C0 can be used, and a control
3857 character for C1 is encoded by an appropriate escape sequence to
3858 fit into the environment. All control characters for C1 are
3859 defined to have corresponding escape sequences.
3861 A graphic character set is at first designated to one of four
3862 graphic registers (G0 through G3), then these graphic registers are
3863 invoked to GL or GR. These designations and invocations can be
3864 done independently. The most common case is that G0 is invoked to
3865 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3866 these invocations and designations are omitted in encoded text.
3867 In a 7-bit environment, only GL can be used.
3869 When a graphic character set of CHARS94 is invoked to GL, codes
3870 0x20 and 0x7F of the GL area work as control characters SPACE and
3871 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3874 There are two ways of invocation: locking-shift and single-shift.
3875 With locking-shift, the invocation lasts until the next different
3876 invocation, whereas with single-shift, the invocation affects the
3877 following character only and doesn't affect the locking-shift
3878 state. Invocations are done by the following control characters or
3881 ----------------------------------------------------------------------
3882 abbrev function cntrl escape seq description
3883 ----------------------------------------------------------------------
3884 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3885 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3886 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3887 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3888 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3889 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3890 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3891 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3892 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3893 ----------------------------------------------------------------------
3894 (*) These are not used by any known coding system.
3896 Control characters for these functions are defined by macros
3897 ISO_CODE_XXX in `coding.h'.
3899 Designations are done by the following escape sequences:
3900 ----------------------------------------------------------------------
3901 escape sequence description
3902 ----------------------------------------------------------------------
3903 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3904 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3905 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3906 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3907 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3908 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3909 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3910 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3911 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3912 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3913 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3914 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3915 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3916 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3917 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3918 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3919 ----------------------------------------------------------------------
3921 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3922 of dimension 1, chars 94, and final character <F>, etc...
3924 Note (*): Although these designations are not allowed in ISO2022,
3925 Emacs accepts them on decoding, and produces them on encoding
3926 CHARS96 character sets in a coding system which is characterized as
3927 7-bit environment, non-locking-shift, and non-single-shift.
3929 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3930 '(' can be omitted. We refer to this as "short-form" hereafter.
3932 Now you may notice that there are a lot of ways for encoding the
3933 same multilingual text in ISO2022. Actually, there exist many
3934 coding systems such as Compound Text (used in X11's inter client
3935 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3936 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3937 localized platforms), and all of these are variants of ISO2022.
3939 In addition to the above, Emacs handles two more kinds of escape
3940 sequences: ISO6429's direction specification and Emacs' private
3941 sequence for specifying character composition.
3943 ISO6429's direction specification takes the following form:
3944 o CSI ']' -- end of the current direction
3945 o CSI '0' ']' -- end of the current direction
3946 o CSI '1' ']' -- start of left-to-right text
3947 o CSI '2' ']' -- start of right-to-left text
3948 The control character CSI (0x9B: control sequence introducer) is
3949 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3951 Character composition specification takes the following form:
3952 o ESC '0' -- start character composition
3953 o ESC '1' -- end character composition
3954 Since these are not standard escape sequences of any ISO standard,
3955 their use with these meanings is restricted to Emacs only. */
3958 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3962 for (i = 0; i < 4; i++)
3964 if (!NILP (coding_system))
3966 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3968 iso->charset[i] = Qt;
3969 iso->invalid_designated[i] = 0;
3971 iso->esc = ISO_ESC_NOTHING;
3972 iso->esc_bytes_index = 0;
3973 iso->register_left = 0;
3974 iso->register_right = 1;
3975 iso->switched_dir_and_no_valid_charset_yet = 0;
3976 iso->invalid_switch_dir = 0;
3977 iso->output_direction_sequence = 0;
3978 iso->output_literally = 0;
3979 #ifdef ENABLE_COMPOSITE_CHARS
3980 if (iso->composite_chars)
3981 Dynarr_reset (iso->composite_chars);
3986 fit_to_be_escape_quoted (unsigned char c)
4003 /* Parse one byte of an ISO2022 escape sequence.
4004 If the result is an invalid escape sequence, return 0 and
4005 do not change anything in STR. Otherwise, if the result is
4006 an incomplete escape sequence, update ISO2022.ESC and
4007 ISO2022.ESC_BYTES and return -1. Otherwise, update
4008 all the state variables (but not ISO2022.ESC_BYTES) and
4011 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4012 or invocation of an invalid character set and treat that as
4013 an unrecognized escape sequence. */
4016 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4017 unsigned char c, unsigned int *flags,
4018 int check_invalid_charsets)
4020 /* (1) If we're at the end of a designation sequence, CS is the
4021 charset being designated and REG is the register to designate
4024 (2) If we're at the end of a locking-shift sequence, REG is
4025 the register to invoke and HALF (0 == left, 1 == right) is
4026 the half to invoke it into.
4028 (3) If we're at the end of a single-shift sequence, REG is
4029 the register to invoke. */
4030 Lisp_Object cs = Qnil;
4033 /* NOTE: This code does goto's all over the fucking place.
4034 The reason for this is that we're basically implementing
4035 a state machine here, and hierarchical languages like C
4036 don't really provide a clean way of doing this. */
4038 if (! (*flags & CODING_STATE_ESCAPE))
4039 /* At beginning of escape sequence; we need to reset our
4040 escape-state variables. */
4041 iso->esc = ISO_ESC_NOTHING;
4043 iso->output_literally = 0;
4044 iso->output_direction_sequence = 0;
4048 case ISO_ESC_NOTHING:
4049 iso->esc_bytes_index = 0;
4052 case ISO_CODE_ESC: /* Start escape sequence */
4053 *flags |= CODING_STATE_ESCAPE;
4057 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4058 *flags |= CODING_STATE_ESCAPE;
4059 iso->esc = ISO_ESC_5_11;
4062 case ISO_CODE_SO: /* locking shift 1 */
4065 case ISO_CODE_SI: /* locking shift 0 */
4069 case ISO_CODE_SS2: /* single shift */
4072 case ISO_CODE_SS3: /* single shift */
4076 default: /* Other control characters */
4083 /**** single shift ****/
4085 case 'N': /* single shift 2 */
4088 case 'O': /* single shift 3 */
4092 /**** locking shift ****/
4094 case '~': /* locking shift 1 right */
4097 case 'n': /* locking shift 2 */
4100 case '}': /* locking shift 2 right */
4103 case 'o': /* locking shift 3 */
4106 case '|': /* locking shift 3 right */
4110 #ifdef ENABLE_COMPOSITE_CHARS
4111 /**** composite ****/
4114 iso->esc = ISO_ESC_START_COMPOSITE;
4115 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4116 CODING_STATE_COMPOSITE;
4120 iso->esc = ISO_ESC_END_COMPOSITE;
4121 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4122 ~CODING_STATE_COMPOSITE;
4124 #endif /* ENABLE_COMPOSITE_CHARS */
4126 /**** directionality ****/
4129 iso->esc = ISO_ESC_5_11;
4132 /**** designation ****/
4134 case '$': /* multibyte charset prefix */
4135 iso->esc = ISO_ESC_2_4;
4139 if (0x28 <= c && c <= 0x2F)
4141 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4145 /* This function is called with CODESYS equal to nil when
4146 doing coding-system detection. */
4148 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4149 && fit_to_be_escape_quoted (c))
4151 iso->esc = ISO_ESC_LITERAL;
4152 *flags &= CODING_STATE_ISO2022_LOCK;
4162 /**** directionality ****/
4164 case ISO_ESC_5_11: /* ISO6429 direction control */
4167 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4168 goto directionality;
4170 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4171 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4172 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4176 case ISO_ESC_5_11_0:
4179 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4180 goto directionality;
4184 case ISO_ESC_5_11_1:
4187 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4188 goto directionality;
4192 case ISO_ESC_5_11_2:
4195 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4196 goto directionality;
4201 iso->esc = ISO_ESC_DIRECTIONALITY;
4202 /* Various junk here to attempt to preserve the direction sequences
4203 literally in the text if they would otherwise be swallowed due
4204 to invalid designations that don't show up as actual charset
4205 changes in the text. */
4206 if (iso->invalid_switch_dir)
4208 /* We already inserted a direction switch literally into the
4209 text. We assume (#### this may not be right) that the
4210 next direction switch is the one going the other way,
4211 and we need to output that literally as well. */
4212 iso->output_literally = 1;
4213 iso->invalid_switch_dir = 0;
4219 /* If we are in the thrall of an invalid designation,
4220 then stick the directionality sequence literally into the
4221 output stream so it ends up in the original text again. */
4222 for (jj = 0; jj < 4; jj++)
4223 if (iso->invalid_designated[jj])
4227 iso->output_literally = 1;
4228 iso->invalid_switch_dir = 1;
4231 /* Indicate that we haven't yet seen a valid designation,
4232 so that if a switch-dir is directly followed by an
4233 invalid designation, both get inserted literally. */
4234 iso->switched_dir_and_no_valid_charset_yet = 1;
4239 /**** designation ****/
4242 if (0x28 <= c && c <= 0x2F)
4244 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4247 if (0x40 <= c && c <= 0x42)
4249 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4250 *flags & CODING_STATE_R2L ?
4251 CHARSET_RIGHT_TO_LEFT :
4252 CHARSET_LEFT_TO_RIGHT);
4262 if (c < '0' || c > '~')
4263 return 0; /* bad final byte */
4265 if (iso->esc >= ISO_ESC_2_8 &&
4266 iso->esc <= ISO_ESC_2_15)
4268 type = ((iso->esc >= ISO_ESC_2_12) ?
4269 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4270 reg = (iso->esc - ISO_ESC_2_8) & 3;
4272 else if (iso->esc >= ISO_ESC_2_4_8 &&
4273 iso->esc <= ISO_ESC_2_4_15)
4275 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4276 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4277 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4281 /* Can this ever be reached? -slb */
4285 cs = CHARSET_BY_ATTRIBUTES (type, c,
4286 *flags & CODING_STATE_R2L ?
4287 CHARSET_RIGHT_TO_LEFT :
4288 CHARSET_LEFT_TO_RIGHT);
4294 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4298 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4299 /* can't invoke something that ain't there. */
4301 iso->esc = ISO_ESC_SINGLE_SHIFT;
4302 *flags &= CODING_STATE_ISO2022_LOCK;
4304 *flags |= CODING_STATE_SS2;
4306 *flags |= CODING_STATE_SS3;
4310 if (check_invalid_charsets &&
4311 !CHARSETP (iso->charset[reg]))
4312 /* can't invoke something that ain't there. */
4315 iso->register_right = reg;
4317 iso->register_left = reg;
4318 *flags &= CODING_STATE_ISO2022_LOCK;
4319 iso->esc = ISO_ESC_LOCKING_SHIFT;
4323 if (NILP (cs) && check_invalid_charsets)
4325 iso->invalid_designated[reg] = 1;
4326 iso->charset[reg] = Vcharset_ascii;
4327 iso->esc = ISO_ESC_DESIGNATE;
4328 *flags &= CODING_STATE_ISO2022_LOCK;
4329 iso->output_literally = 1;
4330 if (iso->switched_dir_and_no_valid_charset_yet)
4332 /* We encountered a switch-direction followed by an
4333 invalid designation. Ensure that the switch-direction
4334 gets outputted; otherwise it will probably get eaten
4335 when the text is written out again. */
4336 iso->switched_dir_and_no_valid_charset_yet = 0;
4337 iso->output_direction_sequence = 1;
4338 /* And make sure that the switch-dir going the other
4339 way gets outputted, as well. */
4340 iso->invalid_switch_dir = 1;
4344 /* This function is called with CODESYS equal to nil when
4345 doing coding-system detection. */
4346 if (!NILP (codesys))
4348 charset_conversion_spec_dynarr *dyn =
4349 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4355 for (i = 0; i < Dynarr_length (dyn); i++)
4357 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4358 if (EQ (cs, spec->from_charset))
4359 cs = spec->to_charset;
4364 iso->charset[reg] = cs;
4365 iso->esc = ISO_ESC_DESIGNATE;
4366 *flags &= CODING_STATE_ISO2022_LOCK;
4367 if (iso->invalid_designated[reg])
4369 iso->invalid_designated[reg] = 0;
4370 iso->output_literally = 1;
4372 if (iso->switched_dir_and_no_valid_charset_yet)
4373 iso->switched_dir_and_no_valid_charset_yet = 0;
4378 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4383 /* #### There are serious deficiencies in the recognition mechanism
4384 here. This needs to be much smarter if it's going to cut it.
4385 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4386 it should be detected as Latin-1.
4387 All the ISO2022 stuff in this file should be synced up with the
4388 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4389 Perhaps we should wait till R2L works in FSF Emacs? */
4391 if (!st->iso2022.initted)
4393 reset_iso2022 (Qnil, &st->iso2022.iso);
4394 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4395 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4396 CODING_CATEGORY_ISO_8_1_MASK |
4397 CODING_CATEGORY_ISO_8_2_MASK |
4398 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4399 st->iso2022.flags = 0;
4400 st->iso2022.high_byte_count = 0;
4401 st->iso2022.saw_single_shift = 0;
4402 st->iso2022.initted = 1;
4405 mask = st->iso2022.mask;
4412 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4413 st->iso2022.high_byte_count++;
4417 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4419 if (st->iso2022.high_byte_count & 1)
4420 /* odd number of high bytes; assume not iso-8-2 */
4421 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4423 st->iso2022.high_byte_count = 0;
4424 st->iso2022.saw_single_shift = 0;
4426 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4428 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4429 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4430 { /* control chars */
4433 /* Allow and ignore control characters that you might
4434 reasonably see in a text file */
4439 case 8: /* backspace */
4440 case 11: /* vertical tab */
4441 case 12: /* form feed */
4442 case 26: /* MS-DOS C-z junk */
4443 case 31: /* '^_' -- for info */
4444 goto label_continue_loop;
4451 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4454 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4455 &st->iso2022.flags, 0))
4457 switch (st->iso2022.iso.esc)
4459 case ISO_ESC_DESIGNATE:
4460 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4461 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4463 case ISO_ESC_LOCKING_SHIFT:
4464 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4465 goto ran_out_of_chars;
4466 case ISO_ESC_SINGLE_SHIFT:
4467 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4468 st->iso2022.saw_single_shift = 1;
4477 goto ran_out_of_chars;
4480 label_continue_loop:;
4489 postprocess_iso2022_mask (int mask)
4491 /* #### kind of cheesy */
4492 /* If seven-bit ISO is allowed, then assume that the encoding is
4493 entirely seven-bit and turn off the eight-bit ones. */
4494 if (mask & CODING_CATEGORY_ISO_7_MASK)
4495 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4496 CODING_CATEGORY_ISO_8_1_MASK |
4497 CODING_CATEGORY_ISO_8_2_MASK);
4501 /* If FLAGS is a null pointer or specifies right-to-left motion,
4502 output a switch-dir-to-left-to-right sequence to DST.
4503 Also update FLAGS if it is not a null pointer.
4504 If INTERNAL_P is set, we are outputting in internal format and
4505 need to handle the CSI differently. */
4508 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4509 unsigned_char_dynarr *dst,
4510 unsigned int *flags,
4513 if (!flags || (*flags & CODING_STATE_R2L))
4515 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4517 Dynarr_add (dst, ISO_CODE_ESC);
4518 Dynarr_add (dst, '[');
4520 else if (internal_p)
4521 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4523 Dynarr_add (dst, ISO_CODE_CSI);
4524 Dynarr_add (dst, '0');
4525 Dynarr_add (dst, ']');
4527 *flags &= ~CODING_STATE_R2L;
4531 /* If FLAGS is a null pointer or specifies a direction different from
4532 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4533 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4534 sequence to DST. Also update FLAGS if it is not a null pointer.
4535 If INTERNAL_P is set, we are outputting in internal format and
4536 need to handle the CSI differently. */
4539 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4540 unsigned_char_dynarr *dst, unsigned int *flags,
4543 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4544 direction == CHARSET_LEFT_TO_RIGHT)
4545 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4546 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4547 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4548 direction == CHARSET_RIGHT_TO_LEFT)
4550 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4552 Dynarr_add (dst, ISO_CODE_ESC);
4553 Dynarr_add (dst, '[');
4555 else if (internal_p)
4556 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4558 Dynarr_add (dst, ISO_CODE_CSI);
4559 Dynarr_add (dst, '2');
4560 Dynarr_add (dst, ']');
4562 *flags |= CODING_STATE_R2L;
4566 /* Convert ISO2022-format data to internal format. */
4569 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4570 unsigned_char_dynarr *dst, unsigned int n)
4572 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4573 unsigned int flags = str->flags;
4574 unsigned int ch = str->ch;
4575 eol_type_t eol_type = str->eol_type;
4576 #ifdef ENABLE_COMPOSITE_CHARS
4577 unsigned_char_dynarr *real_dst = dst;
4579 Lisp_Object coding_system;
4581 XSETCODING_SYSTEM (coding_system, str->codesys);
4583 #ifdef ENABLE_COMPOSITE_CHARS
4584 if (flags & CODING_STATE_COMPOSITE)
4585 dst = str->iso2022.composite_chars;
4586 #endif /* ENABLE_COMPOSITE_CHARS */
4590 unsigned char c = *src++;
4591 if (flags & CODING_STATE_ESCAPE)
4592 { /* Within ESC sequence */
4593 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4598 switch (str->iso2022.esc)
4600 #ifdef ENABLE_COMPOSITE_CHARS
4601 case ISO_ESC_START_COMPOSITE:
4602 if (str->iso2022.composite_chars)
4603 Dynarr_reset (str->iso2022.composite_chars);
4605 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4606 dst = str->iso2022.composite_chars;
4608 case ISO_ESC_END_COMPOSITE:
4610 Bufbyte comstr[MAX_EMCHAR_LEN];
4612 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4613 Dynarr_length (dst));
4615 len = set_charptr_emchar (comstr, emch);
4616 Dynarr_add_many (dst, comstr, len);
4619 #endif /* ENABLE_COMPOSITE_CHARS */
4621 case ISO_ESC_LITERAL:
4622 DECODE_ADD_BINARY_CHAR (c, dst);
4626 /* Everything else handled already */
4631 /* Attempted error recovery. */
4632 if (str->iso2022.output_direction_sequence)
4633 ensure_correct_direction (flags & CODING_STATE_R2L ?
4634 CHARSET_RIGHT_TO_LEFT :
4635 CHARSET_LEFT_TO_RIGHT,
4636 str->codesys, dst, 0, 1);
4637 /* More error recovery. */
4638 if (!retval || str->iso2022.output_literally)
4640 /* Output the (possibly invalid) sequence */
4642 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4643 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4644 flags &= CODING_STATE_ISO2022_LOCK;
4646 n++, src--;/* Repeat the loop with the same character. */
4649 /* No sense in reprocessing the final byte of the
4650 escape sequence; it could mess things up anyway.
4652 DECODE_ADD_BINARY_CHAR (c, dst);
4657 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4658 { /* Control characters */
4660 /***** Error-handling *****/
4662 /* If we were in the middle of a character, dump out the
4663 partial character. */
4664 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4666 /* If we just saw a single-shift character, dump it out.
4667 This may dump out the wrong sort of single-shift character,
4668 but least it will give an indication that something went
4670 if (flags & CODING_STATE_SS2)
4672 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4673 flags &= ~CODING_STATE_SS2;
4675 if (flags & CODING_STATE_SS3)
4677 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4678 flags &= ~CODING_STATE_SS3;
4681 /***** Now handle the control characters. *****/
4684 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4686 flags &= CODING_STATE_ISO2022_LOCK;
4688 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4689 DECODE_ADD_BINARY_CHAR (c, dst);
4692 { /* Graphic characters */
4693 Lisp_Object charset;
4697 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4699 /* Now determine the charset. */
4700 reg = ((flags & CODING_STATE_SS2) ? 2
4701 : (flags & CODING_STATE_SS3) ? 3
4702 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4703 : str->iso2022.register_left);
4704 charset = str->iso2022.charset[reg];
4706 /* Error checking: */
4707 if (! CHARSETP (charset)
4708 || str->iso2022.invalid_designated[reg]
4709 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4710 && XCHARSET_CHARS (charset) == 94))
4711 /* Mrmph. We are trying to invoke a register that has no
4712 or an invalid charset in it, or trying to add a character
4713 outside the range of the charset. Insert that char literally
4714 to preserve it for the output. */
4716 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4717 DECODE_ADD_BINARY_CHAR (c, dst);
4722 /* Things are probably hunky-dorey. */
4724 /* Fetch reverse charset, maybe. */
4725 if (((flags & CODING_STATE_R2L) &&
4726 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4728 (!(flags & CODING_STATE_R2L) &&
4729 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4731 Lisp_Object new_charset =
4732 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4733 if (!NILP (new_charset))
4734 charset = new_charset;
4737 lb = XCHARSET_LEADING_BYTE (charset);
4738 switch (XCHARSET_REP_BYTES (charset))
4741 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4742 Dynarr_add (dst, c & 0x7F);
4745 case 2: /* one-byte official */
4746 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4747 Dynarr_add (dst, lb);
4748 Dynarr_add (dst, c | 0x80);
4751 case 3: /* one-byte private or two-byte official */
4752 if (XCHARSET_PRIVATE_P (charset))
4754 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4755 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4756 Dynarr_add (dst, lb);
4757 Dynarr_add (dst, c | 0x80);
4763 Dynarr_add (dst, lb);
4764 Dynarr_add (dst, ch | 0x80);
4765 Dynarr_add (dst, c | 0x80);
4773 default: /* two-byte private */
4776 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4777 Dynarr_add (dst, lb);
4778 Dynarr_add (dst, ch | 0x80);
4779 Dynarr_add (dst, c | 0x80);
4788 flags &= CODING_STATE_ISO2022_LOCK;
4791 label_continue_loop:;
4794 if (flags & CODING_STATE_END)
4795 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4802 /***** ISO2022 encoder *****/
4804 /* Designate CHARSET into register REG. */
4807 iso2022_designate (Lisp_Object charset, unsigned char reg,
4808 struct encoding_stream *str, unsigned_char_dynarr *dst)
4810 static CONST char inter94[] = "()*+";
4811 static CONST char inter96[] = ",-./";
4813 unsigned char final;
4814 Lisp_Object old_charset = str->iso2022.charset[reg];
4816 str->iso2022.charset[reg] = charset;
4817 if (!CHARSETP (charset))
4818 /* charset might be an initial nil or t. */
4820 type = XCHARSET_TYPE (charset);
4821 final = XCHARSET_FINAL (charset);
4822 if (!str->iso2022.force_charset_on_output[reg] &&
4823 CHARSETP (old_charset) &&
4824 XCHARSET_TYPE (old_charset) == type &&
4825 XCHARSET_FINAL (old_charset) == final)
4828 str->iso2022.force_charset_on_output[reg] = 0;
4831 charset_conversion_spec_dynarr *dyn =
4832 str->codesys->iso2022.output_conv;
4838 for (i = 0; i < Dynarr_length (dyn); i++)
4840 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4841 if (EQ (charset, spec->from_charset))
4842 charset = spec->to_charset;
4847 Dynarr_add (dst, ISO_CODE_ESC);
4850 case CHARSET_TYPE_94:
4851 Dynarr_add (dst, inter94[reg]);
4853 case CHARSET_TYPE_96:
4854 Dynarr_add (dst, inter96[reg]);
4856 case CHARSET_TYPE_94X94:
4857 Dynarr_add (dst, '$');
4859 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4862 Dynarr_add (dst, inter94[reg]);
4864 case CHARSET_TYPE_96X96:
4865 Dynarr_add (dst, '$');
4866 Dynarr_add (dst, inter96[reg]);
4869 Dynarr_add (dst, final);
4873 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4875 if (str->iso2022.register_left != 0)
4877 Dynarr_add (dst, ISO_CODE_SI);
4878 str->iso2022.register_left = 0;
4883 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4885 if (str->iso2022.register_left != 1)
4887 Dynarr_add (dst, ISO_CODE_SO);
4888 str->iso2022.register_left = 1;
4892 /* Convert internally-formatted data to ISO2022 format. */
4895 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4896 unsigned_char_dynarr *dst, unsigned int n)
4898 unsigned char charmask, c;
4899 unsigned char char_boundary;
4900 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4901 unsigned int flags = str->flags;
4902 unsigned int ch = str->ch;
4903 Lisp_Coding_System *codesys = str->codesys;
4904 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4906 Lisp_Object charset;
4909 #ifdef ENABLE_COMPOSITE_CHARS
4910 /* flags for handling composite chars. We do a little switcharoo
4911 on the source while we're outputting the composite char. */
4912 unsigned int saved_n = 0;
4913 CONST unsigned char *saved_src = NULL;
4914 int in_composite = 0;
4915 #endif /* ENABLE_COMPOSITE_CHARS */
4917 char_boundary = str->iso2022.current_char_boundary;
4918 charset = str->iso2022.current_charset;
4919 half = str->iso2022.current_half;
4921 #ifdef ENABLE_COMPOSITE_CHARS
4928 if (BYTE_ASCII_P (c))
4929 { /* Processing ASCII character */
4932 restore_left_to_right_direction (codesys, dst, &flags, 0);
4934 /* Make sure G0 contains ASCII */
4935 if ((c > ' ' && c < ISO_CODE_DEL) ||
4936 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4938 ensure_normal_shift (str, dst);
4939 iso2022_designate (Vcharset_ascii, 0, str, dst);
4942 /* If necessary, restore everything to the default state
4945 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4947 restore_left_to_right_direction (codesys, dst, &flags, 0);
4949 ensure_normal_shift (str, dst);
4951 for (i = 0; i < 4; i++)
4953 Lisp_Object initial_charset =
4954 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4955 iso2022_designate (initial_charset, i, str, dst);
4960 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4961 Dynarr_add (dst, '\r');
4962 if (eol_type != EOL_CR)
4963 Dynarr_add (dst, c);
4967 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4968 && fit_to_be_escape_quoted (c))
4969 Dynarr_add (dst, ISO_CODE_ESC);
4970 Dynarr_add (dst, c);
4975 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4976 { /* Processing Leading Byte */
4978 charset = CHARSET_BY_LEADING_BYTE (c);
4979 if (LEADING_BYTE_PREFIX_P(c))
4981 else if (!EQ (charset, Vcharset_control_1)
4982 #ifdef ENABLE_COMPOSITE_CHARS
4983 && !EQ (charset, Vcharset_composite)
4989 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4990 codesys, dst, &flags, 0);
4992 /* Now determine which register to use. */
4994 for (i = 0; i < 4; i++)
4996 if (EQ (charset, str->iso2022.charset[i]) ||
4998 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5007 if (XCHARSET_GRAPHIC (charset) != 0)
5009 if (!NILP (str->iso2022.charset[1]) &&
5010 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5011 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5013 else if (!NILP (str->iso2022.charset[2]))
5015 else if (!NILP (str->iso2022.charset[3]))
5024 iso2022_designate (charset, reg, str, dst);
5026 /* Now invoke that register. */
5030 ensure_normal_shift (str, dst);
5035 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5037 ensure_shift_out (str, dst);
5045 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5047 Dynarr_add (dst, ISO_CODE_ESC);
5048 Dynarr_add (dst, 'N');
5053 Dynarr_add (dst, ISO_CODE_SS2);
5059 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5061 Dynarr_add (dst, ISO_CODE_ESC);
5062 Dynarr_add (dst, 'O');
5067 Dynarr_add (dst, ISO_CODE_SS3);
5079 { /* Processing Non-ASCII character */
5080 charmask = (half == 0 ? 0x7F : 0xFF);
5082 if (EQ (charset, Vcharset_control_1))
5084 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5085 && fit_to_be_escape_quoted (c))
5086 Dynarr_add (dst, ISO_CODE_ESC);
5087 /* you asked for it ... */
5088 Dynarr_add (dst, c - 0x20);
5092 switch (XCHARSET_REP_BYTES (charset))
5095 Dynarr_add (dst, c & charmask);
5098 if (XCHARSET_PRIVATE_P (charset))
5100 Dynarr_add (dst, c & charmask);
5105 #ifdef ENABLE_COMPOSITE_CHARS
5106 if (EQ (charset, Vcharset_composite))
5110 /* #### Bother! We don't know how to
5112 Dynarr_add (dst, '~');
5116 Emchar emch = MAKE_CHAR (Vcharset_composite,
5117 ch & 0x7F, c & 0x7F);
5118 Lisp_Object lstr = composite_char_string (emch);
5122 src = XSTRING_DATA (lstr);
5123 n = XSTRING_LENGTH (lstr);
5124 Dynarr_add (dst, ISO_CODE_ESC);
5125 Dynarr_add (dst, '0'); /* start composing */
5129 #endif /* ENABLE_COMPOSITE_CHARS */
5131 Dynarr_add (dst, ch & charmask);
5132 Dynarr_add (dst, c & charmask);
5145 Dynarr_add (dst, ch & charmask);
5146 Dynarr_add (dst, c & charmask);
5162 #ifdef ENABLE_COMPOSITE_CHARS
5168 Dynarr_add (dst, ISO_CODE_ESC);
5169 Dynarr_add (dst, '1'); /* end composing */
5170 goto back_to_square_n; /* Wheeeeeeeee ..... */
5172 #endif /* ENABLE_COMPOSITE_CHARS */
5174 if (char_boundary && flags & CODING_STATE_END)
5176 restore_left_to_right_direction (codesys, dst, &flags, 0);
5177 ensure_normal_shift (str, dst);
5178 for (i = 0; i < 4; i++)
5180 Lisp_Object initial_charset =
5181 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5182 iso2022_designate (initial_charset, i, str, dst);
5188 str->iso2022.current_char_boundary = char_boundary;
5189 str->iso2022.current_charset = charset;
5190 str->iso2022.current_half = half;
5192 /* Verbum caro factum est! */
5196 /************************************************************************/
5197 /* No-conversion methods */
5198 /************************************************************************/
5200 /* This is used when reading in "binary" files -- i.e. files that may
5201 contain all 256 possible byte values and that are not to be
5202 interpreted as being in any particular decoding. */
5204 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5205 unsigned_char_dynarr *dst, unsigned int n)
5208 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5209 unsigned int flags = str->flags;
5210 unsigned int ch = str->ch;
5211 eol_type_t eol_type = str->eol_type;
5217 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5218 DECODE_ADD_BINARY_CHAR (c, dst);
5219 label_continue_loop:;
5222 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5229 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5230 unsigned_char_dynarr *dst, unsigned int n)
5233 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5234 unsigned int flags = str->flags;
5235 unsigned int ch = str->ch;
5236 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5243 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5244 Dynarr_add (dst, '\r');
5245 if (eol_type != EOL_CR)
5246 Dynarr_add (dst, '\n');
5249 else if (BYTE_ASCII_P (c))
5252 Dynarr_add (dst, c);
5254 else if (BUFBYTE_LEADING_BYTE_P (c))
5257 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5258 c == LEADING_BYTE_CONTROL_1)
5261 Dynarr_add (dst, '~'); /* untranslatable character */
5265 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5266 Dynarr_add (dst, c);
5267 else if (ch == LEADING_BYTE_CONTROL_1)
5270 Dynarr_add (dst, c - 0x20);
5272 /* else it should be the second or third byte of an
5273 untranslatable character, so ignore it */
5283 /************************************************************************/
5284 /* Simple internal/external functions */
5285 /************************************************************************/
5287 static Extbyte_dynarr *conversion_out_dynarr;
5288 static Bufbyte_dynarr *conversion_in_dynarr;
5290 /* Determine coding system from coding format */
5292 /* #### not correct for all values of `fmt'! */
5294 external_data_format_to_coding_system (enum external_data_format fmt)
5298 case FORMAT_FILENAME:
5299 case FORMAT_TERMINAL:
5300 if (EQ (Vfile_name_coding_system, Qnil) ||
5301 EQ (Vfile_name_coding_system, Qbinary))
5304 return Fget_coding_system (Vfile_name_coding_system);
5307 return Fget_coding_system (Qctext);
5315 convert_to_external_format (CONST Bufbyte *ptr,
5318 enum external_data_format fmt)
5320 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5322 if (!conversion_out_dynarr)
5323 conversion_out_dynarr = Dynarr_new (Extbyte);
5325 Dynarr_reset (conversion_out_dynarr);
5327 if (NILP (coding_system))
5329 CONST Bufbyte *end = ptr + len;
5334 (BYTE_ASCII_P (*ptr)) ? *ptr :
5335 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5336 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5339 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5343 #ifdef ERROR_CHECK_BUFPOS
5344 assert (ptr == end);
5349 Lisp_Object instream, outstream, da_outstream;
5350 Lstream *istr, *ostr;
5351 struct gcpro gcpro1, gcpro2, gcpro3;
5352 char tempbuf[1024]; /* some random amount */
5354 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5355 da_outstream = make_dynarr_output_stream
5356 ((unsigned_char_dynarr *) conversion_out_dynarr);
5358 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5359 istr = XLSTREAM (instream);
5360 ostr = XLSTREAM (outstream);
5361 GCPRO3 (instream, outstream, da_outstream);
5364 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5367 Lstream_write (ostr, tempbuf, size_in_bytes);
5369 Lstream_close (istr);
5370 Lstream_close (ostr);
5372 Lstream_delete (istr);
5373 Lstream_delete (ostr);
5374 Lstream_delete (XLSTREAM (da_outstream));
5377 *len_out = Dynarr_length (conversion_out_dynarr);
5378 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5379 return Dynarr_atp (conversion_out_dynarr, 0);
5383 convert_from_external_format (CONST Extbyte *ptr,
5386 enum external_data_format fmt)
5388 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5390 if (!conversion_in_dynarr)
5391 conversion_in_dynarr = Dynarr_new (Bufbyte);
5393 Dynarr_reset (conversion_in_dynarr);
5395 if (NILP (coding_system))
5397 CONST Extbyte *end = ptr + len;
5398 for (; ptr < end; ptr++)
5401 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5406 Lisp_Object instream, outstream, da_outstream;
5407 Lstream *istr, *ostr;
5408 struct gcpro gcpro1, gcpro2, gcpro3;
5409 char tempbuf[1024]; /* some random amount */
5411 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5412 da_outstream = make_dynarr_output_stream
5413 ((unsigned_char_dynarr *) conversion_in_dynarr);
5415 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5416 istr = XLSTREAM (instream);
5417 ostr = XLSTREAM (outstream);
5418 GCPRO3 (instream, outstream, da_outstream);
5421 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5424 Lstream_write (ostr, tempbuf, size_in_bytes);
5426 Lstream_close (istr);
5427 Lstream_close (ostr);
5429 Lstream_delete (istr);
5430 Lstream_delete (ostr);
5431 Lstream_delete (XLSTREAM (da_outstream));
5434 *len_out = Dynarr_length (conversion_in_dynarr);
5435 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5436 return Dynarr_atp (conversion_in_dynarr, 0);
5440 /************************************************************************/
5441 /* Initialization */
5442 /************************************************************************/
5445 syms_of_file_coding (void)
5447 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5448 deferror (&Qcoding_system_error, "coding-system-error",
5449 "Coding-system error", Qio_error);
5451 DEFSUBR (Fcoding_system_p);
5452 DEFSUBR (Ffind_coding_system);
5453 DEFSUBR (Fget_coding_system);
5454 DEFSUBR (Fcoding_system_list);
5455 DEFSUBR (Fcoding_system_name);
5456 DEFSUBR (Fmake_coding_system);
5457 DEFSUBR (Fcopy_coding_system);
5458 DEFSUBR (Fdefine_coding_system_alias);
5459 DEFSUBR (Fsubsidiary_coding_system);
5461 DEFSUBR (Fcoding_system_type);
5462 DEFSUBR (Fcoding_system_doc_string);
5464 DEFSUBR (Fcoding_system_charset);
5466 DEFSUBR (Fcoding_system_property);
5468 DEFSUBR (Fcoding_category_list);
5469 DEFSUBR (Fset_coding_priority_list);
5470 DEFSUBR (Fcoding_priority_list);
5471 DEFSUBR (Fset_coding_category_system);
5472 DEFSUBR (Fcoding_category_system);
5474 DEFSUBR (Fdetect_coding_region);
5475 DEFSUBR (Fdecode_coding_region);
5476 DEFSUBR (Fencode_coding_region);
5478 DEFSUBR (Fdecode_shift_jis_char);
5479 DEFSUBR (Fencode_shift_jis_char);
5480 DEFSUBR (Fdecode_big5_char);
5481 DEFSUBR (Fencode_big5_char);
5482 DEFSUBR (Fset_ucs_char);
5483 DEFSUBR (Fucs_char);
5484 DEFSUBR (Fset_char_ucs);
5485 DEFSUBR (Fchar_ucs);
5487 defsymbol (&Qcoding_system_p, "coding-system-p");
5488 defsymbol (&Qno_conversion, "no-conversion");
5489 defsymbol (&Qraw_text, "raw-text");
5491 defsymbol (&Qbig5, "big5");
5492 defsymbol (&Qshift_jis, "shift-jis");
5493 defsymbol (&Qucs4, "ucs-4");
5494 defsymbol (&Qutf8, "utf-8");
5495 defsymbol (&Qccl, "ccl");
5496 defsymbol (&Qiso2022, "iso2022");
5498 defsymbol (&Qmnemonic, "mnemonic");
5499 defsymbol (&Qeol_type, "eol-type");
5500 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5501 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5503 defsymbol (&Qcr, "cr");
5504 defsymbol (&Qlf, "lf");
5505 defsymbol (&Qcrlf, "crlf");
5506 defsymbol (&Qeol_cr, "eol-cr");
5507 defsymbol (&Qeol_lf, "eol-lf");
5508 defsymbol (&Qeol_crlf, "eol-crlf");
5510 defsymbol (&Qcharset_g0, "charset-g0");
5511 defsymbol (&Qcharset_g1, "charset-g1");
5512 defsymbol (&Qcharset_g2, "charset-g2");
5513 defsymbol (&Qcharset_g3, "charset-g3");
5514 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5515 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5516 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5517 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5518 defsymbol (&Qno_iso6429, "no-iso6429");
5519 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5520 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5522 defsymbol (&Qshort, "short");
5523 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5524 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5525 defsymbol (&Qseven, "seven");
5526 defsymbol (&Qlock_shift, "lock-shift");
5527 defsymbol (&Qescape_quoted, "escape-quoted");
5529 defsymbol (&Qencode, "encode");
5530 defsymbol (&Qdecode, "decode");
5533 defsymbol (&Qctext, "ctext");
5534 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5536 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5538 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5540 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5542 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5544 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5546 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5548 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5550 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5553 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5558 lstream_type_create_file_coding (void)
5560 LSTREAM_HAS_METHOD (decoding, reader);
5561 LSTREAM_HAS_METHOD (decoding, writer);
5562 LSTREAM_HAS_METHOD (decoding, rewinder);
5563 LSTREAM_HAS_METHOD (decoding, seekable_p);
5564 LSTREAM_HAS_METHOD (decoding, flusher);
5565 LSTREAM_HAS_METHOD (decoding, closer);
5566 LSTREAM_HAS_METHOD (decoding, marker);
5568 LSTREAM_HAS_METHOD (encoding, reader);
5569 LSTREAM_HAS_METHOD (encoding, writer);
5570 LSTREAM_HAS_METHOD (encoding, rewinder);
5571 LSTREAM_HAS_METHOD (encoding, seekable_p);
5572 LSTREAM_HAS_METHOD (encoding, flusher);
5573 LSTREAM_HAS_METHOD (encoding, closer);
5574 LSTREAM_HAS_METHOD (encoding, marker);
5578 vars_of_file_coding (void)
5582 /* Initialize to something reasonable ... */
5583 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5585 coding_category_system[i] = Qnil;
5586 coding_category_by_priority[i] = i;
5589 Fprovide (intern ("file-coding"));
5591 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5592 Coding system used for TTY keyboard input.
5593 Not used under a windowing system.
5595 Vkeyboard_coding_system = Qnil;
5597 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5598 Coding system used for TTY display output.
5599 Not used under a windowing system.
5601 Vterminal_coding_system = Qnil;
5603 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5604 Overriding coding system used when writing a file or process.
5605 You should *bind* this, not set it. If this is non-nil, it specifies
5606 the coding system that will be used when a file or process is read
5607 in, and overrides `buffer-file-coding-system-for-read',
5608 `insert-file-contents-pre-hook', etc. Use those variables instead of
5609 this one for permanent changes to the environment.
5611 Vcoding_system_for_read = Qnil;
5613 DEFVAR_LISP ("coding-system-for-write",
5614 &Vcoding_system_for_write /*
5615 Overriding coding system used when writing a file or process.
5616 You should *bind* this, not set it. If this is non-nil, it specifies
5617 the coding system that will be used when a file or process is wrote
5618 in, and overrides `buffer-file-coding-system',
5619 `write-region-pre-hook', etc. Use those variables instead of this one
5620 for permanent changes to the environment.
5622 Vcoding_system_for_write = Qnil;
5624 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5625 Coding system used to convert pathnames when accessing files.
5627 Vfile_name_coding_system = Qnil;
5629 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5630 Non-nil means the buffer contents are regarded as multi-byte form
5631 of characters, not a binary code. This affects the display, file I/O,
5632 and behaviors of various editing commands.
5634 Setting this to nil does not do anything.
5636 enable_multibyte_characters = 1;
5640 complex_vars_of_file_coding (void)
5642 staticpro (&Vcoding_system_hash_table);
5643 Vcoding_system_hash_table =
5644 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5646 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5648 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5650 struct codesys_prop csp; \
5652 csp.prop_type = (Prop_Type); \
5653 Dynarr_add (the_codesys_prop_dynarr, csp); \
5656 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5657 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5658 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5659 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5660 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5661 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5662 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5664 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5665 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5666 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5667 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5668 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5669 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5670 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5671 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5672 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5673 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5674 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5675 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5676 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5677 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5678 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5679 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5680 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5682 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5683 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5685 /* Need to create this here or we're really screwed. */
5687 (Qraw_text, Qno_conversion,
5688 build_string ("Raw text, which means it converts only line-break-codes."),
5689 list2 (Qmnemonic, build_string ("Raw")));
5692 (Qbinary, Qno_conversion,
5693 build_string ("Binary, which means it does not convert anything."),
5694 list4 (Qeol_type, Qlf,
5695 Qmnemonic, build_string ("Binary")));
5697 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5699 /* Need this for bootstrapping */
5700 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5701 Fget_coding_system (Qraw_text);
5707 for (i = 0; i < 65536; i++)
5708 ucs_to_mule_table[i] = Qnil;
5710 staticpro (&mule_to_ucs_table);
5711 mule_to_ucs_table = Fmake_char_table(Qgeneric);