1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
154 static int detect_coding_sjis (struct detection_state *st,
155 CONST unsigned char *src,
157 static void decode_coding_sjis (Lstream *decoding,
158 CONST unsigned char *src,
159 unsigned_char_dynarr *dst,
161 static void encode_coding_sjis (Lstream *encoding,
162 CONST unsigned char *src,
163 unsigned_char_dynarr *dst,
165 static int detect_coding_big5 (struct detection_state *st,
166 CONST unsigned char *src,
168 static void decode_coding_big5 (Lstream *decoding,
169 CONST unsigned char *src,
170 unsigned_char_dynarr *dst, unsigned int n);
171 static void encode_coding_big5 (Lstream *encoding,
172 CONST unsigned char *src,
173 unsigned_char_dynarr *dst, unsigned int n);
174 static int detect_coding_ucs4 (struct detection_state *st,
175 CONST unsigned char *src,
177 static void decode_coding_ucs4 (Lstream *decoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static void encode_coding_ucs4 (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst, unsigned int n);
183 static int detect_coding_utf8 (struct detection_state *st,
184 CONST unsigned char *src,
186 static void decode_coding_utf8 (Lstream *decoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst, unsigned int n);
189 static void encode_coding_utf8 (Lstream *encoding,
190 CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static int postprocess_iso2022_mask (int mask);
193 static void reset_iso2022 (Lisp_Object coding_system,
194 struct iso2022_decoder *iso);
195 static int detect_coding_iso2022 (struct detection_state *st,
196 CONST unsigned char *src,
198 static void decode_coding_iso2022 (Lstream *decoding,
199 CONST unsigned char *src,
200 unsigned_char_dynarr *dst, unsigned int n);
201 static void encode_coding_iso2022 (Lstream *encoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
205 static void decode_coding_no_conversion (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst,
209 static void encode_coding_no_conversion (Lstream *encoding,
210 CONST unsigned char *src,
211 unsigned_char_dynarr *dst,
213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
214 unsigned_char_dynarr *dst, unsigned int n);
215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
216 unsigned_char_dynarr *dst, unsigned int n);
218 typedef struct codesys_prop codesys_prop;
227 Dynarr_declare (codesys_prop);
228 } codesys_prop_dynarr;
230 codesys_prop_dynarr *the_codesys_prop_dynarr;
232 enum codesys_prop_enum
235 CODESYS_PROP_ISO2022,
240 /************************************************************************/
241 /* Coding system functions */
242 /************************************************************************/
244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
246 static void finalize_coding_system (void *header, int for_disksave);
249 static const struct lrecord_description ccs_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
254 static const struct struct_description ccs_description = {
255 sizeof(charset_conversion_spec),
259 static const struct lrecord_description ccsd_description_1[] = {
260 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
264 static const struct struct_description ccsd_description = {
265 sizeof(charset_conversion_spec_dynarr),
270 static const struct lrecord_description coding_system_description[] = {
271 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
272 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
273 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
275 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
276 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
277 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
278 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
284 mark_coding_system, print_coding_system,
285 finalize_coding_system,
286 0, 0, coding_system_description,
287 struct Lisp_Coding_System);
290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
292 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
294 markobj (CODING_SYSTEM_NAME (codesys));
295 markobj (CODING_SYSTEM_DOC_STRING (codesys));
296 markobj (CODING_SYSTEM_MNEMONIC (codesys));
297 markobj (CODING_SYSTEM_EOL_LF (codesys));
298 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
299 markobj (CODING_SYSTEM_EOL_CR (codesys));
301 switch (CODING_SYSTEM_TYPE (codesys))
305 case CODESYS_ISO2022:
306 for (i = 0; i < 4; i++)
307 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
308 if (codesys->iso2022.input_conv)
310 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
312 struct charset_conversion_spec *ccs =
313 Dynarr_atp (codesys->iso2022.input_conv, i);
314 markobj (ccs->from_charset);
315 markobj (ccs->to_charset);
318 if (codesys->iso2022.output_conv)
320 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
322 struct charset_conversion_spec *ccs =
323 Dynarr_atp (codesys->iso2022.output_conv, i);
324 markobj (ccs->from_charset);
325 markobj (ccs->to_charset);
331 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
332 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
339 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
340 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
347 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
349 error ("printing unreadable object #<coding_system 0x%x>",
352 write_c_string ("#<coding_system ", printcharfun);
353 print_internal (c->name, printcharfun, 1);
354 write_c_string (">", printcharfun);
358 finalize_coding_system (void *header, int for_disksave)
360 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
361 /* Since coding systems never go away, this function is not
362 necessary. But it would be necessary if we changed things
363 so that coding systems could go away. */
364 if (!for_disksave) /* see comment in lstream.c */
366 switch (CODING_SYSTEM_TYPE (c))
369 case CODESYS_ISO2022:
370 if (c->iso2022.input_conv)
372 Dynarr_free (c->iso2022.input_conv);
373 c->iso2022.input_conv = 0;
375 if (c->iso2022.output_conv)
377 Dynarr_free (c->iso2022.output_conv);
378 c->iso2022.output_conv = 0;
389 symbol_to_eol_type (Lisp_Object symbol)
391 CHECK_SYMBOL (symbol);
392 if (NILP (symbol)) return EOL_AUTODETECT;
393 if (EQ (symbol, Qlf)) return EOL_LF;
394 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
395 if (EQ (symbol, Qcr)) return EOL_CR;
397 signal_simple_error ("Unrecognized eol type", symbol);
398 return EOL_AUTODETECT; /* not reached */
402 eol_type_to_symbol (enum eol_type type)
407 case EOL_LF: return Qlf;
408 case EOL_CRLF: return Qcrlf;
409 case EOL_CR: return Qcr;
410 case EOL_AUTODETECT: return Qnil;
415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
417 Lisp_Object codesys_obj;
418 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
419 char *codesys_name = (char *) alloca (len + 7);
421 char *codesys_mnemonic=0;
423 Lisp_Object codesys_name_sym, sub_codesys_obj;
427 XSETCODING_SYSTEM (codesys_obj, codesys);
429 memcpy (codesys_name,
430 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
432 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
434 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
435 codesys_mnemonic = (char *) alloca (mlen + 7);
436 memcpy (codesys_mnemonic,
437 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
441 strcpy (codesys_name + len, "-" op_sys); \
443 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
444 codesys_name_sym = intern (codesys_name); \
445 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
446 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
448 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
449 build_string (codesys_mnemonic); \
450 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
453 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
454 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
455 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
459 Return t if OBJECT is a coding system.
460 A coding system is an object that defines how text containing multiple
461 character sets is encoded into a stream of (typically 8-bit) bytes.
462 The coding system is used to decode the stream into a series of
463 characters (which may be from multiple charsets) when the text is read
464 from a file or process, and is used to encode the text back into the
465 same format when it is written out to a file or process.
467 For example, many ISO2022-compliant coding systems (such as Compound
468 Text, which is used for inter-client data under the X Window System)
469 use escape sequences to switch between different charsets -- Japanese
470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
472 `make-coding-system' for more information.
474 Coding systems are normally identified using a symbol, and the
475 symbol is accepted in place of the actual coding system object whenever
476 a coding system is called for. (This is similar to how faces work.)
480 return CODING_SYSTEMP (object) ? Qt : Qnil;
483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
484 Retrieve the coding system of the given name.
486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
487 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
488 If there is no such coding system, nil is returned. Otherwise the
489 associated coding system object is returned.
491 (coding_system_or_name))
493 if (CODING_SYSTEMP (coding_system_or_name))
494 return coding_system_or_name;
496 if (NILP (coding_system_or_name))
497 coding_system_or_name = Qbinary;
499 CHECK_SYMBOL (coding_system_or_name);
501 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
505 Retrieve the coding system of the given name.
506 Same as `find-coding-system' except that if there is no such
507 coding system, an error is signaled instead of returning nil.
511 Lisp_Object coding_system = Ffind_coding_system (name);
513 if (NILP (coding_system))
514 signal_simple_error ("No such coding system", name);
515 return coding_system;
518 /* We store the coding systems in hash tables with the names as the key and the
519 actual coding system object as the value. Occasionally we need to use them
520 in a list format. These routines provide us with that. */
521 struct coding_system_list_closure
523 Lisp_Object *coding_system_list;
527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
528 void *coding_system_list_closure)
530 /* This function can GC */
531 struct coding_system_list_closure *cscl =
532 (struct coding_system_list_closure *) coding_system_list_closure;
533 Lisp_Object *coding_system_list = cscl->coding_system_list;
535 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
536 *coding_system_list);
540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
541 Return a list of the names of all defined coding systems.
545 Lisp_Object coding_system_list = Qnil;
547 struct coding_system_list_closure coding_system_list_closure;
549 GCPRO1 (coding_system_list);
550 coding_system_list_closure.coding_system_list = &coding_system_list;
551 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
552 &coding_system_list_closure);
555 return coding_system_list;
558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
559 Return the name of the given coding system.
563 coding_system = Fget_coding_system (coding_system);
564 return XCODING_SYSTEM_NAME (coding_system);
567 static Lisp_Coding_System *
568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
570 Lisp_Coding_System *codesys =
571 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
573 zero_lcrecord (codesys);
574 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
575 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
576 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
577 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
578 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
579 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
580 CODING_SYSTEM_TYPE (codesys) = type;
581 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
583 if (type == CODESYS_ISO2022)
586 for (i = 0; i < 4; i++)
587 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
589 else if (type == CODESYS_CCL)
591 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
592 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
595 CODING_SYSTEM_NAME (codesys) = name;
601 /* Given a list of charset conversion specs as specified in a Lisp
602 program, parse it into STORE_HERE. */
605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
606 Lisp_Object spec_list)
610 EXTERNAL_LIST_LOOP (rest, spec_list)
612 Lisp_Object car = XCAR (rest);
613 Lisp_Object from, to;
614 struct charset_conversion_spec spec;
616 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
617 signal_simple_error ("Invalid charset conversion spec", car);
618 from = Fget_charset (XCAR (car));
619 to = Fget_charset (XCAR (XCDR (car)));
620 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
621 signal_simple_error_2
622 ("Attempted conversion between different charset types",
624 spec.from_charset = from;
625 spec.to_charset = to;
627 Dynarr_add (store_here, spec);
631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
632 specs, return the equivalent as the Lisp programmer would see it.
634 If LOAD_HERE is 0, return Qnil. */
637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
644 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
646 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
647 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
650 return Fnreverse (result);
655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
656 Register symbol NAME as a coding system.
658 TYPE describes the conversion method used and should be one of
661 Automatic conversion. XEmacs attempts to detect the coding system
664 No conversion. Use this for binary files and such. On output,
665 graphic characters that are not in ASCII or Latin-1 will be
666 replaced by a ?. (For a no-conversion-encoded buffer, these
667 characters will only be present if you explicitly insert them.)
669 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
671 ISO 10646 UCS-4 encoding.
673 ISO 10646 UTF-8 encoding.
675 Any ISO2022-compliant encoding. Among other things, this includes
676 JIS (the Japanese encoding commonly used for e-mail), EUC (the
677 standard Unix encoding for Japanese and other languages), and
678 Compound Text (the encoding used in X11). You can specify more
679 specific information about the conversion with the FLAGS argument.
681 Big5 (the encoding commonly used for Taiwanese).
683 The conversion is performed using a user-written pseudo-code
684 program. CCL (Code Conversion Language) is the name of this
687 Write out or read in the raw contents of the memory representing
688 the buffer's text. This is primarily useful for debugging
689 purposes, and is only enabled when XEmacs has been compiled with
690 DEBUG_XEMACS defined (via the --debug configure option).
691 WARNING: Reading in a file using 'internal conversion can result
692 in an internal inconsistency in the memory representing a
693 buffer's text, which will produce unpredictable results and may
694 cause XEmacs to crash. Under normal circumstances you should
695 never use 'internal conversion.
697 DOC-STRING is a string describing the coding system.
699 PROPS is a property list, describing the specific nature of the
700 character set. Recognized properties are:
703 String to be displayed in the modeline when this coding system is
707 End-of-line conversion to be used. It should be one of
710 Automatically detect the end-of-line type (LF, CRLF,
711 or CR). Also generate subsidiary coding systems named
712 `NAME-unix', `NAME-dos', and `NAME-mac', that are
713 identical to this coding system but have an EOL-TYPE
714 value of 'lf, 'crlf, and 'cr, respectively.
716 The end of a line is marked externally using ASCII LF.
717 Since this is also the way that XEmacs represents an
718 end-of-line internally, specifying this option results
719 in no end-of-line conversion. This is the standard
720 format for Unix text files.
722 The end of a line is marked externally using ASCII
723 CRLF. This is the standard format for MS-DOS text
726 The end of a line is marked externally using ASCII CR.
727 This is the standard format for Macintosh text files.
729 Automatically detect the end-of-line type but do not
730 generate subsidiary coding systems. (This value is
731 converted to nil when stored internally, and
732 `coding-system-property' will return nil.)
734 'post-read-conversion
735 Function called after a file has been read in, to perform the
736 decoding. Called with two arguments, BEG and END, denoting
737 a region of the current buffer to be decoded.
739 'pre-write-conversion
740 Function called before a file is written out, to perform the
741 encoding. Called with two arguments, BEG and END, denoting
742 a region of the current buffer to be encoded.
745 The following additional properties are recognized if TYPE is 'iso2022:
751 The character set initially designated to the G0 - G3 registers.
752 The value should be one of
754 -- A charset object (designate that character set)
755 -- nil (do not ever use this register)
756 -- t (no character set is initially designated to
757 the register, but may be later on; this automatically
758 sets the corresponding `force-g*-on-output' property)
764 If non-nil, send an explicit designation sequence on output before
765 using the specified register.
768 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
769 "ESC $ B" on output in place of the full designation sequences
770 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
773 If non-nil, don't designate ASCII to G0 at each end of line on output.
774 Setting this to non-nil also suppresses other state-resetting that
775 normally happens at the end of a line.
778 If non-nil, don't designate ASCII to G0 before control chars on output.
781 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
785 If non-nil, use locking-shift (SO/SI) instead of single-shift
786 or designation by escape sequence.
789 If non-nil, don't use ISO6429's direction specification.
792 If non-nil, literal control characters that are the same as
793 the beginning of a recognized ISO2022 or ISO6429 escape sequence
794 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
795 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
796 so that they can be properly distinguished from an escape sequence.
797 (Note that doing this results in a non-portable encoding.) This
798 encoding flag is used for byte-compiled files. Note that ESC
799 is a good choice for a quoting character because there are no
800 escape sequences whose second byte is a character from the Control-0
801 or Control-1 character sets; this is explicitly disallowed by the
804 'input-charset-conversion
805 A list of conversion specifications, specifying conversion of
806 characters in one charset to another when decoding is performed.
807 Each specification is a list of two elements: the source charset,
808 and the destination charset.
810 'output-charset-conversion
811 A list of conversion specifications, specifying conversion of
812 characters in one charset to another when encoding is performed.
813 The form of each specification is the same as for
814 'input-charset-conversion.
817 The following additional properties are recognized (and required)
821 CCL program used for decoding (converting to internal format).
824 CCL program used for encoding (converting to external format).
826 (name, type, doc_string, props))
828 Lisp_Coding_System *codesys;
829 Lisp_Object rest, key, value;
830 enum coding_system_type ty;
831 int need_to_setup_eol_systems = 1;
833 /* Convert type to constant */
834 if (NILP (type) || EQ (type, Qundecided))
835 { ty = CODESYS_AUTODETECT; }
837 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
838 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
839 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
840 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
841 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
842 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
844 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
846 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
849 signal_simple_error ("Invalid coding system type", type);
853 codesys = allocate_coding_system (ty, name);
855 if (NILP (doc_string))
856 doc_string = build_string ("");
858 CHECK_STRING (doc_string);
859 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
862 if (ty == CODESYS_NO_CONVERSION)
863 codesys->fixed.size = 1;
865 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
867 if (EQ (key, Qmnemonic))
870 CHECK_STRING (value);
871 CODING_SYSTEM_MNEMONIC (codesys) = value;
874 else if (EQ (key, Qeol_type))
876 need_to_setup_eol_systems = NILP (value);
879 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
882 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
883 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
885 else if (ty == CODESYS_ISO2022)
887 #define FROB_INITIAL_CHARSET(charset_num) \
888 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
889 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
891 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
892 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
893 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
894 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
896 #define FROB_FORCE_CHARSET(charset_num) \
897 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
899 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
900 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
901 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
902 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
904 #define FROB_BOOLEAN_PROPERTY(prop) \
905 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
907 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
908 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
909 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
910 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
911 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
912 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
913 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
915 else if (EQ (key, Qinput_charset_conversion))
917 codesys->iso2022.input_conv =
918 Dynarr_new (charset_conversion_spec);
919 parse_charset_conversion_specs (codesys->iso2022.input_conv,
922 else if (EQ (key, Qoutput_charset_conversion))
924 codesys->iso2022.output_conv =
925 Dynarr_new (charset_conversion_spec);
926 parse_charset_conversion_specs (codesys->iso2022.output_conv,
930 signal_simple_error ("Unrecognized property", key);
932 else if (EQ (type, Qccl))
934 if (EQ (key, Qdecode))
936 CHECK_VECTOR (value);
937 CODING_SYSTEM_CCL_DECODE (codesys) = value;
939 else if (EQ (key, Qencode))
941 CHECK_VECTOR (value);
942 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
945 signal_simple_error ("Unrecognized property", key);
949 signal_simple_error ("Unrecognized property", key);
952 if (need_to_setup_eol_systems)
953 setup_eol_coding_systems (codesys);
956 Lisp_Object codesys_obj;
957 XSETCODING_SYSTEM (codesys_obj, codesys);
958 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
963 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
964 Copy OLD-CODING-SYSTEM to NEW-NAME.
965 If NEW-NAME does not name an existing coding system, a new one will
968 (old_coding_system, new_name))
970 Lisp_Object new_coding_system;
971 old_coding_system = Fget_coding_system (old_coding_system);
972 new_coding_system = Ffind_coding_system (new_name);
973 if (NILP (new_coding_system))
975 XSETCODING_SYSTEM (new_coding_system,
976 allocate_coding_system
977 (XCODING_SYSTEM_TYPE (old_coding_system),
979 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
983 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
984 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
985 memcpy (((char *) to ) + sizeof (to->header),
986 ((char *) from) + sizeof (from->header),
987 sizeof (*from) - sizeof (from->header));
990 return new_coding_system;
993 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
994 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
996 (alias, coding_system))
998 CHECK_SYMBOL (alias);
999 if (!NILP (Ffind_coding_system (alias)))
1000 signal_simple_error ("Symbol already names a coding system", alias);
1001 coding_system = Fget_coding_system (coding_system);
1002 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1004 /* Set up aliases for subsidiaries. */
1005 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1008 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1009 #define FROB(type, name) \
1011 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1012 if (!NILP (subsidiary)) \
1013 Fdefine_coding_system_alias \
1014 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1017 FROB (CRLF, "-dos");
1021 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1022 but it doesn't look intentional, so I'd rather return something
1023 meaningful or nothing at all. */
1028 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1030 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1031 Lisp_Object new_coding_system;
1033 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1034 return coding_system;
1038 case EOL_AUTODETECT: return coding_system;
1039 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1040 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1041 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1045 return NILP (new_coding_system) ? coding_system : new_coding_system;
1048 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1049 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1051 (coding_system, eol_type))
1053 coding_system = Fget_coding_system (coding_system);
1055 return subsidiary_coding_system (coding_system,
1056 symbol_to_eol_type (eol_type));
1060 /************************************************************************/
1061 /* Coding system accessors */
1062 /************************************************************************/
1064 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1065 Return the doc string for CODING-SYSTEM.
1069 coding_system = Fget_coding_system (coding_system);
1070 return XCODING_SYSTEM_DOC_STRING (coding_system);
1073 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1074 Return the type of CODING-SYSTEM.
1078 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1081 case CODESYS_AUTODETECT: return Qundecided;
1083 case CODESYS_SHIFT_JIS: return Qshift_jis;
1084 case CODESYS_ISO2022: return Qiso2022;
1085 case CODESYS_BIG5: return Qbig5;
1086 case CODESYS_UCS4: return Qucs4;
1087 case CODESYS_UTF8: return Qutf8;
1088 case CODESYS_CCL: return Qccl;
1090 case CODESYS_NO_CONVERSION: return Qno_conversion;
1092 case CODESYS_INTERNAL: return Qinternal;
1099 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1102 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1104 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1107 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1108 Return initial charset of CODING-SYSTEM designated to GNUM.
1111 (coding_system, gnum))
1113 coding_system = Fget_coding_system (coding_system);
1116 return coding_system_charset (coding_system, XINT (gnum));
1120 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1121 Return the PROP property of CODING-SYSTEM.
1123 (coding_system, prop))
1126 enum coding_system_type type;
1128 coding_system = Fget_coding_system (coding_system);
1129 CHECK_SYMBOL (prop);
1130 type = XCODING_SYSTEM_TYPE (coding_system);
1132 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1133 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1136 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1138 case CODESYS_PROP_ALL_OK:
1141 case CODESYS_PROP_ISO2022:
1142 if (type != CODESYS_ISO2022)
1144 ("Property only valid in ISO2022 coding systems",
1148 case CODESYS_PROP_CCL:
1149 if (type != CODESYS_CCL)
1151 ("Property only valid in CCL coding systems",
1161 signal_simple_error ("Unrecognized property", prop);
1163 if (EQ (prop, Qname))
1164 return XCODING_SYSTEM_NAME (coding_system);
1165 else if (EQ (prop, Qtype))
1166 return Fcoding_system_type (coding_system);
1167 else if (EQ (prop, Qdoc_string))
1168 return XCODING_SYSTEM_DOC_STRING (coding_system);
1169 else if (EQ (prop, Qmnemonic))
1170 return XCODING_SYSTEM_MNEMONIC (coding_system);
1171 else if (EQ (prop, Qeol_type))
1172 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1173 else if (EQ (prop, Qeol_lf))
1174 return XCODING_SYSTEM_EOL_LF (coding_system);
1175 else if (EQ (prop, Qeol_crlf))
1176 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1177 else if (EQ (prop, Qeol_cr))
1178 return XCODING_SYSTEM_EOL_CR (coding_system);
1179 else if (EQ (prop, Qpost_read_conversion))
1180 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1181 else if (EQ (prop, Qpre_write_conversion))
1182 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1184 else if (type == CODESYS_ISO2022)
1186 if (EQ (prop, Qcharset_g0))
1187 return coding_system_charset (coding_system, 0);
1188 else if (EQ (prop, Qcharset_g1))
1189 return coding_system_charset (coding_system, 1);
1190 else if (EQ (prop, Qcharset_g2))
1191 return coding_system_charset (coding_system, 2);
1192 else if (EQ (prop, Qcharset_g3))
1193 return coding_system_charset (coding_system, 3);
1195 #define FORCE_CHARSET(charset_num) \
1196 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1197 (coding_system, charset_num) ? Qt : Qnil)
1199 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1200 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1201 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1202 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1204 #define LISP_BOOLEAN(prop) \
1205 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1207 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1208 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1209 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1210 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1211 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1212 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1213 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1215 else if (EQ (prop, Qinput_charset_conversion))
1217 unparse_charset_conversion_specs
1218 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1219 else if (EQ (prop, Qoutput_charset_conversion))
1221 unparse_charset_conversion_specs
1222 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1226 else if (type == CODESYS_CCL)
1228 if (EQ (prop, Qdecode))
1229 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1230 else if (EQ (prop, Qencode))
1231 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1239 return Qnil; /* not reached */
1243 /************************************************************************/
1244 /* Coding category functions */
1245 /************************************************************************/
1248 decode_coding_category (Lisp_Object symbol)
1252 CHECK_SYMBOL (symbol);
1253 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1254 if (EQ (coding_category_symbol[i], symbol))
1257 signal_simple_error ("Unrecognized coding category", symbol);
1258 return 0; /* not reached */
1261 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1262 Return a list of all recognized coding categories.
1267 Lisp_Object list = Qnil;
1269 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1270 list = Fcons (coding_category_symbol[i], list);
1274 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1275 Change the priority order of the coding categories.
1276 LIST should be list of coding categories, in descending order of
1277 priority. Unspecified coding categories will be lower in priority
1278 than all specified ones, in the same relative order they were in
1283 int category_to_priority[CODING_CATEGORY_LAST + 1];
1287 /* First generate a list that maps coding categories to priorities. */
1289 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1290 category_to_priority[i] = -1;
1292 /* Highest priority comes from the specified list. */
1294 EXTERNAL_LIST_LOOP (rest, list)
1296 int cat = decode_coding_category (XCAR (rest));
1298 if (category_to_priority[cat] >= 0)
1299 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1300 category_to_priority[cat] = i++;
1303 /* Now go through the existing categories by priority to retrieve
1304 the categories not yet specified and preserve their priority
1306 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1308 int cat = coding_category_by_priority[j];
1309 if (category_to_priority[cat] < 0)
1310 category_to_priority[cat] = i++;
1313 /* Now we need to construct the inverse of the mapping we just
1316 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1317 coding_category_by_priority[category_to_priority[i]] = i;
1319 /* Phew! That was confusing. */
1323 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1324 Return a list of coding categories in descending order of priority.
1329 Lisp_Object list = Qnil;
1331 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1332 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1337 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1338 Change the coding system associated with a coding category.
1340 (coding_category, coding_system))
1342 int cat = decode_coding_category (coding_category);
1344 coding_system = Fget_coding_system (coding_system);
1345 coding_category_system[cat] = coding_system;
1349 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1350 Return the coding system associated with a coding category.
1354 int cat = decode_coding_category (coding_category);
1355 Lisp_Object sys = coding_category_system[cat];
1358 return XCODING_SYSTEM_NAME (sys);
1363 /************************************************************************/
1364 /* Detecting the encoding of data */
1365 /************************************************************************/
1367 struct detection_state
1369 enum eol_type eol_type;
1405 struct iso2022_decoder iso;
1407 int high_byte_count;
1408 unsigned int saw_single_shift:1;
1421 acceptable_control_char_p (int c)
1425 /* Allow and ignore control characters that you might
1426 reasonably see in a text file */
1431 case 8: /* backspace */
1432 case 11: /* vertical tab */
1433 case 12: /* form feed */
1434 case 26: /* MS-DOS C-z junk */
1435 case 31: /* '^_' -- for info */
1443 mask_has_at_most_one_bit_p (int mask)
1445 /* Perhaps the only thing useful you learn from intensive Microsoft
1446 technical interviews */
1447 return (mask & (mask - 1)) == 0;
1450 static enum eol_type
1451 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1460 st->eol.just_saw_cr = 1;
1465 if (st->eol.just_saw_cr)
1467 else if (st->eol.seen_anything)
1470 else if (st->eol.just_saw_cr)
1472 st->eol.just_saw_cr = 0;
1474 st->eol.seen_anything = 1;
1477 return EOL_AUTODETECT;
1480 /* Attempt to determine the encoding and EOL type of the given text.
1481 Before calling this function for the first type, you must initialize
1482 st->eol_type as appropriate and initialize st->mask to ~0.
1484 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1487 st->mask holds the determined coding category mask, or ~0 if only
1488 ASCII has been seen so far.
1492 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1493 is present in st->mask
1494 1 == definitive answers are here for both st->eol_type and st->mask
1498 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1499 unsigned int n, int just_do_eol)
1503 if (st->eol_type == EOL_AUTODETECT)
1504 st->eol_type = detect_eol_type (st, src, n);
1507 return st->eol_type != EOL_AUTODETECT;
1509 if (!st->seen_non_ascii)
1511 for (; n; n--, src++)
1514 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1516 st->seen_non_ascii = 1;
1518 st->shift_jis.mask = ~0;
1522 st->iso2022.mask = ~0;
1532 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1533 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1534 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1535 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1536 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1537 st->big5.mask = detect_coding_big5 (st, src, n);
1538 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1539 st->utf8.mask = detect_coding_utf8 (st, src, n);
1540 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1541 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1544 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1545 | st->utf8.mask | st->ucs4.mask;
1548 int retval = mask_has_at_most_one_bit_p (st->mask);
1549 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1550 return retval && st->eol_type != EOL_AUTODETECT;
1555 coding_system_from_mask (int mask)
1559 /* If the file was entirely or basically ASCII, use the
1560 default value of `buffer-file-coding-system'. */
1561 Lisp_Object retval =
1562 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1565 retval = Ffind_coding_system (retval);
1569 (Qbad_variable, Qwarning,
1570 "Invalid `default-buffer-file-coding-system', set to nil");
1571 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1575 retval = Fget_coding_system (Qraw_text);
1583 mask = postprocess_iso2022_mask (mask);
1585 /* Look through the coding categories by priority and find
1586 the first one that is allowed. */
1587 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1589 cat = coding_category_by_priority[i];
1590 if ((mask & (1 << cat)) &&
1591 !NILP (coding_category_system[cat]))
1595 return coding_category_system[cat];
1597 return Fget_coding_system (Qraw_text);
1601 /* Given a seekable read stream and potential coding system and EOL type
1602 as specified, do any autodetection that is called for. If the
1603 coding system and/or EOL type are not autodetect, they will be left
1604 alone; but this function will never return an autodetect coding system
1607 This function does not automatically fetch subsidiary coding systems;
1608 that should be unnecessary with the explicit eol-type argument. */
1611 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1612 enum eol_type *eol_type_in_out)
1614 struct detection_state decst;
1616 if (*eol_type_in_out == EOL_AUTODETECT)
1617 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1620 decst.eol_type = *eol_type_in_out;
1623 /* If autodetection is called for, do it now. */
1624 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1625 *eol_type_in_out == EOL_AUTODETECT)
1627 unsigned char random_buffer[4096];
1629 Lisp_Object coding_system = Qnil;
1631 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1634 unsigned char *cp = random_buffer;
1636 while (cp < random_buffer + nread)
1638 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1639 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1640 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1641 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1642 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1643 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1644 (*cp++ == ':') && (cp < random_buffer + nread))
1646 unsigned char coding_system_name[4096 - 6];
1647 unsigned char *np = coding_system_name;
1649 while ( (cp < random_buffer + nread)
1650 && ((*cp == ' ') || (*cp == '\t')) )
1654 while ( (cp < random_buffer + nread) &&
1655 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1661 = Ffind_coding_system (intern (coding_system_name));
1665 if (EQ(coding_system, Qnil))
1667 if (detect_coding_type (&decst, random_buffer, nread,
1668 XCODING_SYSTEM_TYPE (*codesys_in_out)
1669 != CODESYS_AUTODETECT))
1671 nread = Lstream_read (stream,
1672 random_buffer, sizeof (random_buffer));
1677 *eol_type_in_out = decst.eol_type;
1678 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1680 if (EQ(coding_system, Qnil))
1681 *codesys_in_out = coding_system_from_mask (decst.mask);
1683 *codesys_in_out = coding_system;
1686 /* If we absolutely can't determine the EOL type, just assume LF. */
1687 if (*eol_type_in_out == EOL_AUTODETECT)
1688 *eol_type_in_out = EOL_LF;
1690 Lstream_rewind (stream);
1693 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1694 Detect coding system of the text in the region between START and END.
1695 Returned a list of possible coding systems ordered by priority.
1696 If only ASCII characters are found, it returns 'undecided or one of
1697 its subsidiary coding systems according to a detected end-of-line
1698 type. Optional arg BUFFER defaults to the current buffer.
1700 (start, end, buffer))
1702 Lisp_Object val = Qnil;
1703 struct buffer *buf = decode_buffer (buffer, 0);
1705 Lisp_Object instream, lb_instream;
1706 Lstream *istr, *lb_istr;
1707 struct detection_state decst;
1708 struct gcpro gcpro1, gcpro2;
1710 get_buffer_range_char (buf, start, end, &b, &e, 0);
1711 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1712 lb_istr = XLSTREAM (lb_instream);
1713 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1714 istr = XLSTREAM (instream);
1715 GCPRO2 (instream, lb_instream);
1717 decst.eol_type = EOL_AUTODETECT;
1721 unsigned char random_buffer[4096];
1722 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1726 if (detect_coding_type (&decst, random_buffer, nread, 0))
1730 if (decst.mask == ~0)
1731 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1739 decst.mask = postprocess_iso2022_mask (decst.mask);
1741 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1743 int sys = coding_category_by_priority[i];
1744 if (decst.mask & (1 << sys))
1746 Lisp_Object codesys = coding_category_system[sys];
1747 if (!NILP (codesys))
1748 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1749 val = Fcons (codesys, val);
1753 Lstream_close (istr);
1755 Lstream_delete (istr);
1756 Lstream_delete (lb_istr);
1761 /************************************************************************/
1762 /* Converting to internal Mule format ("decoding") */
1763 /************************************************************************/
1765 /* A decoding stream is a stream used for decoding text (i.e.
1766 converting from some external format to internal format).
1767 The decoding-stream object keeps track of the actual coding
1768 stream, the stream that is at the other end, and data that
1769 needs to be persistent across the lifetime of the stream. */
1771 /* Handle the EOL stuff related to just-read-in character C.
1772 EOL_TYPE is the EOL type of the coding stream.
1773 FLAGS is the current value of FLAGS in the coding stream, and may
1774 be modified by this macro. (The macro only looks at the
1775 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1776 bytes are to be written. You need to also define a local goto
1777 label "label_continue_loop" that is at the end of the main
1778 character-reading loop.
1780 If C is a CR character, then this macro handles it entirely and
1781 jumps to label_continue_loop. Otherwise, this macro does not add
1782 anything to DST, and continues normally. You should continue
1783 processing C normally after this macro. */
1785 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1789 if (eol_type == EOL_CR) \
1790 Dynarr_add (dst, '\n'); \
1791 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1792 Dynarr_add (dst, c); \
1794 flags |= CODING_STATE_CR; \
1795 goto label_continue_loop; \
1797 else if (flags & CODING_STATE_CR) \
1798 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1800 Dynarr_add (dst, '\r'); \
1801 flags &= ~CODING_STATE_CR; \
1805 /* C should be a binary character in the range 0 - 255; convert
1806 to internal format and add to Dynarr DST. */
1809 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1811 if (BYTE_ASCII_P (c)) \
1812 Dynarr_add (dst, c); \
1815 Dynarr_add (dst, (c >> 6) | 0xc0); \
1816 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1821 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1825 Dynarr_add (dst, c);
1827 else if ( c <= 0x7ff )
1829 Dynarr_add (dst, (c >> 6) | 0xc0);
1830 Dynarr_add (dst, (c & 0x3f) | 0x80);
1832 else if ( c <= 0xffff )
1834 Dynarr_add (dst, (c >> 12) | 0xe0);
1835 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1836 Dynarr_add (dst, (c & 0x3f) | 0x80);
1838 else if ( c <= 0x1fffff )
1840 Dynarr_add (dst, (c >> 18) | 0xf0);
1841 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1842 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1843 Dynarr_add (dst, (c & 0x3f) | 0x80);
1845 else if ( c <= 0x3ffffff )
1847 Dynarr_add (dst, (c >> 24) | 0xf8);
1848 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1849 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1850 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1851 Dynarr_add (dst, (c & 0x3f) | 0x80);
1855 Dynarr_add (dst, (c >> 30) | 0xfc);
1856 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1857 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1858 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1859 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1860 Dynarr_add (dst, (c & 0x3f) | 0x80);
1864 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1866 if (BYTE_ASCII_P (c)) \
1867 Dynarr_add (dst, c); \
1868 else if (BYTE_C1_P (c)) \
1870 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1871 Dynarr_add (dst, c + 0x20); \
1875 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1876 Dynarr_add (dst, c); \
1881 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1885 DECODE_ADD_BINARY_CHAR (ch, dst); \
1890 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1892 if (flags & CODING_STATE_END) \
1894 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1895 if (flags & CODING_STATE_CR) \
1896 Dynarr_add (dst, '\r'); \
1900 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1902 struct decoding_stream
1904 /* Coding system that governs the conversion. */
1905 Lisp_Coding_System *codesys;
1907 /* Stream that we read the encoded data from or
1908 write the decoded data to. */
1911 /* If we are reading, then we can return only a fixed amount of
1912 data, so if the conversion resulted in too much data, we store it
1913 here for retrieval the next time around. */
1914 unsigned_char_dynarr *runoff;
1916 /* FLAGS holds flags indicating the current state of the decoding.
1917 Some of these flags are dependent on the coding system. */
1920 /* CH holds a partially built-up character. Since we only deal
1921 with one- and two-byte characters at the moment, we only use
1922 this to store the first byte of a two-byte character. */
1925 /* EOL_TYPE specifies the type of end-of-line conversion that
1926 currently applies. We need to keep this separate from the
1927 EOL type stored in CODESYS because the latter might indicate
1928 automatic EOL-type detection while the former will always
1929 indicate a particular EOL type. */
1930 enum eol_type eol_type;
1932 /* Additional ISO2022 information. We define the structure above
1933 because it's also needed by the detection routines. */
1934 struct iso2022_decoder iso2022;
1936 /* Additional information (the state of the running CCL program)
1937 used by the CCL decoder. */
1938 struct ccl_program ccl;
1940 /* counter for UTF-8 or UCS-4 */
1941 unsigned char counter;
1943 struct detection_state decst;
1946 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1947 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1948 static int decoding_rewinder (Lstream *stream);
1949 static int decoding_seekable_p (Lstream *stream);
1950 static int decoding_flusher (Lstream *stream);
1951 static int decoding_closer (Lstream *stream);
1953 static Lisp_Object decoding_marker (Lisp_Object stream,
1954 void (*markobj) (Lisp_Object));
1956 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1957 sizeof (struct decoding_stream));
1960 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1962 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1963 Lisp_Object str_obj;
1965 /* We do not need to mark the coding systems or charsets stored
1966 within the stream because they are stored in a global list
1967 and automatically marked. */
1969 XSETLSTREAM (str_obj, str);
1971 if (str->imp->marker)
1972 return (str->imp->marker) (str_obj, markobj);
1977 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1978 so we read data from the other end, decode it, and store it into DATA. */
1981 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1983 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1984 unsigned char *orig_data = data;
1986 int error_occurred = 0;
1988 /* We need to interface to mule_decode(), which expects to take some
1989 amount of data and store the result into a Dynarr. We have
1990 mule_decode() store into str->runoff, and take data from there
1993 /* We loop until we have enough data, reading chunks from the other
1994 end and decoding it. */
1997 /* Take data from the runoff if we can. Make sure to take at
1998 most SIZE bytes, and delete the data from the runoff. */
1999 if (Dynarr_length (str->runoff) > 0)
2001 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2002 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2003 Dynarr_delete_many (str->runoff, 0, chunk);
2009 break; /* No more room for data */
2011 if (str->flags & CODING_STATE_END)
2012 /* This means that on the previous iteration, we hit the EOF on
2013 the other end. We loop once more so that mule_decode() can
2014 output any final stuff it may be holding, or any "go back
2015 to a sane state" escape sequences. (This latter makes sense
2016 during encoding.) */
2019 /* Exhausted the runoff, so get some more. DATA has at least
2020 SIZE bytes left of storage in it, so it's OK to read directly
2021 into it. (We'll be overwriting above, after we've decoded it
2022 into the runoff.) */
2023 read_size = Lstream_read (str->other_end, data, size);
2030 /* There might be some more end data produced in the translation.
2031 See the comment above. */
2032 str->flags |= CODING_STATE_END;
2033 mule_decode (stream, data, str->runoff, read_size);
2036 if (data - orig_data == 0)
2037 return error_occurred ? -1 : 0;
2039 return data - orig_data;
2043 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2045 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2048 /* Decode all our data into the runoff, and then attempt to write
2049 it all out to the other end. Remove whatever chunk we succeeded
2051 mule_decode (stream, data, str->runoff, size);
2052 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2053 Dynarr_length (str->runoff));
2055 Dynarr_delete_many (str->runoff, 0, retval);
2056 /* Do NOT return retval. The return value indicates how much
2057 of the incoming data was written, not how many bytes were
2063 reset_decoding_stream (struct decoding_stream *str)
2066 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2068 Lisp_Object coding_system;
2069 XSETCODING_SYSTEM (coding_system, str->codesys);
2070 reset_iso2022 (coding_system, &str->iso2022);
2072 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2074 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2078 str->flags = str->ch = 0;
2082 decoding_rewinder (Lstream *stream)
2084 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2085 reset_decoding_stream (str);
2086 Dynarr_reset (str->runoff);
2087 return Lstream_rewind (str->other_end);
2091 decoding_seekable_p (Lstream *stream)
2093 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2094 return Lstream_seekable_p (str->other_end);
2098 decoding_flusher (Lstream *stream)
2100 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101 return Lstream_flush (str->other_end);
2105 decoding_closer (Lstream *stream)
2107 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108 if (stream->flags & LSTREAM_FL_WRITE)
2110 str->flags |= CODING_STATE_END;
2111 decoding_writer (stream, 0, 0);
2113 Dynarr_free (str->runoff);
2115 #ifdef ENABLE_COMPOSITE_CHARS
2116 if (str->iso2022.composite_chars)
2117 Dynarr_free (str->iso2022.composite_chars);
2120 return Lstream_close (str->other_end);
2124 decoding_stream_coding_system (Lstream *stream)
2126 Lisp_Object coding_system;
2127 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2129 XSETCODING_SYSTEM (coding_system, str->codesys);
2130 return subsidiary_coding_system (coding_system, str->eol_type);
2134 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2136 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2137 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2139 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2140 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2141 reset_decoding_stream (str);
2144 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2145 stream for writing, no automatic code detection will be performed.
2146 The reason for this is that automatic code detection requires a
2147 seekable input. Things will also fail if you open a decoding
2148 stream for reading using a non-fully-specified coding system and
2149 a non-seekable input stream. */
2152 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2155 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2156 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2160 str->other_end = stream;
2161 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2162 str->eol_type = EOL_AUTODETECT;
2163 if (!strcmp (mode, "r")
2164 && Lstream_seekable_p (stream))
2165 /* We can determine the coding system now. */
2166 determine_real_coding_system (stream, &codesys, &str->eol_type);
2167 set_decoding_stream_coding_system (lstr, codesys);
2168 str->decst.eol_type = str->eol_type;
2169 str->decst.mask = ~0;
2170 XSETLSTREAM (obj, lstr);
2175 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2177 return make_decoding_stream_1 (stream, codesys, "r");
2181 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2183 return make_decoding_stream_1 (stream, codesys, "w");
2186 /* Note: the decode_coding_* functions all take the same
2187 arguments as mule_decode(), which is to say some SRC data of
2188 size N, which is to be stored into dynamic array DST.
2189 DECODING is the stream within which the decoding is
2190 taking place, but no data is actually read from or
2191 written to that stream; that is handled in decoding_reader()
2192 or decoding_writer(). This allows the same functions to
2193 be used for both reading and writing. */
2196 mule_decode (Lstream *decoding, CONST unsigned char *src,
2197 unsigned_char_dynarr *dst, unsigned int n)
2199 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2201 /* If necessary, do encoding-detection now. We do this when
2202 we're a writing stream or a non-seekable reading stream,
2203 meaning that we can't just process the whole input,
2204 rewind, and start over. */
2206 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2207 str->eol_type == EOL_AUTODETECT)
2209 Lisp_Object codesys;
2211 XSETCODING_SYSTEM (codesys, str->codesys);
2212 detect_coding_type (&str->decst, src, n,
2213 CODING_SYSTEM_TYPE (str->codesys) !=
2214 CODESYS_AUTODETECT);
2215 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2216 str->decst.mask != ~0)
2217 /* #### This is cheesy. What we really ought to do is
2218 buffer up a certain amount of data so as to get a
2219 less random result. */
2220 codesys = coding_system_from_mask (str->decst.mask);
2221 str->eol_type = str->decst.eol_type;
2222 if (XCODING_SYSTEM (codesys) != str->codesys)
2224 /* Preserve the CODING_STATE_END flag in case it was set.
2225 If we erase it, bad things might happen. */
2226 int was_end = str->flags & CODING_STATE_END;
2227 set_decoding_stream_coding_system (decoding, codesys);
2229 str->flags |= CODING_STATE_END;
2233 switch (CODING_SYSTEM_TYPE (str->codesys))
2236 case CODESYS_INTERNAL:
2237 Dynarr_add_many (dst, src, n);
2240 case CODESYS_AUTODETECT:
2241 /* If we got this far and still haven't decided on the coding
2242 system, then do no conversion. */
2243 case CODESYS_NO_CONVERSION:
2244 decode_coding_no_conversion (decoding, src, dst, n);
2247 case CODESYS_SHIFT_JIS:
2248 decode_coding_sjis (decoding, src, dst, n);
2251 decode_coding_big5 (decoding, src, dst, n);
2254 decode_coding_ucs4 (decoding, src, dst, n);
2257 decode_coding_utf8 (decoding, src, dst, n);
2260 str->ccl.last_block = str->flags & CODING_STATE_END;
2261 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2263 case CODESYS_ISO2022:
2264 decode_coding_iso2022 (decoding, src, dst, n);
2272 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2273 Decode the text between START and END which is encoded in CODING-SYSTEM.
2274 This is useful if you've read in encoded text from a file without decoding
2275 it (e.g. you read in a JIS-formatted file but used the `binary' or
2276 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2277 Return length of decoded text.
2278 BUFFER defaults to the current buffer if unspecified.
2280 (start, end, coding_system, buffer))
2283 struct buffer *buf = decode_buffer (buffer, 0);
2284 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2285 Lstream *istr, *ostr;
2286 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2288 get_buffer_range_char (buf, start, end, &b, &e, 0);
2290 barf_if_buffer_read_only (buf, b, e);
2292 coding_system = Fget_coding_system (coding_system);
2293 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2294 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2295 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2297 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2298 Fget_coding_system (Qbinary));
2299 istr = XLSTREAM (instream);
2300 ostr = XLSTREAM (outstream);
2301 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2303 /* The chain of streams looks like this:
2305 [BUFFER] <----- send through
2306 ------> [ENCODE AS BINARY]
2307 ------> [DECODE AS SPECIFIED]
2313 char tempbuf[1024]; /* some random amount */
2314 Bufpos newpos, even_newer_pos;
2315 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2316 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2320 newpos = lisp_buffer_stream_startpos (istr);
2321 Lstream_write (ostr, tempbuf, size_in_bytes);
2322 even_newer_pos = lisp_buffer_stream_startpos (istr);
2323 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2326 Lstream_close (istr);
2327 Lstream_close (ostr);
2329 Lstream_delete (istr);
2330 Lstream_delete (ostr);
2331 Lstream_delete (XLSTREAM (de_outstream));
2332 Lstream_delete (XLSTREAM (lb_outstream));
2337 /************************************************************************/
2338 /* Converting to an external encoding ("encoding") */
2339 /************************************************************************/
2341 /* An encoding stream is an output stream. When you create the
2342 stream, you specify the coding system that governs the encoding
2343 and another stream that the resulting encoded data is to be
2344 sent to, and then start sending data to it. */
2346 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2348 struct encoding_stream
2350 /* Coding system that governs the conversion. */
2351 Lisp_Coding_System *codesys;
2353 /* Stream that we read the encoded data from or
2354 write the decoded data to. */
2357 /* If we are reading, then we can return only a fixed amount of
2358 data, so if the conversion resulted in too much data, we store it
2359 here for retrieval the next time around. */
2360 unsigned_char_dynarr *runoff;
2362 /* FLAGS holds flags indicating the current state of the encoding.
2363 Some of these flags are dependent on the coding system. */
2366 /* CH holds a partially built-up character. Since we only deal
2367 with one- and two-byte characters at the moment, we only use
2368 this to store the first byte of a two-byte character. */
2371 /* Additional information used by the ISO2022 encoder. */
2374 /* CHARSET holds the character sets currently assigned to the G0
2375 through G3 registers. It is initialized from the array
2376 INITIAL_CHARSET in CODESYS. */
2377 Lisp_Object charset[4];
2379 /* Which registers are currently invoked into the left (GL) and
2380 right (GR) halves of the 8-bit encoding space? */
2381 int register_left, register_right;
2383 /* Whether we need to explicitly designate the charset in the
2384 G? register before using it. It is initialized from the
2385 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2386 unsigned char force_charset_on_output[4];
2388 /* Other state variables that need to be preserved across
2390 Lisp_Object current_charset;
2392 int current_char_boundary;
2395 /* Additional information (the state of the running CCL program)
2396 used by the CCL encoder. */
2397 struct ccl_program ccl;
2401 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2402 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2404 static int encoding_rewinder (Lstream *stream);
2405 static int encoding_seekable_p (Lstream *stream);
2406 static int encoding_flusher (Lstream *stream);
2407 static int encoding_closer (Lstream *stream);
2409 static Lisp_Object encoding_marker (Lisp_Object stream,
2410 void (*markobj) (Lisp_Object));
2412 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2413 sizeof (struct encoding_stream));
2416 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2418 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2419 Lisp_Object str_obj;
2421 /* We do not need to mark the coding systems or charsets stored
2422 within the stream because they are stored in a global list
2423 and automatically marked. */
2425 XSETLSTREAM (str_obj, str);
2427 if (str->imp->marker)
2428 return (str->imp->marker) (str_obj, markobj);
2433 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2434 so we read data from the other end, encode it, and store it into DATA. */
2437 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2439 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2440 unsigned char *orig_data = data;
2442 int error_occurred = 0;
2444 /* We need to interface to mule_encode(), which expects to take some
2445 amount of data and store the result into a Dynarr. We have
2446 mule_encode() store into str->runoff, and take data from there
2449 /* We loop until we have enough data, reading chunks from the other
2450 end and encoding it. */
2453 /* Take data from the runoff if we can. Make sure to take at
2454 most SIZE bytes, and delete the data from the runoff. */
2455 if (Dynarr_length (str->runoff) > 0)
2457 int chunk = min ((int) size, Dynarr_length (str->runoff));
2458 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2459 Dynarr_delete_many (str->runoff, 0, chunk);
2465 break; /* No more room for data */
2467 if (str->flags & CODING_STATE_END)
2468 /* This means that on the previous iteration, we hit the EOF on
2469 the other end. We loop once more so that mule_encode() can
2470 output any final stuff it may be holding, or any "go back
2471 to a sane state" escape sequences. (This latter makes sense
2472 during encoding.) */
2475 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2476 left of storage in it, so it's OK to read directly into it.
2477 (We'll be overwriting above, after we've encoded it into the
2479 read_size = Lstream_read (str->other_end, data, size);
2486 /* There might be some more end data produced in the translation.
2487 See the comment above. */
2488 str->flags |= CODING_STATE_END;
2489 mule_encode (stream, data, str->runoff, read_size);
2492 if (data == orig_data)
2493 return error_occurred ? -1 : 0;
2495 return data - orig_data;
2499 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2501 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2504 /* Encode all our data into the runoff, and then attempt to write
2505 it all out to the other end. Remove whatever chunk we succeeded
2507 mule_encode (stream, data, str->runoff, size);
2508 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2509 Dynarr_length (str->runoff));
2511 Dynarr_delete_many (str->runoff, 0, retval);
2512 /* Do NOT return retval. The return value indicates how much
2513 of the incoming data was written, not how many bytes were
2519 reset_encoding_stream (struct encoding_stream *str)
2522 switch (CODING_SYSTEM_TYPE (str->codesys))
2524 case CODESYS_ISO2022:
2528 for (i = 0; i < 4; i++)
2530 str->iso2022.charset[i] =
2531 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2532 str->iso2022.force_charset_on_output[i] =
2533 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2535 str->iso2022.register_left = 0;
2536 str->iso2022.register_right = 1;
2537 str->iso2022.current_charset = Qnil;
2538 str->iso2022.current_half = 0;
2540 str->iso2022.current_char_boundary = 0;
2542 str->iso2022.current_char_boundary = 1;
2547 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2554 str->flags = str->ch = 0;
2558 encoding_rewinder (Lstream *stream)
2560 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2561 reset_encoding_stream (str);
2562 Dynarr_reset (str->runoff);
2563 return Lstream_rewind (str->other_end);
2567 encoding_seekable_p (Lstream *stream)
2569 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2570 return Lstream_seekable_p (str->other_end);
2574 encoding_flusher (Lstream *stream)
2576 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2577 return Lstream_flush (str->other_end);
2581 encoding_closer (Lstream *stream)
2583 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2584 if (stream->flags & LSTREAM_FL_WRITE)
2586 str->flags |= CODING_STATE_END;
2587 encoding_writer (stream, 0, 0);
2589 Dynarr_free (str->runoff);
2590 return Lstream_close (str->other_end);
2594 encoding_stream_coding_system (Lstream *stream)
2596 Lisp_Object coding_system;
2597 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2599 XSETCODING_SYSTEM (coding_system, str->codesys);
2600 return coding_system;
2604 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2606 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2607 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2609 reset_encoding_stream (str);
2613 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2616 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2617 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2621 str->runoff = Dynarr_new (unsigned_char);
2622 str->other_end = stream;
2623 set_encoding_stream_coding_system (lstr, codesys);
2624 XSETLSTREAM (obj, lstr);
2629 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2631 return make_encoding_stream_1 (stream, codesys, "r");
2635 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2637 return make_encoding_stream_1 (stream, codesys, "w");
2640 /* Convert N bytes of internally-formatted data stored in SRC to an
2641 external format, according to the encoding stream ENCODING.
2642 Store the encoded data into DST. */
2645 mule_encode (Lstream *encoding, CONST unsigned char *src,
2646 unsigned_char_dynarr *dst, unsigned int n)
2648 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2650 switch (CODING_SYSTEM_TYPE (str->codesys))
2653 case CODESYS_INTERNAL:
2654 Dynarr_add_many (dst, src, n);
2657 case CODESYS_AUTODETECT:
2658 /* If we got this far and still haven't decided on the coding
2659 system, then do no conversion. */
2660 case CODESYS_NO_CONVERSION:
2661 encode_coding_no_conversion (encoding, src, dst, n);
2664 case CODESYS_SHIFT_JIS:
2665 encode_coding_sjis (encoding, src, dst, n);
2668 encode_coding_big5 (encoding, src, dst, n);
2671 encode_coding_ucs4 (encoding, src, dst, n);
2674 encode_coding_utf8 (encoding, src, dst, n);
2677 str->ccl.last_block = str->flags & CODING_STATE_END;
2678 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2680 case CODESYS_ISO2022:
2681 encode_coding_iso2022 (encoding, src, dst, n);
2689 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2690 Encode the text between START and END using CODING-SYSTEM.
2691 This will, for example, convert Japanese characters into stuff such as
2692 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2693 text. BUFFER defaults to the current buffer if unspecified.
2695 (start, end, coding_system, buffer))
2698 struct buffer *buf = decode_buffer (buffer, 0);
2699 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2700 Lstream *istr, *ostr;
2701 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2703 get_buffer_range_char (buf, start, end, &b, &e, 0);
2705 barf_if_buffer_read_only (buf, b, e);
2707 coding_system = Fget_coding_system (coding_system);
2708 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2709 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2710 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2711 Fget_coding_system (Qbinary));
2712 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2714 istr = XLSTREAM (instream);
2715 ostr = XLSTREAM (outstream);
2716 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2717 /* The chain of streams looks like this:
2719 [BUFFER] <----- send through
2720 ------> [ENCODE AS SPECIFIED]
2721 ------> [DECODE AS BINARY]
2726 char tempbuf[1024]; /* some random amount */
2727 Bufpos newpos, even_newer_pos;
2728 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2729 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2733 newpos = lisp_buffer_stream_startpos (istr);
2734 Lstream_write (ostr, tempbuf, size_in_bytes);
2735 even_newer_pos = lisp_buffer_stream_startpos (istr);
2736 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2742 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2743 Lstream_close (istr);
2744 Lstream_close (ostr);
2746 Lstream_delete (istr);
2747 Lstream_delete (ostr);
2748 Lstream_delete (XLSTREAM (de_outstream));
2749 Lstream_delete (XLSTREAM (lb_outstream));
2750 return make_int (retlen);
2756 /************************************************************************/
2757 /* Shift-JIS methods */
2758 /************************************************************************/
2760 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2761 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2762 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2763 encoded by "position-code + 0x80". A character of JISX0208
2764 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2765 position-codes are divided and shifted so that it fit in the range
2768 --- CODE RANGE of Shift-JIS ---
2769 (character set) (range)
2771 JISX0201-Kana 0xA0 .. 0xDF
2772 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2773 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2774 -------------------------------
2778 /* Is this the first byte of a Shift-JIS two-byte char? */
2780 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2781 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2783 /* Is this the second byte of a Shift-JIS two-byte char? */
2785 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2786 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2788 #define BYTE_SJIS_KATAKANA_P(c) \
2789 ((c) >= 0xA1 && (c) <= 0xDF)
2792 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2800 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2802 if (st->shift_jis.in_second_byte)
2804 st->shift_jis.in_second_byte = 0;
2808 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2809 st->shift_jis.in_second_byte = 1;
2811 return CODING_CATEGORY_SHIFT_JIS_MASK;
2814 /* Convert Shift-JIS data to internal format. */
2817 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2818 unsigned_char_dynarr *dst, unsigned int n)
2821 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2822 unsigned int flags = str->flags;
2823 unsigned int ch = str->ch;
2824 eol_type_t eol_type = str->eol_type;
2832 /* Previous character was first byte of Shift-JIS Kanji char. */
2833 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2835 unsigned char e1, e2;
2837 DECODE_SJIS (ch, c, e1, e2);
2839 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2843 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2844 Dynarr_add (dst, e1);
2845 Dynarr_add (dst, e2);
2850 DECODE_ADD_BINARY_CHAR (ch, dst);
2851 DECODE_ADD_BINARY_CHAR (c, dst);
2857 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2858 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2860 else if (BYTE_SJIS_KATAKANA_P (c))
2863 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2866 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2867 Dynarr_add (dst, c);
2871 DECODE_ADD_BINARY_CHAR (c, dst);
2873 label_continue_loop:;
2876 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2882 /* Convert internally-formatted data to Shift-JIS. */
2885 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2886 unsigned_char_dynarr *dst, unsigned int n)
2889 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2890 unsigned int flags = str->flags;
2891 unsigned int ch = str->ch;
2892 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2894 unsigned char char_boundary = str->iso2022.current_char_boundary;
2901 switch (char_boundary)
2909 else if ( c >= 0xf8 )
2914 else if ( c >= 0xf0 )
2919 else if ( c >= 0xe0 )
2924 else if ( c >= 0xc0 )
2934 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2935 Dynarr_add (dst, '\r');
2936 if (eol_type != EOL_CR)
2937 Dynarr_add (dst, c);
2940 Dynarr_add (dst, c);
2945 ch = ( ch << 6 ) | ( c & 0x3f );
2947 Lisp_Object charset;
2948 unsigned int c1, c2, s1, s2;
2950 BREAKUP_CHAR (ch, charset, c1, c2);
2951 if (EQ(charset, Vcharset_katakana_jisx0201))
2953 Dynarr_add (dst, c1 | 0x80);
2955 else if (EQ(charset, Vcharset_japanese_jisx0208))
2957 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2958 Dynarr_add (dst, s1);
2959 Dynarr_add (dst, s2);
2965 ch = ( ch << 6 ) | ( c & 0x3f );
2971 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2972 Dynarr_add (dst, '\r');
2973 if (eol_type != EOL_CR)
2974 Dynarr_add (dst, '\n');
2977 else if (BYTE_ASCII_P (c))
2979 Dynarr_add (dst, c);
2982 else if (BUFBYTE_LEADING_BYTE_P (c))
2983 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2984 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2985 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2988 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2990 Dynarr_add (dst, c);
2993 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2994 ch == LEADING_BYTE_JAPANESE_JISX0208)
2998 unsigned char j1, j2;
2999 ENCODE_SJIS (ch, c, j1, j2);
3000 Dynarr_add (dst, j1);
3001 Dynarr_add (dst, j2);
3011 str->iso2022.current_char_boundary = char_boundary;
3015 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3016 Decode a JISX0208 character of Shift-JIS coding-system.
3017 CODE is the character code in Shift-JIS as a cons of type bytes.
3018 Return the corresponding character.
3022 unsigned char c1, c2, s1, s2;
3025 CHECK_INT (XCAR (code));
3026 CHECK_INT (XCDR (code));
3027 s1 = XINT (XCAR (code));
3028 s2 = XINT (XCDR (code));
3029 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3030 BYTE_SJIS_TWO_BYTE_2_P (s2))
3032 DECODE_SJIS (s1, s2, c1, c2);
3033 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3034 c1 & 0x7F, c2 & 0x7F));
3040 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3041 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3042 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3046 Lisp_Object charset;
3049 CHECK_CHAR_COERCE_INT (ch);
3050 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3051 if (EQ (charset, Vcharset_japanese_jisx0208))
3053 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3054 return Fcons (make_int (s1), make_int (s2));
3061 /************************************************************************/
3063 /************************************************************************/
3065 /* BIG5 is a coding system encoding two character sets: ASCII and
3066 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3067 character set and is encoded in two-byte.
3069 --- CODE RANGE of BIG5 ---
3070 (character set) (range)
3072 Big5 (1st byte) 0xA1 .. 0xFE
3073 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3074 --------------------------
3076 Since the number of characters in Big5 is larger than maximum
3077 characters in Emacs' charset (96x96), it can't be handled as one
3078 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3079 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3080 contains frequently used characters and the latter contains less
3081 frequently used characters. */
3083 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3084 ((c) >= 0xA1 && (c) <= 0xFE)
3086 /* Is this the second byte of a Shift-JIS two-byte char? */
3088 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3089 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3091 /* Number of Big5 characters which have the same code in 1st byte. */
3093 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3095 /* Code conversion macros. These are macros because they are used in
3096 inner loops during code conversion.
3098 Note that temporary variables in macros introduce the classic
3099 dynamic-scoping problems with variable names. We use capital-
3100 lettered variables in the assumption that XEmacs does not use
3101 capital letters in variables except in a very formalized way
3104 /* Convert Big5 code (b1, b2) into its internal string representation
3107 /* There is a much simpler way to split the Big5 charset into two.
3108 For the moment I'm going to leave the algorithm as-is because it
3109 claims to separate out the most-used characters into a single
3110 charset, which perhaps will lead to optimizations in various
3113 The way the algorithm works is something like this:
3115 Big5 can be viewed as a 94x157 charset, where the row is
3116 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3117 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3118 the split between low and high column numbers is apparently
3119 meaningless; ascending rows produce less and less frequent chars.
3120 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3121 the first charset, and the upper half (0xC9 .. 0xFE) to the
3122 second. To do the conversion, we convert the character into
3123 a single number where 0 .. 156 is the first row, 157 .. 313
3124 is the second, etc. That way, the characters are ordered by
3125 decreasing frequency. Then we just chop the space in two
3126 and coerce the result into a 94x94 space.
3129 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3131 int B1 = b1, B2 = b2; \
3133 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3137 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3141 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3142 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3144 c1 = I / (0xFF - 0xA1) + 0xA1; \
3145 c2 = I % (0xFF - 0xA1) + 0xA1; \
3148 /* Convert the internal string representation of a Big5 character
3149 (lb, c1, c2) into Big5 code (b1, b2). */
3151 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3153 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3155 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3157 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3159 b1 = I / BIG5_SAME_ROW + 0xA1; \
3160 b2 = I % BIG5_SAME_ROW; \
3161 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3165 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3173 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3174 (c >= 0x80 && c <= 0xA0))
3176 if (st->big5.in_second_byte)
3178 st->big5.in_second_byte = 0;
3179 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3183 st->big5.in_second_byte = 1;
3185 return CODING_CATEGORY_BIG5_MASK;
3188 /* Convert Big5 data to internal format. */
3191 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3192 unsigned_char_dynarr *dst, unsigned int n)
3195 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3196 unsigned int flags = str->flags;
3197 unsigned int ch = str->ch;
3198 eol_type_t eol_type = str->eol_type;
3205 /* Previous character was first byte of Big5 char. */
3206 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3208 unsigned char b1, b2, b3;
3209 DECODE_BIG5 (ch, c, b1, b2, b3);
3210 Dynarr_add (dst, b1);
3211 Dynarr_add (dst, b2);
3212 Dynarr_add (dst, b3);
3216 DECODE_ADD_BINARY_CHAR (ch, dst);
3217 DECODE_ADD_BINARY_CHAR (c, dst);
3223 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3224 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3227 DECODE_ADD_BINARY_CHAR (c, dst);
3229 label_continue_loop:;
3232 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3238 /* Convert internally-formatted data to Big5. */
3241 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3242 unsigned_char_dynarr *dst, unsigned int n)
3245 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3246 unsigned int flags = str->flags;
3247 unsigned int ch = str->ch;
3248 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3255 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3256 Dynarr_add (dst, '\r');
3257 if (eol_type != EOL_CR)
3258 Dynarr_add (dst, '\n');
3260 else if (BYTE_ASCII_P (c))
3263 Dynarr_add (dst, c);
3265 else if (BUFBYTE_LEADING_BYTE_P (c))
3267 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3268 c == LEADING_BYTE_CHINESE_BIG5_2)
3270 /* A recognized leading byte. */
3272 continue; /* not done with this character. */
3274 /* otherwise just ignore this character. */
3276 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3277 ch == LEADING_BYTE_CHINESE_BIG5_2)
3279 /* Previous char was a recognized leading byte. */
3281 continue; /* not done with this character. */
3285 /* Encountering second byte of a Big5 character. */
3286 unsigned char b1, b2;
3288 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3289 Dynarr_add (dst, b1);
3290 Dynarr_add (dst, b2);
3301 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3302 Decode a Big5 character CODE of BIG5 coding-system.
3303 CODE is the character code in BIG5, a cons of two integers.
3304 Return the corresponding character.
3308 unsigned char c1, c2, b1, b2;
3311 CHECK_INT (XCAR (code));
3312 CHECK_INT (XCDR (code));
3313 b1 = XINT (XCAR (code));
3314 b2 = XINT (XCDR (code));
3315 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3316 BYTE_BIG5_TWO_BYTE_2_P (b2))
3319 Lisp_Object charset;
3320 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3321 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3322 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3328 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3329 Encode the Big5 character CH to BIG5 coding-system.
3330 Return the corresponding character code in Big5.
3334 Lisp_Object charset;
3337 CHECK_CHAR_COERCE_INT (ch);
3338 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3339 if (EQ (charset, Vcharset_chinese_big5_1) ||
3340 EQ (charset, Vcharset_chinese_big5_2))
3342 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3344 return Fcons (make_int (b1), make_int (b2));
3351 /************************************************************************/
3354 /* UCS-4 character codes are implemented as nonnegative integers. */
3356 /************************************************************************/
3358 Lisp_Object ucs_to_mule_table[65536];
3359 Lisp_Object mule_to_ucs_table;
3361 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3362 Map UCS-4 code CODE to Mule character CHARACTER.
3364 Return T on success, NIL on failure.
3370 CHECK_CHAR (character);
3374 if (c < sizeof (ucs_to_mule_table))
3376 ucs_to_mule_table[c] = character;
3384 ucs_to_char (unsigned long code)
3386 if (code < sizeof (ucs_to_mule_table))
3388 return ucs_to_mule_table[code];
3390 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3395 c = code % (94 * 94);
3397 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3398 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3399 CHARSET_LEFT_TO_RIGHT),
3400 c / 94 + 33, c % 94 + 33));
3406 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3407 Return Mule character corresponding to UCS code CODE (a positive integer).
3411 CHECK_NATNUM (code);
3412 return ucs_to_char (XINT (code));
3415 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3416 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3420 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3421 Fset_char_ucs is more restrictive on index arg, but should
3422 check code arg in a char_table method. */
3423 CHECK_CHAR (character);
3424 CHECK_NATNUM (code);
3425 return Fput_char_table (character, code, mule_to_ucs_table);
3428 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3429 Return the UCS code (a positive integer) corresponding to CHARACTER.
3433 return Fget_char_table (character, mule_to_ucs_table);
3437 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3439 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3440 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3441 is not found, instead.
3442 #### do something more appropriate (use blob?)
3443 Danger, Will Robinson! Data loss. Should we signal user? */
3445 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3447 Lisp_Object chr = ucs_to_char (ch);
3451 Bufbyte work[MAX_EMCHAR_LEN];
3456 simple_set_charptr_emchar (work, ch) :
3457 non_ascii_set_charptr_emchar (work, ch);
3458 Dynarr_add_many (dst, work, len);
3462 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3463 Dynarr_add (dst, 34 + 128);
3464 Dynarr_add (dst, 46 + 128);
3469 static unsigned long
3470 mule_char_to_ucs4 (Lisp_Object charset,
3471 unsigned char h, unsigned char l)
3474 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3481 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3482 (XCHARSET_CHARS (charset) == 94) )
3484 unsigned char final = XCHARSET_FINAL (charset);
3486 if ( ('@' <= final) && (final < 0x7f) )
3488 return 0xe00000 + (final - '@') * 94 * 94
3489 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3503 encode_ucs4 (Lisp_Object charset,
3504 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3506 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3507 Dynarr_add (dst, code >> 24);
3508 Dynarr_add (dst, (code >> 16) & 255);
3509 Dynarr_add (dst, (code >> 8) & 255);
3510 Dynarr_add (dst, code & 255);
3514 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3520 switch (st->ucs4.in_byte)
3529 st->ucs4.in_byte = 0;
3535 return CODING_CATEGORY_UCS4_MASK;
3539 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3540 unsigned_char_dynarr *dst, unsigned int n)
3542 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3543 unsigned int flags = str->flags;
3544 unsigned int ch = str->ch;
3545 unsigned char counter = str->counter;
3549 unsigned char c = *src++;
3557 decode_ucs4 ( ( ch << 8 ) | c, dst);
3562 ch = ( ch << 8 ) | c;
3566 if (counter & CODING_STATE_END)
3567 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3571 str->counter = counter;
3575 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3576 unsigned_char_dynarr *dst, unsigned int n)
3579 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3580 unsigned int flags = str->flags;
3581 unsigned int ch = str->ch;
3582 unsigned char char_boundary = str->iso2022.current_char_boundary;
3583 Lisp_Object charset = str->iso2022.current_charset;
3585 #ifdef ENABLE_COMPOSITE_CHARS
3586 /* flags for handling composite chars. We do a little switcharoo
3587 on the source while we're outputting the composite char. */
3588 unsigned int saved_n = 0;
3589 CONST unsigned char *saved_src = NULL;
3590 int in_composite = 0;
3597 unsigned char c = *src++;
3599 if (BYTE_ASCII_P (c))
3600 { /* Processing ASCII character */
3602 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3605 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3606 { /* Processing Leading Byte */
3608 charset = CHARSET_BY_LEADING_BYTE (c);
3609 if (LEADING_BYTE_PREFIX_P(c))
3614 { /* Processing Non-ASCII character */
3616 if (EQ (charset, Vcharset_control_1))
3618 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3622 switch (XCHARSET_REP_BYTES (charset))
3625 encode_ucs4 (charset, c, 0, dst);
3628 if (XCHARSET_PRIVATE_P (charset))
3630 encode_ucs4 (charset, c, 0, dst);
3635 #ifdef ENABLE_COMPOSITE_CHARS
3636 if (EQ (charset, Vcharset_composite))
3640 /* #### Bother! We don't know how to
3642 Dynarr_add (dst, 0);
3643 Dynarr_add (dst, 0);
3644 Dynarr_add (dst, 0);
3645 Dynarr_add (dst, '~');
3649 Emchar emch = MAKE_CHAR (Vcharset_composite,
3650 ch & 0x7F, c & 0x7F);
3651 Lisp_Object lstr = composite_char_string (emch);
3655 src = XSTRING_DATA (lstr);
3656 n = XSTRING_LENGTH (lstr);
3660 #endif /* ENABLE_COMPOSITE_CHARS */
3662 encode_ucs4(charset, ch, c, dst);
3675 encode_ucs4 (charset, ch, c, dst);
3691 #ifdef ENABLE_COMPOSITE_CHARS
3697 goto back_to_square_n; /* Wheeeeeeeee ..... */
3699 #endif /* ENABLE_COMPOSITE_CHARS */
3703 str->iso2022.current_char_boundary = char_boundary;
3704 str->iso2022.current_charset = charset;
3706 /* Verbum caro factum est! */
3711 /************************************************************************/
3713 /************************************************************************/
3716 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3721 unsigned char c = *src++;
3722 switch (st->utf8.in_byte)
3725 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3728 st->utf8.in_byte = 5;
3730 st->utf8.in_byte = 4;
3732 st->utf8.in_byte = 3;
3734 st->utf8.in_byte = 2;
3736 st->utf8.in_byte = 1;
3741 if ((c & 0xc0) != 0x80)
3747 return CODING_CATEGORY_UTF8_MASK;
3751 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3752 unsigned_char_dynarr *dst, unsigned int n)
3754 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3755 unsigned int flags = str->flags;
3756 unsigned int ch = str->ch;
3757 eol_type_t eol_type = str->eol_type;
3758 unsigned char counter = str->counter;
3762 unsigned char c = *src++;
3771 else if ( c >= 0xf8 )
3776 else if ( c >= 0xf0 )
3781 else if ( c >= 0xe0 )
3786 else if ( c >= 0xc0 )
3793 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3794 decode_ucs4 (c, dst);
3798 ch = ( ch << 6 ) | ( c & 0x3f );
3799 decode_ucs4 (ch, dst);
3804 ch = ( ch << 6 ) | ( c & 0x3f );
3807 label_continue_loop:;
3810 if (flags & CODING_STATE_END)
3811 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3815 str->counter = counter;
3820 encode_utf8 (Lisp_Object charset,
3821 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3823 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3826 Dynarr_add (dst, code);
3828 else if ( code <= 0x7ff )
3830 Dynarr_add (dst, (code >> 6) | 0xc0);
3831 Dynarr_add (dst, (code & 0x3f) | 0x80);
3833 else if ( code <= 0xffff )
3835 Dynarr_add (dst, (code >> 12) | 0xe0);
3836 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3837 Dynarr_add (dst, (code & 0x3f) | 0x80);
3839 else if ( code <= 0x1fffff )
3841 Dynarr_add (dst, (code >> 18) | 0xf0);
3842 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3843 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3844 Dynarr_add (dst, (code & 0x3f) | 0x80);
3846 else if ( code <= 0x3ffffff )
3848 Dynarr_add (dst, (code >> 24) | 0xf8);
3849 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3850 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3851 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3852 Dynarr_add (dst, (code & 0x3f) | 0x80);
3856 Dynarr_add (dst, (code >> 30) | 0xfc);
3857 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3858 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3859 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3860 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3861 Dynarr_add (dst, (code & 0x3f) | 0x80);
3867 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3868 unsigned_char_dynarr *dst, unsigned int n)
3870 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3871 unsigned int flags = str->flags;
3872 unsigned int ch = str->ch;
3873 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3874 unsigned char char_boundary = str->iso2022.current_char_boundary;
3879 unsigned char c = *src++;
3880 switch (char_boundary)
3885 Dynarr_add (dst, c);
3888 else if ( c >= 0xf8 )
3890 Dynarr_add (dst, c);
3893 else if ( c >= 0xf0 )
3895 Dynarr_add (dst, c);
3898 else if ( c >= 0xe0 )
3900 Dynarr_add (dst, c);
3903 else if ( c >= 0xc0 )
3905 Dynarr_add (dst, c);
3912 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3913 Dynarr_add (dst, '\r');
3914 if (eol_type != EOL_CR)
3915 Dynarr_add (dst, c);
3918 Dynarr_add (dst, c);
3923 Dynarr_add (dst, c);
3927 Dynarr_add (dst, c);
3931 #else /* not UTF2000 */
3932 Lisp_Object charset = str->iso2022.current_charset;
3934 #ifdef ENABLE_COMPOSITE_CHARS
3935 /* flags for handling composite chars. We do a little switcharoo
3936 on the source while we're outputting the composite char. */
3937 unsigned int saved_n = 0;
3938 CONST unsigned char *saved_src = NULL;
3939 int in_composite = 0;
3942 #endif /* ENABLE_COMPOSITE_CHARS */
3946 unsigned char c = *src++;
3948 if (BYTE_ASCII_P (c))
3949 { /* Processing ASCII character */
3953 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3954 Dynarr_add (dst, '\r');
3955 if (eol_type != EOL_CR)
3956 Dynarr_add (dst, c);
3959 encode_utf8 (Vcharset_ascii, c, 0, dst);
3962 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3963 { /* Processing Leading Byte */
3965 charset = CHARSET_BY_LEADING_BYTE (c);
3966 if (LEADING_BYTE_PREFIX_P(c))
3971 { /* Processing Non-ASCII character */
3973 if (EQ (charset, Vcharset_control_1))
3975 encode_utf8 (Vcharset_control_1, c, 0, dst);
3979 switch (XCHARSET_REP_BYTES (charset))
3982 encode_utf8 (charset, c, 0, dst);
3985 if (XCHARSET_PRIVATE_P (charset))
3987 encode_utf8 (charset, c, 0, dst);
3992 #ifdef ENABLE_COMPOSITE_CHARS
3993 if (EQ (charset, Vcharset_composite))
3997 /* #### Bother! We don't know how to
3999 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4003 Emchar emch = MAKE_CHAR (Vcharset_composite,
4004 ch & 0x7F, c & 0x7F);
4005 Lisp_Object lstr = composite_char_string (emch);
4009 src = XSTRING_DATA (lstr);
4010 n = XSTRING_LENGTH (lstr);
4014 #endif /* ENABLE_COMPOSITE_CHARS */
4016 encode_utf8 (charset, ch, c, dst);
4029 encode_utf8 (charset, ch, c, dst);
4045 #ifdef ENABLE_COMPOSITE_CHARS
4051 goto back_to_square_n; /* Wheeeeeeeee ..... */
4055 #endif /* not UTF2000 */
4058 str->iso2022.current_char_boundary = char_boundary;
4060 str->iso2022.current_charset = charset;
4063 /* Verbum caro factum est! */
4067 /************************************************************************/
4068 /* ISO2022 methods */
4069 /************************************************************************/
4071 /* The following note describes the coding system ISO2022 briefly.
4072 Since the intention of this note is to help understand the
4073 functions in this file, some parts are NOT ACCURATE or OVERLY
4074 SIMPLIFIED. For thorough understanding, please refer to the
4075 original document of ISO2022.
4077 ISO2022 provides many mechanisms to encode several character sets
4078 in 7-bit and 8-bit environments. For 7-bit environments, all text
4079 is encoded using bytes less than 128. This may make the encoded
4080 text a little bit longer, but the text passes more easily through
4081 several gateways, some of which strip off MSB (Most Signigant Bit).
4083 There are two kinds of character sets: control character set and
4084 graphic character set. The former contains control characters such
4085 as `newline' and `escape' to provide control functions (control
4086 functions are also provided by escape sequences). The latter
4087 contains graphic characters such as 'A' and '-'. Emacs recognizes
4088 two control character sets and many graphic character sets.
4090 Graphic character sets are classified into one of the following
4091 four classes, according to the number of bytes (DIMENSION) and
4092 number of characters in one dimension (CHARS) of the set:
4093 - DIMENSION1_CHARS94
4094 - DIMENSION1_CHARS96
4095 - DIMENSION2_CHARS94
4096 - DIMENSION2_CHARS96
4098 In addition, each character set is assigned an identification tag,
4099 unique for each set, called "final character" (denoted as <F>
4100 hereafter). The <F> of each character set is decided by ECMA(*)
4101 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4102 (0x30..0x3F are for private use only).
4104 Note (*): ECMA = European Computer Manufacturers Association
4106 Here are examples of graphic character set [NAME(<F>)]:
4107 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4108 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4109 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4110 o DIMENSION2_CHARS96 -- none for the moment
4112 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4113 C0 [0x00..0x1F] -- control character plane 0
4114 GL [0x20..0x7F] -- graphic character plane 0
4115 C1 [0x80..0x9F] -- control character plane 1
4116 GR [0xA0..0xFF] -- graphic character plane 1
4118 A control character set is directly designated and invoked to C0 or
4119 C1 by an escape sequence. The most common case is that:
4120 - ISO646's control character set is designated/invoked to C0, and
4121 - ISO6429's control character set is designated/invoked to C1,
4122 and usually these designations/invocations are omitted in encoded
4123 text. In a 7-bit environment, only C0 can be used, and a control
4124 character for C1 is encoded by an appropriate escape sequence to
4125 fit into the environment. All control characters for C1 are
4126 defined to have corresponding escape sequences.
4128 A graphic character set is at first designated to one of four
4129 graphic registers (G0 through G3), then these graphic registers are
4130 invoked to GL or GR. These designations and invocations can be
4131 done independently. The most common case is that G0 is invoked to
4132 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4133 these invocations and designations are omitted in encoded text.
4134 In a 7-bit environment, only GL can be used.
4136 When a graphic character set of CHARS94 is invoked to GL, codes
4137 0x20 and 0x7F of the GL area work as control characters SPACE and
4138 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4141 There are two ways of invocation: locking-shift and single-shift.
4142 With locking-shift, the invocation lasts until the next different
4143 invocation, whereas with single-shift, the invocation affects the
4144 following character only and doesn't affect the locking-shift
4145 state. Invocations are done by the following control characters or
4148 ----------------------------------------------------------------------
4149 abbrev function cntrl escape seq description
4150 ----------------------------------------------------------------------
4151 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4152 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4153 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4154 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4155 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4156 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4157 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4158 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4159 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4160 ----------------------------------------------------------------------
4161 (*) These are not used by any known coding system.
4163 Control characters for these functions are defined by macros
4164 ISO_CODE_XXX in `coding.h'.
4166 Designations are done by the following escape sequences:
4167 ----------------------------------------------------------------------
4168 escape sequence description
4169 ----------------------------------------------------------------------
4170 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4171 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4172 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4173 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4174 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4175 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4176 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4177 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4178 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4179 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4180 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4181 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4182 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4183 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4184 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4185 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4186 ----------------------------------------------------------------------
4188 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4189 of dimension 1, chars 94, and final character <F>, etc...
4191 Note (*): Although these designations are not allowed in ISO2022,
4192 Emacs accepts them on decoding, and produces them on encoding
4193 CHARS96 character sets in a coding system which is characterized as
4194 7-bit environment, non-locking-shift, and non-single-shift.
4196 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4197 '(' can be omitted. We refer to this as "short-form" hereafter.
4199 Now you may notice that there are a lot of ways for encoding the
4200 same multilingual text in ISO2022. Actually, there exist many
4201 coding systems such as Compound Text (used in X11's inter client
4202 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4203 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4204 localized platforms), and all of these are variants of ISO2022.
4206 In addition to the above, Emacs handles two more kinds of escape
4207 sequences: ISO6429's direction specification and Emacs' private
4208 sequence for specifying character composition.
4210 ISO6429's direction specification takes the following form:
4211 o CSI ']' -- end of the current direction
4212 o CSI '0' ']' -- end of the current direction
4213 o CSI '1' ']' -- start of left-to-right text
4214 o CSI '2' ']' -- start of right-to-left text
4215 The control character CSI (0x9B: control sequence introducer) is
4216 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4218 Character composition specification takes the following form:
4219 o ESC '0' -- start character composition
4220 o ESC '1' -- end character composition
4221 Since these are not standard escape sequences of any ISO standard,
4222 their use with these meanings is restricted to Emacs only. */
4225 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4229 for (i = 0; i < 4; i++)
4231 if (!NILP (coding_system))
4233 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4235 iso->charset[i] = Qt;
4236 iso->invalid_designated[i] = 0;
4238 iso->esc = ISO_ESC_NOTHING;
4239 iso->esc_bytes_index = 0;
4240 iso->register_left = 0;
4241 iso->register_right = 1;
4242 iso->switched_dir_and_no_valid_charset_yet = 0;
4243 iso->invalid_switch_dir = 0;
4244 iso->output_direction_sequence = 0;
4245 iso->output_literally = 0;
4246 #ifdef ENABLE_COMPOSITE_CHARS
4247 if (iso->composite_chars)
4248 Dynarr_reset (iso->composite_chars);
4253 fit_to_be_escape_quoted (unsigned char c)
4270 /* Parse one byte of an ISO2022 escape sequence.
4271 If the result is an invalid escape sequence, return 0 and
4272 do not change anything in STR. Otherwise, if the result is
4273 an incomplete escape sequence, update ISO2022.ESC and
4274 ISO2022.ESC_BYTES and return -1. Otherwise, update
4275 all the state variables (but not ISO2022.ESC_BYTES) and
4278 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4279 or invocation of an invalid character set and treat that as
4280 an unrecognized escape sequence. */
4283 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4284 unsigned char c, unsigned int *flags,
4285 int check_invalid_charsets)
4287 /* (1) If we're at the end of a designation sequence, CS is the
4288 charset being designated and REG is the register to designate
4291 (2) If we're at the end of a locking-shift sequence, REG is
4292 the register to invoke and HALF (0 == left, 1 == right) is
4293 the half to invoke it into.
4295 (3) If we're at the end of a single-shift sequence, REG is
4296 the register to invoke. */
4297 Lisp_Object cs = Qnil;
4300 /* NOTE: This code does goto's all over the fucking place.
4301 The reason for this is that we're basically implementing
4302 a state machine here, and hierarchical languages like C
4303 don't really provide a clean way of doing this. */
4305 if (! (*flags & CODING_STATE_ESCAPE))
4306 /* At beginning of escape sequence; we need to reset our
4307 escape-state variables. */
4308 iso->esc = ISO_ESC_NOTHING;
4310 iso->output_literally = 0;
4311 iso->output_direction_sequence = 0;
4315 case ISO_ESC_NOTHING:
4316 iso->esc_bytes_index = 0;
4319 case ISO_CODE_ESC: /* Start escape sequence */
4320 *flags |= CODING_STATE_ESCAPE;
4324 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4325 *flags |= CODING_STATE_ESCAPE;
4326 iso->esc = ISO_ESC_5_11;
4329 case ISO_CODE_SO: /* locking shift 1 */
4332 case ISO_CODE_SI: /* locking shift 0 */
4336 case ISO_CODE_SS2: /* single shift */
4339 case ISO_CODE_SS3: /* single shift */
4343 default: /* Other control characters */
4350 /**** single shift ****/
4352 case 'N': /* single shift 2 */
4355 case 'O': /* single shift 3 */
4359 /**** locking shift ****/
4361 case '~': /* locking shift 1 right */
4364 case 'n': /* locking shift 2 */
4367 case '}': /* locking shift 2 right */
4370 case 'o': /* locking shift 3 */
4373 case '|': /* locking shift 3 right */
4377 #ifdef ENABLE_COMPOSITE_CHARS
4378 /**** composite ****/
4381 iso->esc = ISO_ESC_START_COMPOSITE;
4382 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4383 CODING_STATE_COMPOSITE;
4387 iso->esc = ISO_ESC_END_COMPOSITE;
4388 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4389 ~CODING_STATE_COMPOSITE;
4391 #endif /* ENABLE_COMPOSITE_CHARS */
4393 /**** directionality ****/
4396 iso->esc = ISO_ESC_5_11;
4399 /**** designation ****/
4401 case '$': /* multibyte charset prefix */
4402 iso->esc = ISO_ESC_2_4;
4406 if (0x28 <= c && c <= 0x2F)
4408 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4412 /* This function is called with CODESYS equal to nil when
4413 doing coding-system detection. */
4415 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4416 && fit_to_be_escape_quoted (c))
4418 iso->esc = ISO_ESC_LITERAL;
4419 *flags &= CODING_STATE_ISO2022_LOCK;
4429 /**** directionality ****/
4431 case ISO_ESC_5_11: /* ISO6429 direction control */
4434 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4435 goto directionality;
4437 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4438 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4439 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4443 case ISO_ESC_5_11_0:
4446 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4447 goto directionality;
4451 case ISO_ESC_5_11_1:
4454 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4455 goto directionality;
4459 case ISO_ESC_5_11_2:
4462 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4463 goto directionality;
4468 iso->esc = ISO_ESC_DIRECTIONALITY;
4469 /* Various junk here to attempt to preserve the direction sequences
4470 literally in the text if they would otherwise be swallowed due
4471 to invalid designations that don't show up as actual charset
4472 changes in the text. */
4473 if (iso->invalid_switch_dir)
4475 /* We already inserted a direction switch literally into the
4476 text. We assume (#### this may not be right) that the
4477 next direction switch is the one going the other way,
4478 and we need to output that literally as well. */
4479 iso->output_literally = 1;
4480 iso->invalid_switch_dir = 0;
4486 /* If we are in the thrall of an invalid designation,
4487 then stick the directionality sequence literally into the
4488 output stream so it ends up in the original text again. */
4489 for (jj = 0; jj < 4; jj++)
4490 if (iso->invalid_designated[jj])
4494 iso->output_literally = 1;
4495 iso->invalid_switch_dir = 1;
4498 /* Indicate that we haven't yet seen a valid designation,
4499 so that if a switch-dir is directly followed by an
4500 invalid designation, both get inserted literally. */
4501 iso->switched_dir_and_no_valid_charset_yet = 1;
4506 /**** designation ****/
4509 if (0x28 <= c && c <= 0x2F)
4511 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4514 if (0x40 <= c && c <= 0x42)
4516 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4517 *flags & CODING_STATE_R2L ?
4518 CHARSET_RIGHT_TO_LEFT :
4519 CHARSET_LEFT_TO_RIGHT);
4529 if (c < '0' || c > '~')
4530 return 0; /* bad final byte */
4532 if (iso->esc >= ISO_ESC_2_8 &&
4533 iso->esc <= ISO_ESC_2_15)
4535 type = ((iso->esc >= ISO_ESC_2_12) ?
4536 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4537 reg = (iso->esc - ISO_ESC_2_8) & 3;
4539 else if (iso->esc >= ISO_ESC_2_4_8 &&
4540 iso->esc <= ISO_ESC_2_4_15)
4542 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4543 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4544 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4548 /* Can this ever be reached? -slb */
4552 cs = CHARSET_BY_ATTRIBUTES (type, c,
4553 *flags & CODING_STATE_R2L ?
4554 CHARSET_RIGHT_TO_LEFT :
4555 CHARSET_LEFT_TO_RIGHT);
4561 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4565 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4566 /* can't invoke something that ain't there. */
4568 iso->esc = ISO_ESC_SINGLE_SHIFT;
4569 *flags &= CODING_STATE_ISO2022_LOCK;
4571 *flags |= CODING_STATE_SS2;
4573 *flags |= CODING_STATE_SS3;
4577 if (check_invalid_charsets &&
4578 !CHARSETP (iso->charset[reg]))
4579 /* can't invoke something that ain't there. */
4582 iso->register_right = reg;
4584 iso->register_left = reg;
4585 *flags &= CODING_STATE_ISO2022_LOCK;
4586 iso->esc = ISO_ESC_LOCKING_SHIFT;
4590 if (NILP (cs) && check_invalid_charsets)
4592 iso->invalid_designated[reg] = 1;
4593 iso->charset[reg] = Vcharset_ascii;
4594 iso->esc = ISO_ESC_DESIGNATE;
4595 *flags &= CODING_STATE_ISO2022_LOCK;
4596 iso->output_literally = 1;
4597 if (iso->switched_dir_and_no_valid_charset_yet)
4599 /* We encountered a switch-direction followed by an
4600 invalid designation. Ensure that the switch-direction
4601 gets outputted; otherwise it will probably get eaten
4602 when the text is written out again. */
4603 iso->switched_dir_and_no_valid_charset_yet = 0;
4604 iso->output_direction_sequence = 1;
4605 /* And make sure that the switch-dir going the other
4606 way gets outputted, as well. */
4607 iso->invalid_switch_dir = 1;
4611 /* This function is called with CODESYS equal to nil when
4612 doing coding-system detection. */
4613 if (!NILP (codesys))
4615 charset_conversion_spec_dynarr *dyn =
4616 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4622 for (i = 0; i < Dynarr_length (dyn); i++)
4624 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4625 if (EQ (cs, spec->from_charset))
4626 cs = spec->to_charset;
4631 iso->charset[reg] = cs;
4632 iso->esc = ISO_ESC_DESIGNATE;
4633 *flags &= CODING_STATE_ISO2022_LOCK;
4634 if (iso->invalid_designated[reg])
4636 iso->invalid_designated[reg] = 0;
4637 iso->output_literally = 1;
4639 if (iso->switched_dir_and_no_valid_charset_yet)
4640 iso->switched_dir_and_no_valid_charset_yet = 0;
4645 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4650 /* #### There are serious deficiencies in the recognition mechanism
4651 here. This needs to be much smarter if it's going to cut it.
4652 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4653 it should be detected as Latin-1.
4654 All the ISO2022 stuff in this file should be synced up with the
4655 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4656 Perhaps we should wait till R2L works in FSF Emacs? */
4658 if (!st->iso2022.initted)
4660 reset_iso2022 (Qnil, &st->iso2022.iso);
4661 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4662 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4663 CODING_CATEGORY_ISO_8_1_MASK |
4664 CODING_CATEGORY_ISO_8_2_MASK |
4665 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4666 st->iso2022.flags = 0;
4667 st->iso2022.high_byte_count = 0;
4668 st->iso2022.saw_single_shift = 0;
4669 st->iso2022.initted = 1;
4672 mask = st->iso2022.mask;
4679 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4680 st->iso2022.high_byte_count++;
4684 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4686 if (st->iso2022.high_byte_count & 1)
4687 /* odd number of high bytes; assume not iso-8-2 */
4688 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4690 st->iso2022.high_byte_count = 0;
4691 st->iso2022.saw_single_shift = 0;
4693 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4695 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4696 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4697 { /* control chars */
4700 /* Allow and ignore control characters that you might
4701 reasonably see in a text file */
4706 case 8: /* backspace */
4707 case 11: /* vertical tab */
4708 case 12: /* form feed */
4709 case 26: /* MS-DOS C-z junk */
4710 case 31: /* '^_' -- for info */
4711 goto label_continue_loop;
4718 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4721 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4722 &st->iso2022.flags, 0))
4724 switch (st->iso2022.iso.esc)
4726 case ISO_ESC_DESIGNATE:
4727 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4728 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4730 case ISO_ESC_LOCKING_SHIFT:
4731 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4732 goto ran_out_of_chars;
4733 case ISO_ESC_SINGLE_SHIFT:
4734 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4735 st->iso2022.saw_single_shift = 1;
4744 goto ran_out_of_chars;
4747 label_continue_loop:;
4756 postprocess_iso2022_mask (int mask)
4758 /* #### kind of cheesy */
4759 /* If seven-bit ISO is allowed, then assume that the encoding is
4760 entirely seven-bit and turn off the eight-bit ones. */
4761 if (mask & CODING_CATEGORY_ISO_7_MASK)
4762 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4763 CODING_CATEGORY_ISO_8_1_MASK |
4764 CODING_CATEGORY_ISO_8_2_MASK);
4768 /* If FLAGS is a null pointer or specifies right-to-left motion,
4769 output a switch-dir-to-left-to-right sequence to DST.
4770 Also update FLAGS if it is not a null pointer.
4771 If INTERNAL_P is set, we are outputting in internal format and
4772 need to handle the CSI differently. */
4775 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4776 unsigned_char_dynarr *dst,
4777 unsigned int *flags,
4780 if (!flags || (*flags & CODING_STATE_R2L))
4782 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4784 Dynarr_add (dst, ISO_CODE_ESC);
4785 Dynarr_add (dst, '[');
4787 else if (internal_p)
4788 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4790 Dynarr_add (dst, ISO_CODE_CSI);
4791 Dynarr_add (dst, '0');
4792 Dynarr_add (dst, ']');
4794 *flags &= ~CODING_STATE_R2L;
4798 /* If FLAGS is a null pointer or specifies a direction different from
4799 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4800 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4801 sequence to DST. Also update FLAGS if it is not a null pointer.
4802 If INTERNAL_P is set, we are outputting in internal format and
4803 need to handle the CSI differently. */
4806 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4807 unsigned_char_dynarr *dst, unsigned int *flags,
4810 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4811 direction == CHARSET_LEFT_TO_RIGHT)
4812 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4813 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4814 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4815 direction == CHARSET_RIGHT_TO_LEFT)
4817 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4819 Dynarr_add (dst, ISO_CODE_ESC);
4820 Dynarr_add (dst, '[');
4822 else if (internal_p)
4823 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4825 Dynarr_add (dst, ISO_CODE_CSI);
4826 Dynarr_add (dst, '2');
4827 Dynarr_add (dst, ']');
4829 *flags |= CODING_STATE_R2L;
4833 /* Convert ISO2022-format data to internal format. */
4836 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4837 unsigned_char_dynarr *dst, unsigned int n)
4839 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4840 unsigned int flags = str->flags;
4841 unsigned int ch = str->ch;
4842 eol_type_t eol_type = str->eol_type;
4843 #ifdef ENABLE_COMPOSITE_CHARS
4844 unsigned_char_dynarr *real_dst = dst;
4846 Lisp_Object coding_system;
4848 XSETCODING_SYSTEM (coding_system, str->codesys);
4850 #ifdef ENABLE_COMPOSITE_CHARS
4851 if (flags & CODING_STATE_COMPOSITE)
4852 dst = str->iso2022.composite_chars;
4853 #endif /* ENABLE_COMPOSITE_CHARS */
4857 unsigned char c = *src++;
4858 if (flags & CODING_STATE_ESCAPE)
4859 { /* Within ESC sequence */
4860 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4865 switch (str->iso2022.esc)
4867 #ifdef ENABLE_COMPOSITE_CHARS
4868 case ISO_ESC_START_COMPOSITE:
4869 if (str->iso2022.composite_chars)
4870 Dynarr_reset (str->iso2022.composite_chars);
4872 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4873 dst = str->iso2022.composite_chars;
4875 case ISO_ESC_END_COMPOSITE:
4877 Bufbyte comstr[MAX_EMCHAR_LEN];
4879 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4880 Dynarr_length (dst));
4882 len = set_charptr_emchar (comstr, emch);
4883 Dynarr_add_many (dst, comstr, len);
4886 #endif /* ENABLE_COMPOSITE_CHARS */
4888 case ISO_ESC_LITERAL:
4889 DECODE_ADD_BINARY_CHAR (c, dst);
4893 /* Everything else handled already */
4898 /* Attempted error recovery. */
4899 if (str->iso2022.output_direction_sequence)
4900 ensure_correct_direction (flags & CODING_STATE_R2L ?
4901 CHARSET_RIGHT_TO_LEFT :
4902 CHARSET_LEFT_TO_RIGHT,
4903 str->codesys, dst, 0, 1);
4904 /* More error recovery. */
4905 if (!retval || str->iso2022.output_literally)
4907 /* Output the (possibly invalid) sequence */
4909 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4910 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4911 flags &= CODING_STATE_ISO2022_LOCK;
4913 n++, src--;/* Repeat the loop with the same character. */
4916 /* No sense in reprocessing the final byte of the
4917 escape sequence; it could mess things up anyway.
4919 DECODE_ADD_BINARY_CHAR (c, dst);
4924 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4925 { /* Control characters */
4927 /***** Error-handling *****/
4929 /* If we were in the middle of a character, dump out the
4930 partial character. */
4931 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4933 /* If we just saw a single-shift character, dump it out.
4934 This may dump out the wrong sort of single-shift character,
4935 but least it will give an indication that something went
4937 if (flags & CODING_STATE_SS2)
4939 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4940 flags &= ~CODING_STATE_SS2;
4942 if (flags & CODING_STATE_SS3)
4944 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4945 flags &= ~CODING_STATE_SS3;
4948 /***** Now handle the control characters. *****/
4951 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4953 flags &= CODING_STATE_ISO2022_LOCK;
4955 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4956 DECODE_ADD_BINARY_CHAR (c, dst);
4959 { /* Graphic characters */
4960 Lisp_Object charset;
4966 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4968 /* Now determine the charset. */
4969 reg = ((flags & CODING_STATE_SS2) ? 2
4970 : (flags & CODING_STATE_SS3) ? 3
4971 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4972 : str->iso2022.register_left);
4973 charset = str->iso2022.charset[reg];
4975 /* Error checking: */
4976 if (! CHARSETP (charset)
4977 || str->iso2022.invalid_designated[reg]
4978 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4979 && XCHARSET_CHARS (charset) == 94))
4980 /* Mrmph. We are trying to invoke a register that has no
4981 or an invalid charset in it, or trying to add a character
4982 outside the range of the charset. Insert that char literally
4983 to preserve it for the output. */
4985 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4986 DECODE_ADD_BINARY_CHAR (c, dst);
4991 /* Things are probably hunky-dorey. */
4993 /* Fetch reverse charset, maybe. */
4994 if (((flags & CODING_STATE_R2L) &&
4995 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4997 (!(flags & CODING_STATE_R2L) &&
4998 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5000 Lisp_Object new_charset =
5001 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5002 if (!NILP (new_charset))
5003 charset = new_charset;
5007 lb = XCHARSET_LEADING_BYTE (charset);
5009 switch (XCHARSET_REP_BYTES (charset))
5012 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5013 Dynarr_add (dst, c & 0x7F);
5016 case 2: /* one-byte official */
5017 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5019 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0), dst);
5021 Dynarr_add (dst, lb);
5022 Dynarr_add (dst, c | 0x80);
5026 case 3: /* one-byte private or two-byte official */
5028 if (XCHARSET_DIMENSION (charset) == 1)
5030 if (XCHARSET_PRIVATE_P (charset))
5033 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5035 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0),
5038 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5039 Dynarr_add (dst, lb);
5040 Dynarr_add (dst, c | 0x80);
5048 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
5052 Dynarr_add (dst, lb);
5053 Dynarr_add (dst, ch | 0x80);
5054 Dynarr_add (dst, c | 0x80);
5063 default: /* two-byte private */
5067 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
5071 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5072 Dynarr_add (dst, lb);
5073 Dynarr_add (dst, ch | 0x80);
5074 Dynarr_add (dst, c | 0x80);
5084 flags &= CODING_STATE_ISO2022_LOCK;
5087 label_continue_loop:;
5090 if (flags & CODING_STATE_END)
5091 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5098 /***** ISO2022 encoder *****/
5100 /* Designate CHARSET into register REG. */
5103 iso2022_designate (Lisp_Object charset, unsigned char reg,
5104 struct encoding_stream *str, unsigned_char_dynarr *dst)
5106 static CONST char inter94[] = "()*+";
5107 static CONST char inter96[] = ",-./";
5109 unsigned char final;
5110 Lisp_Object old_charset = str->iso2022.charset[reg];
5112 str->iso2022.charset[reg] = charset;
5113 if (!CHARSETP (charset))
5114 /* charset might be an initial nil or t. */
5116 type = XCHARSET_TYPE (charset);
5117 final = XCHARSET_FINAL (charset);
5118 if (!str->iso2022.force_charset_on_output[reg] &&
5119 CHARSETP (old_charset) &&
5120 XCHARSET_TYPE (old_charset) == type &&
5121 XCHARSET_FINAL (old_charset) == final)
5124 str->iso2022.force_charset_on_output[reg] = 0;
5127 charset_conversion_spec_dynarr *dyn =
5128 str->codesys->iso2022.output_conv;
5134 for (i = 0; i < Dynarr_length (dyn); i++)
5136 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5137 if (EQ (charset, spec->from_charset))
5138 charset = spec->to_charset;
5143 Dynarr_add (dst, ISO_CODE_ESC);
5146 case CHARSET_TYPE_94:
5147 Dynarr_add (dst, inter94[reg]);
5149 case CHARSET_TYPE_96:
5150 Dynarr_add (dst, inter96[reg]);
5152 case CHARSET_TYPE_94X94:
5153 Dynarr_add (dst, '$');
5155 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5158 Dynarr_add (dst, inter94[reg]);
5160 case CHARSET_TYPE_96X96:
5161 Dynarr_add (dst, '$');
5162 Dynarr_add (dst, inter96[reg]);
5165 Dynarr_add (dst, final);
5169 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5171 if (str->iso2022.register_left != 0)
5173 Dynarr_add (dst, ISO_CODE_SI);
5174 str->iso2022.register_left = 0;
5179 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5181 if (str->iso2022.register_left != 1)
5183 Dynarr_add (dst, ISO_CODE_SO);
5184 str->iso2022.register_left = 1;
5188 /* Convert internally-formatted data to ISO2022 format. */
5191 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5192 unsigned_char_dynarr *dst, unsigned int n)
5194 unsigned char charmask, c;
5195 unsigned char char_boundary;
5196 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5197 unsigned int flags = str->flags;
5198 Emchar ch = str->ch;
5199 Lisp_Coding_System *codesys = str->codesys;
5200 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5202 Lisp_Object charset;
5205 unsigned int byte1, byte2;
5208 #ifdef ENABLE_COMPOSITE_CHARS
5209 /* flags for handling composite chars. We do a little switcharoo
5210 on the source while we're outputting the composite char. */
5211 unsigned int saved_n = 0;
5212 CONST unsigned char *saved_src = NULL;
5213 int in_composite = 0;
5214 #endif /* ENABLE_COMPOSITE_CHARS */
5216 char_boundary = str->iso2022.current_char_boundary;
5217 charset = str->iso2022.current_charset;
5218 half = str->iso2022.current_half;
5220 #ifdef ENABLE_COMPOSITE_CHARS
5228 switch (char_boundary)
5236 else if ( c >= 0xf8 )
5241 else if ( c >= 0xf0 )
5246 else if ( c >= 0xe0 )
5251 else if ( c >= 0xc0 )
5260 restore_left_to_right_direction (codesys, dst, &flags, 0);
5262 /* Make sure G0 contains ASCII */
5263 if ((c > ' ' && c < ISO_CODE_DEL) ||
5264 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5266 ensure_normal_shift (str, dst);
5267 iso2022_designate (Vcharset_ascii, 0, str, dst);
5270 /* If necessary, restore everything to the default state
5273 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5275 restore_left_to_right_direction (codesys, dst, &flags, 0);
5277 ensure_normal_shift (str, dst);
5279 for (i = 0; i < 4; i++)
5281 Lisp_Object initial_charset =
5282 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5283 iso2022_designate (initial_charset, i, str, dst);
5288 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5289 Dynarr_add (dst, '\r');
5290 if (eol_type != EOL_CR)
5291 Dynarr_add (dst, c);
5295 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5296 && fit_to_be_escape_quoted (c))
5297 Dynarr_add (dst, ISO_CODE_ESC);
5298 Dynarr_add (dst, c);
5304 ch = ( ch << 6 ) | ( c & 0x3f );
5307 if ( (0x80 <= ch) && (ch <= 0x9f) )
5309 charmask = (half == 0 ? 0x00 : 0x80);
5311 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5312 && fit_to_be_escape_quoted (ch))
5313 Dynarr_add (dst, ISO_CODE_ESC);
5314 /* you asked for it ... */
5315 Dynarr_add (dst, ch);
5321 BREAKUP_CHAR (ch, charset, byte1, byte2);
5322 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5323 codesys, dst, &flags, 0);
5325 /* Now determine which register to use. */
5327 for (i = 0; i < 4; i++)
5329 if (EQ (charset, str->iso2022.charset[i]) ||
5331 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5340 if (XCHARSET_GRAPHIC (charset) != 0)
5342 if (!NILP (str->iso2022.charset[1]) &&
5343 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5344 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5346 else if (!NILP (str->iso2022.charset[2]))
5348 else if (!NILP (str->iso2022.charset[3]))
5357 iso2022_designate (charset, reg, str, dst);
5359 /* Now invoke that register. */
5363 ensure_normal_shift (str, dst);
5368 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5370 ensure_shift_out (str, dst);
5378 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5380 Dynarr_add (dst, ISO_CODE_ESC);
5381 Dynarr_add (dst, 'N');
5386 Dynarr_add (dst, ISO_CODE_SS2);
5392 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5394 Dynarr_add (dst, ISO_CODE_ESC);
5395 Dynarr_add (dst, 'O');
5400 Dynarr_add (dst, ISO_CODE_SS3);
5409 charmask = (half == 0 ? 0x00 : 0x80);
5411 switch (XCHARSET_DIMENSION (charset))
5414 Dynarr_add (dst, byte1 | charmask);
5417 Dynarr_add (dst, byte1 | charmask);
5418 Dynarr_add (dst, byte2 | charmask);
5427 ch = ( ch << 6 ) | ( c & 0x3f );
5431 #else /* not UTF2000 */
5437 if (BYTE_ASCII_P (c))
5438 { /* Processing ASCII character */
5441 restore_left_to_right_direction (codesys, dst, &flags, 0);
5443 /* Make sure G0 contains ASCII */
5444 if ((c > ' ' && c < ISO_CODE_DEL) ||
5445 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5447 ensure_normal_shift (str, dst);
5448 iso2022_designate (Vcharset_ascii, 0, str, dst);
5451 /* If necessary, restore everything to the default state
5454 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5456 restore_left_to_right_direction (codesys, dst, &flags, 0);
5458 ensure_normal_shift (str, dst);
5460 for (i = 0; i < 4; i++)
5462 Lisp_Object initial_charset =
5463 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5464 iso2022_designate (initial_charset, i, str, dst);
5469 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5470 Dynarr_add (dst, '\r');
5471 if (eol_type != EOL_CR)
5472 Dynarr_add (dst, c);
5476 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5477 && fit_to_be_escape_quoted (c))
5478 Dynarr_add (dst, ISO_CODE_ESC);
5479 Dynarr_add (dst, c);
5484 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5485 { /* Processing Leading Byte */
5487 charset = CHARSET_BY_LEADING_BYTE (c);
5488 if (LEADING_BYTE_PREFIX_P(c))
5490 else if (!EQ (charset, Vcharset_control_1)
5491 #ifdef ENABLE_COMPOSITE_CHARS
5492 && !EQ (charset, Vcharset_composite)
5498 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5499 codesys, dst, &flags, 0);
5501 /* Now determine which register to use. */
5503 for (i = 0; i < 4; i++)
5505 if (EQ (charset, str->iso2022.charset[i]) ||
5507 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5516 if (XCHARSET_GRAPHIC (charset) != 0)
5518 if (!NILP (str->iso2022.charset[1]) &&
5519 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5520 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5522 else if (!NILP (str->iso2022.charset[2]))
5524 else if (!NILP (str->iso2022.charset[3]))
5533 iso2022_designate (charset, reg, str, dst);
5535 /* Now invoke that register. */
5539 ensure_normal_shift (str, dst);
5544 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5546 ensure_shift_out (str, dst);
5554 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5556 Dynarr_add (dst, ISO_CODE_ESC);
5557 Dynarr_add (dst, 'N');
5562 Dynarr_add (dst, ISO_CODE_SS2);
5568 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5570 Dynarr_add (dst, ISO_CODE_ESC);
5571 Dynarr_add (dst, 'O');
5576 Dynarr_add (dst, ISO_CODE_SS3);
5588 { /* Processing Non-ASCII character */
5589 charmask = (half == 0 ? 0x7F : 0xFF);
5591 if (EQ (charset, Vcharset_control_1))
5593 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5594 && fit_to_be_escape_quoted (c))
5595 Dynarr_add (dst, ISO_CODE_ESC);
5596 /* you asked for it ... */
5597 Dynarr_add (dst, c - 0x20);
5601 switch (XCHARSET_REP_BYTES (charset))
5604 Dynarr_add (dst, c & charmask);
5607 if (XCHARSET_PRIVATE_P (charset))
5609 Dynarr_add (dst, c & charmask);
5614 #ifdef ENABLE_COMPOSITE_CHARS
5615 if (EQ (charset, Vcharset_composite))
5619 /* #### Bother! We don't know how to
5621 Dynarr_add (dst, '~');
5625 Emchar emch = MAKE_CHAR (Vcharset_composite,
5626 ch & 0x7F, c & 0x7F);
5627 Lisp_Object lstr = composite_char_string (emch);
5631 src = XSTRING_DATA (lstr);
5632 n = XSTRING_LENGTH (lstr);
5633 Dynarr_add (dst, ISO_CODE_ESC);
5634 Dynarr_add (dst, '0'); /* start composing */
5638 #endif /* ENABLE_COMPOSITE_CHARS */
5640 Dynarr_add (dst, ch & charmask);
5641 Dynarr_add (dst, c & charmask);
5654 Dynarr_add (dst, ch & charmask);
5655 Dynarr_add (dst, c & charmask);
5670 #endif /* not UTF2000 */
5672 #ifdef ENABLE_COMPOSITE_CHARS
5678 Dynarr_add (dst, ISO_CODE_ESC);
5679 Dynarr_add (dst, '1'); /* end composing */
5680 goto back_to_square_n; /* Wheeeeeeeee ..... */
5682 #endif /* ENABLE_COMPOSITE_CHARS */
5685 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5687 if (char_boundary && flags & CODING_STATE_END)
5690 restore_left_to_right_direction (codesys, dst, &flags, 0);
5691 ensure_normal_shift (str, dst);
5692 for (i = 0; i < 4; i++)
5694 Lisp_Object initial_charset =
5695 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5696 iso2022_designate (initial_charset, i, str, dst);
5702 str->iso2022.current_char_boundary = char_boundary;
5703 str->iso2022.current_charset = charset;
5704 str->iso2022.current_half = half;
5706 /* Verbum caro factum est! */
5710 /************************************************************************/
5711 /* No-conversion methods */
5712 /************************************************************************/
5714 /* This is used when reading in "binary" files -- i.e. files that may
5715 contain all 256 possible byte values and that are not to be
5716 interpreted as being in any particular decoding. */
5718 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5719 unsigned_char_dynarr *dst, unsigned int n)
5722 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5723 unsigned int flags = str->flags;
5724 unsigned int ch = str->ch;
5725 eol_type_t eol_type = str->eol_type;
5731 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5732 DECODE_ADD_BINARY_CHAR (c, dst);
5733 label_continue_loop:;
5736 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5743 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5744 unsigned_char_dynarr *dst, unsigned int n)
5747 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5748 unsigned int flags = str->flags;
5749 unsigned int ch = str->ch;
5750 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5752 unsigned char char_boundary = str->iso2022.current_char_boundary;
5759 switch (char_boundary)
5767 else if ( c >= 0xf8 )
5772 else if ( c >= 0xf0 )
5777 else if ( c >= 0xe0 )
5782 else if ( c >= 0xc0 )
5793 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5794 Dynarr_add (dst, '\r');
5795 if (eol_type != EOL_CR)
5796 Dynarr_add (dst, c);
5799 Dynarr_add (dst, c);
5804 ch = ( ch << 6 ) | ( c & 0x3f );
5805 switch ( str->codesys->fixed.size )
5808 Dynarr_add (dst, ch & 0xff);
5811 Dynarr_add (dst, (ch >> 8) & 0xff);
5812 Dynarr_add (dst, ch & 0xff);
5815 Dynarr_add (dst, (ch >> 16) & 0xff);
5816 Dynarr_add (dst, (ch >> 8) & 0xff);
5817 Dynarr_add (dst, ch & 0xff);
5820 Dynarr_add (dst, (ch >> 24) & 0xff);
5821 Dynarr_add (dst, (ch >> 16) & 0xff);
5822 Dynarr_add (dst, (ch >> 8) & 0xff);
5823 Dynarr_add (dst, ch & 0xff);
5826 fprintf(stderr, "It seems %d bytes stream.\n",
5827 str->codesys->fixed.size);
5833 ch = ( ch << 6 ) | ( c & 0x3f );
5836 #else /* not UTF2000 */
5839 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5840 Dynarr_add (dst, '\r');
5841 if (eol_type != EOL_CR)
5842 Dynarr_add (dst, '\n');
5845 else if (BYTE_ASCII_P (c))
5848 Dynarr_add (dst, c);
5850 else if (BUFBYTE_LEADING_BYTE_P (c))
5853 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5854 c == LEADING_BYTE_CONTROL_1)
5857 Dynarr_add (dst, '~'); /* untranslatable character */
5861 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5862 Dynarr_add (dst, c);
5863 else if (ch == LEADING_BYTE_CONTROL_1)
5866 Dynarr_add (dst, c - 0x20);
5868 /* else it should be the second or third byte of an
5869 untranslatable character, so ignore it */
5872 #endif /* not UTF2000 */
5878 str->iso2022.current_char_boundary = char_boundary;
5883 /************************************************************************/
5884 /* Simple internal/external functions */
5885 /************************************************************************/
5887 static Extbyte_dynarr *conversion_out_dynarr;
5888 static Bufbyte_dynarr *conversion_in_dynarr;
5890 /* Determine coding system from coding format */
5892 /* #### not correct for all values of `fmt'! */
5894 external_data_format_to_coding_system (enum external_data_format fmt)
5898 case FORMAT_FILENAME:
5899 case FORMAT_TERMINAL:
5900 if (EQ (Vfile_name_coding_system, Qnil) ||
5901 EQ (Vfile_name_coding_system, Qbinary))
5904 return Fget_coding_system (Vfile_name_coding_system);
5907 return Fget_coding_system (Qctext);
5915 convert_to_external_format (CONST Bufbyte *ptr,
5918 enum external_data_format fmt)
5920 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5922 if (!conversion_out_dynarr)
5923 conversion_out_dynarr = Dynarr_new (Extbyte);
5925 Dynarr_reset (conversion_out_dynarr);
5927 if (NILP (coding_system))
5929 CONST Bufbyte *end = ptr + len;
5935 (*ptr < 0xc0) ? *ptr :
5936 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5939 (BYTE_ASCII_P (*ptr)) ? *ptr :
5940 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5941 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5944 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5948 #ifdef ERROR_CHECK_BUFPOS
5949 assert (ptr == end);
5954 Lisp_Object instream, outstream, da_outstream;
5955 Lstream *istr, *ostr;
5956 struct gcpro gcpro1, gcpro2, gcpro3;
5957 char tempbuf[1024]; /* some random amount */
5959 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5960 da_outstream = make_dynarr_output_stream
5961 ((unsigned_char_dynarr *) conversion_out_dynarr);
5963 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5964 istr = XLSTREAM (instream);
5965 ostr = XLSTREAM (outstream);
5966 GCPRO3 (instream, outstream, da_outstream);
5969 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5972 Lstream_write (ostr, tempbuf, size_in_bytes);
5974 Lstream_close (istr);
5975 Lstream_close (ostr);
5977 Lstream_delete (istr);
5978 Lstream_delete (ostr);
5979 Lstream_delete (XLSTREAM (da_outstream));
5982 *len_out = Dynarr_length (conversion_out_dynarr);
5983 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5984 return Dynarr_atp (conversion_out_dynarr, 0);
5988 convert_from_external_format (CONST Extbyte *ptr,
5991 enum external_data_format fmt)
5993 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5995 if (!conversion_in_dynarr)
5996 conversion_in_dynarr = Dynarr_new (Bufbyte);
5998 Dynarr_reset (conversion_in_dynarr);
6000 if (NILP (coding_system))
6002 CONST Extbyte *end = ptr + len;
6003 for (; ptr < end; ptr++)
6006 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
6011 Lisp_Object instream, outstream, da_outstream;
6012 Lstream *istr, *ostr;
6013 struct gcpro gcpro1, gcpro2, gcpro3;
6014 char tempbuf[1024]; /* some random amount */
6016 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
6017 da_outstream = make_dynarr_output_stream
6018 ((unsigned_char_dynarr *) conversion_in_dynarr);
6020 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
6021 istr = XLSTREAM (instream);
6022 ostr = XLSTREAM (outstream);
6023 GCPRO3 (instream, outstream, da_outstream);
6026 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
6029 Lstream_write (ostr, tempbuf, size_in_bytes);
6031 Lstream_close (istr);
6032 Lstream_close (ostr);
6034 Lstream_delete (istr);
6035 Lstream_delete (ostr);
6036 Lstream_delete (XLSTREAM (da_outstream));
6039 *len_out = Dynarr_length (conversion_in_dynarr);
6040 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
6041 return Dynarr_atp (conversion_in_dynarr, 0);
6045 /************************************************************************/
6046 /* Initialization */
6047 /************************************************************************/
6050 syms_of_file_coding (void)
6052 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
6053 deferror (&Qcoding_system_error, "coding-system-error",
6054 "Coding-system error", Qio_error);
6056 DEFSUBR (Fcoding_system_p);
6057 DEFSUBR (Ffind_coding_system);
6058 DEFSUBR (Fget_coding_system);
6059 DEFSUBR (Fcoding_system_list);
6060 DEFSUBR (Fcoding_system_name);
6061 DEFSUBR (Fmake_coding_system);
6062 DEFSUBR (Fcopy_coding_system);
6063 DEFSUBR (Fdefine_coding_system_alias);
6064 DEFSUBR (Fsubsidiary_coding_system);
6066 DEFSUBR (Fcoding_system_type);
6067 DEFSUBR (Fcoding_system_doc_string);
6069 DEFSUBR (Fcoding_system_charset);
6071 DEFSUBR (Fcoding_system_property);
6073 DEFSUBR (Fcoding_category_list);
6074 DEFSUBR (Fset_coding_priority_list);
6075 DEFSUBR (Fcoding_priority_list);
6076 DEFSUBR (Fset_coding_category_system);
6077 DEFSUBR (Fcoding_category_system);
6079 DEFSUBR (Fdetect_coding_region);
6080 DEFSUBR (Fdecode_coding_region);
6081 DEFSUBR (Fencode_coding_region);
6083 DEFSUBR (Fdecode_shift_jis_char);
6084 DEFSUBR (Fencode_shift_jis_char);
6085 DEFSUBR (Fdecode_big5_char);
6086 DEFSUBR (Fencode_big5_char);
6087 DEFSUBR (Fset_ucs_char);
6088 DEFSUBR (Fucs_char);
6089 DEFSUBR (Fset_char_ucs);
6090 DEFSUBR (Fchar_ucs);
6092 defsymbol (&Qcoding_system_p, "coding-system-p");
6093 defsymbol (&Qno_conversion, "no-conversion");
6094 defsymbol (&Qraw_text, "raw-text");
6096 defsymbol (&Qbig5, "big5");
6097 defsymbol (&Qshift_jis, "shift-jis");
6098 defsymbol (&Qucs4, "ucs-4");
6099 defsymbol (&Qutf8, "utf-8");
6100 defsymbol (&Qccl, "ccl");
6101 defsymbol (&Qiso2022, "iso2022");
6103 defsymbol (&Qmnemonic, "mnemonic");
6104 defsymbol (&Qeol_type, "eol-type");
6105 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6106 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6108 defsymbol (&Qcr, "cr");
6109 defsymbol (&Qlf, "lf");
6110 defsymbol (&Qcrlf, "crlf");
6111 defsymbol (&Qeol_cr, "eol-cr");
6112 defsymbol (&Qeol_lf, "eol-lf");
6113 defsymbol (&Qeol_crlf, "eol-crlf");
6115 defsymbol (&Qcharset_g0, "charset-g0");
6116 defsymbol (&Qcharset_g1, "charset-g1");
6117 defsymbol (&Qcharset_g2, "charset-g2");
6118 defsymbol (&Qcharset_g3, "charset-g3");
6119 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6120 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6121 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6122 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6123 defsymbol (&Qno_iso6429, "no-iso6429");
6124 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6125 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6127 defsymbol (&Qshort, "short");
6128 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6129 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6130 defsymbol (&Qseven, "seven");
6131 defsymbol (&Qlock_shift, "lock-shift");
6132 defsymbol (&Qescape_quoted, "escape-quoted");
6134 defsymbol (&Qencode, "encode");
6135 defsymbol (&Qdecode, "decode");
6138 defsymbol (&Qctext, "ctext");
6139 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6141 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6143 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6145 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6147 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6149 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6151 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6153 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6155 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6158 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6163 lstream_type_create_file_coding (void)
6165 LSTREAM_HAS_METHOD (decoding, reader);
6166 LSTREAM_HAS_METHOD (decoding, writer);
6167 LSTREAM_HAS_METHOD (decoding, rewinder);
6168 LSTREAM_HAS_METHOD (decoding, seekable_p);
6169 LSTREAM_HAS_METHOD (decoding, flusher);
6170 LSTREAM_HAS_METHOD (decoding, closer);
6171 LSTREAM_HAS_METHOD (decoding, marker);
6173 LSTREAM_HAS_METHOD (encoding, reader);
6174 LSTREAM_HAS_METHOD (encoding, writer);
6175 LSTREAM_HAS_METHOD (encoding, rewinder);
6176 LSTREAM_HAS_METHOD (encoding, seekable_p);
6177 LSTREAM_HAS_METHOD (encoding, flusher);
6178 LSTREAM_HAS_METHOD (encoding, closer);
6179 LSTREAM_HAS_METHOD (encoding, marker);
6183 vars_of_file_coding (void)
6187 /* Initialize to something reasonable ... */
6188 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6190 coding_category_system[i] = Qnil;
6191 coding_category_by_priority[i] = i;
6194 Fprovide (intern ("file-coding"));
6196 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6197 Coding system used for TTY keyboard input.
6198 Not used under a windowing system.
6200 Vkeyboard_coding_system = Qnil;
6202 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6203 Coding system used for TTY display output.
6204 Not used under a windowing system.
6206 Vterminal_coding_system = Qnil;
6208 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6209 Overriding coding system used when writing a file or process.
6210 You should *bind* this, not set it. If this is non-nil, it specifies
6211 the coding system that will be used when a file or process is read
6212 in, and overrides `buffer-file-coding-system-for-read',
6213 `insert-file-contents-pre-hook', etc. Use those variables instead of
6214 this one for permanent changes to the environment.
6216 Vcoding_system_for_read = Qnil;
6218 DEFVAR_LISP ("coding-system-for-write",
6219 &Vcoding_system_for_write /*
6220 Overriding coding system used when writing a file or process.
6221 You should *bind* this, not set it. If this is non-nil, it specifies
6222 the coding system that will be used when a file or process is wrote
6223 in, and overrides `buffer-file-coding-system',
6224 `write-region-pre-hook', etc. Use those variables instead of this one
6225 for permanent changes to the environment.
6227 Vcoding_system_for_write = Qnil;
6229 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6230 Coding system used to convert pathnames when accessing files.
6232 Vfile_name_coding_system = Qnil;
6234 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6235 Non-nil means the buffer contents are regarded as multi-byte form
6236 of characters, not a binary code. This affects the display, file I/O,
6237 and behaviors of various editing commands.
6239 Setting this to nil does not do anything.
6241 enable_multibyte_characters = 1;
6245 complex_vars_of_file_coding (void)
6247 staticpro (&Vcoding_system_hash_table);
6248 Vcoding_system_hash_table =
6249 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6251 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6253 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6255 struct codesys_prop csp; \
6257 csp.prop_type = (Prop_Type); \
6258 Dynarr_add (the_codesys_prop_dynarr, csp); \
6261 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6262 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6263 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6264 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6265 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6266 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6267 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6269 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6270 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6271 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6272 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6273 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6274 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6275 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6276 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6277 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6278 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6279 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6280 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6281 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6282 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6283 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6284 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6285 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6287 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6288 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6290 /* Need to create this here or we're really screwed. */
6292 (Qraw_text, Qno_conversion,
6293 build_string ("Raw text, which means it converts only line-break-codes."),
6294 list2 (Qmnemonic, build_string ("Raw")));
6297 (Qbinary, Qno_conversion,
6298 build_string ("Binary, which means it does not convert anything."),
6299 list4 (Qeol_type, Qlf,
6300 Qmnemonic, build_string ("Binary")));
6305 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6306 list2 (Qmnemonic, build_string ("UTF8")));
6309 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6311 /* Need this for bootstrapping */
6312 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6313 Fget_coding_system (Qraw_text);
6316 coding_category_system[CODING_CATEGORY_UTF8]
6317 = Fget_coding_system (Qutf8);
6324 for (i = 0; i < 65536; i++)
6325 ucs_to_mule_table[i] = Qnil;
6327 staticpro (&mule_to_ucs_table);
6328 mule_to_ucs_table = Fmake_char_table(Qgeneric);