1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
154 static int detect_coding_sjis (struct detection_state *st,
155 CONST unsigned char *src,
157 static void decode_coding_sjis (Lstream *decoding,
158 CONST unsigned char *src,
159 unsigned_char_dynarr *dst,
161 static void encode_coding_sjis (Lstream *encoding,
162 CONST unsigned char *src,
163 unsigned_char_dynarr *dst,
165 static int detect_coding_big5 (struct detection_state *st,
166 CONST unsigned char *src,
168 static void decode_coding_big5 (Lstream *decoding,
169 CONST unsigned char *src,
170 unsigned_char_dynarr *dst, unsigned int n);
171 static void encode_coding_big5 (Lstream *encoding,
172 CONST unsigned char *src,
173 unsigned_char_dynarr *dst, unsigned int n);
174 static int detect_coding_ucs4 (struct detection_state *st,
175 CONST unsigned char *src,
177 static void decode_coding_ucs4 (Lstream *decoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static void encode_coding_ucs4 (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst, unsigned int n);
183 static int detect_coding_utf8 (struct detection_state *st,
184 CONST unsigned char *src,
186 static void decode_coding_utf8 (Lstream *decoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst, unsigned int n);
189 static void encode_coding_utf8 (Lstream *encoding,
190 CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static int postprocess_iso2022_mask (int mask);
193 static void reset_iso2022 (Lisp_Object coding_system,
194 struct iso2022_decoder *iso);
195 static int detect_coding_iso2022 (struct detection_state *st,
196 CONST unsigned char *src,
198 static void decode_coding_iso2022 (Lstream *decoding,
199 CONST unsigned char *src,
200 unsigned_char_dynarr *dst, unsigned int n);
201 static void encode_coding_iso2022 (Lstream *encoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
205 static void decode_coding_no_conversion (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst,
209 static void encode_coding_no_conversion (Lstream *encoding,
210 CONST unsigned char *src,
211 unsigned_char_dynarr *dst,
213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
214 unsigned_char_dynarr *dst, unsigned int n);
215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
216 unsigned_char_dynarr *dst, unsigned int n);
218 typedef struct codesys_prop codesys_prop;
227 Dynarr_declare (codesys_prop);
228 } codesys_prop_dynarr;
230 codesys_prop_dynarr *the_codesys_prop_dynarr;
232 enum codesys_prop_enum
235 CODESYS_PROP_ISO2022,
240 /************************************************************************/
241 /* Coding system functions */
242 /************************************************************************/
244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
246 static void finalize_coding_system (void *header, int for_disksave);
249 static const struct lrecord_description ccs_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
254 static const struct struct_description ccs_description = {
255 sizeof(charset_conversion_spec),
259 static const struct lrecord_description ccsd_description_1[] = {
260 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
264 static const struct struct_description ccsd_description = {
265 sizeof(charset_conversion_spec_dynarr),
270 static const struct lrecord_description coding_system_description[] = {
271 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
272 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
273 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
275 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
276 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
277 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
278 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
284 mark_coding_system, print_coding_system,
285 finalize_coding_system,
286 0, 0, coding_system_description,
287 struct Lisp_Coding_System);
290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
292 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
294 markobj (CODING_SYSTEM_NAME (codesys));
295 markobj (CODING_SYSTEM_DOC_STRING (codesys));
296 markobj (CODING_SYSTEM_MNEMONIC (codesys));
297 markobj (CODING_SYSTEM_EOL_LF (codesys));
298 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
299 markobj (CODING_SYSTEM_EOL_CR (codesys));
301 switch (CODING_SYSTEM_TYPE (codesys))
305 case CODESYS_ISO2022:
306 for (i = 0; i < 4; i++)
307 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
308 if (codesys->iso2022.input_conv)
310 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
312 struct charset_conversion_spec *ccs =
313 Dynarr_atp (codesys->iso2022.input_conv, i);
314 markobj (ccs->from_charset);
315 markobj (ccs->to_charset);
318 if (codesys->iso2022.output_conv)
320 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
322 struct charset_conversion_spec *ccs =
323 Dynarr_atp (codesys->iso2022.output_conv, i);
324 markobj (ccs->from_charset);
325 markobj (ccs->to_charset);
331 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
332 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
339 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
340 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
347 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
349 error ("printing unreadable object #<coding_system 0x%x>",
352 write_c_string ("#<coding_system ", printcharfun);
353 print_internal (c->name, printcharfun, 1);
354 write_c_string (">", printcharfun);
358 finalize_coding_system (void *header, int for_disksave)
360 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
361 /* Since coding systems never go away, this function is not
362 necessary. But it would be necessary if we changed things
363 so that coding systems could go away. */
364 if (!for_disksave) /* see comment in lstream.c */
366 switch (CODING_SYSTEM_TYPE (c))
369 case CODESYS_ISO2022:
370 if (c->iso2022.input_conv)
372 Dynarr_free (c->iso2022.input_conv);
373 c->iso2022.input_conv = 0;
375 if (c->iso2022.output_conv)
377 Dynarr_free (c->iso2022.output_conv);
378 c->iso2022.output_conv = 0;
389 symbol_to_eol_type (Lisp_Object symbol)
391 CHECK_SYMBOL (symbol);
392 if (NILP (symbol)) return EOL_AUTODETECT;
393 if (EQ (symbol, Qlf)) return EOL_LF;
394 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
395 if (EQ (symbol, Qcr)) return EOL_CR;
397 signal_simple_error ("Unrecognized eol type", symbol);
398 return EOL_AUTODETECT; /* not reached */
402 eol_type_to_symbol (enum eol_type type)
407 case EOL_LF: return Qlf;
408 case EOL_CRLF: return Qcrlf;
409 case EOL_CR: return Qcr;
410 case EOL_AUTODETECT: return Qnil;
415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
417 Lisp_Object codesys_obj;
418 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
419 char *codesys_name = (char *) alloca (len + 7);
421 char *codesys_mnemonic=0;
423 Lisp_Object codesys_name_sym, sub_codesys_obj;
427 XSETCODING_SYSTEM (codesys_obj, codesys);
429 memcpy (codesys_name,
430 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
432 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
434 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
435 codesys_mnemonic = (char *) alloca (mlen + 7);
436 memcpy (codesys_mnemonic,
437 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
441 strcpy (codesys_name + len, "-" op_sys); \
443 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
444 codesys_name_sym = intern (codesys_name); \
445 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
446 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
448 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
449 build_string (codesys_mnemonic); \
450 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
453 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
454 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
455 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
459 Return t if OBJECT is a coding system.
460 A coding system is an object that defines how text containing multiple
461 character sets is encoded into a stream of (typically 8-bit) bytes.
462 The coding system is used to decode the stream into a series of
463 characters (which may be from multiple charsets) when the text is read
464 from a file or process, and is used to encode the text back into the
465 same format when it is written out to a file or process.
467 For example, many ISO2022-compliant coding systems (such as Compound
468 Text, which is used for inter-client data under the X Window System)
469 use escape sequences to switch between different charsets -- Japanese
470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
472 `make-coding-system' for more information.
474 Coding systems are normally identified using a symbol, and the
475 symbol is accepted in place of the actual coding system object whenever
476 a coding system is called for. (This is similar to how faces work.)
480 return CODING_SYSTEMP (object) ? Qt : Qnil;
483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
484 Retrieve the coding system of the given name.
486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
487 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
488 If there is no such coding system, nil is returned. Otherwise the
489 associated coding system object is returned.
491 (coding_system_or_name))
493 if (CODING_SYSTEMP (coding_system_or_name))
494 return coding_system_or_name;
496 if (NILP (coding_system_or_name))
497 coding_system_or_name = Qbinary;
499 CHECK_SYMBOL (coding_system_or_name);
501 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
505 Retrieve the coding system of the given name.
506 Same as `find-coding-system' except that if there is no such
507 coding system, an error is signaled instead of returning nil.
511 Lisp_Object coding_system = Ffind_coding_system (name);
513 if (NILP (coding_system))
514 signal_simple_error ("No such coding system", name);
515 return coding_system;
518 /* We store the coding systems in hash tables with the names as the key and the
519 actual coding system object as the value. Occasionally we need to use them
520 in a list format. These routines provide us with that. */
521 struct coding_system_list_closure
523 Lisp_Object *coding_system_list;
527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
528 void *coding_system_list_closure)
530 /* This function can GC */
531 struct coding_system_list_closure *cscl =
532 (struct coding_system_list_closure *) coding_system_list_closure;
533 Lisp_Object *coding_system_list = cscl->coding_system_list;
535 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
536 *coding_system_list);
540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
541 Return a list of the names of all defined coding systems.
545 Lisp_Object coding_system_list = Qnil;
547 struct coding_system_list_closure coding_system_list_closure;
549 GCPRO1 (coding_system_list);
550 coding_system_list_closure.coding_system_list = &coding_system_list;
551 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
552 &coding_system_list_closure);
555 return coding_system_list;
558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
559 Return the name of the given coding system.
563 coding_system = Fget_coding_system (coding_system);
564 return XCODING_SYSTEM_NAME (coding_system);
567 static Lisp_Coding_System *
568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
570 Lisp_Coding_System *codesys =
571 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
573 zero_lcrecord (codesys);
574 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
575 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
576 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
577 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
578 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
579 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
580 CODING_SYSTEM_TYPE (codesys) = type;
581 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
583 if (type == CODESYS_ISO2022)
586 for (i = 0; i < 4; i++)
587 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
589 else if (type == CODESYS_CCL)
591 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
592 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
595 CODING_SYSTEM_NAME (codesys) = name;
601 /* Given a list of charset conversion specs as specified in a Lisp
602 program, parse it into STORE_HERE. */
605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
606 Lisp_Object spec_list)
610 EXTERNAL_LIST_LOOP (rest, spec_list)
612 Lisp_Object car = XCAR (rest);
613 Lisp_Object from, to;
614 struct charset_conversion_spec spec;
616 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
617 signal_simple_error ("Invalid charset conversion spec", car);
618 from = Fget_charset (XCAR (car));
619 to = Fget_charset (XCAR (XCDR (car)));
620 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
621 signal_simple_error_2
622 ("Attempted conversion between different charset types",
624 spec.from_charset = from;
625 spec.to_charset = to;
627 Dynarr_add (store_here, spec);
631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
632 specs, return the equivalent as the Lisp programmer would see it.
634 If LOAD_HERE is 0, return Qnil. */
637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
644 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
646 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
647 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
650 return Fnreverse (result);
655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
656 Register symbol NAME as a coding system.
658 TYPE describes the conversion method used and should be one of
661 Automatic conversion. XEmacs attempts to detect the coding system
664 No conversion. Use this for binary files and such. On output,
665 graphic characters that are not in ASCII or Latin-1 will be
666 replaced by a ?. (For a no-conversion-encoded buffer, these
667 characters will only be present if you explicitly insert them.)
669 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
671 ISO 10646 UCS-4 encoding.
673 ISO 10646 UTF-8 encoding.
675 Any ISO2022-compliant encoding. Among other things, this includes
676 JIS (the Japanese encoding commonly used for e-mail), EUC (the
677 standard Unix encoding for Japanese and other languages), and
678 Compound Text (the encoding used in X11). You can specify more
679 specific information about the conversion with the FLAGS argument.
681 Big5 (the encoding commonly used for Taiwanese).
683 The conversion is performed using a user-written pseudo-code
684 program. CCL (Code Conversion Language) is the name of this
687 Write out or read in the raw contents of the memory representing
688 the buffer's text. This is primarily useful for debugging
689 purposes, and is only enabled when XEmacs has been compiled with
690 DEBUG_XEMACS defined (via the --debug configure option).
691 WARNING: Reading in a file using 'internal conversion can result
692 in an internal inconsistency in the memory representing a
693 buffer's text, which will produce unpredictable results and may
694 cause XEmacs to crash. Under normal circumstances you should
695 never use 'internal conversion.
697 DOC-STRING is a string describing the coding system.
699 PROPS is a property list, describing the specific nature of the
700 character set. Recognized properties are:
703 String to be displayed in the modeline when this coding system is
707 End-of-line conversion to be used. It should be one of
710 Automatically detect the end-of-line type (LF, CRLF,
711 or CR). Also generate subsidiary coding systems named
712 `NAME-unix', `NAME-dos', and `NAME-mac', that are
713 identical to this coding system but have an EOL-TYPE
714 value of 'lf, 'crlf, and 'cr, respectively.
716 The end of a line is marked externally using ASCII LF.
717 Since this is also the way that XEmacs represents an
718 end-of-line internally, specifying this option results
719 in no end-of-line conversion. This is the standard
720 format for Unix text files.
722 The end of a line is marked externally using ASCII
723 CRLF. This is the standard format for MS-DOS text
726 The end of a line is marked externally using ASCII CR.
727 This is the standard format for Macintosh text files.
729 Automatically detect the end-of-line type but do not
730 generate subsidiary coding systems. (This value is
731 converted to nil when stored internally, and
732 `coding-system-property' will return nil.)
734 'post-read-conversion
735 Function called after a file has been read in, to perform the
736 decoding. Called with two arguments, BEG and END, denoting
737 a region of the current buffer to be decoded.
739 'pre-write-conversion
740 Function called before a file is written out, to perform the
741 encoding. Called with two arguments, BEG and END, denoting
742 a region of the current buffer to be encoded.
745 The following additional properties are recognized if TYPE is 'iso2022:
751 The character set initially designated to the G0 - G3 registers.
752 The value should be one of
754 -- A charset object (designate that character set)
755 -- nil (do not ever use this register)
756 -- t (no character set is initially designated to
757 the register, but may be later on; this automatically
758 sets the corresponding `force-g*-on-output' property)
764 If non-nil, send an explicit designation sequence on output before
765 using the specified register.
768 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
769 "ESC $ B" on output in place of the full designation sequences
770 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
773 If non-nil, don't designate ASCII to G0 at each end of line on output.
774 Setting this to non-nil also suppresses other state-resetting that
775 normally happens at the end of a line.
778 If non-nil, don't designate ASCII to G0 before control chars on output.
781 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
785 If non-nil, use locking-shift (SO/SI) instead of single-shift
786 or designation by escape sequence.
789 If non-nil, don't use ISO6429's direction specification.
792 If non-nil, literal control characters that are the same as
793 the beginning of a recognized ISO2022 or ISO6429 escape sequence
794 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
795 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
796 so that they can be properly distinguished from an escape sequence.
797 (Note that doing this results in a non-portable encoding.) This
798 encoding flag is used for byte-compiled files. Note that ESC
799 is a good choice for a quoting character because there are no
800 escape sequences whose second byte is a character from the Control-0
801 or Control-1 character sets; this is explicitly disallowed by the
804 'input-charset-conversion
805 A list of conversion specifications, specifying conversion of
806 characters in one charset to another when decoding is performed.
807 Each specification is a list of two elements: the source charset,
808 and the destination charset.
810 'output-charset-conversion
811 A list of conversion specifications, specifying conversion of
812 characters in one charset to another when encoding is performed.
813 The form of each specification is the same as for
814 'input-charset-conversion.
817 The following additional properties are recognized (and required)
821 CCL program used for decoding (converting to internal format).
824 CCL program used for encoding (converting to external format).
826 (name, type, doc_string, props))
828 Lisp_Coding_System *codesys;
829 Lisp_Object rest, key, value;
830 enum coding_system_type ty;
831 int need_to_setup_eol_systems = 1;
833 /* Convert type to constant */
834 if (NILP (type) || EQ (type, Qundecided))
835 { ty = CODESYS_AUTODETECT; }
837 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
838 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
839 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
840 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
841 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
842 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
844 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
846 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
849 signal_simple_error ("Invalid coding system type", type);
853 codesys = allocate_coding_system (ty, name);
855 if (NILP (doc_string))
856 doc_string = build_string ("");
858 CHECK_STRING (doc_string);
859 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
861 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
863 if (EQ (key, Qmnemonic))
866 CHECK_STRING (value);
867 CODING_SYSTEM_MNEMONIC (codesys) = value;
870 else if (EQ (key, Qeol_type))
872 need_to_setup_eol_systems = NILP (value);
875 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
878 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
879 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
881 else if (ty == CODESYS_ISO2022)
883 #define FROB_INITIAL_CHARSET(charset_num) \
884 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
885 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
887 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
888 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
889 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
890 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
892 #define FROB_FORCE_CHARSET(charset_num) \
893 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
895 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
896 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
897 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
898 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
900 #define FROB_BOOLEAN_PROPERTY(prop) \
901 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
903 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
904 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
905 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
906 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
907 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
908 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
909 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
911 else if (EQ (key, Qinput_charset_conversion))
913 codesys->iso2022.input_conv =
914 Dynarr_new (charset_conversion_spec);
915 parse_charset_conversion_specs (codesys->iso2022.input_conv,
918 else if (EQ (key, Qoutput_charset_conversion))
920 codesys->iso2022.output_conv =
921 Dynarr_new (charset_conversion_spec);
922 parse_charset_conversion_specs (codesys->iso2022.output_conv,
926 signal_simple_error ("Unrecognized property", key);
928 else if (EQ (type, Qccl))
930 if (EQ (key, Qdecode))
932 CHECK_VECTOR (value);
933 CODING_SYSTEM_CCL_DECODE (codesys) = value;
935 else if (EQ (key, Qencode))
937 CHECK_VECTOR (value);
938 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
941 signal_simple_error ("Unrecognized property", key);
945 signal_simple_error ("Unrecognized property", key);
948 if (need_to_setup_eol_systems)
949 setup_eol_coding_systems (codesys);
952 Lisp_Object codesys_obj;
953 XSETCODING_SYSTEM (codesys_obj, codesys);
954 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
959 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
960 Copy OLD-CODING-SYSTEM to NEW-NAME.
961 If NEW-NAME does not name an existing coding system, a new one will
964 (old_coding_system, new_name))
966 Lisp_Object new_coding_system;
967 old_coding_system = Fget_coding_system (old_coding_system);
968 new_coding_system = Ffind_coding_system (new_name);
969 if (NILP (new_coding_system))
971 XSETCODING_SYSTEM (new_coding_system,
972 allocate_coding_system
973 (XCODING_SYSTEM_TYPE (old_coding_system),
975 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
979 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
980 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
981 memcpy (((char *) to ) + sizeof (to->header),
982 ((char *) from) + sizeof (from->header),
983 sizeof (*from) - sizeof (from->header));
986 return new_coding_system;
989 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
990 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
992 (alias, coding_system))
994 CHECK_SYMBOL (alias);
995 if (!NILP (Ffind_coding_system (alias)))
996 signal_simple_error ("Symbol already names a coding system", alias);
997 coding_system = Fget_coding_system (coding_system);
998 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1000 /* Set up aliases for subsidiaries. */
1001 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1004 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1005 #define FROB(type, name) \
1007 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1008 if (!NILP (subsidiary)) \
1009 Fdefine_coding_system_alias \
1010 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1013 FROB (CRLF, "-dos");
1017 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1018 but it doesn't look intentional, so I'd rather return something
1019 meaningful or nothing at all. */
1024 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1026 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1027 Lisp_Object new_coding_system;
1029 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1030 return coding_system;
1034 case EOL_AUTODETECT: return coding_system;
1035 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1036 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1037 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1041 return NILP (new_coding_system) ? coding_system : new_coding_system;
1044 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1045 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1047 (coding_system, eol_type))
1049 coding_system = Fget_coding_system (coding_system);
1051 return subsidiary_coding_system (coding_system,
1052 symbol_to_eol_type (eol_type));
1056 /************************************************************************/
1057 /* Coding system accessors */
1058 /************************************************************************/
1060 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1061 Return the doc string for CODING-SYSTEM.
1065 coding_system = Fget_coding_system (coding_system);
1066 return XCODING_SYSTEM_DOC_STRING (coding_system);
1069 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1070 Return the type of CODING-SYSTEM.
1074 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1077 case CODESYS_AUTODETECT: return Qundecided;
1079 case CODESYS_SHIFT_JIS: return Qshift_jis;
1080 case CODESYS_ISO2022: return Qiso2022;
1081 case CODESYS_BIG5: return Qbig5;
1082 case CODESYS_UCS4: return Qucs4;
1083 case CODESYS_UTF8: return Qutf8;
1084 case CODESYS_CCL: return Qccl;
1086 case CODESYS_NO_CONVERSION: return Qno_conversion;
1088 case CODESYS_INTERNAL: return Qinternal;
1095 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1098 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1100 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1103 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1104 Return initial charset of CODING-SYSTEM designated to GNUM.
1107 (coding_system, gnum))
1109 coding_system = Fget_coding_system (coding_system);
1112 return coding_system_charset (coding_system, XINT (gnum));
1116 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1117 Return the PROP property of CODING-SYSTEM.
1119 (coding_system, prop))
1122 enum coding_system_type type;
1124 coding_system = Fget_coding_system (coding_system);
1125 CHECK_SYMBOL (prop);
1126 type = XCODING_SYSTEM_TYPE (coding_system);
1128 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1129 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1132 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1134 case CODESYS_PROP_ALL_OK:
1137 case CODESYS_PROP_ISO2022:
1138 if (type != CODESYS_ISO2022)
1140 ("Property only valid in ISO2022 coding systems",
1144 case CODESYS_PROP_CCL:
1145 if (type != CODESYS_CCL)
1147 ("Property only valid in CCL coding systems",
1157 signal_simple_error ("Unrecognized property", prop);
1159 if (EQ (prop, Qname))
1160 return XCODING_SYSTEM_NAME (coding_system);
1161 else if (EQ (prop, Qtype))
1162 return Fcoding_system_type (coding_system);
1163 else if (EQ (prop, Qdoc_string))
1164 return XCODING_SYSTEM_DOC_STRING (coding_system);
1165 else if (EQ (prop, Qmnemonic))
1166 return XCODING_SYSTEM_MNEMONIC (coding_system);
1167 else if (EQ (prop, Qeol_type))
1168 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1169 else if (EQ (prop, Qeol_lf))
1170 return XCODING_SYSTEM_EOL_LF (coding_system);
1171 else if (EQ (prop, Qeol_crlf))
1172 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1173 else if (EQ (prop, Qeol_cr))
1174 return XCODING_SYSTEM_EOL_CR (coding_system);
1175 else if (EQ (prop, Qpost_read_conversion))
1176 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1177 else if (EQ (prop, Qpre_write_conversion))
1178 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1180 else if (type == CODESYS_ISO2022)
1182 if (EQ (prop, Qcharset_g0))
1183 return coding_system_charset (coding_system, 0);
1184 else if (EQ (prop, Qcharset_g1))
1185 return coding_system_charset (coding_system, 1);
1186 else if (EQ (prop, Qcharset_g2))
1187 return coding_system_charset (coding_system, 2);
1188 else if (EQ (prop, Qcharset_g3))
1189 return coding_system_charset (coding_system, 3);
1191 #define FORCE_CHARSET(charset_num) \
1192 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1193 (coding_system, charset_num) ? Qt : Qnil)
1195 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1196 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1197 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1198 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1200 #define LISP_BOOLEAN(prop) \
1201 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1203 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1204 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1205 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1206 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1207 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1208 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1209 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1211 else if (EQ (prop, Qinput_charset_conversion))
1213 unparse_charset_conversion_specs
1214 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1215 else if (EQ (prop, Qoutput_charset_conversion))
1217 unparse_charset_conversion_specs
1218 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1222 else if (type == CODESYS_CCL)
1224 if (EQ (prop, Qdecode))
1225 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1226 else if (EQ (prop, Qencode))
1227 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1235 return Qnil; /* not reached */
1239 /************************************************************************/
1240 /* Coding category functions */
1241 /************************************************************************/
1244 decode_coding_category (Lisp_Object symbol)
1248 CHECK_SYMBOL (symbol);
1249 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1250 if (EQ (coding_category_symbol[i], symbol))
1253 signal_simple_error ("Unrecognized coding category", symbol);
1254 return 0; /* not reached */
1257 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1258 Return a list of all recognized coding categories.
1263 Lisp_Object list = Qnil;
1265 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1266 list = Fcons (coding_category_symbol[i], list);
1270 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1271 Change the priority order of the coding categories.
1272 LIST should be list of coding categories, in descending order of
1273 priority. Unspecified coding categories will be lower in priority
1274 than all specified ones, in the same relative order they were in
1279 int category_to_priority[CODING_CATEGORY_LAST + 1];
1283 /* First generate a list that maps coding categories to priorities. */
1285 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1286 category_to_priority[i] = -1;
1288 /* Highest priority comes from the specified list. */
1290 EXTERNAL_LIST_LOOP (rest, list)
1292 int cat = decode_coding_category (XCAR (rest));
1294 if (category_to_priority[cat] >= 0)
1295 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1296 category_to_priority[cat] = i++;
1299 /* Now go through the existing categories by priority to retrieve
1300 the categories not yet specified and preserve their priority
1302 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1304 int cat = coding_category_by_priority[j];
1305 if (category_to_priority[cat] < 0)
1306 category_to_priority[cat] = i++;
1309 /* Now we need to construct the inverse of the mapping we just
1312 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1313 coding_category_by_priority[category_to_priority[i]] = i;
1315 /* Phew! That was confusing. */
1319 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1320 Return a list of coding categories in descending order of priority.
1325 Lisp_Object list = Qnil;
1327 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1328 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1333 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1334 Change the coding system associated with a coding category.
1336 (coding_category, coding_system))
1338 int cat = decode_coding_category (coding_category);
1340 coding_system = Fget_coding_system (coding_system);
1341 coding_category_system[cat] = coding_system;
1345 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1346 Return the coding system associated with a coding category.
1350 int cat = decode_coding_category (coding_category);
1351 Lisp_Object sys = coding_category_system[cat];
1354 return XCODING_SYSTEM_NAME (sys);
1359 /************************************************************************/
1360 /* Detecting the encoding of data */
1361 /************************************************************************/
1363 struct detection_state
1365 enum eol_type eol_type;
1401 struct iso2022_decoder iso;
1403 int high_byte_count;
1404 unsigned int saw_single_shift:1;
1417 acceptable_control_char_p (int c)
1421 /* Allow and ignore control characters that you might
1422 reasonably see in a text file */
1427 case 8: /* backspace */
1428 case 11: /* vertical tab */
1429 case 12: /* form feed */
1430 case 26: /* MS-DOS C-z junk */
1431 case 31: /* '^_' -- for info */
1439 mask_has_at_most_one_bit_p (int mask)
1441 /* Perhaps the only thing useful you learn from intensive Microsoft
1442 technical interviews */
1443 return (mask & (mask - 1)) == 0;
1446 static enum eol_type
1447 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1456 st->eol.just_saw_cr = 1;
1461 if (st->eol.just_saw_cr)
1463 else if (st->eol.seen_anything)
1466 else if (st->eol.just_saw_cr)
1468 st->eol.just_saw_cr = 0;
1470 st->eol.seen_anything = 1;
1473 return EOL_AUTODETECT;
1476 /* Attempt to determine the encoding and EOL type of the given text.
1477 Before calling this function for the first type, you must initialize
1478 st->eol_type as appropriate and initialize st->mask to ~0.
1480 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1483 st->mask holds the determined coding category mask, or ~0 if only
1484 ASCII has been seen so far.
1488 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1489 is present in st->mask
1490 1 == definitive answers are here for both st->eol_type and st->mask
1494 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1495 unsigned int n, int just_do_eol)
1499 if (st->eol_type == EOL_AUTODETECT)
1500 st->eol_type = detect_eol_type (st, src, n);
1503 return st->eol_type != EOL_AUTODETECT;
1505 if (!st->seen_non_ascii)
1507 for (; n; n--, src++)
1510 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1512 st->seen_non_ascii = 1;
1514 st->shift_jis.mask = ~0;
1518 st->iso2022.mask = ~0;
1528 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1529 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1530 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1531 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1532 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1533 st->big5.mask = detect_coding_big5 (st, src, n);
1534 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1535 st->utf8.mask = detect_coding_utf8 (st, src, n);
1536 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1537 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1540 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1541 | st->utf8.mask | st->ucs4.mask;
1544 int retval = mask_has_at_most_one_bit_p (st->mask);
1545 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1546 return retval && st->eol_type != EOL_AUTODETECT;
1551 coding_system_from_mask (int mask)
1555 /* If the file was entirely or basically ASCII, use the
1556 default value of `buffer-file-coding-system'. */
1557 Lisp_Object retval =
1558 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1561 retval = Ffind_coding_system (retval);
1565 (Qbad_variable, Qwarning,
1566 "Invalid `default-buffer-file-coding-system', set to nil");
1567 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1571 retval = Fget_coding_system (Qraw_text);
1579 mask = postprocess_iso2022_mask (mask);
1581 /* Look through the coding categories by priority and find
1582 the first one that is allowed. */
1583 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1585 cat = coding_category_by_priority[i];
1586 if ((mask & (1 << cat)) &&
1587 !NILP (coding_category_system[cat]))
1591 return coding_category_system[cat];
1593 return Fget_coding_system (Qraw_text);
1597 /* Given a seekable read stream and potential coding system and EOL type
1598 as specified, do any autodetection that is called for. If the
1599 coding system and/or EOL type are not autodetect, they will be left
1600 alone; but this function will never return an autodetect coding system
1603 This function does not automatically fetch subsidiary coding systems;
1604 that should be unnecessary with the explicit eol-type argument. */
1607 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1608 enum eol_type *eol_type_in_out)
1610 struct detection_state decst;
1612 if (*eol_type_in_out == EOL_AUTODETECT)
1613 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1616 decst.eol_type = *eol_type_in_out;
1619 /* If autodetection is called for, do it now. */
1620 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1621 *eol_type_in_out == EOL_AUTODETECT)
1623 unsigned char random_buffer[4096];
1625 Lisp_Object coding_system = Qnil;
1627 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1630 unsigned char *cp = random_buffer;
1632 while (cp < random_buffer + nread)
1634 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1635 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1636 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1637 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1638 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1639 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1640 (*cp++ == ':') && (cp < random_buffer + nread))
1642 unsigned char coding_system_name[4096 - 6];
1643 unsigned char *np = coding_system_name;
1645 while ( (cp < random_buffer + nread)
1646 && ((*cp == ' ') || (*cp == '\t')) )
1650 while ( (cp < random_buffer + nread) &&
1651 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1657 = Ffind_coding_system (intern (coding_system_name));
1661 if (EQ(coding_system, Qnil))
1663 if (detect_coding_type (&decst, random_buffer, nread,
1664 XCODING_SYSTEM_TYPE (*codesys_in_out)
1665 != CODESYS_AUTODETECT))
1667 nread = Lstream_read (stream,
1668 random_buffer, sizeof (random_buffer));
1673 *eol_type_in_out = decst.eol_type;
1674 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1676 if (EQ(coding_system, Qnil))
1677 *codesys_in_out = coding_system_from_mask (decst.mask);
1679 *codesys_in_out = coding_system;
1682 /* If we absolutely can't determine the EOL type, just assume LF. */
1683 if (*eol_type_in_out == EOL_AUTODETECT)
1684 *eol_type_in_out = EOL_LF;
1686 Lstream_rewind (stream);
1689 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1690 Detect coding system of the text in the region between START and END.
1691 Returned a list of possible coding systems ordered by priority.
1692 If only ASCII characters are found, it returns 'undecided or one of
1693 its subsidiary coding systems according to a detected end-of-line
1694 type. Optional arg BUFFER defaults to the current buffer.
1696 (start, end, buffer))
1698 Lisp_Object val = Qnil;
1699 struct buffer *buf = decode_buffer (buffer, 0);
1701 Lisp_Object instream, lb_instream;
1702 Lstream *istr, *lb_istr;
1703 struct detection_state decst;
1704 struct gcpro gcpro1, gcpro2;
1706 get_buffer_range_char (buf, start, end, &b, &e, 0);
1707 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1708 lb_istr = XLSTREAM (lb_instream);
1709 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1710 istr = XLSTREAM (instream);
1711 GCPRO2 (instream, lb_instream);
1713 decst.eol_type = EOL_AUTODETECT;
1717 unsigned char random_buffer[4096];
1718 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1722 if (detect_coding_type (&decst, random_buffer, nread, 0))
1726 if (decst.mask == ~0)
1727 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1735 decst.mask = postprocess_iso2022_mask (decst.mask);
1737 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1739 int sys = coding_category_by_priority[i];
1740 if (decst.mask & (1 << sys))
1742 Lisp_Object codesys = coding_category_system[sys];
1743 if (!NILP (codesys))
1744 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1745 val = Fcons (codesys, val);
1749 Lstream_close (istr);
1751 Lstream_delete (istr);
1752 Lstream_delete (lb_istr);
1757 /************************************************************************/
1758 /* Converting to internal Mule format ("decoding") */
1759 /************************************************************************/
1761 /* A decoding stream is a stream used for decoding text (i.e.
1762 converting from some external format to internal format).
1763 The decoding-stream object keeps track of the actual coding
1764 stream, the stream that is at the other end, and data that
1765 needs to be persistent across the lifetime of the stream. */
1767 /* Handle the EOL stuff related to just-read-in character C.
1768 EOL_TYPE is the EOL type of the coding stream.
1769 FLAGS is the current value of FLAGS in the coding stream, and may
1770 be modified by this macro. (The macro only looks at the
1771 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1772 bytes are to be written. You need to also define a local goto
1773 label "label_continue_loop" that is at the end of the main
1774 character-reading loop.
1776 If C is a CR character, then this macro handles it entirely and
1777 jumps to label_continue_loop. Otherwise, this macro does not add
1778 anything to DST, and continues normally. You should continue
1779 processing C normally after this macro. */
1781 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1785 if (eol_type == EOL_CR) \
1786 Dynarr_add (dst, '\n'); \
1787 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1788 Dynarr_add (dst, c); \
1790 flags |= CODING_STATE_CR; \
1791 goto label_continue_loop; \
1793 else if (flags & CODING_STATE_CR) \
1794 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1796 Dynarr_add (dst, '\r'); \
1797 flags &= ~CODING_STATE_CR; \
1801 /* C should be a binary character in the range 0 - 255; convert
1802 to internal format and add to Dynarr DST. */
1805 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1807 if (BYTE_ASCII_P (c)) \
1808 Dynarr_add (dst, c); \
1811 Dynarr_add (dst, (c >> 6) | 0xc0); \
1812 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1817 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1821 Dynarr_add (dst, c);
1823 else if ( c <= 0x7ff )
1825 Dynarr_add (dst, (c >> 6) | 0xc0);
1826 Dynarr_add (dst, (c & 0x3f) | 0x80);
1828 else if ( c <= 0xffff )
1830 Dynarr_add (dst, (c >> 12) | 0xe0);
1831 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1832 Dynarr_add (dst, (c & 0x3f) | 0x80);
1834 else if ( c <= 0x1fffff )
1836 Dynarr_add (dst, (c >> 18) | 0xf0);
1837 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1838 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1839 Dynarr_add (dst, (c & 0x3f) | 0x80);
1841 else if ( c <= 0x3ffffff )
1843 Dynarr_add (dst, (c >> 24) | 0xf8);
1844 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1845 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1846 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1847 Dynarr_add (dst, (c & 0x3f) | 0x80);
1851 Dynarr_add (dst, (c >> 30) | 0xfc);
1852 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1853 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1854 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1855 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1856 Dynarr_add (dst, (c & 0x3f) | 0x80);
1860 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1862 if (BYTE_ASCII_P (c)) \
1863 Dynarr_add (dst, c); \
1864 else if (BYTE_C1_P (c)) \
1866 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1867 Dynarr_add (dst, c + 0x20); \
1871 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1872 Dynarr_add (dst, c); \
1877 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1881 DECODE_ADD_BINARY_CHAR (ch, dst); \
1886 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1888 if (flags & CODING_STATE_END) \
1890 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1891 if (flags & CODING_STATE_CR) \
1892 Dynarr_add (dst, '\r'); \
1896 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1898 struct decoding_stream
1900 /* Coding system that governs the conversion. */
1901 Lisp_Coding_System *codesys;
1903 /* Stream that we read the encoded data from or
1904 write the decoded data to. */
1907 /* If we are reading, then we can return only a fixed amount of
1908 data, so if the conversion resulted in too much data, we store it
1909 here for retrieval the next time around. */
1910 unsigned_char_dynarr *runoff;
1912 /* FLAGS holds flags indicating the current state of the decoding.
1913 Some of these flags are dependent on the coding system. */
1916 /* CH holds a partially built-up character. Since we only deal
1917 with one- and two-byte characters at the moment, we only use
1918 this to store the first byte of a two-byte character. */
1921 /* EOL_TYPE specifies the type of end-of-line conversion that
1922 currently applies. We need to keep this separate from the
1923 EOL type stored in CODESYS because the latter might indicate
1924 automatic EOL-type detection while the former will always
1925 indicate a particular EOL type. */
1926 enum eol_type eol_type;
1928 /* Additional ISO2022 information. We define the structure above
1929 because it's also needed by the detection routines. */
1930 struct iso2022_decoder iso2022;
1932 /* Additional information (the state of the running CCL program)
1933 used by the CCL decoder. */
1934 struct ccl_program ccl;
1936 /* counter for UTF-8 or UCS-4 */
1937 unsigned char counter;
1939 struct detection_state decst;
1942 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1943 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1944 static int decoding_rewinder (Lstream *stream);
1945 static int decoding_seekable_p (Lstream *stream);
1946 static int decoding_flusher (Lstream *stream);
1947 static int decoding_closer (Lstream *stream);
1949 static Lisp_Object decoding_marker (Lisp_Object stream,
1950 void (*markobj) (Lisp_Object));
1952 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1953 sizeof (struct decoding_stream));
1956 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1958 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1959 Lisp_Object str_obj;
1961 /* We do not need to mark the coding systems or charsets stored
1962 within the stream because they are stored in a global list
1963 and automatically marked. */
1965 XSETLSTREAM (str_obj, str);
1967 if (str->imp->marker)
1968 return (str->imp->marker) (str_obj, markobj);
1973 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1974 so we read data from the other end, decode it, and store it into DATA. */
1977 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1979 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1980 unsigned char *orig_data = data;
1982 int error_occurred = 0;
1984 /* We need to interface to mule_decode(), which expects to take some
1985 amount of data and store the result into a Dynarr. We have
1986 mule_decode() store into str->runoff, and take data from there
1989 /* We loop until we have enough data, reading chunks from the other
1990 end and decoding it. */
1993 /* Take data from the runoff if we can. Make sure to take at
1994 most SIZE bytes, and delete the data from the runoff. */
1995 if (Dynarr_length (str->runoff) > 0)
1997 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1998 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1999 Dynarr_delete_many (str->runoff, 0, chunk);
2005 break; /* No more room for data */
2007 if (str->flags & CODING_STATE_END)
2008 /* This means that on the previous iteration, we hit the EOF on
2009 the other end. We loop once more so that mule_decode() can
2010 output any final stuff it may be holding, or any "go back
2011 to a sane state" escape sequences. (This latter makes sense
2012 during encoding.) */
2015 /* Exhausted the runoff, so get some more. DATA has at least
2016 SIZE bytes left of storage in it, so it's OK to read directly
2017 into it. (We'll be overwriting above, after we've decoded it
2018 into the runoff.) */
2019 read_size = Lstream_read (str->other_end, data, size);
2026 /* There might be some more end data produced in the translation.
2027 See the comment above. */
2028 str->flags |= CODING_STATE_END;
2029 mule_decode (stream, data, str->runoff, read_size);
2032 if (data - orig_data == 0)
2033 return error_occurred ? -1 : 0;
2035 return data - orig_data;
2039 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2041 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2044 /* Decode all our data into the runoff, and then attempt to write
2045 it all out to the other end. Remove whatever chunk we succeeded
2047 mule_decode (stream, data, str->runoff, size);
2048 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2049 Dynarr_length (str->runoff));
2051 Dynarr_delete_many (str->runoff, 0, retval);
2052 /* Do NOT return retval. The return value indicates how much
2053 of the incoming data was written, not how many bytes were
2059 reset_decoding_stream (struct decoding_stream *str)
2062 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2064 Lisp_Object coding_system;
2065 XSETCODING_SYSTEM (coding_system, str->codesys);
2066 reset_iso2022 (coding_system, &str->iso2022);
2068 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2070 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2074 str->flags = str->ch = 0;
2078 decoding_rewinder (Lstream *stream)
2080 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2081 reset_decoding_stream (str);
2082 Dynarr_reset (str->runoff);
2083 return Lstream_rewind (str->other_end);
2087 decoding_seekable_p (Lstream *stream)
2089 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2090 return Lstream_seekable_p (str->other_end);
2094 decoding_flusher (Lstream *stream)
2096 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2097 return Lstream_flush (str->other_end);
2101 decoding_closer (Lstream *stream)
2103 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2104 if (stream->flags & LSTREAM_FL_WRITE)
2106 str->flags |= CODING_STATE_END;
2107 decoding_writer (stream, 0, 0);
2109 Dynarr_free (str->runoff);
2111 #ifdef ENABLE_COMPOSITE_CHARS
2112 if (str->iso2022.composite_chars)
2113 Dynarr_free (str->iso2022.composite_chars);
2116 return Lstream_close (str->other_end);
2120 decoding_stream_coding_system (Lstream *stream)
2122 Lisp_Object coding_system;
2123 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2125 XSETCODING_SYSTEM (coding_system, str->codesys);
2126 return subsidiary_coding_system (coding_system, str->eol_type);
2130 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2132 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2133 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2135 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2136 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2137 reset_decoding_stream (str);
2140 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2141 stream for writing, no automatic code detection will be performed.
2142 The reason for this is that automatic code detection requires a
2143 seekable input. Things will also fail if you open a decoding
2144 stream for reading using a non-fully-specified coding system and
2145 a non-seekable input stream. */
2148 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2151 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2152 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2156 str->other_end = stream;
2157 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2158 str->eol_type = EOL_AUTODETECT;
2159 if (!strcmp (mode, "r")
2160 && Lstream_seekable_p (stream))
2161 /* We can determine the coding system now. */
2162 determine_real_coding_system (stream, &codesys, &str->eol_type);
2163 set_decoding_stream_coding_system (lstr, codesys);
2164 str->decst.eol_type = str->eol_type;
2165 str->decst.mask = ~0;
2166 XSETLSTREAM (obj, lstr);
2171 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2173 return make_decoding_stream_1 (stream, codesys, "r");
2177 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2179 return make_decoding_stream_1 (stream, codesys, "w");
2182 /* Note: the decode_coding_* functions all take the same
2183 arguments as mule_decode(), which is to say some SRC data of
2184 size N, which is to be stored into dynamic array DST.
2185 DECODING is the stream within which the decoding is
2186 taking place, but no data is actually read from or
2187 written to that stream; that is handled in decoding_reader()
2188 or decoding_writer(). This allows the same functions to
2189 be used for both reading and writing. */
2192 mule_decode (Lstream *decoding, CONST unsigned char *src,
2193 unsigned_char_dynarr *dst, unsigned int n)
2195 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2197 /* If necessary, do encoding-detection now. We do this when
2198 we're a writing stream or a non-seekable reading stream,
2199 meaning that we can't just process the whole input,
2200 rewind, and start over. */
2202 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2203 str->eol_type == EOL_AUTODETECT)
2205 Lisp_Object codesys;
2207 XSETCODING_SYSTEM (codesys, str->codesys);
2208 detect_coding_type (&str->decst, src, n,
2209 CODING_SYSTEM_TYPE (str->codesys) !=
2210 CODESYS_AUTODETECT);
2211 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2212 str->decst.mask != ~0)
2213 /* #### This is cheesy. What we really ought to do is
2214 buffer up a certain amount of data so as to get a
2215 less random result. */
2216 codesys = coding_system_from_mask (str->decst.mask);
2217 str->eol_type = str->decst.eol_type;
2218 if (XCODING_SYSTEM (codesys) != str->codesys)
2220 /* Preserve the CODING_STATE_END flag in case it was set.
2221 If we erase it, bad things might happen. */
2222 int was_end = str->flags & CODING_STATE_END;
2223 set_decoding_stream_coding_system (decoding, codesys);
2225 str->flags |= CODING_STATE_END;
2229 switch (CODING_SYSTEM_TYPE (str->codesys))
2232 case CODESYS_INTERNAL:
2233 Dynarr_add_many (dst, src, n);
2236 case CODESYS_AUTODETECT:
2237 /* If we got this far and still haven't decided on the coding
2238 system, then do no conversion. */
2239 case CODESYS_NO_CONVERSION:
2240 decode_coding_no_conversion (decoding, src, dst, n);
2243 case CODESYS_SHIFT_JIS:
2244 decode_coding_sjis (decoding, src, dst, n);
2247 decode_coding_big5 (decoding, src, dst, n);
2250 decode_coding_ucs4 (decoding, src, dst, n);
2253 decode_coding_utf8 (decoding, src, dst, n);
2256 str->ccl.last_block = str->flags & CODING_STATE_END;
2257 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2259 case CODESYS_ISO2022:
2260 decode_coding_iso2022 (decoding, src, dst, n);
2268 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2269 Decode the text between START and END which is encoded in CODING-SYSTEM.
2270 This is useful if you've read in encoded text from a file without decoding
2271 it (e.g. you read in a JIS-formatted file but used the `binary' or
2272 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2273 Return length of decoded text.
2274 BUFFER defaults to the current buffer if unspecified.
2276 (start, end, coding_system, buffer))
2279 struct buffer *buf = decode_buffer (buffer, 0);
2280 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2281 Lstream *istr, *ostr;
2282 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2284 get_buffer_range_char (buf, start, end, &b, &e, 0);
2286 barf_if_buffer_read_only (buf, b, e);
2288 coding_system = Fget_coding_system (coding_system);
2289 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2290 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2291 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2293 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2294 Fget_coding_system (Qbinary));
2295 istr = XLSTREAM (instream);
2296 ostr = XLSTREAM (outstream);
2297 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2299 /* The chain of streams looks like this:
2301 [BUFFER] <----- send through
2302 ------> [ENCODE AS BINARY]
2303 ------> [DECODE AS SPECIFIED]
2309 char tempbuf[1024]; /* some random amount */
2310 Bufpos newpos, even_newer_pos;
2311 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2312 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2316 newpos = lisp_buffer_stream_startpos (istr);
2317 Lstream_write (ostr, tempbuf, size_in_bytes);
2318 even_newer_pos = lisp_buffer_stream_startpos (istr);
2319 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2322 Lstream_close (istr);
2323 Lstream_close (ostr);
2325 Lstream_delete (istr);
2326 Lstream_delete (ostr);
2327 Lstream_delete (XLSTREAM (de_outstream));
2328 Lstream_delete (XLSTREAM (lb_outstream));
2333 /************************************************************************/
2334 /* Converting to an external encoding ("encoding") */
2335 /************************************************************************/
2337 /* An encoding stream is an output stream. When you create the
2338 stream, you specify the coding system that governs the encoding
2339 and another stream that the resulting encoded data is to be
2340 sent to, and then start sending data to it. */
2342 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2344 struct encoding_stream
2346 /* Coding system that governs the conversion. */
2347 Lisp_Coding_System *codesys;
2349 /* Stream that we read the encoded data from or
2350 write the decoded data to. */
2353 /* If we are reading, then we can return only a fixed amount of
2354 data, so if the conversion resulted in too much data, we store it
2355 here for retrieval the next time around. */
2356 unsigned_char_dynarr *runoff;
2358 /* FLAGS holds flags indicating the current state of the encoding.
2359 Some of these flags are dependent on the coding system. */
2362 /* CH holds a partially built-up character. Since we only deal
2363 with one- and two-byte characters at the moment, we only use
2364 this to store the first byte of a two-byte character. */
2367 /* Additional information used by the ISO2022 encoder. */
2370 /* CHARSET holds the character sets currently assigned to the G0
2371 through G3 registers. It is initialized from the array
2372 INITIAL_CHARSET in CODESYS. */
2373 Lisp_Object charset[4];
2375 /* Which registers are currently invoked into the left (GL) and
2376 right (GR) halves of the 8-bit encoding space? */
2377 int register_left, register_right;
2379 /* Whether we need to explicitly designate the charset in the
2380 G? register before using it. It is initialized from the
2381 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2382 unsigned char force_charset_on_output[4];
2384 /* Other state variables that need to be preserved across
2386 Lisp_Object current_charset;
2388 int current_char_boundary;
2391 /* Additional information (the state of the running CCL program)
2392 used by the CCL encoder. */
2393 struct ccl_program ccl;
2397 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2398 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2400 static int encoding_rewinder (Lstream *stream);
2401 static int encoding_seekable_p (Lstream *stream);
2402 static int encoding_flusher (Lstream *stream);
2403 static int encoding_closer (Lstream *stream);
2405 static Lisp_Object encoding_marker (Lisp_Object stream,
2406 void (*markobj) (Lisp_Object));
2408 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2409 sizeof (struct encoding_stream));
2412 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2414 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2415 Lisp_Object str_obj;
2417 /* We do not need to mark the coding systems or charsets stored
2418 within the stream because they are stored in a global list
2419 and automatically marked. */
2421 XSETLSTREAM (str_obj, str);
2423 if (str->imp->marker)
2424 return (str->imp->marker) (str_obj, markobj);
2429 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2430 so we read data from the other end, encode it, and store it into DATA. */
2433 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2435 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2436 unsigned char *orig_data = data;
2438 int error_occurred = 0;
2440 /* We need to interface to mule_encode(), which expects to take some
2441 amount of data and store the result into a Dynarr. We have
2442 mule_encode() store into str->runoff, and take data from there
2445 /* We loop until we have enough data, reading chunks from the other
2446 end and encoding it. */
2449 /* Take data from the runoff if we can. Make sure to take at
2450 most SIZE bytes, and delete the data from the runoff. */
2451 if (Dynarr_length (str->runoff) > 0)
2453 int chunk = min ((int) size, Dynarr_length (str->runoff));
2454 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2455 Dynarr_delete_many (str->runoff, 0, chunk);
2461 break; /* No more room for data */
2463 if (str->flags & CODING_STATE_END)
2464 /* This means that on the previous iteration, we hit the EOF on
2465 the other end. We loop once more so that mule_encode() can
2466 output any final stuff it may be holding, or any "go back
2467 to a sane state" escape sequences. (This latter makes sense
2468 during encoding.) */
2471 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2472 left of storage in it, so it's OK to read directly into it.
2473 (We'll be overwriting above, after we've encoded it into the
2475 read_size = Lstream_read (str->other_end, data, size);
2482 /* There might be some more end data produced in the translation.
2483 See the comment above. */
2484 str->flags |= CODING_STATE_END;
2485 mule_encode (stream, data, str->runoff, read_size);
2488 if (data == orig_data)
2489 return error_occurred ? -1 : 0;
2491 return data - orig_data;
2495 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2497 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2500 /* Encode all our data into the runoff, and then attempt to write
2501 it all out to the other end. Remove whatever chunk we succeeded
2503 mule_encode (stream, data, str->runoff, size);
2504 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2505 Dynarr_length (str->runoff));
2507 Dynarr_delete_many (str->runoff, 0, retval);
2508 /* Do NOT return retval. The return value indicates how much
2509 of the incoming data was written, not how many bytes were
2515 reset_encoding_stream (struct encoding_stream *str)
2518 switch (CODING_SYSTEM_TYPE (str->codesys))
2520 case CODESYS_ISO2022:
2524 for (i = 0; i < 4; i++)
2526 str->iso2022.charset[i] =
2527 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2528 str->iso2022.force_charset_on_output[i] =
2529 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2531 str->iso2022.register_left = 0;
2532 str->iso2022.register_right = 1;
2533 str->iso2022.current_charset = Qnil;
2534 str->iso2022.current_half = 0;
2536 str->iso2022.current_char_boundary = 0;
2538 str->iso2022.current_char_boundary = 1;
2543 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2550 str->flags = str->ch = 0;
2554 encoding_rewinder (Lstream *stream)
2556 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2557 reset_encoding_stream (str);
2558 Dynarr_reset (str->runoff);
2559 return Lstream_rewind (str->other_end);
2563 encoding_seekable_p (Lstream *stream)
2565 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2566 return Lstream_seekable_p (str->other_end);
2570 encoding_flusher (Lstream *stream)
2572 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2573 return Lstream_flush (str->other_end);
2577 encoding_closer (Lstream *stream)
2579 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2580 if (stream->flags & LSTREAM_FL_WRITE)
2582 str->flags |= CODING_STATE_END;
2583 encoding_writer (stream, 0, 0);
2585 Dynarr_free (str->runoff);
2586 return Lstream_close (str->other_end);
2590 encoding_stream_coding_system (Lstream *stream)
2592 Lisp_Object coding_system;
2593 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2595 XSETCODING_SYSTEM (coding_system, str->codesys);
2596 return coding_system;
2600 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2602 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2603 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2605 reset_encoding_stream (str);
2609 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2612 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2613 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2617 str->runoff = Dynarr_new (unsigned_char);
2618 str->other_end = stream;
2619 set_encoding_stream_coding_system (lstr, codesys);
2620 XSETLSTREAM (obj, lstr);
2625 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2627 return make_encoding_stream_1 (stream, codesys, "r");
2631 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2633 return make_encoding_stream_1 (stream, codesys, "w");
2636 /* Convert N bytes of internally-formatted data stored in SRC to an
2637 external format, according to the encoding stream ENCODING.
2638 Store the encoded data into DST. */
2641 mule_encode (Lstream *encoding, CONST unsigned char *src,
2642 unsigned_char_dynarr *dst, unsigned int n)
2644 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2646 switch (CODING_SYSTEM_TYPE (str->codesys))
2649 case CODESYS_INTERNAL:
2650 Dynarr_add_many (dst, src, n);
2653 case CODESYS_AUTODETECT:
2654 /* If we got this far and still haven't decided on the coding
2655 system, then do no conversion. */
2656 case CODESYS_NO_CONVERSION:
2657 encode_coding_no_conversion (encoding, src, dst, n);
2660 case CODESYS_SHIFT_JIS:
2661 encode_coding_sjis (encoding, src, dst, n);
2664 encode_coding_big5 (encoding, src, dst, n);
2667 encode_coding_ucs4 (encoding, src, dst, n);
2670 encode_coding_utf8 (encoding, src, dst, n);
2673 str->ccl.last_block = str->flags & CODING_STATE_END;
2674 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2676 case CODESYS_ISO2022:
2677 encode_coding_iso2022 (encoding, src, dst, n);
2685 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2686 Encode the text between START and END using CODING-SYSTEM.
2687 This will, for example, convert Japanese characters into stuff such as
2688 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2689 text. BUFFER defaults to the current buffer if unspecified.
2691 (start, end, coding_system, buffer))
2694 struct buffer *buf = decode_buffer (buffer, 0);
2695 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2696 Lstream *istr, *ostr;
2697 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2699 get_buffer_range_char (buf, start, end, &b, &e, 0);
2701 barf_if_buffer_read_only (buf, b, e);
2703 coding_system = Fget_coding_system (coding_system);
2704 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2705 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2706 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2707 Fget_coding_system (Qbinary));
2708 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2710 istr = XLSTREAM (instream);
2711 ostr = XLSTREAM (outstream);
2712 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2713 /* The chain of streams looks like this:
2715 [BUFFER] <----- send through
2716 ------> [ENCODE AS SPECIFIED]
2717 ------> [DECODE AS BINARY]
2722 char tempbuf[1024]; /* some random amount */
2723 Bufpos newpos, even_newer_pos;
2724 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2725 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2729 newpos = lisp_buffer_stream_startpos (istr);
2730 Lstream_write (ostr, tempbuf, size_in_bytes);
2731 even_newer_pos = lisp_buffer_stream_startpos (istr);
2732 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2738 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2739 Lstream_close (istr);
2740 Lstream_close (ostr);
2742 Lstream_delete (istr);
2743 Lstream_delete (ostr);
2744 Lstream_delete (XLSTREAM (de_outstream));
2745 Lstream_delete (XLSTREAM (lb_outstream));
2746 return make_int (retlen);
2752 /************************************************************************/
2753 /* Shift-JIS methods */
2754 /************************************************************************/
2756 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2757 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2758 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2759 encoded by "position-code + 0x80". A character of JISX0208
2760 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2761 position-codes are divided and shifted so that it fit in the range
2764 --- CODE RANGE of Shift-JIS ---
2765 (character set) (range)
2767 JISX0201-Kana 0xA0 .. 0xDF
2768 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2769 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2770 -------------------------------
2774 /* Is this the first byte of a Shift-JIS two-byte char? */
2776 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2777 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2779 /* Is this the second byte of a Shift-JIS two-byte char? */
2781 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2782 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2784 #define BYTE_SJIS_KATAKANA_P(c) \
2785 ((c) >= 0xA1 && (c) <= 0xDF)
2788 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2796 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2798 if (st->shift_jis.in_second_byte)
2800 st->shift_jis.in_second_byte = 0;
2804 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2805 st->shift_jis.in_second_byte = 1;
2807 return CODING_CATEGORY_SHIFT_JIS_MASK;
2810 /* Convert Shift-JIS data to internal format. */
2813 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2814 unsigned_char_dynarr *dst, unsigned int n)
2817 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2818 unsigned int flags = str->flags;
2819 unsigned int ch = str->ch;
2820 eol_type_t eol_type = str->eol_type;
2828 /* Previous character was first byte of Shift-JIS Kanji char. */
2829 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2831 unsigned char e1, e2;
2833 DECODE_SJIS (ch, c, e1, e2);
2835 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2839 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2840 Dynarr_add (dst, e1);
2841 Dynarr_add (dst, e2);
2846 DECODE_ADD_BINARY_CHAR (ch, dst);
2847 DECODE_ADD_BINARY_CHAR (c, dst);
2853 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2854 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2856 else if (BYTE_SJIS_KATAKANA_P (c))
2859 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2862 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2863 Dynarr_add (dst, c);
2867 DECODE_ADD_BINARY_CHAR (c, dst);
2869 label_continue_loop:;
2872 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2878 /* Convert internally-formatted data to Shift-JIS. */
2881 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2882 unsigned_char_dynarr *dst, unsigned int n)
2885 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2886 unsigned int flags = str->flags;
2887 unsigned int ch = str->ch;
2888 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2890 unsigned char char_boundary = str->iso2022.current_char_boundary;
2897 switch (char_boundary)
2905 else if ( c >= 0xf8 )
2910 else if ( c >= 0xf0 )
2915 else if ( c >= 0xe0 )
2920 else if ( c >= 0xc0 )
2930 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2931 Dynarr_add (dst, '\r');
2932 if (eol_type != EOL_CR)
2933 Dynarr_add (dst, c);
2936 Dynarr_add (dst, c);
2941 ch = ( ch << 6 ) | ( c & 0x3f );
2943 Lisp_Object charset;
2944 unsigned int c1, c2, s1, s2;
2946 BREAKUP_CHAR (ch, charset, c1, c2);
2947 if (EQ(charset, Vcharset_katakana_jisx0201))
2949 Dynarr_add (dst, c1 | 0x80);
2951 else if (EQ(charset, Vcharset_japanese_jisx0208))
2953 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2954 Dynarr_add (dst, s1);
2955 Dynarr_add (dst, s2);
2961 ch = ( ch << 6 ) | ( c & 0x3f );
2967 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2968 Dynarr_add (dst, '\r');
2969 if (eol_type != EOL_CR)
2970 Dynarr_add (dst, '\n');
2973 else if (BYTE_ASCII_P (c))
2975 Dynarr_add (dst, c);
2978 else if (BUFBYTE_LEADING_BYTE_P (c))
2979 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2980 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2981 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2984 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2986 Dynarr_add (dst, c);
2989 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2990 ch == LEADING_BYTE_JAPANESE_JISX0208)
2994 unsigned char j1, j2;
2995 ENCODE_SJIS (ch, c, j1, j2);
2996 Dynarr_add (dst, j1);
2997 Dynarr_add (dst, j2);
3007 str->iso2022.current_char_boundary = char_boundary;
3011 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3012 Decode a JISX0208 character of Shift-JIS coding-system.
3013 CODE is the character code in Shift-JIS as a cons of type bytes.
3014 Return the corresponding character.
3018 unsigned char c1, c2, s1, s2;
3021 CHECK_INT (XCAR (code));
3022 CHECK_INT (XCDR (code));
3023 s1 = XINT (XCAR (code));
3024 s2 = XINT (XCDR (code));
3025 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3026 BYTE_SJIS_TWO_BYTE_2_P (s2))
3028 DECODE_SJIS (s1, s2, c1, c2);
3029 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3030 c1 & 0x7F, c2 & 0x7F));
3036 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3037 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3038 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3042 Lisp_Object charset;
3045 CHECK_CHAR_COERCE_INT (ch);
3046 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3047 if (EQ (charset, Vcharset_japanese_jisx0208))
3049 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3050 return Fcons (make_int (s1), make_int (s2));
3057 /************************************************************************/
3059 /************************************************************************/
3061 /* BIG5 is a coding system encoding two character sets: ASCII and
3062 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3063 character set and is encoded in two-byte.
3065 --- CODE RANGE of BIG5 ---
3066 (character set) (range)
3068 Big5 (1st byte) 0xA1 .. 0xFE
3069 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3070 --------------------------
3072 Since the number of characters in Big5 is larger than maximum
3073 characters in Emacs' charset (96x96), it can't be handled as one
3074 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3075 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3076 contains frequently used characters and the latter contains less
3077 frequently used characters. */
3079 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3080 ((c) >= 0xA1 && (c) <= 0xFE)
3082 /* Is this the second byte of a Shift-JIS two-byte char? */
3084 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3085 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3087 /* Number of Big5 characters which have the same code in 1st byte. */
3089 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3091 /* Code conversion macros. These are macros because they are used in
3092 inner loops during code conversion.
3094 Note that temporary variables in macros introduce the classic
3095 dynamic-scoping problems with variable names. We use capital-
3096 lettered variables in the assumption that XEmacs does not use
3097 capital letters in variables except in a very formalized way
3100 /* Convert Big5 code (b1, b2) into its internal string representation
3103 /* There is a much simpler way to split the Big5 charset into two.
3104 For the moment I'm going to leave the algorithm as-is because it
3105 claims to separate out the most-used characters into a single
3106 charset, which perhaps will lead to optimizations in various
3109 The way the algorithm works is something like this:
3111 Big5 can be viewed as a 94x157 charset, where the row is
3112 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3113 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3114 the split between low and high column numbers is apparently
3115 meaningless; ascending rows produce less and less frequent chars.
3116 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3117 the first charset, and the upper half (0xC9 .. 0xFE) to the
3118 second. To do the conversion, we convert the character into
3119 a single number where 0 .. 156 is the first row, 157 .. 313
3120 is the second, etc. That way, the characters are ordered by
3121 decreasing frequency. Then we just chop the space in two
3122 and coerce the result into a 94x94 space.
3125 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3127 int B1 = b1, B2 = b2; \
3129 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3133 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3137 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3138 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3140 c1 = I / (0xFF - 0xA1) + 0xA1; \
3141 c2 = I % (0xFF - 0xA1) + 0xA1; \
3144 /* Convert the internal string representation of a Big5 character
3145 (lb, c1, c2) into Big5 code (b1, b2). */
3147 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3149 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3151 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3153 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3155 b1 = I / BIG5_SAME_ROW + 0xA1; \
3156 b2 = I % BIG5_SAME_ROW; \
3157 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3161 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3169 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3170 (c >= 0x80 && c <= 0xA0))
3172 if (st->big5.in_second_byte)
3174 st->big5.in_second_byte = 0;
3175 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3179 st->big5.in_second_byte = 1;
3181 return CODING_CATEGORY_BIG5_MASK;
3184 /* Convert Big5 data to internal format. */
3187 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3188 unsigned_char_dynarr *dst, unsigned int n)
3191 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3192 unsigned int flags = str->flags;
3193 unsigned int ch = str->ch;
3194 eol_type_t eol_type = str->eol_type;
3201 /* Previous character was first byte of Big5 char. */
3202 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3204 unsigned char b1, b2, b3;
3205 DECODE_BIG5 (ch, c, b1, b2, b3);
3206 Dynarr_add (dst, b1);
3207 Dynarr_add (dst, b2);
3208 Dynarr_add (dst, b3);
3212 DECODE_ADD_BINARY_CHAR (ch, dst);
3213 DECODE_ADD_BINARY_CHAR (c, dst);
3219 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3220 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3223 DECODE_ADD_BINARY_CHAR (c, dst);
3225 label_continue_loop:;
3228 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3234 /* Convert internally-formatted data to Big5. */
3237 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3238 unsigned_char_dynarr *dst, unsigned int n)
3242 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3243 unsigned int flags = str->flags;
3244 unsigned int ch = str->ch;
3245 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3252 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3253 Dynarr_add (dst, '\r');
3254 if (eol_type != EOL_CR)
3255 Dynarr_add (dst, '\n');
3257 else if (BYTE_ASCII_P (c))
3260 Dynarr_add (dst, c);
3262 else if (BUFBYTE_LEADING_BYTE_P (c))
3264 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3265 c == LEADING_BYTE_CHINESE_BIG5_2)
3267 /* A recognized leading byte. */
3269 continue; /* not done with this character. */
3271 /* otherwise just ignore this character. */
3273 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3274 ch == LEADING_BYTE_CHINESE_BIG5_2)
3276 /* Previous char was a recognized leading byte. */
3278 continue; /* not done with this character. */
3282 /* Encountering second byte of a Big5 character. */
3283 unsigned char b1, b2;
3285 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3286 Dynarr_add (dst, b1);
3287 Dynarr_add (dst, b2);
3299 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3300 Decode a Big5 character CODE of BIG5 coding-system.
3301 CODE is the character code in BIG5, a cons of two integers.
3302 Return the corresponding character.
3306 unsigned char c1, c2, b1, b2;
3309 CHECK_INT (XCAR (code));
3310 CHECK_INT (XCDR (code));
3311 b1 = XINT (XCAR (code));
3312 b2 = XINT (XCDR (code));
3313 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3314 BYTE_BIG5_TWO_BYTE_2_P (b2))
3316 Charset_ID leading_byte;
3317 Lisp_Object charset;
3318 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3319 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3320 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3326 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3327 Encode the Big5 character CH to BIG5 coding-system.
3328 Return the corresponding character code in Big5.
3332 Lisp_Object charset;
3335 CHECK_CHAR_COERCE_INT (ch);
3336 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3337 if (EQ (charset, Vcharset_chinese_big5_1) ||
3338 EQ (charset, Vcharset_chinese_big5_2))
3340 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3342 return Fcons (make_int (b1), make_int (b2));
3349 /************************************************************************/
3352 /* UCS-4 character codes are implemented as nonnegative integers. */
3354 /************************************************************************/
3356 Lisp_Object ucs_to_mule_table[65536];
3357 Lisp_Object mule_to_ucs_table;
3359 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3360 Map UCS-4 code CODE to Mule character CHARACTER.
3362 Return T on success, NIL on failure.
3368 CHECK_CHAR (character);
3372 if (c < sizeof (ucs_to_mule_table))
3374 ucs_to_mule_table[c] = character;
3382 ucs_to_char (unsigned long code)
3384 if (code < sizeof (ucs_to_mule_table))
3386 return ucs_to_mule_table[code];
3388 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3393 c = code % (94 * 94);
3395 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3396 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3397 CHARSET_LEFT_TO_RIGHT),
3398 c / 94 + 33, c % 94 + 33));
3404 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3405 Return Mule character corresponding to UCS code CODE (a positive integer).
3409 CHECK_NATNUM (code);
3410 return ucs_to_char (XINT (code));
3413 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3414 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3418 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3419 Fset_char_ucs is more restrictive on index arg, but should
3420 check code arg in a char_table method. */
3421 CHECK_CHAR (character);
3422 CHECK_NATNUM (code);
3423 return Fput_char_table (character, code, mule_to_ucs_table);
3426 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3427 Return the UCS code (a positive integer) corresponding to CHARACTER.
3431 return Fget_char_table (character, mule_to_ucs_table);
3435 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3437 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3438 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3439 is not found, instead.
3440 #### do something more appropriate (use blob?)
3441 Danger, Will Robinson! Data loss. Should we signal user? */
3443 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3445 Lisp_Object chr = ucs_to_char (ch);
3449 Bufbyte work[MAX_EMCHAR_LEN];
3454 simple_set_charptr_emchar (work, ch) :
3455 non_ascii_set_charptr_emchar (work, ch);
3456 Dynarr_add_many (dst, work, len);
3460 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3461 Dynarr_add (dst, 34 + 128);
3462 Dynarr_add (dst, 46 + 128);
3467 static unsigned long
3468 mule_char_to_ucs4 (Lisp_Object charset,
3469 unsigned char h, unsigned char l)
3472 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3479 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3480 (XCHARSET_CHARS (charset) == 94) )
3482 unsigned char final = XCHARSET_FINAL (charset);
3484 if ( ('@' <= final) && (final < 0x7f) )
3486 return 0xe00000 + (final - '@') * 94 * 94
3487 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3501 encode_ucs4 (Lisp_Object charset,
3502 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3504 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3505 Dynarr_add (dst, code >> 24);
3506 Dynarr_add (dst, (code >> 16) & 255);
3507 Dynarr_add (dst, (code >> 8) & 255);
3508 Dynarr_add (dst, code & 255);
3512 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3518 switch (st->ucs4.in_byte)
3527 st->ucs4.in_byte = 0;
3533 return CODING_CATEGORY_UCS4_MASK;
3537 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3538 unsigned_char_dynarr *dst, unsigned int n)
3540 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3541 unsigned int flags = str->flags;
3542 unsigned int ch = str->ch;
3543 unsigned char counter = str->counter;
3547 unsigned char c = *src++;
3555 decode_ucs4 ( ( ch << 8 ) | c, dst);
3560 ch = ( ch << 8 ) | c;
3564 if (counter & CODING_STATE_END)
3565 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3569 str->counter = counter;
3573 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3574 unsigned_char_dynarr *dst, unsigned int n)
3577 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3578 unsigned int flags = str->flags;
3579 unsigned int ch = str->ch;
3580 unsigned char char_boundary = str->iso2022.current_char_boundary;
3581 Lisp_Object charset = str->iso2022.current_charset;
3583 #ifdef ENABLE_COMPOSITE_CHARS
3584 /* flags for handling composite chars. We do a little switcharoo
3585 on the source while we're outputting the composite char. */
3586 unsigned int saved_n = 0;
3587 CONST unsigned char *saved_src = NULL;
3588 int in_composite = 0;
3595 unsigned char c = *src++;
3597 if (BYTE_ASCII_P (c))
3598 { /* Processing ASCII character */
3600 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3603 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3604 { /* Processing Leading Byte */
3606 charset = CHARSET_BY_LEADING_BYTE (c);
3607 if (LEADING_BYTE_PREFIX_P(c))
3612 { /* Processing Non-ASCII character */
3614 if (EQ (charset, Vcharset_control_1))
3616 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3620 switch (XCHARSET_REP_BYTES (charset))
3623 encode_ucs4 (charset, c, 0, dst);
3626 if (XCHARSET_PRIVATE_P (charset))
3628 encode_ucs4 (charset, c, 0, dst);
3633 #ifdef ENABLE_COMPOSITE_CHARS
3634 if (EQ (charset, Vcharset_composite))
3638 /* #### Bother! We don't know how to
3640 Dynarr_add (dst, 0);
3641 Dynarr_add (dst, 0);
3642 Dynarr_add (dst, 0);
3643 Dynarr_add (dst, '~');
3647 Emchar emch = MAKE_CHAR (Vcharset_composite,
3648 ch & 0x7F, c & 0x7F);
3649 Lisp_Object lstr = composite_char_string (emch);
3653 src = XSTRING_DATA (lstr);
3654 n = XSTRING_LENGTH (lstr);
3658 #endif /* ENABLE_COMPOSITE_CHARS */
3660 encode_ucs4(charset, ch, c, dst);
3673 encode_ucs4 (charset, ch, c, dst);
3689 #ifdef ENABLE_COMPOSITE_CHARS
3695 goto back_to_square_n; /* Wheeeeeeeee ..... */
3697 #endif /* ENABLE_COMPOSITE_CHARS */
3701 str->iso2022.current_char_boundary = char_boundary;
3702 str->iso2022.current_charset = charset;
3704 /* Verbum caro factum est! */
3709 /************************************************************************/
3711 /************************************************************************/
3714 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3719 unsigned char c = *src++;
3720 switch (st->utf8.in_byte)
3723 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3726 st->utf8.in_byte = 5;
3728 st->utf8.in_byte = 4;
3730 st->utf8.in_byte = 3;
3732 st->utf8.in_byte = 2;
3734 st->utf8.in_byte = 1;
3739 if ((c & 0xc0) != 0x80)
3745 return CODING_CATEGORY_UTF8_MASK;
3749 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3750 unsigned_char_dynarr *dst, unsigned int n)
3752 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3753 unsigned int flags = str->flags;
3754 unsigned int ch = str->ch;
3755 eol_type_t eol_type = str->eol_type;
3756 unsigned char counter = str->counter;
3760 unsigned char c = *src++;
3769 else if ( c >= 0xf8 )
3774 else if ( c >= 0xf0 )
3779 else if ( c >= 0xe0 )
3784 else if ( c >= 0xc0 )
3791 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3792 decode_ucs4 (c, dst);
3796 ch = ( ch << 6 ) | ( c & 0x3f );
3797 decode_ucs4 (ch, dst);
3802 ch = ( ch << 6 ) | ( c & 0x3f );
3805 label_continue_loop:;
3808 if (flags & CODING_STATE_END)
3809 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3813 str->counter = counter;
3818 encode_utf8 (Lisp_Object charset,
3819 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3821 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3824 Dynarr_add (dst, code);
3826 else if ( code <= 0x7ff )
3828 Dynarr_add (dst, (code >> 6) | 0xc0);
3829 Dynarr_add (dst, (code & 0x3f) | 0x80);
3831 else if ( code <= 0xffff )
3833 Dynarr_add (dst, (code >> 12) | 0xe0);
3834 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3835 Dynarr_add (dst, (code & 0x3f) | 0x80);
3837 else if ( code <= 0x1fffff )
3839 Dynarr_add (dst, (code >> 18) | 0xf0);
3840 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3841 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3842 Dynarr_add (dst, (code & 0x3f) | 0x80);
3844 else if ( code <= 0x3ffffff )
3846 Dynarr_add (dst, (code >> 24) | 0xf8);
3847 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3848 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3849 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3850 Dynarr_add (dst, (code & 0x3f) | 0x80);
3854 Dynarr_add (dst, (code >> 30) | 0xfc);
3855 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3856 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3857 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3858 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3859 Dynarr_add (dst, (code & 0x3f) | 0x80);
3865 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3866 unsigned_char_dynarr *dst, unsigned int n)
3868 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3869 unsigned int flags = str->flags;
3870 unsigned int ch = str->ch;
3871 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3872 unsigned char char_boundary = str->iso2022.current_char_boundary;
3877 unsigned char c = *src++;
3878 switch (char_boundary)
3883 Dynarr_add (dst, c);
3886 else if ( c >= 0xf8 )
3888 Dynarr_add (dst, c);
3891 else if ( c >= 0xf0 )
3893 Dynarr_add (dst, c);
3896 else if ( c >= 0xe0 )
3898 Dynarr_add (dst, c);
3901 else if ( c >= 0xc0 )
3903 Dynarr_add (dst, c);
3910 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3911 Dynarr_add (dst, '\r');
3912 if (eol_type != EOL_CR)
3913 Dynarr_add (dst, c);
3916 Dynarr_add (dst, c);
3921 Dynarr_add (dst, c);
3925 Dynarr_add (dst, c);
3929 #else /* not UTF2000 */
3930 Lisp_Object charset = str->iso2022.current_charset;
3932 #ifdef ENABLE_COMPOSITE_CHARS
3933 /* flags for handling composite chars. We do a little switcharoo
3934 on the source while we're outputting the composite char. */
3935 unsigned int saved_n = 0;
3936 CONST unsigned char *saved_src = NULL;
3937 int in_composite = 0;
3940 #endif /* ENABLE_COMPOSITE_CHARS */
3944 unsigned char c = *src++;
3946 if (BYTE_ASCII_P (c))
3947 { /* Processing ASCII character */
3951 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3952 Dynarr_add (dst, '\r');
3953 if (eol_type != EOL_CR)
3954 Dynarr_add (dst, c);
3957 encode_utf8 (Vcharset_ascii, c, 0, dst);
3960 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3961 { /* Processing Leading Byte */
3963 charset = CHARSET_BY_LEADING_BYTE (c);
3964 if (LEADING_BYTE_PREFIX_P(c))
3969 { /* Processing Non-ASCII character */
3971 if (EQ (charset, Vcharset_control_1))
3973 encode_utf8 (Vcharset_control_1, c, 0, dst);
3977 switch (XCHARSET_REP_BYTES (charset))
3980 encode_utf8 (charset, c, 0, dst);
3983 if (XCHARSET_PRIVATE_P (charset))
3985 encode_utf8 (charset, c, 0, dst);
3990 #ifdef ENABLE_COMPOSITE_CHARS
3991 if (EQ (charset, Vcharset_composite))
3995 /* #### Bother! We don't know how to
3997 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4001 Emchar emch = MAKE_CHAR (Vcharset_composite,
4002 ch & 0x7F, c & 0x7F);
4003 Lisp_Object lstr = composite_char_string (emch);
4007 src = XSTRING_DATA (lstr);
4008 n = XSTRING_LENGTH (lstr);
4012 #endif /* ENABLE_COMPOSITE_CHARS */
4014 encode_utf8 (charset, ch, c, dst);
4027 encode_utf8 (charset, ch, c, dst);
4043 #ifdef ENABLE_COMPOSITE_CHARS
4049 goto back_to_square_n; /* Wheeeeeeeee ..... */
4053 #endif /* not UTF2000 */
4056 str->iso2022.current_char_boundary = char_boundary;
4058 str->iso2022.current_charset = charset;
4061 /* Verbum caro factum est! */
4065 /************************************************************************/
4066 /* ISO2022 methods */
4067 /************************************************************************/
4069 /* The following note describes the coding system ISO2022 briefly.
4070 Since the intention of this note is to help understand the
4071 functions in this file, some parts are NOT ACCURATE or OVERLY
4072 SIMPLIFIED. For thorough understanding, please refer to the
4073 original document of ISO2022.
4075 ISO2022 provides many mechanisms to encode several character sets
4076 in 7-bit and 8-bit environments. For 7-bit environments, all text
4077 is encoded using bytes less than 128. This may make the encoded
4078 text a little bit longer, but the text passes more easily through
4079 several gateways, some of which strip off MSB (Most Signigant Bit).
4081 There are two kinds of character sets: control character set and
4082 graphic character set. The former contains control characters such
4083 as `newline' and `escape' to provide control functions (control
4084 functions are also provided by escape sequences). The latter
4085 contains graphic characters such as 'A' and '-'. Emacs recognizes
4086 two control character sets and many graphic character sets.
4088 Graphic character sets are classified into one of the following
4089 four classes, according to the number of bytes (DIMENSION) and
4090 number of characters in one dimension (CHARS) of the set:
4091 - DIMENSION1_CHARS94
4092 - DIMENSION1_CHARS96
4093 - DIMENSION2_CHARS94
4094 - DIMENSION2_CHARS96
4096 In addition, each character set is assigned an identification tag,
4097 unique for each set, called "final character" (denoted as <F>
4098 hereafter). The <F> of each character set is decided by ECMA(*)
4099 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4100 (0x30..0x3F are for private use only).
4102 Note (*): ECMA = European Computer Manufacturers Association
4104 Here are examples of graphic character set [NAME(<F>)]:
4105 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4106 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4107 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4108 o DIMENSION2_CHARS96 -- none for the moment
4110 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4111 C0 [0x00..0x1F] -- control character plane 0
4112 GL [0x20..0x7F] -- graphic character plane 0
4113 C1 [0x80..0x9F] -- control character plane 1
4114 GR [0xA0..0xFF] -- graphic character plane 1
4116 A control character set is directly designated and invoked to C0 or
4117 C1 by an escape sequence. The most common case is that:
4118 - ISO646's control character set is designated/invoked to C0, and
4119 - ISO6429's control character set is designated/invoked to C1,
4120 and usually these designations/invocations are omitted in encoded
4121 text. In a 7-bit environment, only C0 can be used, and a control
4122 character for C1 is encoded by an appropriate escape sequence to
4123 fit into the environment. All control characters for C1 are
4124 defined to have corresponding escape sequences.
4126 A graphic character set is at first designated to one of four
4127 graphic registers (G0 through G3), then these graphic registers are
4128 invoked to GL or GR. These designations and invocations can be
4129 done independently. The most common case is that G0 is invoked to
4130 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4131 these invocations and designations are omitted in encoded text.
4132 In a 7-bit environment, only GL can be used.
4134 When a graphic character set of CHARS94 is invoked to GL, codes
4135 0x20 and 0x7F of the GL area work as control characters SPACE and
4136 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4139 There are two ways of invocation: locking-shift and single-shift.
4140 With locking-shift, the invocation lasts until the next different
4141 invocation, whereas with single-shift, the invocation affects the
4142 following character only and doesn't affect the locking-shift
4143 state. Invocations are done by the following control characters or
4146 ----------------------------------------------------------------------
4147 abbrev function cntrl escape seq description
4148 ----------------------------------------------------------------------
4149 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4150 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4151 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4152 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4153 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4154 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4155 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4156 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4157 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4158 ----------------------------------------------------------------------
4159 (*) These are not used by any known coding system.
4161 Control characters for these functions are defined by macros
4162 ISO_CODE_XXX in `coding.h'.
4164 Designations are done by the following escape sequences:
4165 ----------------------------------------------------------------------
4166 escape sequence description
4167 ----------------------------------------------------------------------
4168 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4169 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4170 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4171 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4172 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4173 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4174 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4175 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4176 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4177 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4178 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4179 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4180 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4181 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4182 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4183 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4184 ----------------------------------------------------------------------
4186 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4187 of dimension 1, chars 94, and final character <F>, etc...
4189 Note (*): Although these designations are not allowed in ISO2022,
4190 Emacs accepts them on decoding, and produces them on encoding
4191 CHARS96 character sets in a coding system which is characterized as
4192 7-bit environment, non-locking-shift, and non-single-shift.
4194 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4195 '(' can be omitted. We refer to this as "short-form" hereafter.
4197 Now you may notice that there are a lot of ways for encoding the
4198 same multilingual text in ISO2022. Actually, there exist many
4199 coding systems such as Compound Text (used in X11's inter client
4200 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4201 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4202 localized platforms), and all of these are variants of ISO2022.
4204 In addition to the above, Emacs handles two more kinds of escape
4205 sequences: ISO6429's direction specification and Emacs' private
4206 sequence for specifying character composition.
4208 ISO6429's direction specification takes the following form:
4209 o CSI ']' -- end of the current direction
4210 o CSI '0' ']' -- end of the current direction
4211 o CSI '1' ']' -- start of left-to-right text
4212 o CSI '2' ']' -- start of right-to-left text
4213 The control character CSI (0x9B: control sequence introducer) is
4214 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4216 Character composition specification takes the following form:
4217 o ESC '0' -- start character composition
4218 o ESC '1' -- end character composition
4219 Since these are not standard escape sequences of any ISO standard,
4220 their use with these meanings is restricted to Emacs only. */
4223 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4227 for (i = 0; i < 4; i++)
4229 if (!NILP (coding_system))
4231 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4233 iso->charset[i] = Qt;
4234 iso->invalid_designated[i] = 0;
4236 iso->esc = ISO_ESC_NOTHING;
4237 iso->esc_bytes_index = 0;
4238 iso->register_left = 0;
4239 iso->register_right = 1;
4240 iso->switched_dir_and_no_valid_charset_yet = 0;
4241 iso->invalid_switch_dir = 0;
4242 iso->output_direction_sequence = 0;
4243 iso->output_literally = 0;
4244 #ifdef ENABLE_COMPOSITE_CHARS
4245 if (iso->composite_chars)
4246 Dynarr_reset (iso->composite_chars);
4251 fit_to_be_escape_quoted (unsigned char c)
4268 /* Parse one byte of an ISO2022 escape sequence.
4269 If the result is an invalid escape sequence, return 0 and
4270 do not change anything in STR. Otherwise, if the result is
4271 an incomplete escape sequence, update ISO2022.ESC and
4272 ISO2022.ESC_BYTES and return -1. Otherwise, update
4273 all the state variables (but not ISO2022.ESC_BYTES) and
4276 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4277 or invocation of an invalid character set and treat that as
4278 an unrecognized escape sequence. */
4281 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4282 unsigned char c, unsigned int *flags,
4283 int check_invalid_charsets)
4285 /* (1) If we're at the end of a designation sequence, CS is the
4286 charset being designated and REG is the register to designate
4289 (2) If we're at the end of a locking-shift sequence, REG is
4290 the register to invoke and HALF (0 == left, 1 == right) is
4291 the half to invoke it into.
4293 (3) If we're at the end of a single-shift sequence, REG is
4294 the register to invoke. */
4295 Lisp_Object cs = Qnil;
4298 /* NOTE: This code does goto's all over the fucking place.
4299 The reason for this is that we're basically implementing
4300 a state machine here, and hierarchical languages like C
4301 don't really provide a clean way of doing this. */
4303 if (! (*flags & CODING_STATE_ESCAPE))
4304 /* At beginning of escape sequence; we need to reset our
4305 escape-state variables. */
4306 iso->esc = ISO_ESC_NOTHING;
4308 iso->output_literally = 0;
4309 iso->output_direction_sequence = 0;
4313 case ISO_ESC_NOTHING:
4314 iso->esc_bytes_index = 0;
4317 case ISO_CODE_ESC: /* Start escape sequence */
4318 *flags |= CODING_STATE_ESCAPE;
4322 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4323 *flags |= CODING_STATE_ESCAPE;
4324 iso->esc = ISO_ESC_5_11;
4327 case ISO_CODE_SO: /* locking shift 1 */
4330 case ISO_CODE_SI: /* locking shift 0 */
4334 case ISO_CODE_SS2: /* single shift */
4337 case ISO_CODE_SS3: /* single shift */
4341 default: /* Other control characters */
4348 /**** single shift ****/
4350 case 'N': /* single shift 2 */
4353 case 'O': /* single shift 3 */
4357 /**** locking shift ****/
4359 case '~': /* locking shift 1 right */
4362 case 'n': /* locking shift 2 */
4365 case '}': /* locking shift 2 right */
4368 case 'o': /* locking shift 3 */
4371 case '|': /* locking shift 3 right */
4375 #ifdef ENABLE_COMPOSITE_CHARS
4376 /**** composite ****/
4379 iso->esc = ISO_ESC_START_COMPOSITE;
4380 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4381 CODING_STATE_COMPOSITE;
4385 iso->esc = ISO_ESC_END_COMPOSITE;
4386 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4387 ~CODING_STATE_COMPOSITE;
4389 #endif /* ENABLE_COMPOSITE_CHARS */
4391 /**** directionality ****/
4394 iso->esc = ISO_ESC_5_11;
4397 /**** designation ****/
4399 case '$': /* multibyte charset prefix */
4400 iso->esc = ISO_ESC_2_4;
4404 if (0x28 <= c && c <= 0x2F)
4406 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4410 /* This function is called with CODESYS equal to nil when
4411 doing coding-system detection. */
4413 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4414 && fit_to_be_escape_quoted (c))
4416 iso->esc = ISO_ESC_LITERAL;
4417 *flags &= CODING_STATE_ISO2022_LOCK;
4427 /**** directionality ****/
4429 case ISO_ESC_5_11: /* ISO6429 direction control */
4432 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4433 goto directionality;
4435 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4436 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4437 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4441 case ISO_ESC_5_11_0:
4444 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4445 goto directionality;
4449 case ISO_ESC_5_11_1:
4452 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4453 goto directionality;
4457 case ISO_ESC_5_11_2:
4460 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4461 goto directionality;
4466 iso->esc = ISO_ESC_DIRECTIONALITY;
4467 /* Various junk here to attempt to preserve the direction sequences
4468 literally in the text if they would otherwise be swallowed due
4469 to invalid designations that don't show up as actual charset
4470 changes in the text. */
4471 if (iso->invalid_switch_dir)
4473 /* We already inserted a direction switch literally into the
4474 text. We assume (#### this may not be right) that the
4475 next direction switch is the one going the other way,
4476 and we need to output that literally as well. */
4477 iso->output_literally = 1;
4478 iso->invalid_switch_dir = 0;
4484 /* If we are in the thrall of an invalid designation,
4485 then stick the directionality sequence literally into the
4486 output stream so it ends up in the original text again. */
4487 for (jj = 0; jj < 4; jj++)
4488 if (iso->invalid_designated[jj])
4492 iso->output_literally = 1;
4493 iso->invalid_switch_dir = 1;
4496 /* Indicate that we haven't yet seen a valid designation,
4497 so that if a switch-dir is directly followed by an
4498 invalid designation, both get inserted literally. */
4499 iso->switched_dir_and_no_valid_charset_yet = 1;
4504 /**** designation ****/
4507 if (0x28 <= c && c <= 0x2F)
4509 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4512 if (0x40 <= c && c <= 0x42)
4514 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4515 *flags & CODING_STATE_R2L ?
4516 CHARSET_RIGHT_TO_LEFT :
4517 CHARSET_LEFT_TO_RIGHT);
4527 if (c < '0' || c > '~')
4528 return 0; /* bad final byte */
4530 if (iso->esc >= ISO_ESC_2_8 &&
4531 iso->esc <= ISO_ESC_2_15)
4533 type = ((iso->esc >= ISO_ESC_2_12) ?
4534 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4535 reg = (iso->esc - ISO_ESC_2_8) & 3;
4537 else if (iso->esc >= ISO_ESC_2_4_8 &&
4538 iso->esc <= ISO_ESC_2_4_15)
4540 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4541 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4542 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4546 /* Can this ever be reached? -slb */
4550 cs = CHARSET_BY_ATTRIBUTES (type, c,
4551 *flags & CODING_STATE_R2L ?
4552 CHARSET_RIGHT_TO_LEFT :
4553 CHARSET_LEFT_TO_RIGHT);
4559 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4563 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4564 /* can't invoke something that ain't there. */
4566 iso->esc = ISO_ESC_SINGLE_SHIFT;
4567 *flags &= CODING_STATE_ISO2022_LOCK;
4569 *flags |= CODING_STATE_SS2;
4571 *flags |= CODING_STATE_SS3;
4575 if (check_invalid_charsets &&
4576 !CHARSETP (iso->charset[reg]))
4577 /* can't invoke something that ain't there. */
4580 iso->register_right = reg;
4582 iso->register_left = reg;
4583 *flags &= CODING_STATE_ISO2022_LOCK;
4584 iso->esc = ISO_ESC_LOCKING_SHIFT;
4588 if (NILP (cs) && check_invalid_charsets)
4590 iso->invalid_designated[reg] = 1;
4591 iso->charset[reg] = Vcharset_ascii;
4592 iso->esc = ISO_ESC_DESIGNATE;
4593 *flags &= CODING_STATE_ISO2022_LOCK;
4594 iso->output_literally = 1;
4595 if (iso->switched_dir_and_no_valid_charset_yet)
4597 /* We encountered a switch-direction followed by an
4598 invalid designation. Ensure that the switch-direction
4599 gets outputted; otherwise it will probably get eaten
4600 when the text is written out again. */
4601 iso->switched_dir_and_no_valid_charset_yet = 0;
4602 iso->output_direction_sequence = 1;
4603 /* And make sure that the switch-dir going the other
4604 way gets outputted, as well. */
4605 iso->invalid_switch_dir = 1;
4609 /* This function is called with CODESYS equal to nil when
4610 doing coding-system detection. */
4611 if (!NILP (codesys))
4613 charset_conversion_spec_dynarr *dyn =
4614 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4620 for (i = 0; i < Dynarr_length (dyn); i++)
4622 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4623 if (EQ (cs, spec->from_charset))
4624 cs = spec->to_charset;
4629 iso->charset[reg] = cs;
4630 iso->esc = ISO_ESC_DESIGNATE;
4631 *flags &= CODING_STATE_ISO2022_LOCK;
4632 if (iso->invalid_designated[reg])
4634 iso->invalid_designated[reg] = 0;
4635 iso->output_literally = 1;
4637 if (iso->switched_dir_and_no_valid_charset_yet)
4638 iso->switched_dir_and_no_valid_charset_yet = 0;
4643 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4648 /* #### There are serious deficiencies in the recognition mechanism
4649 here. This needs to be much smarter if it's going to cut it.
4650 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4651 it should be detected as Latin-1.
4652 All the ISO2022 stuff in this file should be synced up with the
4653 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4654 Perhaps we should wait till R2L works in FSF Emacs? */
4656 if (!st->iso2022.initted)
4658 reset_iso2022 (Qnil, &st->iso2022.iso);
4659 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4660 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4661 CODING_CATEGORY_ISO_8_1_MASK |
4662 CODING_CATEGORY_ISO_8_2_MASK |
4663 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4664 st->iso2022.flags = 0;
4665 st->iso2022.high_byte_count = 0;
4666 st->iso2022.saw_single_shift = 0;
4667 st->iso2022.initted = 1;
4670 mask = st->iso2022.mask;
4677 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4678 st->iso2022.high_byte_count++;
4682 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4684 if (st->iso2022.high_byte_count & 1)
4685 /* odd number of high bytes; assume not iso-8-2 */
4686 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4688 st->iso2022.high_byte_count = 0;
4689 st->iso2022.saw_single_shift = 0;
4691 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4693 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4694 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4695 { /* control chars */
4698 /* Allow and ignore control characters that you might
4699 reasonably see in a text file */
4704 case 8: /* backspace */
4705 case 11: /* vertical tab */
4706 case 12: /* form feed */
4707 case 26: /* MS-DOS C-z junk */
4708 case 31: /* '^_' -- for info */
4709 goto label_continue_loop;
4716 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4719 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4720 &st->iso2022.flags, 0))
4722 switch (st->iso2022.iso.esc)
4724 case ISO_ESC_DESIGNATE:
4725 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4726 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4728 case ISO_ESC_LOCKING_SHIFT:
4729 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4730 goto ran_out_of_chars;
4731 case ISO_ESC_SINGLE_SHIFT:
4732 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4733 st->iso2022.saw_single_shift = 1;
4742 goto ran_out_of_chars;
4745 label_continue_loop:;
4754 postprocess_iso2022_mask (int mask)
4756 /* #### kind of cheesy */
4757 /* If seven-bit ISO is allowed, then assume that the encoding is
4758 entirely seven-bit and turn off the eight-bit ones. */
4759 if (mask & CODING_CATEGORY_ISO_7_MASK)
4760 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4761 CODING_CATEGORY_ISO_8_1_MASK |
4762 CODING_CATEGORY_ISO_8_2_MASK);
4766 /* If FLAGS is a null pointer or specifies right-to-left motion,
4767 output a switch-dir-to-left-to-right sequence to DST.
4768 Also update FLAGS if it is not a null pointer.
4769 If INTERNAL_P is set, we are outputting in internal format and
4770 need to handle the CSI differently. */
4773 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4774 unsigned_char_dynarr *dst,
4775 unsigned int *flags,
4778 if (!flags || (*flags & CODING_STATE_R2L))
4780 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4782 Dynarr_add (dst, ISO_CODE_ESC);
4783 Dynarr_add (dst, '[');
4785 else if (internal_p)
4786 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4788 Dynarr_add (dst, ISO_CODE_CSI);
4789 Dynarr_add (dst, '0');
4790 Dynarr_add (dst, ']');
4792 *flags &= ~CODING_STATE_R2L;
4796 /* If FLAGS is a null pointer or specifies a direction different from
4797 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4798 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4799 sequence to DST. Also update FLAGS if it is not a null pointer.
4800 If INTERNAL_P is set, we are outputting in internal format and
4801 need to handle the CSI differently. */
4804 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4805 unsigned_char_dynarr *dst, unsigned int *flags,
4808 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4809 direction == CHARSET_LEFT_TO_RIGHT)
4810 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4811 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4812 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4813 direction == CHARSET_RIGHT_TO_LEFT)
4815 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4817 Dynarr_add (dst, ISO_CODE_ESC);
4818 Dynarr_add (dst, '[');
4820 else if (internal_p)
4821 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4823 Dynarr_add (dst, ISO_CODE_CSI);
4824 Dynarr_add (dst, '2');
4825 Dynarr_add (dst, ']');
4827 *flags |= CODING_STATE_R2L;
4831 /* Convert ISO2022-format data to internal format. */
4834 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4835 unsigned_char_dynarr *dst, unsigned int n)
4837 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4838 unsigned int flags = str->flags;
4839 unsigned int ch = str->ch;
4840 eol_type_t eol_type = str->eol_type;
4841 #ifdef ENABLE_COMPOSITE_CHARS
4842 unsigned_char_dynarr *real_dst = dst;
4844 Lisp_Object coding_system;
4846 XSETCODING_SYSTEM (coding_system, str->codesys);
4848 #ifdef ENABLE_COMPOSITE_CHARS
4849 if (flags & CODING_STATE_COMPOSITE)
4850 dst = str->iso2022.composite_chars;
4851 #endif /* ENABLE_COMPOSITE_CHARS */
4855 unsigned char c = *src++;
4856 if (flags & CODING_STATE_ESCAPE)
4857 { /* Within ESC sequence */
4858 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4863 switch (str->iso2022.esc)
4865 #ifdef ENABLE_COMPOSITE_CHARS
4866 case ISO_ESC_START_COMPOSITE:
4867 if (str->iso2022.composite_chars)
4868 Dynarr_reset (str->iso2022.composite_chars);
4870 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4871 dst = str->iso2022.composite_chars;
4873 case ISO_ESC_END_COMPOSITE:
4875 Bufbyte comstr[MAX_EMCHAR_LEN];
4877 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4878 Dynarr_length (dst));
4880 len = set_charptr_emchar (comstr, emch);
4881 Dynarr_add_many (dst, comstr, len);
4884 #endif /* ENABLE_COMPOSITE_CHARS */
4886 case ISO_ESC_LITERAL:
4887 DECODE_ADD_BINARY_CHAR (c, dst);
4891 /* Everything else handled already */
4896 /* Attempted error recovery. */
4897 if (str->iso2022.output_direction_sequence)
4898 ensure_correct_direction (flags & CODING_STATE_R2L ?
4899 CHARSET_RIGHT_TO_LEFT :
4900 CHARSET_LEFT_TO_RIGHT,
4901 str->codesys, dst, 0, 1);
4902 /* More error recovery. */
4903 if (!retval || str->iso2022.output_literally)
4905 /* Output the (possibly invalid) sequence */
4907 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4908 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4909 flags &= CODING_STATE_ISO2022_LOCK;
4911 n++, src--;/* Repeat the loop with the same character. */
4914 /* No sense in reprocessing the final byte of the
4915 escape sequence; it could mess things up anyway.
4917 DECODE_ADD_BINARY_CHAR (c, dst);
4922 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4923 { /* Control characters */
4925 /***** Error-handling *****/
4927 /* If we were in the middle of a character, dump out the
4928 partial character. */
4929 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4931 /* If we just saw a single-shift character, dump it out.
4932 This may dump out the wrong sort of single-shift character,
4933 but least it will give an indication that something went
4935 if (flags & CODING_STATE_SS2)
4937 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4938 flags &= ~CODING_STATE_SS2;
4940 if (flags & CODING_STATE_SS3)
4942 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4943 flags &= ~CODING_STATE_SS3;
4946 /***** Now handle the control characters. *****/
4949 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4951 flags &= CODING_STATE_ISO2022_LOCK;
4953 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4954 DECODE_ADD_BINARY_CHAR (c, dst);
4957 { /* Graphic characters */
4958 Lisp_Object charset;
4964 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4966 /* Now determine the charset. */
4967 reg = ((flags & CODING_STATE_SS2) ? 2
4968 : (flags & CODING_STATE_SS3) ? 3
4969 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4970 : str->iso2022.register_left);
4971 charset = str->iso2022.charset[reg];
4973 /* Error checking: */
4974 if (! CHARSETP (charset)
4975 || str->iso2022.invalid_designated[reg]
4976 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4977 && XCHARSET_CHARS (charset) == 94))
4978 /* Mrmph. We are trying to invoke a register that has no
4979 or an invalid charset in it, or trying to add a character
4980 outside the range of the charset. Insert that char literally
4981 to preserve it for the output. */
4983 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4984 DECODE_ADD_BINARY_CHAR (c, dst);
4989 /* Things are probably hunky-dorey. */
4991 /* Fetch reverse charset, maybe. */
4992 if (((flags & CODING_STATE_R2L) &&
4993 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4995 (!(flags & CODING_STATE_R2L) &&
4996 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4998 Lisp_Object new_charset =
4999 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5000 if (!NILP (new_charset))
5001 charset = new_charset;
5005 if (XCHARSET_DIMENSION (charset) == 1)
5007 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5009 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5014 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5020 lb = XCHARSET_LEADING_BYTE (charset);
5021 switch (XCHARSET_REP_BYTES (charset))
5024 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5025 Dynarr_add (dst, c & 0x7F);
5028 case 2: /* one-byte official */
5029 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5030 Dynarr_add (dst, lb);
5031 Dynarr_add (dst, c | 0x80);
5034 case 3: /* one-byte private or two-byte official */
5035 if (XCHARSET_PRIVATE_P (charset))
5037 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5038 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5039 Dynarr_add (dst, lb);
5040 Dynarr_add (dst, c | 0x80);
5046 Dynarr_add (dst, lb);
5047 Dynarr_add (dst, ch | 0x80);
5048 Dynarr_add (dst, c | 0x80);
5056 default: /* two-byte private */
5059 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5060 Dynarr_add (dst, lb);
5061 Dynarr_add (dst, ch | 0x80);
5062 Dynarr_add (dst, c | 0x80);
5072 flags &= CODING_STATE_ISO2022_LOCK;
5075 label_continue_loop:;
5078 if (flags & CODING_STATE_END)
5079 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5086 /***** ISO2022 encoder *****/
5088 /* Designate CHARSET into register REG. */
5091 iso2022_designate (Lisp_Object charset, unsigned char reg,
5092 struct encoding_stream *str, unsigned_char_dynarr *dst)
5094 static CONST char inter94[] = "()*+";
5095 static CONST char inter96[] = ",-./";
5097 unsigned char final;
5098 Lisp_Object old_charset = str->iso2022.charset[reg];
5100 str->iso2022.charset[reg] = charset;
5101 if (!CHARSETP (charset))
5102 /* charset might be an initial nil or t. */
5104 type = XCHARSET_TYPE (charset);
5105 final = XCHARSET_FINAL (charset);
5106 if (!str->iso2022.force_charset_on_output[reg] &&
5107 CHARSETP (old_charset) &&
5108 XCHARSET_TYPE (old_charset) == type &&
5109 XCHARSET_FINAL (old_charset) == final)
5112 str->iso2022.force_charset_on_output[reg] = 0;
5115 charset_conversion_spec_dynarr *dyn =
5116 str->codesys->iso2022.output_conv;
5122 for (i = 0; i < Dynarr_length (dyn); i++)
5124 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5125 if (EQ (charset, spec->from_charset))
5126 charset = spec->to_charset;
5131 Dynarr_add (dst, ISO_CODE_ESC);
5134 case CHARSET_TYPE_94:
5135 Dynarr_add (dst, inter94[reg]);
5137 case CHARSET_TYPE_96:
5138 Dynarr_add (dst, inter96[reg]);
5140 case CHARSET_TYPE_94X94:
5141 Dynarr_add (dst, '$');
5143 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5146 Dynarr_add (dst, inter94[reg]);
5148 case CHARSET_TYPE_96X96:
5149 Dynarr_add (dst, '$');
5150 Dynarr_add (dst, inter96[reg]);
5153 Dynarr_add (dst, final);
5157 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5159 if (str->iso2022.register_left != 0)
5161 Dynarr_add (dst, ISO_CODE_SI);
5162 str->iso2022.register_left = 0;
5167 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5169 if (str->iso2022.register_left != 1)
5171 Dynarr_add (dst, ISO_CODE_SO);
5172 str->iso2022.register_left = 1;
5176 /* Convert internally-formatted data to ISO2022 format. */
5179 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5180 unsigned_char_dynarr *dst, unsigned int n)
5182 unsigned char charmask, c;
5183 unsigned char char_boundary;
5184 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5185 unsigned int flags = str->flags;
5186 Emchar ch = str->ch;
5187 Lisp_Coding_System *codesys = str->codesys;
5188 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5190 Lisp_Object charset;
5193 unsigned int byte1, byte2;
5196 #ifdef ENABLE_COMPOSITE_CHARS
5197 /* flags for handling composite chars. We do a little switcharoo
5198 on the source while we're outputting the composite char. */
5199 unsigned int saved_n = 0;
5200 CONST unsigned char *saved_src = NULL;
5201 int in_composite = 0;
5202 #endif /* ENABLE_COMPOSITE_CHARS */
5204 char_boundary = str->iso2022.current_char_boundary;
5205 charset = str->iso2022.current_charset;
5206 half = str->iso2022.current_half;
5208 #ifdef ENABLE_COMPOSITE_CHARS
5216 switch (char_boundary)
5224 else if ( c >= 0xf8 )
5229 else if ( c >= 0xf0 )
5234 else if ( c >= 0xe0 )
5239 else if ( c >= 0xc0 )
5248 restore_left_to_right_direction (codesys, dst, &flags, 0);
5250 /* Make sure G0 contains ASCII */
5251 if ((c > ' ' && c < ISO_CODE_DEL) ||
5252 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5254 ensure_normal_shift (str, dst);
5255 iso2022_designate (Vcharset_ascii, 0, str, dst);
5258 /* If necessary, restore everything to the default state
5261 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5263 restore_left_to_right_direction (codesys, dst, &flags, 0);
5265 ensure_normal_shift (str, dst);
5267 for (i = 0; i < 4; i++)
5269 Lisp_Object initial_charset =
5270 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5271 iso2022_designate (initial_charset, i, str, dst);
5276 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5277 Dynarr_add (dst, '\r');
5278 if (eol_type != EOL_CR)
5279 Dynarr_add (dst, c);
5283 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5284 && fit_to_be_escape_quoted (c))
5285 Dynarr_add (dst, ISO_CODE_ESC);
5286 Dynarr_add (dst, c);
5292 ch = ( ch << 6 ) | ( c & 0x3f );
5295 if ( (0x80 <= ch) && (ch <= 0x9f) )
5297 charmask = (half == 0 ? 0x00 : 0x80);
5299 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5300 && fit_to_be_escape_quoted (ch))
5301 Dynarr_add (dst, ISO_CODE_ESC);
5302 /* you asked for it ... */
5303 Dynarr_add (dst, ch);
5309 BREAKUP_CHAR (ch, charset, byte1, byte2);
5310 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5311 codesys, dst, &flags, 0);
5313 /* Now determine which register to use. */
5315 for (i = 0; i < 4; i++)
5317 if (EQ (charset, str->iso2022.charset[i]) ||
5319 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5328 if (XCHARSET_GRAPHIC (charset) != 0)
5330 if (!NILP (str->iso2022.charset[1]) &&
5331 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5332 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5334 else if (!NILP (str->iso2022.charset[2]))
5336 else if (!NILP (str->iso2022.charset[3]))
5345 iso2022_designate (charset, reg, str, dst);
5347 /* Now invoke that register. */
5351 ensure_normal_shift (str, dst);
5356 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5358 ensure_shift_out (str, dst);
5366 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5368 Dynarr_add (dst, ISO_CODE_ESC);
5369 Dynarr_add (dst, 'N');
5374 Dynarr_add (dst, ISO_CODE_SS2);
5380 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5382 Dynarr_add (dst, ISO_CODE_ESC);
5383 Dynarr_add (dst, 'O');
5388 Dynarr_add (dst, ISO_CODE_SS3);
5397 charmask = (half == 0 ? 0x00 : 0x80);
5399 switch (XCHARSET_DIMENSION (charset))
5402 Dynarr_add (dst, byte1 | charmask);
5405 Dynarr_add (dst, byte1 | charmask);
5406 Dynarr_add (dst, byte2 | charmask);
5415 ch = ( ch << 6 ) | ( c & 0x3f );
5419 #else /* not UTF2000 */
5425 if (BYTE_ASCII_P (c))
5426 { /* Processing ASCII character */
5429 restore_left_to_right_direction (codesys, dst, &flags, 0);
5431 /* Make sure G0 contains ASCII */
5432 if ((c > ' ' && c < ISO_CODE_DEL) ||
5433 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5435 ensure_normal_shift (str, dst);
5436 iso2022_designate (Vcharset_ascii, 0, str, dst);
5439 /* If necessary, restore everything to the default state
5442 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5444 restore_left_to_right_direction (codesys, dst, &flags, 0);
5446 ensure_normal_shift (str, dst);
5448 for (i = 0; i < 4; i++)
5450 Lisp_Object initial_charset =
5451 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5452 iso2022_designate (initial_charset, i, str, dst);
5457 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5458 Dynarr_add (dst, '\r');
5459 if (eol_type != EOL_CR)
5460 Dynarr_add (dst, c);
5464 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5465 && fit_to_be_escape_quoted (c))
5466 Dynarr_add (dst, ISO_CODE_ESC);
5467 Dynarr_add (dst, c);
5472 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5473 { /* Processing Leading Byte */
5475 charset = CHARSET_BY_LEADING_BYTE (c);
5476 if (LEADING_BYTE_PREFIX_P(c))
5478 else if (!EQ (charset, Vcharset_control_1)
5479 #ifdef ENABLE_COMPOSITE_CHARS
5480 && !EQ (charset, Vcharset_composite)
5486 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5487 codesys, dst, &flags, 0);
5489 /* Now determine which register to use. */
5491 for (i = 0; i < 4; i++)
5493 if (EQ (charset, str->iso2022.charset[i]) ||
5495 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5504 if (XCHARSET_GRAPHIC (charset) != 0)
5506 if (!NILP (str->iso2022.charset[1]) &&
5507 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5508 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5510 else if (!NILP (str->iso2022.charset[2]))
5512 else if (!NILP (str->iso2022.charset[3]))
5521 iso2022_designate (charset, reg, str, dst);
5523 /* Now invoke that register. */
5527 ensure_normal_shift (str, dst);
5532 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5534 ensure_shift_out (str, dst);
5542 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5544 Dynarr_add (dst, ISO_CODE_ESC);
5545 Dynarr_add (dst, 'N');
5550 Dynarr_add (dst, ISO_CODE_SS2);
5556 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5558 Dynarr_add (dst, ISO_CODE_ESC);
5559 Dynarr_add (dst, 'O');
5564 Dynarr_add (dst, ISO_CODE_SS3);
5576 { /* Processing Non-ASCII character */
5577 charmask = (half == 0 ? 0x7F : 0xFF);
5579 if (EQ (charset, Vcharset_control_1))
5581 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5582 && fit_to_be_escape_quoted (c))
5583 Dynarr_add (dst, ISO_CODE_ESC);
5584 /* you asked for it ... */
5585 Dynarr_add (dst, c - 0x20);
5589 switch (XCHARSET_REP_BYTES (charset))
5592 Dynarr_add (dst, c & charmask);
5595 if (XCHARSET_PRIVATE_P (charset))
5597 Dynarr_add (dst, c & charmask);
5602 #ifdef ENABLE_COMPOSITE_CHARS
5603 if (EQ (charset, Vcharset_composite))
5607 /* #### Bother! We don't know how to
5609 Dynarr_add (dst, '~');
5613 Emchar emch = MAKE_CHAR (Vcharset_composite,
5614 ch & 0x7F, c & 0x7F);
5615 Lisp_Object lstr = composite_char_string (emch);
5619 src = XSTRING_DATA (lstr);
5620 n = XSTRING_LENGTH (lstr);
5621 Dynarr_add (dst, ISO_CODE_ESC);
5622 Dynarr_add (dst, '0'); /* start composing */
5626 #endif /* ENABLE_COMPOSITE_CHARS */
5628 Dynarr_add (dst, ch & charmask);
5629 Dynarr_add (dst, c & charmask);
5642 Dynarr_add (dst, ch & charmask);
5643 Dynarr_add (dst, c & charmask);
5658 #endif /* not UTF2000 */
5660 #ifdef ENABLE_COMPOSITE_CHARS
5666 Dynarr_add (dst, ISO_CODE_ESC);
5667 Dynarr_add (dst, '1'); /* end composing */
5668 goto back_to_square_n; /* Wheeeeeeeee ..... */
5670 #endif /* ENABLE_COMPOSITE_CHARS */
5673 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5675 if (char_boundary && flags & CODING_STATE_END)
5678 restore_left_to_right_direction (codesys, dst, &flags, 0);
5679 ensure_normal_shift (str, dst);
5680 for (i = 0; i < 4; i++)
5682 Lisp_Object initial_charset =
5683 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5684 iso2022_designate (initial_charset, i, str, dst);
5690 str->iso2022.current_char_boundary = char_boundary;
5691 str->iso2022.current_charset = charset;
5692 str->iso2022.current_half = half;
5694 /* Verbum caro factum est! */
5698 /************************************************************************/
5699 /* No-conversion methods */
5700 /************************************************************************/
5702 /* This is used when reading in "binary" files -- i.e. files that may
5703 contain all 256 possible byte values and that are not to be
5704 interpreted as being in any particular decoding. */
5706 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5707 unsigned_char_dynarr *dst, unsigned int n)
5710 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5711 unsigned int flags = str->flags;
5712 unsigned int ch = str->ch;
5713 eol_type_t eol_type = str->eol_type;
5719 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5720 DECODE_ADD_BINARY_CHAR (c, dst);
5721 label_continue_loop:;
5724 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5731 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5732 unsigned_char_dynarr *dst, unsigned int n)
5735 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5736 unsigned int flags = str->flags;
5737 unsigned int ch = str->ch;
5738 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5740 unsigned char char_boundary = str->iso2022.current_char_boundary;
5747 switch (char_boundary)
5755 else if ( c >= 0xf8 )
5760 else if ( c >= 0xf0 )
5765 else if ( c >= 0xe0 )
5770 else if ( c >= 0xc0 )
5781 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5782 Dynarr_add (dst, '\r');
5783 if (eol_type != EOL_CR)
5784 Dynarr_add (dst, c);
5787 Dynarr_add (dst, c);
5792 ch = ( ch << 6 ) | ( c & 0x3f );
5793 Dynarr_add (dst, ch & 0xff);
5797 ch = ( ch << 6 ) | ( c & 0x3f );
5800 #else /* not UTF2000 */
5803 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5804 Dynarr_add (dst, '\r');
5805 if (eol_type != EOL_CR)
5806 Dynarr_add (dst, '\n');
5809 else if (BYTE_ASCII_P (c))
5812 Dynarr_add (dst, c);
5814 else if (BUFBYTE_LEADING_BYTE_P (c))
5817 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5818 c == LEADING_BYTE_CONTROL_1)
5821 Dynarr_add (dst, '~'); /* untranslatable character */
5825 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5826 Dynarr_add (dst, c);
5827 else if (ch == LEADING_BYTE_CONTROL_1)
5830 Dynarr_add (dst, c - 0x20);
5832 /* else it should be the second or third byte of an
5833 untranslatable character, so ignore it */
5836 #endif /* not UTF2000 */
5842 str->iso2022.current_char_boundary = char_boundary;
5847 /************************************************************************/
5848 /* Simple internal/external functions */
5849 /************************************************************************/
5851 static Extbyte_dynarr *conversion_out_dynarr;
5852 static Bufbyte_dynarr *conversion_in_dynarr;
5854 /* Determine coding system from coding format */
5856 /* #### not correct for all values of `fmt'! */
5858 external_data_format_to_coding_system (enum external_data_format fmt)
5862 case FORMAT_FILENAME:
5863 case FORMAT_TERMINAL:
5864 if (EQ (Vfile_name_coding_system, Qnil) ||
5865 EQ (Vfile_name_coding_system, Qbinary))
5868 return Fget_coding_system (Vfile_name_coding_system);
5871 return Fget_coding_system (Qctext);
5879 convert_to_external_format (CONST Bufbyte *ptr,
5882 enum external_data_format fmt)
5884 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5886 if (!conversion_out_dynarr)
5887 conversion_out_dynarr = Dynarr_new (Extbyte);
5889 Dynarr_reset (conversion_out_dynarr);
5891 if (NILP (coding_system))
5893 CONST Bufbyte *end = ptr + len;
5899 (*ptr < 0xc0) ? *ptr :
5900 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5903 (BYTE_ASCII_P (*ptr)) ? *ptr :
5904 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5905 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5908 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5912 #ifdef ERROR_CHECK_BUFPOS
5913 assert (ptr == end);
5918 Lisp_Object instream, outstream, da_outstream;
5919 Lstream *istr, *ostr;
5920 struct gcpro gcpro1, gcpro2, gcpro3;
5921 char tempbuf[1024]; /* some random amount */
5923 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5924 da_outstream = make_dynarr_output_stream
5925 ((unsigned_char_dynarr *) conversion_out_dynarr);
5927 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5928 istr = XLSTREAM (instream);
5929 ostr = XLSTREAM (outstream);
5930 GCPRO3 (instream, outstream, da_outstream);
5933 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5936 Lstream_write (ostr, tempbuf, size_in_bytes);
5938 Lstream_close (istr);
5939 Lstream_close (ostr);
5941 Lstream_delete (istr);
5942 Lstream_delete (ostr);
5943 Lstream_delete (XLSTREAM (da_outstream));
5946 *len_out = Dynarr_length (conversion_out_dynarr);
5947 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5948 return Dynarr_atp (conversion_out_dynarr, 0);
5952 convert_from_external_format (CONST Extbyte *ptr,
5955 enum external_data_format fmt)
5957 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5959 if (!conversion_in_dynarr)
5960 conversion_in_dynarr = Dynarr_new (Bufbyte);
5962 Dynarr_reset (conversion_in_dynarr);
5964 if (NILP (coding_system))
5966 CONST Extbyte *end = ptr + len;
5967 for (; ptr < end; ptr++)
5970 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5975 Lisp_Object instream, outstream, da_outstream;
5976 Lstream *istr, *ostr;
5977 struct gcpro gcpro1, gcpro2, gcpro3;
5978 char tempbuf[1024]; /* some random amount */
5980 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5981 da_outstream = make_dynarr_output_stream
5982 ((unsigned_char_dynarr *) conversion_in_dynarr);
5984 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5985 istr = XLSTREAM (instream);
5986 ostr = XLSTREAM (outstream);
5987 GCPRO3 (instream, outstream, da_outstream);
5990 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5993 Lstream_write (ostr, tempbuf, size_in_bytes);
5995 Lstream_close (istr);
5996 Lstream_close (ostr);
5998 Lstream_delete (istr);
5999 Lstream_delete (ostr);
6000 Lstream_delete (XLSTREAM (da_outstream));
6003 *len_out = Dynarr_length (conversion_in_dynarr);
6004 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
6005 return Dynarr_atp (conversion_in_dynarr, 0);
6009 /************************************************************************/
6010 /* Initialization */
6011 /************************************************************************/
6014 syms_of_file_coding (void)
6016 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
6017 deferror (&Qcoding_system_error, "coding-system-error",
6018 "Coding-system error", Qio_error);
6020 DEFSUBR (Fcoding_system_p);
6021 DEFSUBR (Ffind_coding_system);
6022 DEFSUBR (Fget_coding_system);
6023 DEFSUBR (Fcoding_system_list);
6024 DEFSUBR (Fcoding_system_name);
6025 DEFSUBR (Fmake_coding_system);
6026 DEFSUBR (Fcopy_coding_system);
6027 DEFSUBR (Fdefine_coding_system_alias);
6028 DEFSUBR (Fsubsidiary_coding_system);
6030 DEFSUBR (Fcoding_system_type);
6031 DEFSUBR (Fcoding_system_doc_string);
6033 DEFSUBR (Fcoding_system_charset);
6035 DEFSUBR (Fcoding_system_property);
6037 DEFSUBR (Fcoding_category_list);
6038 DEFSUBR (Fset_coding_priority_list);
6039 DEFSUBR (Fcoding_priority_list);
6040 DEFSUBR (Fset_coding_category_system);
6041 DEFSUBR (Fcoding_category_system);
6043 DEFSUBR (Fdetect_coding_region);
6044 DEFSUBR (Fdecode_coding_region);
6045 DEFSUBR (Fencode_coding_region);
6047 DEFSUBR (Fdecode_shift_jis_char);
6048 DEFSUBR (Fencode_shift_jis_char);
6049 DEFSUBR (Fdecode_big5_char);
6050 DEFSUBR (Fencode_big5_char);
6051 DEFSUBR (Fset_ucs_char);
6052 DEFSUBR (Fucs_char);
6053 DEFSUBR (Fset_char_ucs);
6054 DEFSUBR (Fchar_ucs);
6056 defsymbol (&Qcoding_system_p, "coding-system-p");
6057 defsymbol (&Qno_conversion, "no-conversion");
6058 defsymbol (&Qraw_text, "raw-text");
6060 defsymbol (&Qbig5, "big5");
6061 defsymbol (&Qshift_jis, "shift-jis");
6062 defsymbol (&Qucs4, "ucs-4");
6063 defsymbol (&Qutf8, "utf-8");
6064 defsymbol (&Qccl, "ccl");
6065 defsymbol (&Qiso2022, "iso2022");
6067 defsymbol (&Qmnemonic, "mnemonic");
6068 defsymbol (&Qeol_type, "eol-type");
6069 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6070 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6072 defsymbol (&Qcr, "cr");
6073 defsymbol (&Qlf, "lf");
6074 defsymbol (&Qcrlf, "crlf");
6075 defsymbol (&Qeol_cr, "eol-cr");
6076 defsymbol (&Qeol_lf, "eol-lf");
6077 defsymbol (&Qeol_crlf, "eol-crlf");
6079 defsymbol (&Qcharset_g0, "charset-g0");
6080 defsymbol (&Qcharset_g1, "charset-g1");
6081 defsymbol (&Qcharset_g2, "charset-g2");
6082 defsymbol (&Qcharset_g3, "charset-g3");
6083 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6084 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6085 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6086 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6087 defsymbol (&Qno_iso6429, "no-iso6429");
6088 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6089 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6091 defsymbol (&Qshort, "short");
6092 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6093 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6094 defsymbol (&Qseven, "seven");
6095 defsymbol (&Qlock_shift, "lock-shift");
6096 defsymbol (&Qescape_quoted, "escape-quoted");
6098 defsymbol (&Qencode, "encode");
6099 defsymbol (&Qdecode, "decode");
6102 defsymbol (&Qctext, "ctext");
6103 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6105 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6107 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6109 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6111 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6113 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6115 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6117 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6119 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6122 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6127 lstream_type_create_file_coding (void)
6129 LSTREAM_HAS_METHOD (decoding, reader);
6130 LSTREAM_HAS_METHOD (decoding, writer);
6131 LSTREAM_HAS_METHOD (decoding, rewinder);
6132 LSTREAM_HAS_METHOD (decoding, seekable_p);
6133 LSTREAM_HAS_METHOD (decoding, flusher);
6134 LSTREAM_HAS_METHOD (decoding, closer);
6135 LSTREAM_HAS_METHOD (decoding, marker);
6137 LSTREAM_HAS_METHOD (encoding, reader);
6138 LSTREAM_HAS_METHOD (encoding, writer);
6139 LSTREAM_HAS_METHOD (encoding, rewinder);
6140 LSTREAM_HAS_METHOD (encoding, seekable_p);
6141 LSTREAM_HAS_METHOD (encoding, flusher);
6142 LSTREAM_HAS_METHOD (encoding, closer);
6143 LSTREAM_HAS_METHOD (encoding, marker);
6147 vars_of_file_coding (void)
6151 /* Initialize to something reasonable ... */
6152 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6154 coding_category_system[i] = Qnil;
6155 coding_category_by_priority[i] = i;
6158 Fprovide (intern ("file-coding"));
6160 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6161 Coding system used for TTY keyboard input.
6162 Not used under a windowing system.
6164 Vkeyboard_coding_system = Qnil;
6166 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6167 Coding system used for TTY display output.
6168 Not used under a windowing system.
6170 Vterminal_coding_system = Qnil;
6172 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6173 Overriding coding system used when writing a file or process.
6174 You should *bind* this, not set it. If this is non-nil, it specifies
6175 the coding system that will be used when a file or process is read
6176 in, and overrides `buffer-file-coding-system-for-read',
6177 `insert-file-contents-pre-hook', etc. Use those variables instead of
6178 this one for permanent changes to the environment.
6180 Vcoding_system_for_read = Qnil;
6182 DEFVAR_LISP ("coding-system-for-write",
6183 &Vcoding_system_for_write /*
6184 Overriding coding system used when writing a file or process.
6185 You should *bind* this, not set it. If this is non-nil, it specifies
6186 the coding system that will be used when a file or process is wrote
6187 in, and overrides `buffer-file-coding-system',
6188 `write-region-pre-hook', etc. Use those variables instead of this one
6189 for permanent changes to the environment.
6191 Vcoding_system_for_write = Qnil;
6193 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6194 Coding system used to convert pathnames when accessing files.
6196 Vfile_name_coding_system = Qnil;
6198 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6199 Non-nil means the buffer contents are regarded as multi-byte form
6200 of characters, not a binary code. This affects the display, file I/O,
6201 and behaviors of various editing commands.
6203 Setting this to nil does not do anything.
6205 enable_multibyte_characters = 1;
6209 complex_vars_of_file_coding (void)
6211 staticpro (&Vcoding_system_hash_table);
6212 Vcoding_system_hash_table =
6213 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6215 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6217 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6219 struct codesys_prop csp; \
6221 csp.prop_type = (Prop_Type); \
6222 Dynarr_add (the_codesys_prop_dynarr, csp); \
6225 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6226 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6227 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6228 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6229 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6230 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6231 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6233 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6234 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6235 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6236 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6237 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6238 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6239 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6240 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6241 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6242 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6243 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6244 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6245 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6246 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6247 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6248 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6249 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6251 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6252 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6254 /* Need to create this here or we're really screwed. */
6256 (Qraw_text, Qno_conversion,
6257 build_string ("Raw text, which means it converts only line-break-codes."),
6258 list2 (Qmnemonic, build_string ("Raw")));
6261 (Qbinary, Qno_conversion,
6262 build_string ("Binary, which means it does not convert anything."),
6263 list4 (Qeol_type, Qlf,
6264 Qmnemonic, build_string ("Binary")));
6269 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6270 list2 (Qmnemonic, build_string ("UTF8")));
6273 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6275 /* Need this for bootstrapping */
6276 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6277 Fget_coding_system (Qraw_text);
6280 coding_category_system[CODING_CATEGORY_UTF8]
6281 = Fget_coding_system (Qutf8);
6288 for (i = 0; i < 65536; i++)
6289 ucs_to_mule_table[i] = Qnil;
6291 staticpro (&mule_to_ucs_table);
6292 mule_to_ucs_table = Fmake_char_table(Qgeneric);