1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
52 struct file_coding_dump {
53 /* Coding system currently associated with each coding category. */
54 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
56 /* Table of all coding categories in decreasing order of priority.
57 This describes a permutation of the possible coding categories. */
58 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
61 Lisp_Object ucs_to_mule_table[65536];
65 static const struct lrecord_description fcd_description_1[] = {
66 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
68 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
73 static const struct struct_description fcd_description = {
74 sizeof(struct file_coding_dump),
78 Lisp_Object mule_to_ucs_table;
80 Lisp_Object Qcoding_systemp;
82 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
83 /* Qinternal in general.c */
85 Lisp_Object Qmnemonic, Qeol_type;
86 Lisp_Object Qcr, Qcrlf, Qlf;
87 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
88 Lisp_Object Qpost_read_conversion;
89 Lisp_Object Qpre_write_conversion;
92 Lisp_Object Qucs4, Qutf8;
93 Lisp_Object Qbig5, Qshift_jis;
94 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
95 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
96 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
97 Lisp_Object Qno_iso6429;
98 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
99 Lisp_Object Qctext, Qescape_quoted;
100 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
102 Lisp_Object Qencode, Qdecode;
104 Lisp_Object Vcoding_system_hash_table;
106 int enable_multibyte_characters;
109 /* Additional information used by the ISO2022 decoder and detector. */
110 struct iso2022_decoder
112 /* CHARSET holds the character sets currently assigned to the G0
113 through G3 variables. It is initialized from the array
114 INITIAL_CHARSET in CODESYS. */
115 Lisp_Object charset[4];
117 /* Which registers are currently invoked into the left (GL) and
118 right (GR) halves of the 8-bit encoding space? */
119 int register_left, register_right;
121 /* ISO_ESC holds a value indicating part of an escape sequence
122 that has already been seen. */
123 enum iso_esc_flag esc;
125 /* This records the bytes we've seen so far in an escape sequence,
126 in case the sequence is invalid (we spit out the bytes unchanged). */
127 unsigned char esc_bytes[8];
129 /* Index for next byte to store in ISO escape sequence. */
132 #ifdef ENABLE_COMPOSITE_CHARS
133 /* Stuff seen so far when composing a string. */
134 unsigned_char_dynarr *composite_chars;
137 /* If we saw an invalid designation sequence for a particular
138 register, we flag it here and switch to ASCII. The next time we
139 see a valid designation for this register, we turn off the flag
140 and do the designation normally, but pretend the sequence was
141 invalid. The effect of all this is that (most of the time) the
142 escape sequences for both the switch to the unknown charset, and
143 the switch back to the known charset, get inserted literally into
144 the buffer and saved out as such. The hope is that we can
145 preserve the escape sequences so that the resulting written out
146 file makes sense. If we don't do any of this, the designation
147 to the invalid charset will be preserved but that switch back
148 to the known charset will probably get eaten because it was
149 the same charset that was already present in the register. */
150 unsigned char invalid_designated[4];
152 /* We try to do similar things as above for direction-switching
153 sequences. If we encountered a direction switch while an
154 invalid designation was present, or an invalid designation
155 just after a direction switch (i.e. no valid designation
156 encountered yet), we insert the direction-switch escape
157 sequence literally into the output stream, and later on
158 insert the corresponding direction-restoring escape sequence
160 unsigned int switched_dir_and_no_valid_charset_yet :1;
161 unsigned int invalid_switch_dir :1;
163 /* Tells the decoder to output the escape sequence literally
164 even though it was valid. Used in the games we play to
165 avoid lossage when we encounter invalid designations. */
166 unsigned int output_literally :1;
167 /* We encountered a direction switch followed by an invalid
168 designation. We didn't output the direction switch
169 literally because we didn't know about the invalid designation;
170 but we have to do so now. */
171 unsigned int output_direction_sequence :1;
174 EXFUN (Fcopy_coding_system, 2);
176 struct detection_state;
177 static int detect_coding_sjis (struct detection_state *st,
178 CONST unsigned char *src,
180 static void decode_coding_sjis (Lstream *decoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst,
184 static void encode_coding_sjis (Lstream *encoding,
185 CONST unsigned char *src,
186 unsigned_char_dynarr *dst,
188 static int detect_coding_big5 (struct detection_state *st,
189 CONST unsigned char *src,
191 static void decode_coding_big5 (Lstream *decoding,
192 CONST unsigned char *src,
193 unsigned_char_dynarr *dst, unsigned int n);
194 static void encode_coding_big5 (Lstream *encoding,
195 CONST unsigned char *src,
196 unsigned_char_dynarr *dst, unsigned int n);
197 static int detect_coding_ucs4 (struct detection_state *st,
198 CONST unsigned char *src,
200 static void decode_coding_ucs4 (Lstream *decoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
203 static void encode_coding_ucs4 (Lstream *encoding,
204 CONST unsigned char *src,
205 unsigned_char_dynarr *dst, unsigned int n);
206 static int detect_coding_utf8 (struct detection_state *st,
207 CONST unsigned char *src,
209 static void decode_coding_utf8 (Lstream *decoding,
210 CONST unsigned char *src,
211 unsigned_char_dynarr *dst, unsigned int n);
212 static void encode_coding_utf8 (Lstream *encoding,
213 CONST unsigned char *src,
214 unsigned_char_dynarr *dst, unsigned int n);
215 static int postprocess_iso2022_mask (int mask);
216 static void reset_iso2022 (Lisp_Object coding_system,
217 struct iso2022_decoder *iso);
218 static int detect_coding_iso2022 (struct detection_state *st,
219 CONST unsigned char *src,
221 static void decode_coding_iso2022 (Lstream *decoding,
222 CONST unsigned char *src,
223 unsigned_char_dynarr *dst, unsigned int n);
224 static void encode_coding_iso2022 (Lstream *encoding,
225 CONST unsigned char *src,
226 unsigned_char_dynarr *dst, unsigned int n);
228 static void decode_coding_no_conversion (Lstream *decoding,
229 CONST unsigned char *src,
230 unsigned_char_dynarr *dst,
232 static void encode_coding_no_conversion (Lstream *encoding,
233 CONST unsigned char *src,
234 unsigned_char_dynarr *dst,
236 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
237 unsigned_char_dynarr *dst, unsigned int n);
238 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
239 unsigned_char_dynarr *dst, unsigned int n);
241 typedef struct codesys_prop codesys_prop;
250 Dynarr_declare (codesys_prop);
251 } codesys_prop_dynarr;
253 static const struct lrecord_description codesys_prop_description_1[] = {
254 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
258 static const struct struct_description codesys_prop_description = {
259 sizeof(codesys_prop),
260 codesys_prop_description_1
263 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
264 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
268 static const struct struct_description codesys_prop_dynarr_description = {
269 sizeof(codesys_prop_dynarr),
270 codesys_prop_dynarr_description_1
273 codesys_prop_dynarr *the_codesys_prop_dynarr;
275 enum codesys_prop_enum
278 CODESYS_PROP_ISO2022,
283 /************************************************************************/
284 /* Coding system functions */
285 /************************************************************************/
287 static Lisp_Object mark_coding_system (Lisp_Object);
288 static void print_coding_system (Lisp_Object, Lisp_Object, int);
289 static void finalize_coding_system (void *header, int for_disksave);
292 static const struct lrecord_description ccs_description_1[] = {
293 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
297 static const struct struct_description ccs_description = {
298 sizeof(charset_conversion_spec),
302 static const struct lrecord_description ccsd_description_1[] = {
303 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
307 static const struct struct_description ccsd_description = {
308 sizeof(charset_conversion_spec_dynarr),
313 static const struct lrecord_description coding_system_description[] = {
314 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
315 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
316 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
318 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
319 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
320 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
321 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
326 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
327 mark_coding_system, print_coding_system,
328 finalize_coding_system,
329 0, 0, coding_system_description,
330 struct Lisp_Coding_System);
333 mark_coding_system (Lisp_Object obj)
335 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
337 mark_object (CODING_SYSTEM_NAME (codesys));
338 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
339 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
340 mark_object (CODING_SYSTEM_EOL_LF (codesys));
341 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
342 mark_object (CODING_SYSTEM_EOL_CR (codesys));
344 switch (CODING_SYSTEM_TYPE (codesys))
348 case CODESYS_ISO2022:
349 for (i = 0; i < 4; i++)
350 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
351 if (codesys->iso2022.input_conv)
353 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
355 struct charset_conversion_spec *ccs =
356 Dynarr_atp (codesys->iso2022.input_conv, i);
357 mark_object (ccs->from_charset);
358 mark_object (ccs->to_charset);
361 if (codesys->iso2022.output_conv)
363 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
365 struct charset_conversion_spec *ccs =
366 Dynarr_atp (codesys->iso2022.output_conv, i);
367 mark_object (ccs->from_charset);
368 mark_object (ccs->to_charset);
374 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
375 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
382 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
383 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
387 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
390 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
392 error ("printing unreadable object #<coding_system 0x%x>",
395 write_c_string ("#<coding_system ", printcharfun);
396 print_internal (c->name, printcharfun, 1);
397 write_c_string (">", printcharfun);
401 finalize_coding_system (void *header, int for_disksave)
403 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
404 /* Since coding systems never go away, this function is not
405 necessary. But it would be necessary if we changed things
406 so that coding systems could go away. */
407 if (!for_disksave) /* see comment in lstream.c */
409 switch (CODING_SYSTEM_TYPE (c))
412 case CODESYS_ISO2022:
413 if (c->iso2022.input_conv)
415 Dynarr_free (c->iso2022.input_conv);
416 c->iso2022.input_conv = 0;
418 if (c->iso2022.output_conv)
420 Dynarr_free (c->iso2022.output_conv);
421 c->iso2022.output_conv = 0;
432 symbol_to_eol_type (Lisp_Object symbol)
434 CHECK_SYMBOL (symbol);
435 if (NILP (symbol)) return EOL_AUTODETECT;
436 if (EQ (symbol, Qlf)) return EOL_LF;
437 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
438 if (EQ (symbol, Qcr)) return EOL_CR;
440 signal_simple_error ("Unrecognized eol type", symbol);
441 return EOL_AUTODETECT; /* not reached */
445 eol_type_to_symbol (enum eol_type type)
450 case EOL_LF: return Qlf;
451 case EOL_CRLF: return Qcrlf;
452 case EOL_CR: return Qcr;
453 case EOL_AUTODETECT: return Qnil;
458 setup_eol_coding_systems (Lisp_Coding_System *codesys)
460 Lisp_Object codesys_obj;
461 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
462 char *codesys_name = (char *) alloca (len + 7);
464 char *codesys_mnemonic=0;
466 Lisp_Object codesys_name_sym, sub_codesys_obj;
470 XSETCODING_SYSTEM (codesys_obj, codesys);
472 memcpy (codesys_name,
473 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
475 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
477 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
478 codesys_mnemonic = (char *) alloca (mlen + 7);
479 memcpy (codesys_mnemonic,
480 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
483 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
484 strcpy (codesys_name + len, "-" op_sys); \
486 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
487 codesys_name_sym = intern (codesys_name); \
488 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
489 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
491 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
492 build_string (codesys_mnemonic); \
493 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
496 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
497 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
498 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
501 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
502 Return t if OBJECT is a coding system.
503 A coding system is an object that defines how text containing multiple
504 character sets is encoded into a stream of (typically 8-bit) bytes.
505 The coding system is used to decode the stream into a series of
506 characters (which may be from multiple charsets) when the text is read
507 from a file or process, and is used to encode the text back into the
508 same format when it is written out to a file or process.
510 For example, many ISO2022-compliant coding systems (such as Compound
511 Text, which is used for inter-client data under the X Window System)
512 use escape sequences to switch between different charsets -- Japanese
513 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
514 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
515 `make-coding-system' for more information.
517 Coding systems are normally identified using a symbol, and the
518 symbol is accepted in place of the actual coding system object whenever
519 a coding system is called for. (This is similar to how faces work.)
523 return CODING_SYSTEMP (object) ? Qt : Qnil;
526 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
527 Retrieve the coding system of the given name.
529 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
530 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
531 If there is no such coding system, nil is returned. Otherwise the
532 associated coding system object is returned.
534 (coding_system_or_name))
536 if (CODING_SYSTEMP (coding_system_or_name))
537 return coding_system_or_name;
539 if (NILP (coding_system_or_name))
540 coding_system_or_name = Qbinary;
542 CHECK_SYMBOL (coding_system_or_name);
544 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
547 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
548 Retrieve the coding system of the given name.
549 Same as `find-coding-system' except that if there is no such
550 coding system, an error is signaled instead of returning nil.
554 Lisp_Object coding_system = Ffind_coding_system (name);
556 if (NILP (coding_system))
557 signal_simple_error ("No such coding system", name);
558 return coding_system;
561 /* We store the coding systems in hash tables with the names as the key and the
562 actual coding system object as the value. Occasionally we need to use them
563 in a list format. These routines provide us with that. */
564 struct coding_system_list_closure
566 Lisp_Object *coding_system_list;
570 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
571 void *coding_system_list_closure)
573 /* This function can GC */
574 struct coding_system_list_closure *cscl =
575 (struct coding_system_list_closure *) coding_system_list_closure;
576 Lisp_Object *coding_system_list = cscl->coding_system_list;
578 *coding_system_list = Fcons (key, *coding_system_list);
582 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
583 Return a list of the names of all defined coding systems.
587 Lisp_Object coding_system_list = Qnil;
589 struct coding_system_list_closure coding_system_list_closure;
591 GCPRO1 (coding_system_list);
592 coding_system_list_closure.coding_system_list = &coding_system_list;
593 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
594 &coding_system_list_closure);
597 return coding_system_list;
600 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
601 Return the name of the given coding system.
605 coding_system = Fget_coding_system (coding_system);
606 return XCODING_SYSTEM_NAME (coding_system);
609 static Lisp_Coding_System *
610 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
612 Lisp_Coding_System *codesys =
613 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
615 zero_lcrecord (codesys);
616 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
617 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
618 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
619 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
620 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
621 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
622 CODING_SYSTEM_TYPE (codesys) = type;
623 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
625 if (type == CODESYS_ISO2022)
628 for (i = 0; i < 4; i++)
629 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
631 else if (type == CODESYS_CCL)
633 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
634 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
637 CODING_SYSTEM_NAME (codesys) = name;
643 /* Given a list of charset conversion specs as specified in a Lisp
644 program, parse it into STORE_HERE. */
647 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
648 Lisp_Object spec_list)
652 EXTERNAL_LIST_LOOP (rest, spec_list)
654 Lisp_Object car = XCAR (rest);
655 Lisp_Object from, to;
656 struct charset_conversion_spec spec;
658 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
659 signal_simple_error ("Invalid charset conversion spec", car);
660 from = Fget_charset (XCAR (car));
661 to = Fget_charset (XCAR (XCDR (car)));
662 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
663 signal_simple_error_2
664 ("Attempted conversion between different charset types",
666 spec.from_charset = from;
667 spec.to_charset = to;
669 Dynarr_add (store_here, spec);
673 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
674 specs, return the equivalent as the Lisp programmer would see it.
676 If LOAD_HERE is 0, return Qnil. */
679 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
686 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
688 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
689 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
692 return Fnreverse (result);
697 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
698 Register symbol NAME as a coding system.
700 TYPE describes the conversion method used and should be one of
703 Automatic conversion. XEmacs attempts to detect the coding system
706 No conversion. Use this for binary files and such. On output,
707 graphic characters that are not in ASCII or Latin-1 will be
708 replaced by a ?. (For a no-conversion-encoded buffer, these
709 characters will only be present if you explicitly insert them.)
711 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
713 ISO 10646 UCS-4 encoding.
715 ISO 10646 UTF-8 encoding.
717 Any ISO2022-compliant encoding. Among other things, this includes
718 JIS (the Japanese encoding commonly used for e-mail), EUC (the
719 standard Unix encoding for Japanese and other languages), and
720 Compound Text (the encoding used in X11). You can specify more
721 specific information about the conversion with the FLAGS argument.
723 Big5 (the encoding commonly used for Taiwanese).
725 The conversion is performed using a user-written pseudo-code
726 program. CCL (Code Conversion Language) is the name of this
729 Write out or read in the raw contents of the memory representing
730 the buffer's text. This is primarily useful for debugging
731 purposes, and is only enabled when XEmacs has been compiled with
732 DEBUG_XEMACS defined (via the --debug configure option).
733 WARNING: Reading in a file using 'internal conversion can result
734 in an internal inconsistency in the memory representing a
735 buffer's text, which will produce unpredictable results and may
736 cause XEmacs to crash. Under normal circumstances you should
737 never use 'internal conversion.
739 DOC-STRING is a string describing the coding system.
741 PROPS is a property list, describing the specific nature of the
742 character set. Recognized properties are:
745 String to be displayed in the modeline when this coding system is
749 End-of-line conversion to be used. It should be one of
752 Automatically detect the end-of-line type (LF, CRLF,
753 or CR). Also generate subsidiary coding systems named
754 `NAME-unix', `NAME-dos', and `NAME-mac', that are
755 identical to this coding system but have an EOL-TYPE
756 value of 'lf, 'crlf, and 'cr, respectively.
758 The end of a line is marked externally using ASCII LF.
759 Since this is also the way that XEmacs represents an
760 end-of-line internally, specifying this option results
761 in no end-of-line conversion. This is the standard
762 format for Unix text files.
764 The end of a line is marked externally using ASCII
765 CRLF. This is the standard format for MS-DOS text
768 The end of a line is marked externally using ASCII CR.
769 This is the standard format for Macintosh text files.
771 Automatically detect the end-of-line type but do not
772 generate subsidiary coding systems. (This value is
773 converted to nil when stored internally, and
774 `coding-system-property' will return nil.)
776 'post-read-conversion
777 Function called after a file has been read in, to perform the
778 decoding. Called with two arguments, BEG and END, denoting
779 a region of the current buffer to be decoded.
781 'pre-write-conversion
782 Function called before a file is written out, to perform the
783 encoding. Called with two arguments, BEG and END, denoting
784 a region of the current buffer to be encoded.
787 The following additional properties are recognized if TYPE is 'iso2022:
793 The character set initially designated to the G0 - G3 registers.
794 The value should be one of
796 -- A charset object (designate that character set)
797 -- nil (do not ever use this register)
798 -- t (no character set is initially designated to
799 the register, but may be later on; this automatically
800 sets the corresponding `force-g*-on-output' property)
806 If non-nil, send an explicit designation sequence on output before
807 using the specified register.
810 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
811 "ESC $ B" on output in place of the full designation sequences
812 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
815 If non-nil, don't designate ASCII to G0 at each end of line on output.
816 Setting this to non-nil also suppresses other state-resetting that
817 normally happens at the end of a line.
820 If non-nil, don't designate ASCII to G0 before control chars on output.
823 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
827 If non-nil, use locking-shift (SO/SI) instead of single-shift
828 or designation by escape sequence.
831 If non-nil, don't use ISO6429's direction specification.
834 If non-nil, literal control characters that are the same as
835 the beginning of a recognized ISO2022 or ISO6429 escape sequence
836 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
837 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
838 so that they can be properly distinguished from an escape sequence.
839 (Note that doing this results in a non-portable encoding.) This
840 encoding flag is used for byte-compiled files. Note that ESC
841 is a good choice for a quoting character because there are no
842 escape sequences whose second byte is a character from the Control-0
843 or Control-1 character sets; this is explicitly disallowed by the
846 'input-charset-conversion
847 A list of conversion specifications, specifying conversion of
848 characters in one charset to another when decoding is performed.
849 Each specification is a list of two elements: the source charset,
850 and the destination charset.
852 'output-charset-conversion
853 A list of conversion specifications, specifying conversion of
854 characters in one charset to another when encoding is performed.
855 The form of each specification is the same as for
856 'input-charset-conversion.
859 The following additional properties are recognized (and required)
863 CCL program used for decoding (converting to internal format).
866 CCL program used for encoding (converting to external format).
868 (name, type, doc_string, props))
870 Lisp_Coding_System *codesys;
871 Lisp_Object rest, key, value;
872 enum coding_system_type ty;
873 int need_to_setup_eol_systems = 1;
875 /* Convert type to constant */
876 if (NILP (type) || EQ (type, Qundecided))
877 { ty = CODESYS_AUTODETECT; }
879 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
880 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
881 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
882 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
883 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
884 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
886 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
888 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
891 signal_simple_error ("Invalid coding system type", type);
895 codesys = allocate_coding_system (ty, name);
897 if (NILP (doc_string))
898 doc_string = build_string ("");
900 CHECK_STRING (doc_string);
901 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
903 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
905 if (EQ (key, Qmnemonic))
908 CHECK_STRING (value);
909 CODING_SYSTEM_MNEMONIC (codesys) = value;
912 else if (EQ (key, Qeol_type))
914 need_to_setup_eol_systems = NILP (value);
917 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
920 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
921 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
923 else if (ty == CODESYS_ISO2022)
925 #define FROB_INITIAL_CHARSET(charset_num) \
926 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
927 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
929 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
930 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
931 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
932 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
934 #define FROB_FORCE_CHARSET(charset_num) \
935 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
937 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
938 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
939 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
940 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
942 #define FROB_BOOLEAN_PROPERTY(prop) \
943 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
945 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
946 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
947 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
948 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
949 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
950 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
951 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
953 else if (EQ (key, Qinput_charset_conversion))
955 codesys->iso2022.input_conv =
956 Dynarr_new (charset_conversion_spec);
957 parse_charset_conversion_specs (codesys->iso2022.input_conv,
960 else if (EQ (key, Qoutput_charset_conversion))
962 codesys->iso2022.output_conv =
963 Dynarr_new (charset_conversion_spec);
964 parse_charset_conversion_specs (codesys->iso2022.output_conv,
968 signal_simple_error ("Unrecognized property", key);
970 else if (EQ (type, Qccl))
972 if (EQ (key, Qdecode))
974 CHECK_VECTOR (value);
975 CODING_SYSTEM_CCL_DECODE (codesys) = value;
977 else if (EQ (key, Qencode))
979 CHECK_VECTOR (value);
980 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
983 signal_simple_error ("Unrecognized property", key);
987 signal_simple_error ("Unrecognized property", key);
990 if (need_to_setup_eol_systems)
991 setup_eol_coding_systems (codesys);
994 Lisp_Object codesys_obj;
995 XSETCODING_SYSTEM (codesys_obj, codesys);
996 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1001 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1002 Copy OLD-CODING-SYSTEM to NEW-NAME.
1003 If NEW-NAME does not name an existing coding system, a new one will
1006 (old_coding_system, new_name))
1008 Lisp_Object new_coding_system;
1009 old_coding_system = Fget_coding_system (old_coding_system);
1010 new_coding_system = Ffind_coding_system (new_name);
1011 if (NILP (new_coding_system))
1013 XSETCODING_SYSTEM (new_coding_system,
1014 allocate_coding_system
1015 (XCODING_SYSTEM_TYPE (old_coding_system),
1017 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1021 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1022 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1023 memcpy (((char *) to ) + sizeof (to->header),
1024 ((char *) from) + sizeof (from->header),
1025 sizeof (*from) - sizeof (from->header));
1026 to->name = new_name;
1028 return new_coding_system;
1031 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1032 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1034 (alias, coding_system))
1036 CHECK_SYMBOL (alias);
1037 if (!NILP (Ffind_coding_system (alias)))
1038 signal_simple_error ("Symbol already names a coding system", alias);
1039 coding_system = Fget_coding_system (coding_system);
1040 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1042 /* Set up aliases for subsidiaries. */
1043 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1046 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1047 #define FROB(type, name) \
1049 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1050 if (!NILP (subsidiary)) \
1051 Fdefine_coding_system_alias \
1052 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1055 FROB (CRLF, "-dos");
1059 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1060 but it doesn't look intentional, so I'd rather return something
1061 meaningful or nothing at all. */
1066 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1068 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1069 Lisp_Object new_coding_system;
1071 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1072 return coding_system;
1076 case EOL_AUTODETECT: return coding_system;
1077 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1078 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1079 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1083 return NILP (new_coding_system) ? coding_system : new_coding_system;
1086 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1087 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1089 (coding_system, eol_type))
1091 coding_system = Fget_coding_system (coding_system);
1093 return subsidiary_coding_system (coding_system,
1094 symbol_to_eol_type (eol_type));
1098 /************************************************************************/
1099 /* Coding system accessors */
1100 /************************************************************************/
1102 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1103 Return the doc string for CODING-SYSTEM.
1107 coding_system = Fget_coding_system (coding_system);
1108 return XCODING_SYSTEM_DOC_STRING (coding_system);
1111 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1112 Return the type of CODING-SYSTEM.
1116 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1119 case CODESYS_AUTODETECT: return Qundecided;
1121 case CODESYS_SHIFT_JIS: return Qshift_jis;
1122 case CODESYS_ISO2022: return Qiso2022;
1123 case CODESYS_BIG5: return Qbig5;
1124 case CODESYS_UCS4: return Qucs4;
1125 case CODESYS_UTF8: return Qutf8;
1126 case CODESYS_CCL: return Qccl;
1128 case CODESYS_NO_CONVERSION: return Qno_conversion;
1130 case CODESYS_INTERNAL: return Qinternal;
1137 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1140 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1142 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1145 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1146 Return initial charset of CODING-SYSTEM designated to GNUM.
1149 (coding_system, gnum))
1151 coding_system = Fget_coding_system (coding_system);
1154 return coding_system_charset (coding_system, XINT (gnum));
1158 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1159 Return the PROP property of CODING-SYSTEM.
1161 (coding_system, prop))
1164 enum coding_system_type type;
1166 coding_system = Fget_coding_system (coding_system);
1167 CHECK_SYMBOL (prop);
1168 type = XCODING_SYSTEM_TYPE (coding_system);
1170 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1171 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1174 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1176 case CODESYS_PROP_ALL_OK:
1179 case CODESYS_PROP_ISO2022:
1180 if (type != CODESYS_ISO2022)
1182 ("Property only valid in ISO2022 coding systems",
1186 case CODESYS_PROP_CCL:
1187 if (type != CODESYS_CCL)
1189 ("Property only valid in CCL coding systems",
1199 signal_simple_error ("Unrecognized property", prop);
1201 if (EQ (prop, Qname))
1202 return XCODING_SYSTEM_NAME (coding_system);
1203 else if (EQ (prop, Qtype))
1204 return Fcoding_system_type (coding_system);
1205 else if (EQ (prop, Qdoc_string))
1206 return XCODING_SYSTEM_DOC_STRING (coding_system);
1207 else if (EQ (prop, Qmnemonic))
1208 return XCODING_SYSTEM_MNEMONIC (coding_system);
1209 else if (EQ (prop, Qeol_type))
1210 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1211 else if (EQ (prop, Qeol_lf))
1212 return XCODING_SYSTEM_EOL_LF (coding_system);
1213 else if (EQ (prop, Qeol_crlf))
1214 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1215 else if (EQ (prop, Qeol_cr))
1216 return XCODING_SYSTEM_EOL_CR (coding_system);
1217 else if (EQ (prop, Qpost_read_conversion))
1218 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1219 else if (EQ (prop, Qpre_write_conversion))
1220 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1222 else if (type == CODESYS_ISO2022)
1224 if (EQ (prop, Qcharset_g0))
1225 return coding_system_charset (coding_system, 0);
1226 else if (EQ (prop, Qcharset_g1))
1227 return coding_system_charset (coding_system, 1);
1228 else if (EQ (prop, Qcharset_g2))
1229 return coding_system_charset (coding_system, 2);
1230 else if (EQ (prop, Qcharset_g3))
1231 return coding_system_charset (coding_system, 3);
1233 #define FORCE_CHARSET(charset_num) \
1234 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1235 (coding_system, charset_num) ? Qt : Qnil)
1237 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1238 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1239 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1240 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1242 #define LISP_BOOLEAN(prop) \
1243 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1245 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1246 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1247 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1248 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1249 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1250 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1251 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1253 else if (EQ (prop, Qinput_charset_conversion))
1255 unparse_charset_conversion_specs
1256 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1257 else if (EQ (prop, Qoutput_charset_conversion))
1259 unparse_charset_conversion_specs
1260 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1264 else if (type == CODESYS_CCL)
1266 if (EQ (prop, Qdecode))
1267 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1268 else if (EQ (prop, Qencode))
1269 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1277 return Qnil; /* not reached */
1281 /************************************************************************/
1282 /* Coding category functions */
1283 /************************************************************************/
1286 decode_coding_category (Lisp_Object symbol)
1290 CHECK_SYMBOL (symbol);
1291 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1292 if (EQ (coding_category_symbol[i], symbol))
1295 signal_simple_error ("Unrecognized coding category", symbol);
1296 return 0; /* not reached */
1299 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1300 Return a list of all recognized coding categories.
1305 Lisp_Object list = Qnil;
1307 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1308 list = Fcons (coding_category_symbol[i], list);
1312 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1313 Change the priority order of the coding categories.
1314 LIST should be list of coding categories, in descending order of
1315 priority. Unspecified coding categories will be lower in priority
1316 than all specified ones, in the same relative order they were in
1321 int category_to_priority[CODING_CATEGORY_LAST + 1];
1325 /* First generate a list that maps coding categories to priorities. */
1327 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1328 category_to_priority[i] = -1;
1330 /* Highest priority comes from the specified list. */
1332 EXTERNAL_LIST_LOOP (rest, list)
1334 int cat = decode_coding_category (XCAR (rest));
1336 if (category_to_priority[cat] >= 0)
1337 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1338 category_to_priority[cat] = i++;
1341 /* Now go through the existing categories by priority to retrieve
1342 the categories not yet specified and preserve their priority
1344 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1346 int cat = fcd->coding_category_by_priority[j];
1347 if (category_to_priority[cat] < 0)
1348 category_to_priority[cat] = i++;
1351 /* Now we need to construct the inverse of the mapping we just
1354 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1355 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1357 /* Phew! That was confusing. */
1361 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1362 Return a list of coding categories in descending order of priority.
1367 Lisp_Object list = Qnil;
1369 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1370 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1375 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1376 Change the coding system associated with a coding category.
1378 (coding_category, coding_system))
1380 int cat = decode_coding_category (coding_category);
1382 coding_system = Fget_coding_system (coding_system);
1383 fcd->coding_category_system[cat] = coding_system;
1387 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1388 Return the coding system associated with a coding category.
1392 int cat = decode_coding_category (coding_category);
1393 Lisp_Object sys = fcd->coding_category_system[cat];
1396 return XCODING_SYSTEM_NAME (sys);
1401 /************************************************************************/
1402 /* Detecting the encoding of data */
1403 /************************************************************************/
1405 struct detection_state
1407 enum eol_type eol_type;
1443 struct iso2022_decoder iso;
1445 int high_byte_count;
1446 unsigned int saw_single_shift:1;
1459 acceptable_control_char_p (int c)
1463 /* Allow and ignore control characters that you might
1464 reasonably see in a text file */
1469 case 8: /* backspace */
1470 case 11: /* vertical tab */
1471 case 12: /* form feed */
1472 case 26: /* MS-DOS C-z junk */
1473 case 31: /* '^_' -- for info */
1481 mask_has_at_most_one_bit_p (int mask)
1483 /* Perhaps the only thing useful you learn from intensive Microsoft
1484 technical interviews */
1485 return (mask & (mask - 1)) == 0;
1488 static enum eol_type
1489 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1499 if (st->eol.just_saw_cr)
1501 else if (st->eol.seen_anything)
1504 else if (st->eol.just_saw_cr)
1507 st->eol.just_saw_cr = 1;
1509 st->eol.just_saw_cr = 0;
1510 st->eol.seen_anything = 1;
1513 return EOL_AUTODETECT;
1516 /* Attempt to determine the encoding and EOL type of the given text.
1517 Before calling this function for the first type, you must initialize
1518 st->eol_type as appropriate and initialize st->mask to ~0.
1520 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1523 st->mask holds the determined coding category mask, or ~0 if only
1524 ASCII has been seen so far.
1528 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1529 is present in st->mask
1530 1 == definitive answers are here for both st->eol_type and st->mask
1534 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1535 unsigned int n, int just_do_eol)
1539 if (st->eol_type == EOL_AUTODETECT)
1540 st->eol_type = detect_eol_type (st, src, n);
1543 return st->eol_type != EOL_AUTODETECT;
1545 if (!st->seen_non_ascii)
1547 for (; n; n--, src++)
1550 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1552 st->seen_non_ascii = 1;
1554 st->shift_jis.mask = ~0;
1558 st->iso2022.mask = ~0;
1568 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1569 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1570 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1571 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1572 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1573 st->big5.mask = detect_coding_big5 (st, src, n);
1574 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1575 st->utf8.mask = detect_coding_utf8 (st, src, n);
1576 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1577 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1580 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1581 | st->utf8.mask | st->ucs4.mask;
1584 int retval = mask_has_at_most_one_bit_p (st->mask);
1585 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1586 return retval && st->eol_type != EOL_AUTODETECT;
1591 coding_system_from_mask (int mask)
1595 /* If the file was entirely or basically ASCII, use the
1596 default value of `buffer-file-coding-system'. */
1597 Lisp_Object retval =
1598 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1601 retval = Ffind_coding_system (retval);
1605 (Qbad_variable, Qwarning,
1606 "Invalid `default-buffer-file-coding-system', set to nil");
1607 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1611 retval = Fget_coding_system (Qraw_text);
1619 mask = postprocess_iso2022_mask (mask);
1621 /* Look through the coding categories by priority and find
1622 the first one that is allowed. */
1623 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1625 cat = fcd->coding_category_by_priority[i];
1626 if ((mask & (1 << cat)) &&
1627 !NILP (fcd->coding_category_system[cat]))
1631 return fcd->coding_category_system[cat];
1633 return Fget_coding_system (Qraw_text);
1637 /* Given a seekable read stream and potential coding system and EOL type
1638 as specified, do any autodetection that is called for. If the
1639 coding system and/or EOL type are not `autodetect', they will be left
1640 alone; but this function will never return an autodetect coding system
1643 This function does not automatically fetch subsidiary coding systems;
1644 that should be unnecessary with the explicit eol-type argument. */
1646 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1649 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1650 enum eol_type *eol_type_in_out)
1652 struct detection_state decst;
1654 if (*eol_type_in_out == EOL_AUTODETECT)
1655 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1658 decst.eol_type = *eol_type_in_out;
1661 /* If autodetection is called for, do it now. */
1662 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1663 || *eol_type_in_out == EOL_AUTODETECT)
1666 Lisp_Object coding_system = Qnil;
1668 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1671 /* Look for initial "-*-"; mode line prefix */
1673 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1678 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1680 Extbyte *local_vars_beg = p + 3;
1681 /* Look for final "-*-"; mode line suffix */
1682 for (p = local_vars_beg,
1683 scan_end = buf + nread - LENGTH ("-*-");
1688 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1690 Extbyte *suffix = p;
1691 /* Look for "coding:" */
1692 for (p = local_vars_beg,
1693 scan_end = suffix - LENGTH ("coding:?");
1696 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1697 && (p == local_vars_beg
1698 || (*(p-1) == ' ' ||
1704 p += LENGTH ("coding:");
1705 while (*p == ' ' || *p == '\t') p++;
1707 /* Get coding system name */
1708 save = *suffix; *suffix = '\0';
1709 /* Characters valid in a MIME charset name (rfc 1521),
1710 and in a Lisp symbol name. */
1711 n = strspn ( (char *) p,
1712 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1713 "abcdefghijklmnopqrstuvwxyz"
1719 save = p[n]; p[n] = '\0';
1721 Ffind_coding_system (intern ((char *) p));
1731 if (NILP (coding_system))
1734 if (detect_coding_type (&decst, buf, nread,
1735 XCODING_SYSTEM_TYPE (*codesys_in_out)
1736 != CODESYS_AUTODETECT))
1738 nread = Lstream_read (stream, buf, sizeof (buf));
1744 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1745 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1748 if (detect_coding_type (&decst, buf, nread, 1))
1750 nread = Lstream_read (stream, buf, sizeof (buf));
1756 *eol_type_in_out = decst.eol_type;
1757 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1759 if (NILP (coding_system))
1760 *codesys_in_out = coding_system_from_mask (decst.mask);
1762 *codesys_in_out = coding_system;
1766 /* If we absolutely can't determine the EOL type, just assume LF. */
1767 if (*eol_type_in_out == EOL_AUTODETECT)
1768 *eol_type_in_out = EOL_LF;
1770 Lstream_rewind (stream);
1773 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1774 Detect coding system of the text in the region between START and END.
1775 Returned a list of possible coding systems ordered by priority.
1776 If only ASCII characters are found, it returns 'undecided or one of
1777 its subsidiary coding systems according to a detected end-of-line
1778 type. Optional arg BUFFER defaults to the current buffer.
1780 (start, end, buffer))
1782 Lisp_Object val = Qnil;
1783 struct buffer *buf = decode_buffer (buffer, 0);
1785 Lisp_Object instream, lb_instream;
1786 Lstream *istr, *lb_istr;
1787 struct detection_state decst;
1788 struct gcpro gcpro1, gcpro2;
1790 get_buffer_range_char (buf, start, end, &b, &e, 0);
1791 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1792 lb_istr = XLSTREAM (lb_instream);
1793 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1794 istr = XLSTREAM (instream);
1795 GCPRO2 (instream, lb_instream);
1797 decst.eol_type = EOL_AUTODETECT;
1801 unsigned char random_buffer[4096];
1802 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1806 if (detect_coding_type (&decst, random_buffer, nread, 0))
1810 if (decst.mask == ~0)
1811 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1819 decst.mask = postprocess_iso2022_mask (decst.mask);
1821 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1823 int sys = fcd->coding_category_by_priority[i];
1824 if (decst.mask & (1 << sys))
1826 Lisp_Object codesys = fcd->coding_category_system[sys];
1827 if (!NILP (codesys))
1828 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1829 val = Fcons (codesys, val);
1833 Lstream_close (istr);
1835 Lstream_delete (istr);
1836 Lstream_delete (lb_istr);
1841 /************************************************************************/
1842 /* Converting to internal Mule format ("decoding") */
1843 /************************************************************************/
1845 /* A decoding stream is a stream used for decoding text (i.e.
1846 converting from some external format to internal format).
1847 The decoding-stream object keeps track of the actual coding
1848 stream, the stream that is at the other end, and data that
1849 needs to be persistent across the lifetime of the stream. */
1851 /* Handle the EOL stuff related to just-read-in character C.
1852 EOL_TYPE is the EOL type of the coding stream.
1853 FLAGS is the current value of FLAGS in the coding stream, and may
1854 be modified by this macro. (The macro only looks at the
1855 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1856 bytes are to be written. You need to also define a local goto
1857 label "label_continue_loop" that is at the end of the main
1858 character-reading loop.
1860 If C is a CR character, then this macro handles it entirely and
1861 jumps to label_continue_loop. Otherwise, this macro does not add
1862 anything to DST, and continues normally. You should continue
1863 processing C normally after this macro. */
1865 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1869 if (eol_type == EOL_CR) \
1870 Dynarr_add (dst, '\n'); \
1871 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1872 Dynarr_add (dst, c); \
1874 flags |= CODING_STATE_CR; \
1875 goto label_continue_loop; \
1877 else if (flags & CODING_STATE_CR) \
1878 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1880 Dynarr_add (dst, '\r'); \
1881 flags &= ~CODING_STATE_CR; \
1885 /* C should be a binary character in the range 0 - 255; convert
1886 to internal format and add to Dynarr DST. */
1889 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1891 if (BYTE_ASCII_P (c)) \
1892 Dynarr_add (dst, c); \
1895 Dynarr_add (dst, (c >> 6) | 0xc0); \
1896 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1901 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1905 Dynarr_add (dst, c);
1907 else if ( c <= 0x7ff )
1909 Dynarr_add (dst, (c >> 6) | 0xc0);
1910 Dynarr_add (dst, (c & 0x3f) | 0x80);
1912 else if ( c <= 0xffff )
1914 Dynarr_add (dst, (c >> 12) | 0xe0);
1915 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1916 Dynarr_add (dst, (c & 0x3f) | 0x80);
1918 else if ( c <= 0x1fffff )
1920 Dynarr_add (dst, (c >> 18) | 0xf0);
1921 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1922 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1923 Dynarr_add (dst, (c & 0x3f) | 0x80);
1925 else if ( c <= 0x3ffffff )
1927 Dynarr_add (dst, (c >> 24) | 0xf8);
1928 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1929 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1930 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1931 Dynarr_add (dst, (c & 0x3f) | 0x80);
1935 Dynarr_add (dst, (c >> 30) | 0xfc);
1936 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1937 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1938 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1939 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1940 Dynarr_add (dst, (c & 0x3f) | 0x80);
1944 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1946 if (BYTE_ASCII_P (c)) \
1947 Dynarr_add (dst, c); \
1948 else if (BYTE_C1_P (c)) \
1950 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1951 Dynarr_add (dst, c + 0x20); \
1955 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1956 Dynarr_add (dst, c); \
1961 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1965 DECODE_ADD_BINARY_CHAR (ch, dst); \
1970 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1972 if (flags & CODING_STATE_END) \
1974 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1975 if (flags & CODING_STATE_CR) \
1976 Dynarr_add (dst, '\r'); \
1980 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1982 struct decoding_stream
1984 /* Coding system that governs the conversion. */
1985 Lisp_Coding_System *codesys;
1987 /* Stream that we read the encoded data from or
1988 write the decoded data to. */
1991 /* If we are reading, then we can return only a fixed amount of
1992 data, so if the conversion resulted in too much data, we store it
1993 here for retrieval the next time around. */
1994 unsigned_char_dynarr *runoff;
1996 /* FLAGS holds flags indicating the current state of the decoding.
1997 Some of these flags are dependent on the coding system. */
2000 /* CH holds a partially built-up character. Since we only deal
2001 with one- and two-byte characters at the moment, we only use
2002 this to store the first byte of a two-byte character. */
2005 /* EOL_TYPE specifies the type of end-of-line conversion that
2006 currently applies. We need to keep this separate from the
2007 EOL type stored in CODESYS because the latter might indicate
2008 automatic EOL-type detection while the former will always
2009 indicate a particular EOL type. */
2010 enum eol_type eol_type;
2012 /* Additional ISO2022 information. We define the structure above
2013 because it's also needed by the detection routines. */
2014 struct iso2022_decoder iso2022;
2016 /* Additional information (the state of the running CCL program)
2017 used by the CCL decoder. */
2018 struct ccl_program ccl;
2020 /* counter for UTF-8 or UCS-4 */
2021 unsigned char counter;
2023 struct detection_state decst;
2026 static ssize_t decoding_reader (Lstream *stream,
2027 unsigned char *data, size_t size);
2028 static ssize_t decoding_writer (Lstream *stream,
2029 CONST unsigned char *data, size_t size);
2030 static int decoding_rewinder (Lstream *stream);
2031 static int decoding_seekable_p (Lstream *stream);
2032 static int decoding_flusher (Lstream *stream);
2033 static int decoding_closer (Lstream *stream);
2035 static Lisp_Object decoding_marker (Lisp_Object stream);
2037 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2038 sizeof (struct decoding_stream));
2041 decoding_marker (Lisp_Object stream)
2043 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2044 Lisp_Object str_obj;
2046 /* We do not need to mark the coding systems or charsets stored
2047 within the stream because they are stored in a global list
2048 and automatically marked. */
2050 XSETLSTREAM (str_obj, str);
2051 mark_object (str_obj);
2052 if (str->imp->marker)
2053 return (str->imp->marker) (str_obj);
2058 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2059 so we read data from the other end, decode it, and store it into DATA. */
2062 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2064 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2065 unsigned char *orig_data = data;
2067 int error_occurred = 0;
2069 /* We need to interface to mule_decode(), which expects to take some
2070 amount of data and store the result into a Dynarr. We have
2071 mule_decode() store into str->runoff, and take data from there
2074 /* We loop until we have enough data, reading chunks from the other
2075 end and decoding it. */
2078 /* Take data from the runoff if we can. Make sure to take at
2079 most SIZE bytes, and delete the data from the runoff. */
2080 if (Dynarr_length (str->runoff) > 0)
2082 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2083 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2084 Dynarr_delete_many (str->runoff, 0, chunk);
2090 break; /* No more room for data */
2092 if (str->flags & CODING_STATE_END)
2093 /* This means that on the previous iteration, we hit the EOF on
2094 the other end. We loop once more so that mule_decode() can
2095 output any final stuff it may be holding, or any "go back
2096 to a sane state" escape sequences. (This latter makes sense
2097 during encoding.) */
2100 /* Exhausted the runoff, so get some more. DATA has at least
2101 SIZE bytes left of storage in it, so it's OK to read directly
2102 into it. (We'll be overwriting above, after we've decoded it
2103 into the runoff.) */
2104 read_size = Lstream_read (str->other_end, data, size);
2111 /* There might be some more end data produced in the translation.
2112 See the comment above. */
2113 str->flags |= CODING_STATE_END;
2114 mule_decode (stream, data, str->runoff, read_size);
2117 if (data - orig_data == 0)
2118 return error_occurred ? -1 : 0;
2120 return data - orig_data;
2124 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2126 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2129 /* Decode all our data into the runoff, and then attempt to write
2130 it all out to the other end. Remove whatever chunk we succeeded
2132 mule_decode (stream, data, str->runoff, size);
2133 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2134 Dynarr_length (str->runoff));
2136 Dynarr_delete_many (str->runoff, 0, retval);
2137 /* Do NOT return retval. The return value indicates how much
2138 of the incoming data was written, not how many bytes were
2144 reset_decoding_stream (struct decoding_stream *str)
2147 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2149 Lisp_Object coding_system;
2150 XSETCODING_SYSTEM (coding_system, str->codesys);
2151 reset_iso2022 (coding_system, &str->iso2022);
2153 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2155 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2159 str->flags = str->ch = 0;
2163 decoding_rewinder (Lstream *stream)
2165 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2166 reset_decoding_stream (str);
2167 Dynarr_reset (str->runoff);
2168 return Lstream_rewind (str->other_end);
2172 decoding_seekable_p (Lstream *stream)
2174 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2175 return Lstream_seekable_p (str->other_end);
2179 decoding_flusher (Lstream *stream)
2181 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2182 return Lstream_flush (str->other_end);
2186 decoding_closer (Lstream *stream)
2188 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2189 if (stream->flags & LSTREAM_FL_WRITE)
2191 str->flags |= CODING_STATE_END;
2192 decoding_writer (stream, 0, 0);
2194 Dynarr_free (str->runoff);
2196 #ifdef ENABLE_COMPOSITE_CHARS
2197 if (str->iso2022.composite_chars)
2198 Dynarr_free (str->iso2022.composite_chars);
2201 return Lstream_close (str->other_end);
2205 decoding_stream_coding_system (Lstream *stream)
2207 Lisp_Object coding_system;
2208 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2210 XSETCODING_SYSTEM (coding_system, str->codesys);
2211 return subsidiary_coding_system (coding_system, str->eol_type);
2215 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2217 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2218 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2220 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2221 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2222 reset_decoding_stream (str);
2225 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2226 stream for writing, no automatic code detection will be performed.
2227 The reason for this is that automatic code detection requires a
2228 seekable input. Things will also fail if you open a decoding
2229 stream for reading using a non-fully-specified coding system and
2230 a non-seekable input stream. */
2233 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2236 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2237 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2241 str->other_end = stream;
2242 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2243 str->eol_type = EOL_AUTODETECT;
2244 if (!strcmp (mode, "r")
2245 && Lstream_seekable_p (stream))
2246 /* We can determine the coding system now. */
2247 determine_real_coding_system (stream, &codesys, &str->eol_type);
2248 set_decoding_stream_coding_system (lstr, codesys);
2249 str->decst.eol_type = str->eol_type;
2250 str->decst.mask = ~0;
2251 XSETLSTREAM (obj, lstr);
2256 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2258 return make_decoding_stream_1 (stream, codesys, "r");
2262 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2264 return make_decoding_stream_1 (stream, codesys, "w");
2267 /* Note: the decode_coding_* functions all take the same
2268 arguments as mule_decode(), which is to say some SRC data of
2269 size N, which is to be stored into dynamic array DST.
2270 DECODING is the stream within which the decoding is
2271 taking place, but no data is actually read from or
2272 written to that stream; that is handled in decoding_reader()
2273 or decoding_writer(). This allows the same functions to
2274 be used for both reading and writing. */
2277 mule_decode (Lstream *decoding, CONST unsigned char *src,
2278 unsigned_char_dynarr *dst, unsigned int n)
2280 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2282 /* If necessary, do encoding-detection now. We do this when
2283 we're a writing stream or a non-seekable reading stream,
2284 meaning that we can't just process the whole input,
2285 rewind, and start over. */
2287 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2288 str->eol_type == EOL_AUTODETECT)
2290 Lisp_Object codesys;
2292 XSETCODING_SYSTEM (codesys, str->codesys);
2293 detect_coding_type (&str->decst, src, n,
2294 CODING_SYSTEM_TYPE (str->codesys) !=
2295 CODESYS_AUTODETECT);
2296 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2297 str->decst.mask != ~0)
2298 /* #### This is cheesy. What we really ought to do is
2299 buffer up a certain amount of data so as to get a
2300 less random result. */
2301 codesys = coding_system_from_mask (str->decst.mask);
2302 str->eol_type = str->decst.eol_type;
2303 if (XCODING_SYSTEM (codesys) != str->codesys)
2305 /* Preserve the CODING_STATE_END flag in case it was set.
2306 If we erase it, bad things might happen. */
2307 int was_end = str->flags & CODING_STATE_END;
2308 set_decoding_stream_coding_system (decoding, codesys);
2310 str->flags |= CODING_STATE_END;
2314 switch (CODING_SYSTEM_TYPE (str->codesys))
2317 case CODESYS_INTERNAL:
2318 Dynarr_add_many (dst, src, n);
2321 case CODESYS_AUTODETECT:
2322 /* If we got this far and still haven't decided on the coding
2323 system, then do no conversion. */
2324 case CODESYS_NO_CONVERSION:
2325 decode_coding_no_conversion (decoding, src, dst, n);
2328 case CODESYS_SHIFT_JIS:
2329 decode_coding_sjis (decoding, src, dst, n);
2332 decode_coding_big5 (decoding, src, dst, n);
2335 decode_coding_ucs4 (decoding, src, dst, n);
2338 decode_coding_utf8 (decoding, src, dst, n);
2341 str->ccl.last_block = str->flags & CODING_STATE_END;
2342 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2344 case CODESYS_ISO2022:
2345 decode_coding_iso2022 (decoding, src, dst, n);
2353 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2354 Decode the text between START and END which is encoded in CODING-SYSTEM.
2355 This is useful if you've read in encoded text from a file without decoding
2356 it (e.g. you read in a JIS-formatted file but used the `binary' or
2357 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2358 Return length of decoded text.
2359 BUFFER defaults to the current buffer if unspecified.
2361 (start, end, coding_system, buffer))
2364 struct buffer *buf = decode_buffer (buffer, 0);
2365 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2366 Lstream *istr, *ostr;
2367 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2369 get_buffer_range_char (buf, start, end, &b, &e, 0);
2371 barf_if_buffer_read_only (buf, b, e);
2373 coding_system = Fget_coding_system (coding_system);
2374 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2375 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2376 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2378 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2379 Fget_coding_system (Qbinary));
2380 istr = XLSTREAM (instream);
2381 ostr = XLSTREAM (outstream);
2382 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2384 /* The chain of streams looks like this:
2386 [BUFFER] <----- send through
2387 ------> [ENCODE AS BINARY]
2388 ------> [DECODE AS SPECIFIED]
2394 char tempbuf[1024]; /* some random amount */
2395 Bufpos newpos, even_newer_pos;
2396 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2397 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2401 newpos = lisp_buffer_stream_startpos (istr);
2402 Lstream_write (ostr, tempbuf, size_in_bytes);
2403 even_newer_pos = lisp_buffer_stream_startpos (istr);
2404 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2407 Lstream_close (istr);
2408 Lstream_close (ostr);
2410 Lstream_delete (istr);
2411 Lstream_delete (ostr);
2412 Lstream_delete (XLSTREAM (de_outstream));
2413 Lstream_delete (XLSTREAM (lb_outstream));
2418 /************************************************************************/
2419 /* Converting to an external encoding ("encoding") */
2420 /************************************************************************/
2422 /* An encoding stream is an output stream. When you create the
2423 stream, you specify the coding system that governs the encoding
2424 and another stream that the resulting encoded data is to be
2425 sent to, and then start sending data to it. */
2427 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2429 struct encoding_stream
2431 /* Coding system that governs the conversion. */
2432 Lisp_Coding_System *codesys;
2434 /* Stream that we read the encoded data from or
2435 write the decoded data to. */
2438 /* If we are reading, then we can return only a fixed amount of
2439 data, so if the conversion resulted in too much data, we store it
2440 here for retrieval the next time around. */
2441 unsigned_char_dynarr *runoff;
2443 /* FLAGS holds flags indicating the current state of the encoding.
2444 Some of these flags are dependent on the coding system. */
2447 /* CH holds a partially built-up character. Since we only deal
2448 with one- and two-byte characters at the moment, we only use
2449 this to store the first byte of a two-byte character. */
2452 /* Additional information used by the ISO2022 encoder. */
2455 /* CHARSET holds the character sets currently assigned to the G0
2456 through G3 registers. It is initialized from the array
2457 INITIAL_CHARSET in CODESYS. */
2458 Lisp_Object charset[4];
2460 /* Which registers are currently invoked into the left (GL) and
2461 right (GR) halves of the 8-bit encoding space? */
2462 int register_left, register_right;
2464 /* Whether we need to explicitly designate the charset in the
2465 G? register before using it. It is initialized from the
2466 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2467 unsigned char force_charset_on_output[4];
2469 /* Other state variables that need to be preserved across
2471 Lisp_Object current_charset;
2473 int current_char_boundary;
2476 /* Additional information (the state of the running CCL program)
2477 used by the CCL encoder. */
2478 struct ccl_program ccl;
2482 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2483 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2485 static int encoding_rewinder (Lstream *stream);
2486 static int encoding_seekable_p (Lstream *stream);
2487 static int encoding_flusher (Lstream *stream);
2488 static int encoding_closer (Lstream *stream);
2490 static Lisp_Object encoding_marker (Lisp_Object stream);
2492 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2493 sizeof (struct encoding_stream));
2496 encoding_marker (Lisp_Object stream)
2498 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2499 Lisp_Object str_obj;
2501 /* We do not need to mark the coding systems or charsets stored
2502 within the stream because they are stored in a global list
2503 and automatically marked. */
2505 XSETLSTREAM (str_obj, str);
2506 mark_object (str_obj);
2507 if (str->imp->marker)
2508 return (str->imp->marker) (str_obj);
2513 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2514 so we read data from the other end, encode it, and store it into DATA. */
2517 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2519 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2520 unsigned char *orig_data = data;
2522 int error_occurred = 0;
2524 /* We need to interface to mule_encode(), which expects to take some
2525 amount of data and store the result into a Dynarr. We have
2526 mule_encode() store into str->runoff, and take data from there
2529 /* We loop until we have enough data, reading chunks from the other
2530 end and encoding it. */
2533 /* Take data from the runoff if we can. Make sure to take at
2534 most SIZE bytes, and delete the data from the runoff. */
2535 if (Dynarr_length (str->runoff) > 0)
2537 int chunk = min ((int) size, Dynarr_length (str->runoff));
2538 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2539 Dynarr_delete_many (str->runoff, 0, chunk);
2545 break; /* No more room for data */
2547 if (str->flags & CODING_STATE_END)
2548 /* This means that on the previous iteration, we hit the EOF on
2549 the other end. We loop once more so that mule_encode() can
2550 output any final stuff it may be holding, or any "go back
2551 to a sane state" escape sequences. (This latter makes sense
2552 during encoding.) */
2555 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2556 left of storage in it, so it's OK to read directly into it.
2557 (We'll be overwriting above, after we've encoded it into the
2559 read_size = Lstream_read (str->other_end, data, size);
2566 /* There might be some more end data produced in the translation.
2567 See the comment above. */
2568 str->flags |= CODING_STATE_END;
2569 mule_encode (stream, data, str->runoff, read_size);
2572 if (data == orig_data)
2573 return error_occurred ? -1 : 0;
2575 return data - orig_data;
2579 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2581 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2584 /* Encode all our data into the runoff, and then attempt to write
2585 it all out to the other end. Remove whatever chunk we succeeded
2587 mule_encode (stream, data, str->runoff, size);
2588 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2589 Dynarr_length (str->runoff));
2591 Dynarr_delete_many (str->runoff, 0, retval);
2592 /* Do NOT return retval. The return value indicates how much
2593 of the incoming data was written, not how many bytes were
2599 reset_encoding_stream (struct encoding_stream *str)
2602 switch (CODING_SYSTEM_TYPE (str->codesys))
2604 case CODESYS_ISO2022:
2608 for (i = 0; i < 4; i++)
2610 str->iso2022.charset[i] =
2611 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2612 str->iso2022.force_charset_on_output[i] =
2613 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2615 str->iso2022.register_left = 0;
2616 str->iso2022.register_right = 1;
2617 str->iso2022.current_charset = Qnil;
2618 str->iso2022.current_half = 0;
2620 str->iso2022.current_char_boundary = 0;
2622 str->iso2022.current_char_boundary = 1;
2627 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2634 str->flags = str->ch = 0;
2638 encoding_rewinder (Lstream *stream)
2640 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2641 reset_encoding_stream (str);
2642 Dynarr_reset (str->runoff);
2643 return Lstream_rewind (str->other_end);
2647 encoding_seekable_p (Lstream *stream)
2649 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2650 return Lstream_seekable_p (str->other_end);
2654 encoding_flusher (Lstream *stream)
2656 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2657 return Lstream_flush (str->other_end);
2661 encoding_closer (Lstream *stream)
2663 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2664 if (stream->flags & LSTREAM_FL_WRITE)
2666 str->flags |= CODING_STATE_END;
2667 encoding_writer (stream, 0, 0);
2669 Dynarr_free (str->runoff);
2670 return Lstream_close (str->other_end);
2674 encoding_stream_coding_system (Lstream *stream)
2676 Lisp_Object coding_system;
2677 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2679 XSETCODING_SYSTEM (coding_system, str->codesys);
2680 return coding_system;
2684 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2686 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2687 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2689 reset_encoding_stream (str);
2693 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2696 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2697 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2701 str->runoff = Dynarr_new (unsigned_char);
2702 str->other_end = stream;
2703 set_encoding_stream_coding_system (lstr, codesys);
2704 XSETLSTREAM (obj, lstr);
2709 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2711 return make_encoding_stream_1 (stream, codesys, "r");
2715 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2717 return make_encoding_stream_1 (stream, codesys, "w");
2720 /* Convert N bytes of internally-formatted data stored in SRC to an
2721 external format, according to the encoding stream ENCODING.
2722 Store the encoded data into DST. */
2725 mule_encode (Lstream *encoding, CONST unsigned char *src,
2726 unsigned_char_dynarr *dst, unsigned int n)
2728 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2730 switch (CODING_SYSTEM_TYPE (str->codesys))
2733 case CODESYS_INTERNAL:
2734 Dynarr_add_many (dst, src, n);
2737 case CODESYS_AUTODETECT:
2738 /* If we got this far and still haven't decided on the coding
2739 system, then do no conversion. */
2740 case CODESYS_NO_CONVERSION:
2741 encode_coding_no_conversion (encoding, src, dst, n);
2744 case CODESYS_SHIFT_JIS:
2745 encode_coding_sjis (encoding, src, dst, n);
2748 encode_coding_big5 (encoding, src, dst, n);
2751 encode_coding_ucs4 (encoding, src, dst, n);
2754 encode_coding_utf8 (encoding, src, dst, n);
2757 str->ccl.last_block = str->flags & CODING_STATE_END;
2758 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2760 case CODESYS_ISO2022:
2761 encode_coding_iso2022 (encoding, src, dst, n);
2769 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2770 Encode the text between START and END using CODING-SYSTEM.
2771 This will, for example, convert Japanese characters into stuff such as
2772 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2773 text. BUFFER defaults to the current buffer if unspecified.
2775 (start, end, coding_system, buffer))
2778 struct buffer *buf = decode_buffer (buffer, 0);
2779 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2780 Lstream *istr, *ostr;
2781 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2783 get_buffer_range_char (buf, start, end, &b, &e, 0);
2785 barf_if_buffer_read_only (buf, b, e);
2787 coding_system = Fget_coding_system (coding_system);
2788 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2789 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2790 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2791 Fget_coding_system (Qbinary));
2792 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2794 istr = XLSTREAM (instream);
2795 ostr = XLSTREAM (outstream);
2796 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2797 /* The chain of streams looks like this:
2799 [BUFFER] <----- send through
2800 ------> [ENCODE AS SPECIFIED]
2801 ------> [DECODE AS BINARY]
2806 char tempbuf[1024]; /* some random amount */
2807 Bufpos newpos, even_newer_pos;
2808 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2809 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2813 newpos = lisp_buffer_stream_startpos (istr);
2814 Lstream_write (ostr, tempbuf, size_in_bytes);
2815 even_newer_pos = lisp_buffer_stream_startpos (istr);
2816 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2822 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2823 Lstream_close (istr);
2824 Lstream_close (ostr);
2826 Lstream_delete (istr);
2827 Lstream_delete (ostr);
2828 Lstream_delete (XLSTREAM (de_outstream));
2829 Lstream_delete (XLSTREAM (lb_outstream));
2830 return make_int (retlen);
2836 /************************************************************************/
2837 /* Shift-JIS methods */
2838 /************************************************************************/
2840 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2841 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2842 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2843 encoded by "position-code + 0x80". A character of JISX0208
2844 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2845 position-codes are divided and shifted so that it fit in the range
2848 --- CODE RANGE of Shift-JIS ---
2849 (character set) (range)
2851 JISX0201-Kana 0xA0 .. 0xDF
2852 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2853 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2854 -------------------------------
2858 /* Is this the first byte of a Shift-JIS two-byte char? */
2860 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2861 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2863 /* Is this the second byte of a Shift-JIS two-byte char? */
2865 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2866 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2868 #define BYTE_SJIS_KATAKANA_P(c) \
2869 ((c) >= 0xA1 && (c) <= 0xDF)
2872 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2880 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2882 if (st->shift_jis.in_second_byte)
2884 st->shift_jis.in_second_byte = 0;
2888 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2889 st->shift_jis.in_second_byte = 1;
2891 return CODING_CATEGORY_SHIFT_JIS_MASK;
2894 /* Convert Shift-JIS data to internal format. */
2897 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2898 unsigned_char_dynarr *dst, unsigned int n)
2901 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2902 unsigned int flags = str->flags;
2903 unsigned int ch = str->ch;
2904 eol_type_t eol_type = str->eol_type;
2912 /* Previous character was first byte of Shift-JIS Kanji char. */
2913 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2915 unsigned char e1, e2;
2917 DECODE_SJIS (ch, c, e1, e2);
2919 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2923 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2924 Dynarr_add (dst, e1);
2925 Dynarr_add (dst, e2);
2930 DECODE_ADD_BINARY_CHAR (ch, dst);
2931 DECODE_ADD_BINARY_CHAR (c, dst);
2937 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2938 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2940 else if (BYTE_SJIS_KATAKANA_P (c))
2943 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2946 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2947 Dynarr_add (dst, c);
2951 DECODE_ADD_BINARY_CHAR (c, dst);
2953 label_continue_loop:;
2956 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2962 /* Convert internally-formatted data to Shift-JIS. */
2965 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2966 unsigned_char_dynarr *dst, unsigned int n)
2969 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2970 unsigned int flags = str->flags;
2971 unsigned int ch = str->ch;
2972 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2974 unsigned char char_boundary = str->iso2022.current_char_boundary;
2981 switch (char_boundary)
2989 else if ( c >= 0xf8 )
2994 else if ( c >= 0xf0 )
2999 else if ( c >= 0xe0 )
3004 else if ( c >= 0xc0 )
3014 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3015 Dynarr_add (dst, '\r');
3016 if (eol_type != EOL_CR)
3017 Dynarr_add (dst, c);
3020 Dynarr_add (dst, c);
3025 ch = ( ch << 6 ) | ( c & 0x3f );
3027 Lisp_Object charset;
3028 unsigned int c1, c2, s1, s2;
3030 BREAKUP_CHAR (ch, charset, c1, c2);
3031 if (EQ(charset, Vcharset_katakana_jisx0201))
3033 Dynarr_add (dst, c1 | 0x80);
3035 else if (EQ(charset, Vcharset_japanese_jisx0208))
3037 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3038 Dynarr_add (dst, s1);
3039 Dynarr_add (dst, s2);
3045 ch = ( ch << 6 ) | ( c & 0x3f );
3051 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3052 Dynarr_add (dst, '\r');
3053 if (eol_type != EOL_CR)
3054 Dynarr_add (dst, '\n');
3057 else if (BYTE_ASCII_P (c))
3059 Dynarr_add (dst, c);
3062 else if (BUFBYTE_LEADING_BYTE_P (c))
3063 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3064 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3065 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3068 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3070 Dynarr_add (dst, c);
3073 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3074 ch == LEADING_BYTE_JAPANESE_JISX0208)
3078 unsigned char j1, j2;
3079 ENCODE_SJIS (ch, c, j1, j2);
3080 Dynarr_add (dst, j1);
3081 Dynarr_add (dst, j2);
3091 str->iso2022.current_char_boundary = char_boundary;
3095 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3096 Decode a JISX0208 character of Shift-JIS coding-system.
3097 CODE is the character code in Shift-JIS as a cons of type bytes.
3098 Return the corresponding character.
3102 unsigned char c1, c2, s1, s2;
3105 CHECK_INT (XCAR (code));
3106 CHECK_INT (XCDR (code));
3107 s1 = XINT (XCAR (code));
3108 s2 = XINT (XCDR (code));
3109 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3110 BYTE_SJIS_TWO_BYTE_2_P (s2))
3112 DECODE_SJIS (s1, s2, c1, c2);
3113 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3114 c1 & 0x7F, c2 & 0x7F));
3120 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3121 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3122 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3126 Lisp_Object charset;
3129 CHECK_CHAR_COERCE_INT (ch);
3130 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3131 if (EQ (charset, Vcharset_japanese_jisx0208))
3133 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3134 return Fcons (make_int (s1), make_int (s2));
3141 /************************************************************************/
3143 /************************************************************************/
3145 /* BIG5 is a coding system encoding two character sets: ASCII and
3146 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3147 character set and is encoded in two-byte.
3149 --- CODE RANGE of BIG5 ---
3150 (character set) (range)
3152 Big5 (1st byte) 0xA1 .. 0xFE
3153 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3154 --------------------------
3156 Since the number of characters in Big5 is larger than maximum
3157 characters in Emacs' charset (96x96), it can't be handled as one
3158 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3159 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3160 contains frequently used characters and the latter contains less
3161 frequently used characters. */
3163 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3164 ((c) >= 0xA1 && (c) <= 0xFE)
3166 /* Is this the second byte of a Shift-JIS two-byte char? */
3168 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3169 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3171 /* Number of Big5 characters which have the same code in 1st byte. */
3173 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3175 /* Code conversion macros. These are macros because they are used in
3176 inner loops during code conversion.
3178 Note that temporary variables in macros introduce the classic
3179 dynamic-scoping problems with variable names. We use capital-
3180 lettered variables in the assumption that XEmacs does not use
3181 capital letters in variables except in a very formalized way
3184 /* Convert Big5 code (b1, b2) into its internal string representation
3187 /* There is a much simpler way to split the Big5 charset into two.
3188 For the moment I'm going to leave the algorithm as-is because it
3189 claims to separate out the most-used characters into a single
3190 charset, which perhaps will lead to optimizations in various
3193 The way the algorithm works is something like this:
3195 Big5 can be viewed as a 94x157 charset, where the row is
3196 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3197 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3198 the split between low and high column numbers is apparently
3199 meaningless; ascending rows produce less and less frequent chars.
3200 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3201 the first charset, and the upper half (0xC9 .. 0xFE) to the
3202 second. To do the conversion, we convert the character into
3203 a single number where 0 .. 156 is the first row, 157 .. 313
3204 is the second, etc. That way, the characters are ordered by
3205 decreasing frequency. Then we just chop the space in two
3206 and coerce the result into a 94x94 space.
3209 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3211 int B1 = b1, B2 = b2; \
3213 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3217 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3221 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3222 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3224 c1 = I / (0xFF - 0xA1) + 0xA1; \
3225 c2 = I % (0xFF - 0xA1) + 0xA1; \
3228 /* Convert the internal string representation of a Big5 character
3229 (lb, c1, c2) into Big5 code (b1, b2). */
3231 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3233 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3235 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3237 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3239 b1 = I / BIG5_SAME_ROW + 0xA1; \
3240 b2 = I % BIG5_SAME_ROW; \
3241 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3245 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3253 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3254 (c >= 0x80 && c <= 0xA0))
3256 if (st->big5.in_second_byte)
3258 st->big5.in_second_byte = 0;
3259 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3263 st->big5.in_second_byte = 1;
3265 return CODING_CATEGORY_BIG5_MASK;
3268 /* Convert Big5 data to internal format. */
3271 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3272 unsigned_char_dynarr *dst, unsigned int n)
3275 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3276 unsigned int flags = str->flags;
3277 unsigned int ch = str->ch;
3278 eol_type_t eol_type = str->eol_type;
3285 /* Previous character was first byte of Big5 char. */
3286 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3288 unsigned char b1, b2, b3;
3289 DECODE_BIG5 (ch, c, b1, b2, b3);
3290 Dynarr_add (dst, b1);
3291 Dynarr_add (dst, b2);
3292 Dynarr_add (dst, b3);
3296 DECODE_ADD_BINARY_CHAR (ch, dst);
3297 DECODE_ADD_BINARY_CHAR (c, dst);
3303 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3304 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3307 DECODE_ADD_BINARY_CHAR (c, dst);
3309 label_continue_loop:;
3312 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3318 /* Convert internally-formatted data to Big5. */
3321 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3322 unsigned_char_dynarr *dst, unsigned int n)
3326 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3327 unsigned int flags = str->flags;
3328 unsigned int ch = str->ch;
3329 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3336 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3337 Dynarr_add (dst, '\r');
3338 if (eol_type != EOL_CR)
3339 Dynarr_add (dst, '\n');
3341 else if (BYTE_ASCII_P (c))
3344 Dynarr_add (dst, c);
3346 else if (BUFBYTE_LEADING_BYTE_P (c))
3348 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3349 c == LEADING_BYTE_CHINESE_BIG5_2)
3351 /* A recognized leading byte. */
3353 continue; /* not done with this character. */
3355 /* otherwise just ignore this character. */
3357 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3358 ch == LEADING_BYTE_CHINESE_BIG5_2)
3360 /* Previous char was a recognized leading byte. */
3362 continue; /* not done with this character. */
3366 /* Encountering second byte of a Big5 character. */
3367 unsigned char b1, b2;
3369 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3370 Dynarr_add (dst, b1);
3371 Dynarr_add (dst, b2);
3383 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3384 Decode a Big5 character CODE of BIG5 coding-system.
3385 CODE is the character code in BIG5, a cons of two integers.
3386 Return the corresponding character.
3390 unsigned char c1, c2, b1, b2;
3393 CHECK_INT (XCAR (code));
3394 CHECK_INT (XCDR (code));
3395 b1 = XINT (XCAR (code));
3396 b2 = XINT (XCDR (code));
3397 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3398 BYTE_BIG5_TWO_BYTE_2_P (b2))
3400 Charset_ID leading_byte;
3401 Lisp_Object charset;
3402 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3403 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3404 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3410 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3411 Encode the Big5 character CH to BIG5 coding-system.
3412 Return the corresponding character code in Big5.
3416 Lisp_Object charset;
3419 CHECK_CHAR_COERCE_INT (ch);
3420 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3421 if (EQ (charset, Vcharset_chinese_big5_1) ||
3422 EQ (charset, Vcharset_chinese_big5_2))
3424 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3426 return Fcons (make_int (b1), make_int (b2));
3433 /************************************************************************/
3436 /* UCS-4 character codes are implemented as nonnegative integers. */
3438 /************************************************************************/
3441 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3442 Map UCS-4 code CODE to Mule character CHARACTER.
3444 Return T on success, NIL on failure.
3450 CHECK_CHAR (character);
3454 if (c < sizeof (fcd->ucs_to_mule_table))
3456 fcd->ucs_to_mule_table[c] = character;
3464 ucs_to_char (unsigned long code)
3466 if (code < sizeof (fcd->ucs_to_mule_table))
3468 return fcd->ucs_to_mule_table[code];
3470 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3475 c = code % (94 * 94);
3477 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3478 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3479 CHARSET_LEFT_TO_RIGHT),
3480 c / 94 + 33, c % 94 + 33));
3486 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3487 Return Mule character corresponding to UCS code CODE (a positive integer).
3491 CHECK_NATNUM (code);
3492 return ucs_to_char (XINT (code));
3495 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3496 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3500 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3501 Fset_char_ucs is more restrictive on index arg, but should
3502 check code arg in a char_table method. */
3503 CHECK_CHAR (character);
3504 CHECK_NATNUM (code);
3505 return Fput_char_table (character, code, mule_to_ucs_table);
3508 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3509 Return the UCS code (a positive integer) corresponding to CHARACTER.
3513 return Fget_char_table (character, mule_to_ucs_table);
3518 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3520 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3521 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3522 is not found, instead.
3523 #### do something more appropriate (use blob?)
3524 Danger, Will Robinson! Data loss. Should we signal user? */
3526 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3528 Lisp_Object chr = ucs_to_char (ch);
3532 Bufbyte work[MAX_EMCHAR_LEN];
3537 simple_set_charptr_emchar (work, ch) :
3538 non_ascii_set_charptr_emchar (work, ch);
3539 Dynarr_add_many (dst, work, len);
3543 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3544 Dynarr_add (dst, 34 + 128);
3545 Dynarr_add (dst, 46 + 128);
3551 static unsigned long
3552 mule_char_to_ucs4 (Lisp_Object charset,
3553 unsigned char h, unsigned char l)
3556 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3563 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3564 (XCHARSET_CHARS (charset) == 94) )
3566 unsigned char final = XCHARSET_FINAL (charset);
3568 if ( ('@' <= final) && (final < 0x7f) )
3570 return 0xe00000 + (final - '@') * 94 * 94
3571 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3585 encode_ucs4 (Lisp_Object charset,
3586 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3588 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3589 Dynarr_add (dst, code >> 24);
3590 Dynarr_add (dst, (code >> 16) & 255);
3591 Dynarr_add (dst, (code >> 8) & 255);
3592 Dynarr_add (dst, code & 255);
3597 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3603 switch (st->ucs4.in_byte)
3612 st->ucs4.in_byte = 0;
3618 return CODING_CATEGORY_UCS4_MASK;
3622 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3623 unsigned_char_dynarr *dst, unsigned int n)
3625 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3626 unsigned int flags = str->flags;
3627 unsigned int ch = str->ch;
3628 unsigned char counter = str->counter;
3632 unsigned char c = *src++;
3640 decode_ucs4 ( ( ch << 8 ) | c, dst);
3645 ch = ( ch << 8 ) | c;
3649 if (counter & CODING_STATE_END)
3650 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3654 str->counter = counter;
3658 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3659 unsigned_char_dynarr *dst, unsigned int n)
3662 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3663 unsigned int flags = str->flags;
3664 unsigned int ch = str->ch;
3665 unsigned char char_boundary = str->iso2022.current_char_boundary;
3666 Lisp_Object charset = str->iso2022.current_charset;
3668 #ifdef ENABLE_COMPOSITE_CHARS
3669 /* flags for handling composite chars. We do a little switcharoo
3670 on the source while we're outputting the composite char. */
3671 unsigned int saved_n = 0;
3672 CONST unsigned char *saved_src = NULL;
3673 int in_composite = 0;
3680 unsigned char c = *src++;
3682 if (BYTE_ASCII_P (c))
3683 { /* Processing ASCII character */
3685 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3688 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3689 { /* Processing Leading Byte */
3691 charset = CHARSET_BY_LEADING_BYTE (c);
3692 if (LEADING_BYTE_PREFIX_P(c))
3697 { /* Processing Non-ASCII character */
3699 if (EQ (charset, Vcharset_control_1))
3701 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3705 switch (XCHARSET_REP_BYTES (charset))
3708 encode_ucs4 (charset, c, 0, dst);
3711 if (XCHARSET_PRIVATE_P (charset))
3713 encode_ucs4 (charset, c, 0, dst);
3718 #ifdef ENABLE_COMPOSITE_CHARS
3719 if (EQ (charset, Vcharset_composite))
3723 /* #### Bother! We don't know how to
3725 Dynarr_add (dst, 0);
3726 Dynarr_add (dst, 0);
3727 Dynarr_add (dst, 0);
3728 Dynarr_add (dst, '~');
3732 Emchar emch = MAKE_CHAR (Vcharset_composite,
3733 ch & 0x7F, c & 0x7F);
3734 Lisp_Object lstr = composite_char_string (emch);
3738 src = XSTRING_DATA (lstr);
3739 n = XSTRING_LENGTH (lstr);
3743 #endif /* ENABLE_COMPOSITE_CHARS */
3745 encode_ucs4(charset, ch, c, dst);
3758 encode_ucs4 (charset, ch, c, dst);
3774 #ifdef ENABLE_COMPOSITE_CHARS
3780 goto back_to_square_n; /* Wheeeeeeeee ..... */
3782 #endif /* ENABLE_COMPOSITE_CHARS */
3786 str->iso2022.current_char_boundary = char_boundary;
3787 str->iso2022.current_charset = charset;
3789 /* Verbum caro factum est! */
3794 /************************************************************************/
3796 /************************************************************************/
3799 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3804 unsigned char c = *src++;
3805 switch (st->utf8.in_byte)
3808 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3811 st->utf8.in_byte = 5;
3813 st->utf8.in_byte = 4;
3815 st->utf8.in_byte = 3;
3817 st->utf8.in_byte = 2;
3819 st->utf8.in_byte = 1;
3824 if ((c & 0xc0) != 0x80)
3830 return CODING_CATEGORY_UTF8_MASK;
3834 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3835 unsigned_char_dynarr *dst, unsigned int n)
3837 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3838 unsigned int flags = str->flags;
3839 unsigned int ch = str->ch;
3840 eol_type_t eol_type = str->eol_type;
3841 unsigned char counter = str->counter;
3845 unsigned char c = *src++;
3854 else if ( c >= 0xf8 )
3859 else if ( c >= 0xf0 )
3864 else if ( c >= 0xe0 )
3869 else if ( c >= 0xc0 )
3876 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3877 decode_ucs4 (c, dst);
3881 ch = ( ch << 6 ) | ( c & 0x3f );
3882 decode_ucs4 (ch, dst);
3887 ch = ( ch << 6 ) | ( c & 0x3f );
3890 label_continue_loop:;
3893 if (flags & CODING_STATE_END)
3894 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3898 str->counter = counter;
3903 encode_utf8 (Lisp_Object charset,
3904 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3906 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3909 Dynarr_add (dst, code);
3911 else if ( code <= 0x7ff )
3913 Dynarr_add (dst, (code >> 6) | 0xc0);
3914 Dynarr_add (dst, (code & 0x3f) | 0x80);
3916 else if ( code <= 0xffff )
3918 Dynarr_add (dst, (code >> 12) | 0xe0);
3919 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3920 Dynarr_add (dst, (code & 0x3f) | 0x80);
3922 else if ( code <= 0x1fffff )
3924 Dynarr_add (dst, (code >> 18) | 0xf0);
3925 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3926 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3927 Dynarr_add (dst, (code & 0x3f) | 0x80);
3929 else if ( code <= 0x3ffffff )
3931 Dynarr_add (dst, (code >> 24) | 0xf8);
3932 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3933 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3934 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3935 Dynarr_add (dst, (code & 0x3f) | 0x80);
3939 Dynarr_add (dst, (code >> 30) | 0xfc);
3940 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3941 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3942 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3943 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3944 Dynarr_add (dst, (code & 0x3f) | 0x80);
3950 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3951 unsigned_char_dynarr *dst, unsigned int n)
3953 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3954 unsigned int flags = str->flags;
3955 unsigned int ch = str->ch;
3956 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3957 unsigned char char_boundary = str->iso2022.current_char_boundary;
3962 unsigned char c = *src++;
3963 switch (char_boundary)
3968 Dynarr_add (dst, c);
3971 else if ( c >= 0xf8 )
3973 Dynarr_add (dst, c);
3976 else if ( c >= 0xf0 )
3978 Dynarr_add (dst, c);
3981 else if ( c >= 0xe0 )
3983 Dynarr_add (dst, c);
3986 else if ( c >= 0xc0 )
3988 Dynarr_add (dst, c);
3995 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3996 Dynarr_add (dst, '\r');
3997 if (eol_type != EOL_CR)
3998 Dynarr_add (dst, c);
4001 Dynarr_add (dst, c);
4006 Dynarr_add (dst, c);
4010 Dynarr_add (dst, c);
4014 #else /* not UTF2000 */
4015 Lisp_Object charset = str->iso2022.current_charset;
4017 #ifdef ENABLE_COMPOSITE_CHARS
4018 /* flags for handling composite chars. We do a little switcharoo
4019 on the source while we're outputting the composite char. */
4020 unsigned int saved_n = 0;
4021 CONST unsigned char *saved_src = NULL;
4022 int in_composite = 0;
4025 #endif /* ENABLE_COMPOSITE_CHARS */
4029 unsigned char c = *src++;
4031 if (BYTE_ASCII_P (c))
4032 { /* Processing ASCII character */
4036 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4037 Dynarr_add (dst, '\r');
4038 if (eol_type != EOL_CR)
4039 Dynarr_add (dst, c);
4042 encode_utf8 (Vcharset_ascii, c, 0, dst);
4045 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4046 { /* Processing Leading Byte */
4048 charset = CHARSET_BY_LEADING_BYTE (c);
4049 if (LEADING_BYTE_PREFIX_P(c))
4054 { /* Processing Non-ASCII character */
4056 if (EQ (charset, Vcharset_control_1))
4058 encode_utf8 (Vcharset_control_1, c, 0, dst);
4062 switch (XCHARSET_REP_BYTES (charset))
4065 encode_utf8 (charset, c, 0, dst);
4068 if (XCHARSET_PRIVATE_P (charset))
4070 encode_utf8 (charset, c, 0, dst);
4075 #ifdef ENABLE_COMPOSITE_CHARS
4076 if (EQ (charset, Vcharset_composite))
4080 /* #### Bother! We don't know how to
4082 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4086 Emchar emch = MAKE_CHAR (Vcharset_composite,
4087 ch & 0x7F, c & 0x7F);
4088 Lisp_Object lstr = composite_char_string (emch);
4092 src = XSTRING_DATA (lstr);
4093 n = XSTRING_LENGTH (lstr);
4097 #endif /* ENABLE_COMPOSITE_CHARS */
4099 encode_utf8 (charset, ch, c, dst);
4112 encode_utf8 (charset, ch, c, dst);
4128 #ifdef ENABLE_COMPOSITE_CHARS
4134 goto back_to_square_n; /* Wheeeeeeeee ..... */
4138 #endif /* not UTF2000 */
4141 str->iso2022.current_char_boundary = char_boundary;
4143 str->iso2022.current_charset = charset;
4146 /* Verbum caro factum est! */
4150 /************************************************************************/
4151 /* ISO2022 methods */
4152 /************************************************************************/
4154 /* The following note describes the coding system ISO2022 briefly.
4155 Since the intention of this note is to help understand the
4156 functions in this file, some parts are NOT ACCURATE or OVERLY
4157 SIMPLIFIED. For thorough understanding, please refer to the
4158 original document of ISO2022.
4160 ISO2022 provides many mechanisms to encode several character sets
4161 in 7-bit and 8-bit environments. For 7-bit environments, all text
4162 is encoded using bytes less than 128. This may make the encoded
4163 text a little bit longer, but the text passes more easily through
4164 several gateways, some of which strip off MSB (Most Signigant Bit).
4166 There are two kinds of character sets: control character set and
4167 graphic character set. The former contains control characters such
4168 as `newline' and `escape' to provide control functions (control
4169 functions are also provided by escape sequences). The latter
4170 contains graphic characters such as 'A' and '-'. Emacs recognizes
4171 two control character sets and many graphic character sets.
4173 Graphic character sets are classified into one of the following
4174 four classes, according to the number of bytes (DIMENSION) and
4175 number of characters in one dimension (CHARS) of the set:
4176 - DIMENSION1_CHARS94
4177 - DIMENSION1_CHARS96
4178 - DIMENSION2_CHARS94
4179 - DIMENSION2_CHARS96
4181 In addition, each character set is assigned an identification tag,
4182 unique for each set, called "final character" (denoted as <F>
4183 hereafter). The <F> of each character set is decided by ECMA(*)
4184 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4185 (0x30..0x3F are for private use only).
4187 Note (*): ECMA = European Computer Manufacturers Association
4189 Here are examples of graphic character set [NAME(<F>)]:
4190 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4191 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4192 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4193 o DIMENSION2_CHARS96 -- none for the moment
4195 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4196 C0 [0x00..0x1F] -- control character plane 0
4197 GL [0x20..0x7F] -- graphic character plane 0
4198 C1 [0x80..0x9F] -- control character plane 1
4199 GR [0xA0..0xFF] -- graphic character plane 1
4201 A control character set is directly designated and invoked to C0 or
4202 C1 by an escape sequence. The most common case is that:
4203 - ISO646's control character set is designated/invoked to C0, and
4204 - ISO6429's control character set is designated/invoked to C1,
4205 and usually these designations/invocations are omitted in encoded
4206 text. In a 7-bit environment, only C0 can be used, and a control
4207 character for C1 is encoded by an appropriate escape sequence to
4208 fit into the environment. All control characters for C1 are
4209 defined to have corresponding escape sequences.
4211 A graphic character set is at first designated to one of four
4212 graphic registers (G0 through G3), then these graphic registers are
4213 invoked to GL or GR. These designations and invocations can be
4214 done independently. The most common case is that G0 is invoked to
4215 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4216 these invocations and designations are omitted in encoded text.
4217 In a 7-bit environment, only GL can be used.
4219 When a graphic character set of CHARS94 is invoked to GL, codes
4220 0x20 and 0x7F of the GL area work as control characters SPACE and
4221 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4224 There are two ways of invocation: locking-shift and single-shift.
4225 With locking-shift, the invocation lasts until the next different
4226 invocation, whereas with single-shift, the invocation affects the
4227 following character only and doesn't affect the locking-shift
4228 state. Invocations are done by the following control characters or
4231 ----------------------------------------------------------------------
4232 abbrev function cntrl escape seq description
4233 ----------------------------------------------------------------------
4234 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4235 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4236 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4237 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4238 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4239 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4240 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4241 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4242 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4243 ----------------------------------------------------------------------
4244 (*) These are not used by any known coding system.
4246 Control characters for these functions are defined by macros
4247 ISO_CODE_XXX in `coding.h'.
4249 Designations are done by the following escape sequences:
4250 ----------------------------------------------------------------------
4251 escape sequence description
4252 ----------------------------------------------------------------------
4253 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4254 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4255 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4256 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4257 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4258 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4259 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4260 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4261 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4262 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4263 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4264 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4265 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4266 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4267 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4268 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4269 ----------------------------------------------------------------------
4271 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4272 of dimension 1, chars 94, and final character <F>, etc...
4274 Note (*): Although these designations are not allowed in ISO2022,
4275 Emacs accepts them on decoding, and produces them on encoding
4276 CHARS96 character sets in a coding system which is characterized as
4277 7-bit environment, non-locking-shift, and non-single-shift.
4279 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4280 '(' can be omitted. We refer to this as "short-form" hereafter.
4282 Now you may notice that there are a lot of ways for encoding the
4283 same multilingual text in ISO2022. Actually, there exist many
4284 coding systems such as Compound Text (used in X11's inter client
4285 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4286 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4287 localized platforms), and all of these are variants of ISO2022.
4289 In addition to the above, Emacs handles two more kinds of escape
4290 sequences: ISO6429's direction specification and Emacs' private
4291 sequence for specifying character composition.
4293 ISO6429's direction specification takes the following form:
4294 o CSI ']' -- end of the current direction
4295 o CSI '0' ']' -- end of the current direction
4296 o CSI '1' ']' -- start of left-to-right text
4297 o CSI '2' ']' -- start of right-to-left text
4298 The control character CSI (0x9B: control sequence introducer) is
4299 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4301 Character composition specification takes the following form:
4302 o ESC '0' -- start character composition
4303 o ESC '1' -- end character composition
4304 Since these are not standard escape sequences of any ISO standard,
4305 their use with these meanings is restricted to Emacs only. */
4308 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4312 for (i = 0; i < 4; i++)
4314 if (!NILP (coding_system))
4316 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4318 iso->charset[i] = Qt;
4319 iso->invalid_designated[i] = 0;
4321 iso->esc = ISO_ESC_NOTHING;
4322 iso->esc_bytes_index = 0;
4323 iso->register_left = 0;
4324 iso->register_right = 1;
4325 iso->switched_dir_and_no_valid_charset_yet = 0;
4326 iso->invalid_switch_dir = 0;
4327 iso->output_direction_sequence = 0;
4328 iso->output_literally = 0;
4329 #ifdef ENABLE_COMPOSITE_CHARS
4330 if (iso->composite_chars)
4331 Dynarr_reset (iso->composite_chars);
4336 fit_to_be_escape_quoted (unsigned char c)
4353 /* Parse one byte of an ISO2022 escape sequence.
4354 If the result is an invalid escape sequence, return 0 and
4355 do not change anything in STR. Otherwise, if the result is
4356 an incomplete escape sequence, update ISO2022.ESC and
4357 ISO2022.ESC_BYTES and return -1. Otherwise, update
4358 all the state variables (but not ISO2022.ESC_BYTES) and
4361 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4362 or invocation of an invalid character set and treat that as
4363 an unrecognized escape sequence. */
4366 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4367 unsigned char c, unsigned int *flags,
4368 int check_invalid_charsets)
4370 /* (1) If we're at the end of a designation sequence, CS is the
4371 charset being designated and REG is the register to designate
4374 (2) If we're at the end of a locking-shift sequence, REG is
4375 the register to invoke and HALF (0 == left, 1 == right) is
4376 the half to invoke it into.
4378 (3) If we're at the end of a single-shift sequence, REG is
4379 the register to invoke. */
4380 Lisp_Object cs = Qnil;
4383 /* NOTE: This code does goto's all over the fucking place.
4384 The reason for this is that we're basically implementing
4385 a state machine here, and hierarchical languages like C
4386 don't really provide a clean way of doing this. */
4388 if (! (*flags & CODING_STATE_ESCAPE))
4389 /* At beginning of escape sequence; we need to reset our
4390 escape-state variables. */
4391 iso->esc = ISO_ESC_NOTHING;
4393 iso->output_literally = 0;
4394 iso->output_direction_sequence = 0;
4398 case ISO_ESC_NOTHING:
4399 iso->esc_bytes_index = 0;
4402 case ISO_CODE_ESC: /* Start escape sequence */
4403 *flags |= CODING_STATE_ESCAPE;
4407 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4408 *flags |= CODING_STATE_ESCAPE;
4409 iso->esc = ISO_ESC_5_11;
4412 case ISO_CODE_SO: /* locking shift 1 */
4415 case ISO_CODE_SI: /* locking shift 0 */
4419 case ISO_CODE_SS2: /* single shift */
4422 case ISO_CODE_SS3: /* single shift */
4426 default: /* Other control characters */
4433 /**** single shift ****/
4435 case 'N': /* single shift 2 */
4438 case 'O': /* single shift 3 */
4442 /**** locking shift ****/
4444 case '~': /* locking shift 1 right */
4447 case 'n': /* locking shift 2 */
4450 case '}': /* locking shift 2 right */
4453 case 'o': /* locking shift 3 */
4456 case '|': /* locking shift 3 right */
4460 #ifdef ENABLE_COMPOSITE_CHARS
4461 /**** composite ****/
4464 iso->esc = ISO_ESC_START_COMPOSITE;
4465 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4466 CODING_STATE_COMPOSITE;
4470 iso->esc = ISO_ESC_END_COMPOSITE;
4471 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4472 ~CODING_STATE_COMPOSITE;
4474 #endif /* ENABLE_COMPOSITE_CHARS */
4476 /**** directionality ****/
4479 iso->esc = ISO_ESC_5_11;
4482 /**** designation ****/
4484 case '$': /* multibyte charset prefix */
4485 iso->esc = ISO_ESC_2_4;
4489 if (0x28 <= c && c <= 0x2F)
4491 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4495 /* This function is called with CODESYS equal to nil when
4496 doing coding-system detection. */
4498 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4499 && fit_to_be_escape_quoted (c))
4501 iso->esc = ISO_ESC_LITERAL;
4502 *flags &= CODING_STATE_ISO2022_LOCK;
4512 /**** directionality ****/
4514 case ISO_ESC_5_11: /* ISO6429 direction control */
4517 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4518 goto directionality;
4520 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4521 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4522 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4526 case ISO_ESC_5_11_0:
4529 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4530 goto directionality;
4534 case ISO_ESC_5_11_1:
4537 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4538 goto directionality;
4542 case ISO_ESC_5_11_2:
4545 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4546 goto directionality;
4551 iso->esc = ISO_ESC_DIRECTIONALITY;
4552 /* Various junk here to attempt to preserve the direction sequences
4553 literally in the text if they would otherwise be swallowed due
4554 to invalid designations that don't show up as actual charset
4555 changes in the text. */
4556 if (iso->invalid_switch_dir)
4558 /* We already inserted a direction switch literally into the
4559 text. We assume (#### this may not be right) that the
4560 next direction switch is the one going the other way,
4561 and we need to output that literally as well. */
4562 iso->output_literally = 1;
4563 iso->invalid_switch_dir = 0;
4569 /* If we are in the thrall of an invalid designation,
4570 then stick the directionality sequence literally into the
4571 output stream so it ends up in the original text again. */
4572 for (jj = 0; jj < 4; jj++)
4573 if (iso->invalid_designated[jj])
4577 iso->output_literally = 1;
4578 iso->invalid_switch_dir = 1;
4581 /* Indicate that we haven't yet seen a valid designation,
4582 so that if a switch-dir is directly followed by an
4583 invalid designation, both get inserted literally. */
4584 iso->switched_dir_and_no_valid_charset_yet = 1;
4589 /**** designation ****/
4592 if (0x28 <= c && c <= 0x2F)
4594 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4597 if (0x40 <= c && c <= 0x42)
4599 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4600 *flags & CODING_STATE_R2L ?
4601 CHARSET_RIGHT_TO_LEFT :
4602 CHARSET_LEFT_TO_RIGHT);
4612 if (c < '0' || c > '~')
4613 return 0; /* bad final byte */
4615 if (iso->esc >= ISO_ESC_2_8 &&
4616 iso->esc <= ISO_ESC_2_15)
4618 type = ((iso->esc >= ISO_ESC_2_12) ?
4619 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4620 reg = (iso->esc - ISO_ESC_2_8) & 3;
4622 else if (iso->esc >= ISO_ESC_2_4_8 &&
4623 iso->esc <= ISO_ESC_2_4_15)
4625 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4626 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4627 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4631 /* Can this ever be reached? -slb */
4635 cs = CHARSET_BY_ATTRIBUTES (type, c,
4636 *flags & CODING_STATE_R2L ?
4637 CHARSET_RIGHT_TO_LEFT :
4638 CHARSET_LEFT_TO_RIGHT);
4644 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4648 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4649 /* can't invoke something that ain't there. */
4651 iso->esc = ISO_ESC_SINGLE_SHIFT;
4652 *flags &= CODING_STATE_ISO2022_LOCK;
4654 *flags |= CODING_STATE_SS2;
4656 *flags |= CODING_STATE_SS3;
4660 if (check_invalid_charsets &&
4661 !CHARSETP (iso->charset[reg]))
4662 /* can't invoke something that ain't there. */
4665 iso->register_right = reg;
4667 iso->register_left = reg;
4668 *flags &= CODING_STATE_ISO2022_LOCK;
4669 iso->esc = ISO_ESC_LOCKING_SHIFT;
4673 if (NILP (cs) && check_invalid_charsets)
4675 iso->invalid_designated[reg] = 1;
4676 iso->charset[reg] = Vcharset_ascii;
4677 iso->esc = ISO_ESC_DESIGNATE;
4678 *flags &= CODING_STATE_ISO2022_LOCK;
4679 iso->output_literally = 1;
4680 if (iso->switched_dir_and_no_valid_charset_yet)
4682 /* We encountered a switch-direction followed by an
4683 invalid designation. Ensure that the switch-direction
4684 gets outputted; otherwise it will probably get eaten
4685 when the text is written out again. */
4686 iso->switched_dir_and_no_valid_charset_yet = 0;
4687 iso->output_direction_sequence = 1;
4688 /* And make sure that the switch-dir going the other
4689 way gets outputted, as well. */
4690 iso->invalid_switch_dir = 1;
4694 /* This function is called with CODESYS equal to nil when
4695 doing coding-system detection. */
4696 if (!NILP (codesys))
4698 charset_conversion_spec_dynarr *dyn =
4699 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4705 for (i = 0; i < Dynarr_length (dyn); i++)
4707 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4708 if (EQ (cs, spec->from_charset))
4709 cs = spec->to_charset;
4714 iso->charset[reg] = cs;
4715 iso->esc = ISO_ESC_DESIGNATE;
4716 *flags &= CODING_STATE_ISO2022_LOCK;
4717 if (iso->invalid_designated[reg])
4719 iso->invalid_designated[reg] = 0;
4720 iso->output_literally = 1;
4722 if (iso->switched_dir_and_no_valid_charset_yet)
4723 iso->switched_dir_and_no_valid_charset_yet = 0;
4728 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4733 /* #### There are serious deficiencies in the recognition mechanism
4734 here. This needs to be much smarter if it's going to cut it.
4735 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4736 it should be detected as Latin-1.
4737 All the ISO2022 stuff in this file should be synced up with the
4738 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4739 Perhaps we should wait till R2L works in FSF Emacs? */
4741 if (!st->iso2022.initted)
4743 reset_iso2022 (Qnil, &st->iso2022.iso);
4744 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4745 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4746 CODING_CATEGORY_ISO_8_1_MASK |
4747 CODING_CATEGORY_ISO_8_2_MASK |
4748 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4749 st->iso2022.flags = 0;
4750 st->iso2022.high_byte_count = 0;
4751 st->iso2022.saw_single_shift = 0;
4752 st->iso2022.initted = 1;
4755 mask = st->iso2022.mask;
4762 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4763 st->iso2022.high_byte_count++;
4767 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4769 if (st->iso2022.high_byte_count & 1)
4770 /* odd number of high bytes; assume not iso-8-2 */
4771 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4773 st->iso2022.high_byte_count = 0;
4774 st->iso2022.saw_single_shift = 0;
4776 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4778 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4779 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4780 { /* control chars */
4783 /* Allow and ignore control characters that you might
4784 reasonably see in a text file */
4789 case 8: /* backspace */
4790 case 11: /* vertical tab */
4791 case 12: /* form feed */
4792 case 26: /* MS-DOS C-z junk */
4793 case 31: /* '^_' -- for info */
4794 goto label_continue_loop;
4801 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4804 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4805 &st->iso2022.flags, 0))
4807 switch (st->iso2022.iso.esc)
4809 case ISO_ESC_DESIGNATE:
4810 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4811 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4813 case ISO_ESC_LOCKING_SHIFT:
4814 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4815 goto ran_out_of_chars;
4816 case ISO_ESC_SINGLE_SHIFT:
4817 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4818 st->iso2022.saw_single_shift = 1;
4827 goto ran_out_of_chars;
4830 label_continue_loop:;
4839 postprocess_iso2022_mask (int mask)
4841 /* #### kind of cheesy */
4842 /* If seven-bit ISO is allowed, then assume that the encoding is
4843 entirely seven-bit and turn off the eight-bit ones. */
4844 if (mask & CODING_CATEGORY_ISO_7_MASK)
4845 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4846 CODING_CATEGORY_ISO_8_1_MASK |
4847 CODING_CATEGORY_ISO_8_2_MASK);
4851 /* If FLAGS is a null pointer or specifies right-to-left motion,
4852 output a switch-dir-to-left-to-right sequence to DST.
4853 Also update FLAGS if it is not a null pointer.
4854 If INTERNAL_P is set, we are outputting in internal format and
4855 need to handle the CSI differently. */
4858 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4859 unsigned_char_dynarr *dst,
4860 unsigned int *flags,
4863 if (!flags || (*flags & CODING_STATE_R2L))
4865 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4867 Dynarr_add (dst, ISO_CODE_ESC);
4868 Dynarr_add (dst, '[');
4870 else if (internal_p)
4871 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4873 Dynarr_add (dst, ISO_CODE_CSI);
4874 Dynarr_add (dst, '0');
4875 Dynarr_add (dst, ']');
4877 *flags &= ~CODING_STATE_R2L;
4881 /* If FLAGS is a null pointer or specifies a direction different from
4882 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4883 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4884 sequence to DST. Also update FLAGS if it is not a null pointer.
4885 If INTERNAL_P is set, we are outputting in internal format and
4886 need to handle the CSI differently. */
4889 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4890 unsigned_char_dynarr *dst, unsigned int *flags,
4893 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4894 direction == CHARSET_LEFT_TO_RIGHT)
4895 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4896 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4897 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4898 direction == CHARSET_RIGHT_TO_LEFT)
4900 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4902 Dynarr_add (dst, ISO_CODE_ESC);
4903 Dynarr_add (dst, '[');
4905 else if (internal_p)
4906 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4908 Dynarr_add (dst, ISO_CODE_CSI);
4909 Dynarr_add (dst, '2');
4910 Dynarr_add (dst, ']');
4912 *flags |= CODING_STATE_R2L;
4916 /* Convert ISO2022-format data to internal format. */
4919 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4920 unsigned_char_dynarr *dst, unsigned int n)
4922 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4923 unsigned int flags = str->flags;
4924 unsigned int ch = str->ch;
4925 eol_type_t eol_type = str->eol_type;
4926 #ifdef ENABLE_COMPOSITE_CHARS
4927 unsigned_char_dynarr *real_dst = dst;
4929 Lisp_Object coding_system;
4931 XSETCODING_SYSTEM (coding_system, str->codesys);
4933 #ifdef ENABLE_COMPOSITE_CHARS
4934 if (flags & CODING_STATE_COMPOSITE)
4935 dst = str->iso2022.composite_chars;
4936 #endif /* ENABLE_COMPOSITE_CHARS */
4940 unsigned char c = *src++;
4941 if (flags & CODING_STATE_ESCAPE)
4942 { /* Within ESC sequence */
4943 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4948 switch (str->iso2022.esc)
4950 #ifdef ENABLE_COMPOSITE_CHARS
4951 case ISO_ESC_START_COMPOSITE:
4952 if (str->iso2022.composite_chars)
4953 Dynarr_reset (str->iso2022.composite_chars);
4955 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4956 dst = str->iso2022.composite_chars;
4958 case ISO_ESC_END_COMPOSITE:
4960 Bufbyte comstr[MAX_EMCHAR_LEN];
4962 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4963 Dynarr_length (dst));
4965 len = set_charptr_emchar (comstr, emch);
4966 Dynarr_add_many (dst, comstr, len);
4969 #endif /* ENABLE_COMPOSITE_CHARS */
4971 case ISO_ESC_LITERAL:
4972 DECODE_ADD_BINARY_CHAR (c, dst);
4976 /* Everything else handled already */
4981 /* Attempted error recovery. */
4982 if (str->iso2022.output_direction_sequence)
4983 ensure_correct_direction (flags & CODING_STATE_R2L ?
4984 CHARSET_RIGHT_TO_LEFT :
4985 CHARSET_LEFT_TO_RIGHT,
4986 str->codesys, dst, 0, 1);
4987 /* More error recovery. */
4988 if (!retval || str->iso2022.output_literally)
4990 /* Output the (possibly invalid) sequence */
4992 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4993 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4994 flags &= CODING_STATE_ISO2022_LOCK;
4996 n++, src--;/* Repeat the loop with the same character. */
4999 /* No sense in reprocessing the final byte of the
5000 escape sequence; it could mess things up anyway.
5002 DECODE_ADD_BINARY_CHAR (c, dst);
5007 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5008 { /* Control characters */
5010 /***** Error-handling *****/
5012 /* If we were in the middle of a character, dump out the
5013 partial character. */
5014 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5016 /* If we just saw a single-shift character, dump it out.
5017 This may dump out the wrong sort of single-shift character,
5018 but least it will give an indication that something went
5020 if (flags & CODING_STATE_SS2)
5022 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5023 flags &= ~CODING_STATE_SS2;
5025 if (flags & CODING_STATE_SS3)
5027 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5028 flags &= ~CODING_STATE_SS3;
5031 /***** Now handle the control characters. *****/
5034 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5036 flags &= CODING_STATE_ISO2022_LOCK;
5038 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5039 DECODE_ADD_BINARY_CHAR (c, dst);
5042 { /* Graphic characters */
5043 Lisp_Object charset;
5049 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5051 /* Now determine the charset. */
5052 reg = ((flags & CODING_STATE_SS2) ? 2
5053 : (flags & CODING_STATE_SS3) ? 3
5054 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5055 : str->iso2022.register_left);
5056 charset = str->iso2022.charset[reg];
5058 /* Error checking: */
5059 if (! CHARSETP (charset)
5060 || str->iso2022.invalid_designated[reg]
5061 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5062 && XCHARSET_CHARS (charset) == 94))
5063 /* Mrmph. We are trying to invoke a register that has no
5064 or an invalid charset in it, or trying to add a character
5065 outside the range of the charset. Insert that char literally
5066 to preserve it for the output. */
5068 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5069 DECODE_ADD_BINARY_CHAR (c, dst);
5074 /* Things are probably hunky-dorey. */
5076 /* Fetch reverse charset, maybe. */
5077 if (((flags & CODING_STATE_R2L) &&
5078 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5080 (!(flags & CODING_STATE_R2L) &&
5081 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5083 Lisp_Object new_charset =
5084 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5085 if (!NILP (new_charset))
5086 charset = new_charset;
5090 if (XCHARSET_DIMENSION (charset) == 1)
5092 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5094 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5099 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5105 lb = XCHARSET_LEADING_BYTE (charset);
5106 switch (XCHARSET_REP_BYTES (charset))
5109 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5110 Dynarr_add (dst, c & 0x7F);
5113 case 2: /* one-byte official */
5114 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5115 Dynarr_add (dst, lb);
5116 Dynarr_add (dst, c | 0x80);
5119 case 3: /* one-byte private or two-byte official */
5120 if (XCHARSET_PRIVATE_P (charset))
5122 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5123 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5124 Dynarr_add (dst, lb);
5125 Dynarr_add (dst, c | 0x80);
5131 Dynarr_add (dst, lb);
5132 Dynarr_add (dst, ch | 0x80);
5133 Dynarr_add (dst, c | 0x80);
5141 default: /* two-byte private */
5144 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5145 Dynarr_add (dst, lb);
5146 Dynarr_add (dst, ch | 0x80);
5147 Dynarr_add (dst, c | 0x80);
5157 flags &= CODING_STATE_ISO2022_LOCK;
5160 label_continue_loop:;
5163 if (flags & CODING_STATE_END)
5164 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5171 /***** ISO2022 encoder *****/
5173 /* Designate CHARSET into register REG. */
5176 iso2022_designate (Lisp_Object charset, unsigned char reg,
5177 struct encoding_stream *str, unsigned_char_dynarr *dst)
5179 static CONST char inter94[] = "()*+";
5180 static CONST char inter96[] = ",-./";
5182 unsigned char final;
5183 Lisp_Object old_charset = str->iso2022.charset[reg];
5185 str->iso2022.charset[reg] = charset;
5186 if (!CHARSETP (charset))
5187 /* charset might be an initial nil or t. */
5189 type = XCHARSET_TYPE (charset);
5190 final = XCHARSET_FINAL (charset);
5191 if (!str->iso2022.force_charset_on_output[reg] &&
5192 CHARSETP (old_charset) &&
5193 XCHARSET_TYPE (old_charset) == type &&
5194 XCHARSET_FINAL (old_charset) == final)
5197 str->iso2022.force_charset_on_output[reg] = 0;
5200 charset_conversion_spec_dynarr *dyn =
5201 str->codesys->iso2022.output_conv;
5207 for (i = 0; i < Dynarr_length (dyn); i++)
5209 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5210 if (EQ (charset, spec->from_charset))
5211 charset = spec->to_charset;
5216 Dynarr_add (dst, ISO_CODE_ESC);
5219 case CHARSET_TYPE_94:
5220 Dynarr_add (dst, inter94[reg]);
5222 case CHARSET_TYPE_96:
5223 Dynarr_add (dst, inter96[reg]);
5225 case CHARSET_TYPE_94X94:
5226 Dynarr_add (dst, '$');
5228 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5231 Dynarr_add (dst, inter94[reg]);
5233 case CHARSET_TYPE_96X96:
5234 Dynarr_add (dst, '$');
5235 Dynarr_add (dst, inter96[reg]);
5238 Dynarr_add (dst, final);
5242 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5244 if (str->iso2022.register_left != 0)
5246 Dynarr_add (dst, ISO_CODE_SI);
5247 str->iso2022.register_left = 0;
5252 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5254 if (str->iso2022.register_left != 1)
5256 Dynarr_add (dst, ISO_CODE_SO);
5257 str->iso2022.register_left = 1;
5261 /* Convert internally-formatted data to ISO2022 format. */
5264 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5265 unsigned_char_dynarr *dst, unsigned int n)
5267 unsigned char charmask, c;
5268 unsigned char char_boundary;
5269 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5270 unsigned int flags = str->flags;
5271 Emchar ch = str->ch;
5272 Lisp_Coding_System *codesys = str->codesys;
5273 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5275 Lisp_Object charset;
5278 unsigned int byte1, byte2;
5281 #ifdef ENABLE_COMPOSITE_CHARS
5282 /* flags for handling composite chars. We do a little switcharoo
5283 on the source while we're outputting the composite char. */
5284 unsigned int saved_n = 0;
5285 CONST unsigned char *saved_src = NULL;
5286 int in_composite = 0;
5287 #endif /* ENABLE_COMPOSITE_CHARS */
5289 char_boundary = str->iso2022.current_char_boundary;
5290 charset = str->iso2022.current_charset;
5291 half = str->iso2022.current_half;
5293 #ifdef ENABLE_COMPOSITE_CHARS
5301 switch (char_boundary)
5309 else if ( c >= 0xf8 )
5314 else if ( c >= 0xf0 )
5319 else if ( c >= 0xe0 )
5324 else if ( c >= 0xc0 )
5333 restore_left_to_right_direction (codesys, dst, &flags, 0);
5335 /* Make sure G0 contains ASCII */
5336 if ((c > ' ' && c < ISO_CODE_DEL) ||
5337 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5339 ensure_normal_shift (str, dst);
5340 iso2022_designate (Vcharset_ascii, 0, str, dst);
5343 /* If necessary, restore everything to the default state
5346 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5348 restore_left_to_right_direction (codesys, dst, &flags, 0);
5350 ensure_normal_shift (str, dst);
5352 for (i = 0; i < 4; i++)
5354 Lisp_Object initial_charset =
5355 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5356 iso2022_designate (initial_charset, i, str, dst);
5361 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5362 Dynarr_add (dst, '\r');
5363 if (eol_type != EOL_CR)
5364 Dynarr_add (dst, c);
5368 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5369 && fit_to_be_escape_quoted (c))
5370 Dynarr_add (dst, ISO_CODE_ESC);
5371 Dynarr_add (dst, c);
5377 ch = ( ch << 6 ) | ( c & 0x3f );
5380 if ( (0x80 <= ch) && (ch <= 0x9f) )
5382 charmask = (half == 0 ? 0x00 : 0x80);
5384 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5385 && fit_to_be_escape_quoted (ch))
5386 Dynarr_add (dst, ISO_CODE_ESC);
5387 /* you asked for it ... */
5388 Dynarr_add (dst, ch);
5394 BREAKUP_CHAR (ch, charset, byte1, byte2);
5395 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5396 codesys, dst, &flags, 0);
5398 /* Now determine which register to use. */
5400 for (i = 0; i < 4; i++)
5402 if (EQ (charset, str->iso2022.charset[i]) ||
5404 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5413 if (XCHARSET_GRAPHIC (charset) != 0)
5415 if (!NILP (str->iso2022.charset[1]) &&
5416 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5417 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5419 else if (!NILP (str->iso2022.charset[2]))
5421 else if (!NILP (str->iso2022.charset[3]))
5430 iso2022_designate (charset, reg, str, dst);
5432 /* Now invoke that register. */
5436 ensure_normal_shift (str, dst);
5441 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5443 ensure_shift_out (str, dst);
5451 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5453 Dynarr_add (dst, ISO_CODE_ESC);
5454 Dynarr_add (dst, 'N');
5459 Dynarr_add (dst, ISO_CODE_SS2);
5465 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5467 Dynarr_add (dst, ISO_CODE_ESC);
5468 Dynarr_add (dst, 'O');
5473 Dynarr_add (dst, ISO_CODE_SS3);
5482 charmask = (half == 0 ? 0x00 : 0x80);
5484 switch (XCHARSET_DIMENSION (charset))
5487 Dynarr_add (dst, byte1 | charmask);
5490 Dynarr_add (dst, byte1 | charmask);
5491 Dynarr_add (dst, byte2 | charmask);
5500 ch = ( ch << 6 ) | ( c & 0x3f );
5504 #else /* not UTF2000 */
5510 if (BYTE_ASCII_P (c))
5511 { /* Processing ASCII character */
5514 restore_left_to_right_direction (codesys, dst, &flags, 0);
5516 /* Make sure G0 contains ASCII */
5517 if ((c > ' ' && c < ISO_CODE_DEL) ||
5518 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5520 ensure_normal_shift (str, dst);
5521 iso2022_designate (Vcharset_ascii, 0, str, dst);
5524 /* If necessary, restore everything to the default state
5527 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5529 restore_left_to_right_direction (codesys, dst, &flags, 0);
5531 ensure_normal_shift (str, dst);
5533 for (i = 0; i < 4; i++)
5535 Lisp_Object initial_charset =
5536 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5537 iso2022_designate (initial_charset, i, str, dst);
5542 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5543 Dynarr_add (dst, '\r');
5544 if (eol_type != EOL_CR)
5545 Dynarr_add (dst, c);
5549 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5550 && fit_to_be_escape_quoted (c))
5551 Dynarr_add (dst, ISO_CODE_ESC);
5552 Dynarr_add (dst, c);
5557 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5558 { /* Processing Leading Byte */
5560 charset = CHARSET_BY_LEADING_BYTE (c);
5561 if (LEADING_BYTE_PREFIX_P(c))
5563 else if (!EQ (charset, Vcharset_control_1)
5564 #ifdef ENABLE_COMPOSITE_CHARS
5565 && !EQ (charset, Vcharset_composite)
5571 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5572 codesys, dst, &flags, 0);
5574 /* Now determine which register to use. */
5576 for (i = 0; i < 4; i++)
5578 if (EQ (charset, str->iso2022.charset[i]) ||
5580 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5589 if (XCHARSET_GRAPHIC (charset) != 0)
5591 if (!NILP (str->iso2022.charset[1]) &&
5592 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5593 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5595 else if (!NILP (str->iso2022.charset[2]))
5597 else if (!NILP (str->iso2022.charset[3]))
5606 iso2022_designate (charset, reg, str, dst);
5608 /* Now invoke that register. */
5612 ensure_normal_shift (str, dst);
5617 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5619 ensure_shift_out (str, dst);
5627 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5629 Dynarr_add (dst, ISO_CODE_ESC);
5630 Dynarr_add (dst, 'N');
5635 Dynarr_add (dst, ISO_CODE_SS2);
5641 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5643 Dynarr_add (dst, ISO_CODE_ESC);
5644 Dynarr_add (dst, 'O');
5649 Dynarr_add (dst, ISO_CODE_SS3);
5661 { /* Processing Non-ASCII character */
5662 charmask = (half == 0 ? 0x7F : 0xFF);
5664 if (EQ (charset, Vcharset_control_1))
5666 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5667 && fit_to_be_escape_quoted (c))
5668 Dynarr_add (dst, ISO_CODE_ESC);
5669 /* you asked for it ... */
5670 Dynarr_add (dst, c - 0x20);
5674 switch (XCHARSET_REP_BYTES (charset))
5677 Dynarr_add (dst, c & charmask);
5680 if (XCHARSET_PRIVATE_P (charset))
5682 Dynarr_add (dst, c & charmask);
5687 #ifdef ENABLE_COMPOSITE_CHARS
5688 if (EQ (charset, Vcharset_composite))
5692 /* #### Bother! We don't know how to
5694 Dynarr_add (dst, '~');
5698 Emchar emch = MAKE_CHAR (Vcharset_composite,
5699 ch & 0x7F, c & 0x7F);
5700 Lisp_Object lstr = composite_char_string (emch);
5704 src = XSTRING_DATA (lstr);
5705 n = XSTRING_LENGTH (lstr);
5706 Dynarr_add (dst, ISO_CODE_ESC);
5707 Dynarr_add (dst, '0'); /* start composing */
5711 #endif /* ENABLE_COMPOSITE_CHARS */
5713 Dynarr_add (dst, ch & charmask);
5714 Dynarr_add (dst, c & charmask);
5727 Dynarr_add (dst, ch & charmask);
5728 Dynarr_add (dst, c & charmask);
5743 #endif /* not UTF2000 */
5745 #ifdef ENABLE_COMPOSITE_CHARS
5751 Dynarr_add (dst, ISO_CODE_ESC);
5752 Dynarr_add (dst, '1'); /* end composing */
5753 goto back_to_square_n; /* Wheeeeeeeee ..... */
5755 #endif /* ENABLE_COMPOSITE_CHARS */
5758 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5760 if (char_boundary && flags & CODING_STATE_END)
5763 restore_left_to_right_direction (codesys, dst, &flags, 0);
5764 ensure_normal_shift (str, dst);
5765 for (i = 0; i < 4; i++)
5767 Lisp_Object initial_charset =
5768 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5769 iso2022_designate (initial_charset, i, str, dst);
5775 str->iso2022.current_char_boundary = char_boundary;
5776 str->iso2022.current_charset = charset;
5777 str->iso2022.current_half = half;
5779 /* Verbum caro factum est! */
5783 /************************************************************************/
5784 /* No-conversion methods */
5785 /************************************************************************/
5787 /* This is used when reading in "binary" files -- i.e. files that may
5788 contain all 256 possible byte values and that are not to be
5789 interpreted as being in any particular decoding. */
5791 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5792 unsigned_char_dynarr *dst, unsigned int n)
5795 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5796 unsigned int flags = str->flags;
5797 unsigned int ch = str->ch;
5798 eol_type_t eol_type = str->eol_type;
5804 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5805 DECODE_ADD_BINARY_CHAR (c, dst);
5806 label_continue_loop:;
5809 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5816 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5817 unsigned_char_dynarr *dst, unsigned int n)
5820 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5821 unsigned int flags = str->flags;
5822 unsigned int ch = str->ch;
5823 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5825 unsigned char char_boundary = str->iso2022.current_char_boundary;
5832 switch (char_boundary)
5840 else if ( c >= 0xf8 )
5845 else if ( c >= 0xf0 )
5850 else if ( c >= 0xe0 )
5855 else if ( c >= 0xc0 )
5866 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5867 Dynarr_add (dst, '\r');
5868 if (eol_type != EOL_CR)
5869 Dynarr_add (dst, c);
5872 Dynarr_add (dst, c);
5877 ch = ( ch << 6 ) | ( c & 0x3f );
5878 Dynarr_add (dst, ch & 0xff);
5882 ch = ( ch << 6 ) | ( c & 0x3f );
5885 #else /* not UTF2000 */
5888 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5889 Dynarr_add (dst, '\r');
5890 if (eol_type != EOL_CR)
5891 Dynarr_add (dst, '\n');
5894 else if (BYTE_ASCII_P (c))
5897 Dynarr_add (dst, c);
5899 else if (BUFBYTE_LEADING_BYTE_P (c))
5902 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5903 c == LEADING_BYTE_CONTROL_1)
5906 Dynarr_add (dst, '~'); /* untranslatable character */
5910 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5911 Dynarr_add (dst, c);
5912 else if (ch == LEADING_BYTE_CONTROL_1)
5915 Dynarr_add (dst, c - 0x20);
5917 /* else it should be the second or third byte of an
5918 untranslatable character, so ignore it */
5921 #endif /* not UTF2000 */
5927 str->iso2022.current_char_boundary = char_boundary;
5932 /************************************************************************/
5933 /* Simple internal/external functions */
5934 /************************************************************************/
5936 static Extbyte_dynarr *conversion_out_dynarr;
5937 static Bufbyte_dynarr *conversion_in_dynarr;
5939 /* Determine coding system from coding format */
5941 /* #### not correct for all values of `fmt'! */
5943 external_data_format_to_coding_system (enum external_data_format fmt)
5947 case FORMAT_FILENAME:
5948 case FORMAT_TERMINAL:
5949 if (EQ (Vfile_name_coding_system, Qnil) ||
5950 EQ (Vfile_name_coding_system, Qbinary))
5953 return Fget_coding_system (Vfile_name_coding_system);
5956 return Fget_coding_system (Qctext);
5964 convert_to_external_format (CONST Bufbyte *ptr,
5967 enum external_data_format fmt)
5969 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5971 if (!conversion_out_dynarr)
5972 conversion_out_dynarr = Dynarr_new (Extbyte);
5974 Dynarr_reset (conversion_out_dynarr);
5976 if (NILP (coding_system))
5978 CONST Bufbyte *end = ptr + len;
5984 (*ptr < 0xc0) ? *ptr :
5985 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5988 (BYTE_ASCII_P (*ptr)) ? *ptr :
5989 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5990 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5993 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5997 #ifdef ERROR_CHECK_BUFPOS
5998 assert (ptr == end);
6003 Lisp_Object instream, outstream, da_outstream;
6004 Lstream *istr, *ostr;
6005 struct gcpro gcpro1, gcpro2, gcpro3;
6006 char tempbuf[1024]; /* some random amount */
6008 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
6009 da_outstream = make_dynarr_output_stream
6010 ((unsigned_char_dynarr *) conversion_out_dynarr);
6012 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
6013 istr = XLSTREAM (instream);
6014 ostr = XLSTREAM (outstream);
6015 GCPRO3 (instream, outstream, da_outstream);
6018 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
6021 Lstream_write (ostr, tempbuf, size_in_bytes);
6023 Lstream_close (istr);
6024 Lstream_close (ostr);
6026 Lstream_delete (istr);
6027 Lstream_delete (ostr);
6028 Lstream_delete (XLSTREAM (da_outstream));
6031 *len_out = Dynarr_length (conversion_out_dynarr);
6032 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
6033 return Dynarr_atp (conversion_out_dynarr, 0);
6037 convert_from_external_format (CONST Extbyte *ptr,
6040 enum external_data_format fmt)
6042 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
6044 if (!conversion_in_dynarr)
6045 conversion_in_dynarr = Dynarr_new (Bufbyte);
6047 Dynarr_reset (conversion_in_dynarr);
6049 if (NILP (coding_system))
6051 CONST Extbyte *end = ptr + len;
6052 for (; ptr < end; ptr++)
6055 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
6060 Lisp_Object instream, outstream, da_outstream;
6061 Lstream *istr, *ostr;
6062 struct gcpro gcpro1, gcpro2, gcpro3;
6063 char tempbuf[1024]; /* some random amount */
6065 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
6066 da_outstream = make_dynarr_output_stream
6067 ((unsigned_char_dynarr *) conversion_in_dynarr);
6069 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
6070 istr = XLSTREAM (instream);
6071 ostr = XLSTREAM (outstream);
6072 GCPRO3 (instream, outstream, da_outstream);
6075 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
6078 Lstream_write (ostr, tempbuf, size_in_bytes);
6080 Lstream_close (istr);
6081 Lstream_close (ostr);
6083 Lstream_delete (istr);
6084 Lstream_delete (ostr);
6085 Lstream_delete (XLSTREAM (da_outstream));
6088 *len_out = Dynarr_length (conversion_in_dynarr);
6089 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
6090 return Dynarr_atp (conversion_in_dynarr, 0);
6094 /************************************************************************/
6095 /* Initialization */
6096 /************************************************************************/
6099 syms_of_file_coding (void)
6101 deferror (&Qcoding_system_error, "coding-system-error",
6102 "Coding-system error", Qio_error);
6104 DEFSUBR (Fcoding_system_p);
6105 DEFSUBR (Ffind_coding_system);
6106 DEFSUBR (Fget_coding_system);
6107 DEFSUBR (Fcoding_system_list);
6108 DEFSUBR (Fcoding_system_name);
6109 DEFSUBR (Fmake_coding_system);
6110 DEFSUBR (Fcopy_coding_system);
6111 DEFSUBR (Fdefine_coding_system_alias);
6112 DEFSUBR (Fsubsidiary_coding_system);
6114 DEFSUBR (Fcoding_system_type);
6115 DEFSUBR (Fcoding_system_doc_string);
6117 DEFSUBR (Fcoding_system_charset);
6119 DEFSUBR (Fcoding_system_property);
6121 DEFSUBR (Fcoding_category_list);
6122 DEFSUBR (Fset_coding_priority_list);
6123 DEFSUBR (Fcoding_priority_list);
6124 DEFSUBR (Fset_coding_category_system);
6125 DEFSUBR (Fcoding_category_system);
6127 DEFSUBR (Fdetect_coding_region);
6128 DEFSUBR (Fdecode_coding_region);
6129 DEFSUBR (Fencode_coding_region);
6131 DEFSUBR (Fdecode_shift_jis_char);
6132 DEFSUBR (Fencode_shift_jis_char);
6133 DEFSUBR (Fdecode_big5_char);
6134 DEFSUBR (Fencode_big5_char);
6136 DEFSUBR (Fset_ucs_char);
6137 DEFSUBR (Fucs_char);
6138 DEFSUBR (Fset_char_ucs);
6139 DEFSUBR (Fchar_ucs);
6140 #endif /* not UTF2000 */
6142 defsymbol (&Qcoding_systemp, "coding-system-p");
6143 defsymbol (&Qno_conversion, "no-conversion");
6144 defsymbol (&Qraw_text, "raw-text");
6146 defsymbol (&Qbig5, "big5");
6147 defsymbol (&Qshift_jis, "shift-jis");
6148 defsymbol (&Qucs4, "ucs-4");
6149 defsymbol (&Qutf8, "utf-8");
6150 defsymbol (&Qccl, "ccl");
6151 defsymbol (&Qiso2022, "iso2022");
6153 defsymbol (&Qmnemonic, "mnemonic");
6154 defsymbol (&Qeol_type, "eol-type");
6155 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6156 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6158 defsymbol (&Qcr, "cr");
6159 defsymbol (&Qlf, "lf");
6160 defsymbol (&Qcrlf, "crlf");
6161 defsymbol (&Qeol_cr, "eol-cr");
6162 defsymbol (&Qeol_lf, "eol-lf");
6163 defsymbol (&Qeol_crlf, "eol-crlf");
6165 defsymbol (&Qcharset_g0, "charset-g0");
6166 defsymbol (&Qcharset_g1, "charset-g1");
6167 defsymbol (&Qcharset_g2, "charset-g2");
6168 defsymbol (&Qcharset_g3, "charset-g3");
6169 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6170 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6171 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6172 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6173 defsymbol (&Qno_iso6429, "no-iso6429");
6174 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6175 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6177 defsymbol (&Qshort, "short");
6178 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6179 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6180 defsymbol (&Qseven, "seven");
6181 defsymbol (&Qlock_shift, "lock-shift");
6182 defsymbol (&Qescape_quoted, "escape-quoted");
6184 defsymbol (&Qencode, "encode");
6185 defsymbol (&Qdecode, "decode");
6188 defsymbol (&Qctext, "ctext");
6189 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6191 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6193 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6195 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6197 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6199 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6201 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6203 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6205 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6208 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6213 lstream_type_create_file_coding (void)
6215 LSTREAM_HAS_METHOD (decoding, reader);
6216 LSTREAM_HAS_METHOD (decoding, writer);
6217 LSTREAM_HAS_METHOD (decoding, rewinder);
6218 LSTREAM_HAS_METHOD (decoding, seekable_p);
6219 LSTREAM_HAS_METHOD (decoding, flusher);
6220 LSTREAM_HAS_METHOD (decoding, closer);
6221 LSTREAM_HAS_METHOD (decoding, marker);
6223 LSTREAM_HAS_METHOD (encoding, reader);
6224 LSTREAM_HAS_METHOD (encoding, writer);
6225 LSTREAM_HAS_METHOD (encoding, rewinder);
6226 LSTREAM_HAS_METHOD (encoding, seekable_p);
6227 LSTREAM_HAS_METHOD (encoding, flusher);
6228 LSTREAM_HAS_METHOD (encoding, closer);
6229 LSTREAM_HAS_METHOD (encoding, marker);
6233 vars_of_file_coding (void)
6237 fcd = xnew (struct file_coding_dump);
6238 dumpstruct (&fcd, &fcd_description);
6240 /* Initialize to something reasonable ... */
6241 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6243 fcd->coding_category_system[i] = Qnil;
6244 fcd->coding_category_by_priority[i] = i;
6247 Fprovide (intern ("file-coding"));
6249 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6250 Coding system used for TTY keyboard input.
6251 Not used under a windowing system.
6253 Vkeyboard_coding_system = Qnil;
6255 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6256 Coding system used for TTY display output.
6257 Not used under a windowing system.
6259 Vterminal_coding_system = Qnil;
6261 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6262 Overriding coding system used when writing a file or process.
6263 You should *bind* this, not set it. If this is non-nil, it specifies
6264 the coding system that will be used when a file or process is read
6265 in, and overrides `buffer-file-coding-system-for-read',
6266 `insert-file-contents-pre-hook', etc. Use those variables instead of
6267 this one for permanent changes to the environment.
6269 Vcoding_system_for_read = Qnil;
6271 DEFVAR_LISP ("coding-system-for-write",
6272 &Vcoding_system_for_write /*
6273 Overriding coding system used when writing a file or process.
6274 You should *bind* this, not set it. If this is non-nil, it specifies
6275 the coding system that will be used when a file or process is wrote
6276 in, and overrides `buffer-file-coding-system',
6277 `write-region-pre-hook', etc. Use those variables instead of this one
6278 for permanent changes to the environment.
6280 Vcoding_system_for_write = Qnil;
6282 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6283 Coding system used to convert pathnames when accessing files.
6285 Vfile_name_coding_system = Qnil;
6287 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6288 Non-nil means the buffer contents are regarded as multi-byte form
6289 of characters, not a binary code. This affects the display, file I/O,
6290 and behaviors of various editing commands.
6292 Setting this to nil does not do anything.
6294 enable_multibyte_characters = 1;
6298 complex_vars_of_file_coding (void)
6300 staticpro (&Vcoding_system_hash_table);
6301 Vcoding_system_hash_table =
6302 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6304 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6305 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6307 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6309 struct codesys_prop csp; \
6311 csp.prop_type = (Prop_Type); \
6312 Dynarr_add (the_codesys_prop_dynarr, csp); \
6315 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6316 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6317 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6318 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6319 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6320 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6321 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6323 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6324 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6325 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6326 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6327 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6328 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6329 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6330 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6331 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6332 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6333 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6334 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6335 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6336 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6337 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6338 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6339 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6341 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6342 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6344 /* Need to create this here or we're really screwed. */
6346 (Qraw_text, Qno_conversion,
6347 build_string ("Raw text, which means it converts only line-break-codes."),
6348 list2 (Qmnemonic, build_string ("Raw")));
6351 (Qbinary, Qno_conversion,
6352 build_string ("Binary, which means it does not convert anything."),
6353 list4 (Qeol_type, Qlf,
6354 Qmnemonic, build_string ("Binary")));
6359 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6360 list2 (Qmnemonic, build_string ("UTF8")));
6363 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6365 /* Need this for bootstrapping */
6366 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6367 Fget_coding_system (Qraw_text);
6370 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6371 = Fget_coding_system (Qutf8);
6374 #if defined(MULE) && !defined(UTF2000)
6378 for (i = 0; i < 65536; i++)
6379 fcd->ucs_to_mule_table[i] = Qnil;
6381 staticpro (&mule_to_ucs_table);
6382 mule_to_ucs_table = Fmake_char_table(Qgeneric);