1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST];
61 #if defined(MULE) && !defined(UTF2000)
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
68 #if defined(MULE) && !defined(UTF2000)
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const Extbyte *src, size_t n);
180 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
181 unsigned_char_dynarr *dst, size_t n);
182 static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
183 unsigned_char_dynarr *dst, size_t n);
184 static int detect_coding_big5 (struct detection_state *st,
185 const Extbyte *src, size_t n);
186 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
187 unsigned_char_dynarr *dst, size_t n);
188 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
189 unsigned_char_dynarr *dst, size_t n);
190 static int detect_coding_ucs4 (struct detection_state *st,
191 const Extbyte *src, size_t n);
192 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
193 unsigned_char_dynarr *dst, size_t n);
194 static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
195 unsigned_char_dynarr *dst, size_t n);
196 static int detect_coding_utf8 (struct detection_state *st,
197 const Extbyte *src, size_t n);
198 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, size_t n);
200 static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
201 unsigned_char_dynarr *dst, size_t n);
202 static int postprocess_iso2022_mask (int mask);
203 static void reset_iso2022 (Lisp_Object coding_system,
204 struct iso2022_decoder *iso);
205 static int detect_coding_iso2022 (struct detection_state *st,
206 const Extbyte *src, size_t n);
207 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, size_t n);
209 static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
210 unsigned_char_dynarr *dst, size_t n);
212 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
213 unsigned_char_dynarr *dst, size_t n);
214 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
215 unsigned_char_dynarr *dst, size_t n);
216 static void mule_decode (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, size_t n);
218 static void mule_encode (Lstream *encoding, const Bufbyte *src,
219 unsigned_char_dynarr *dst, size_t n);
221 typedef struct codesys_prop codesys_prop;
230 Dynarr_declare (codesys_prop);
231 } codesys_prop_dynarr;
233 static const struct lrecord_description codesys_prop_description_1[] = {
234 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
238 static const struct struct_description codesys_prop_description = {
239 sizeof (codesys_prop),
240 codesys_prop_description_1
243 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
244 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
248 static const struct struct_description codesys_prop_dynarr_description = {
249 sizeof (codesys_prop_dynarr),
250 codesys_prop_dynarr_description_1
253 codesys_prop_dynarr *the_codesys_prop_dynarr;
255 enum codesys_prop_enum
258 CODESYS_PROP_ISO2022,
263 /************************************************************************/
264 /* Coding system functions */
265 /************************************************************************/
267 static Lisp_Object mark_coding_system (Lisp_Object);
268 static void print_coding_system (Lisp_Object, Lisp_Object, int);
269 static void finalize_coding_system (void *header, int for_disksave);
272 static const struct lrecord_description ccs_description_1[] = {
273 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
274 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
278 static const struct struct_description ccs_description = {
279 sizeof (charset_conversion_spec),
283 static const struct lrecord_description ccsd_description_1[] = {
284 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
288 static const struct struct_description ccsd_description = {
289 sizeof (charset_conversion_spec_dynarr),
294 static const struct lrecord_description coding_system_description[] = {
295 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
296 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
297 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
298 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
299 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
300 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
301 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
302 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
304 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
305 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
306 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
307 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
308 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
313 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
314 mark_coding_system, print_coding_system,
315 finalize_coding_system,
316 0, 0, coding_system_description,
320 mark_coding_system (Lisp_Object obj)
322 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
324 mark_object (CODING_SYSTEM_NAME (codesys));
325 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
326 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
327 mark_object (CODING_SYSTEM_EOL_LF (codesys));
328 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
329 mark_object (CODING_SYSTEM_EOL_CR (codesys));
331 switch (CODING_SYSTEM_TYPE (codesys))
335 case CODESYS_ISO2022:
336 for (i = 0; i < 4; i++)
337 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
338 if (codesys->iso2022.input_conv)
340 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
342 struct charset_conversion_spec *ccs =
343 Dynarr_atp (codesys->iso2022.input_conv, i);
344 mark_object (ccs->from_charset);
345 mark_object (ccs->to_charset);
348 if (codesys->iso2022.output_conv)
350 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp (codesys->iso2022.output_conv, i);
354 mark_object (ccs->from_charset);
355 mark_object (ccs->to_charset);
361 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
362 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
369 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
370 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
374 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
377 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
379 error ("printing unreadable object #<coding_system 0x%x>",
382 write_c_string ("#<coding_system ", printcharfun);
383 print_internal (c->name, printcharfun, 1);
384 write_c_string (">", printcharfun);
388 finalize_coding_system (void *header, int for_disksave)
390 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
391 /* Since coding systems never go away, this function is not
392 necessary. But it would be necessary if we changed things
393 so that coding systems could go away. */
394 if (!for_disksave) /* see comment in lstream.c */
396 switch (CODING_SYSTEM_TYPE (c))
399 case CODESYS_ISO2022:
400 if (c->iso2022.input_conv)
402 Dynarr_free (c->iso2022.input_conv);
403 c->iso2022.input_conv = 0;
405 if (c->iso2022.output_conv)
407 Dynarr_free (c->iso2022.output_conv);
408 c->iso2022.output_conv = 0;
419 symbol_to_eol_type (Lisp_Object symbol)
421 CHECK_SYMBOL (symbol);
422 if (NILP (symbol)) return EOL_AUTODETECT;
423 if (EQ (symbol, Qlf)) return EOL_LF;
424 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
425 if (EQ (symbol, Qcr)) return EOL_CR;
427 signal_simple_error ("Unrecognized eol type", symbol);
428 return EOL_AUTODETECT; /* not reached */
432 eol_type_to_symbol (eol_type_t type)
437 case EOL_LF: return Qlf;
438 case EOL_CRLF: return Qcrlf;
439 case EOL_CR: return Qcr;
440 case EOL_AUTODETECT: return Qnil;
445 setup_eol_coding_systems (Lisp_Coding_System *codesys)
447 Lisp_Object codesys_obj;
448 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
449 char *codesys_name = (char *) alloca (len + 7);
451 char *codesys_mnemonic=0;
453 Lisp_Object codesys_name_sym, sub_codesys_obj;
457 XSETCODING_SYSTEM (codesys_obj, codesys);
459 memcpy (codesys_name,
460 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
462 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
464 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
465 codesys_mnemonic = (char *) alloca (mlen + 7);
466 memcpy (codesys_mnemonic,
467 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
470 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
471 strcpy (codesys_name + len, "-" op_sys); \
473 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
474 codesys_name_sym = intern (codesys_name); \
475 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
476 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
478 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
479 build_string (codesys_mnemonic); \
480 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
483 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
484 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
485 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
488 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
489 Return t if OBJECT is a coding system.
490 A coding system is an object that defines how text containing multiple
491 character sets is encoded into a stream of (typically 8-bit) bytes.
492 The coding system is used to decode the stream into a series of
493 characters (which may be from multiple charsets) when the text is read
494 from a file or process, and is used to encode the text back into the
495 same format when it is written out to a file or process.
497 For example, many ISO2022-compliant coding systems (such as Compound
498 Text, which is used for inter-client data under the X Window System)
499 use escape sequences to switch between different charsets -- Japanese
500 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
501 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
502 `make-coding-system' for more information.
504 Coding systems are normally identified using a symbol, and the
505 symbol is accepted in place of the actual coding system object whenever
506 a coding system is called for. (This is similar to how faces work.)
510 return CODING_SYSTEMP (object) ? Qt : Qnil;
513 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
514 Retrieve the coding system of the given name.
516 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
517 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
518 If there is no such coding system, nil is returned. Otherwise the
519 associated coding system object is returned.
521 (coding_system_or_name))
523 if (NILP (coding_system_or_name))
524 coding_system_or_name = Qbinary;
525 else if (CODING_SYSTEMP (coding_system_or_name))
526 return coding_system_or_name;
528 CHECK_SYMBOL (coding_system_or_name);
532 coding_system_or_name =
533 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
535 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
536 return coding_system_or_name;
540 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
541 Retrieve the coding system of the given name.
542 Same as `find-coding-system' except that if there is no such
543 coding system, an error is signaled instead of returning nil.
547 Lisp_Object coding_system = Ffind_coding_system (name);
549 if (NILP (coding_system))
550 signal_simple_error ("No such coding system", name);
551 return coding_system;
554 /* We store the coding systems in hash tables with the names as the key and the
555 actual coding system object as the value. Occasionally we need to use them
556 in a list format. These routines provide us with that. */
557 struct coding_system_list_closure
559 Lisp_Object *coding_system_list;
563 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
564 void *coding_system_list_closure)
566 /* This function can GC */
567 struct coding_system_list_closure *cscl =
568 (struct coding_system_list_closure *) coding_system_list_closure;
569 Lisp_Object *coding_system_list = cscl->coding_system_list;
571 *coding_system_list = Fcons (key, *coding_system_list);
575 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
576 Return a list of the names of all defined coding systems.
580 Lisp_Object coding_system_list = Qnil;
582 struct coding_system_list_closure coding_system_list_closure;
584 GCPRO1 (coding_system_list);
585 coding_system_list_closure.coding_system_list = &coding_system_list;
586 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
587 &coding_system_list_closure);
590 return coding_system_list;
593 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
594 Return the name of the given coding system.
598 coding_system = Fget_coding_system (coding_system);
599 return XCODING_SYSTEM_NAME (coding_system);
602 static Lisp_Coding_System *
603 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
605 Lisp_Coding_System *codesys =
606 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
608 zero_lcrecord (codesys);
609 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
610 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
611 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
612 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
613 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
614 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
615 CODING_SYSTEM_TYPE (codesys) = type;
616 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
618 if (type == CODESYS_ISO2022)
621 for (i = 0; i < 4; i++)
622 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
624 else if (type == CODESYS_CCL)
626 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
627 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
630 CODING_SYSTEM_NAME (codesys) = name;
636 /* Given a list of charset conversion specs as specified in a Lisp
637 program, parse it into STORE_HERE. */
640 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
641 Lisp_Object spec_list)
645 EXTERNAL_LIST_LOOP (rest, spec_list)
647 Lisp_Object car = XCAR (rest);
648 Lisp_Object from, to;
649 struct charset_conversion_spec spec;
651 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
652 signal_simple_error ("Invalid charset conversion spec", car);
653 from = Fget_charset (XCAR (car));
654 to = Fget_charset (XCAR (XCDR (car)));
655 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
656 signal_simple_error_2
657 ("Attempted conversion between different charset types",
659 spec.from_charset = from;
660 spec.to_charset = to;
662 Dynarr_add (store_here, spec);
666 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
667 specs, return the equivalent as the Lisp programmer would see it.
669 If LOAD_HERE is 0, return Qnil. */
672 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
679 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
681 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
682 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
685 return Fnreverse (result);
690 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
691 Register symbol NAME as a coding system.
693 TYPE describes the conversion method used and should be one of
696 Automatic conversion. XEmacs attempts to detect the coding system
699 No conversion. Use this for binary files and such. On output,
700 graphic characters that are not in ASCII or Latin-1 will be
701 replaced by a ?. (For a no-conversion-encoded buffer, these
702 characters will only be present if you explicitly insert them.)
704 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
706 ISO 10646 UCS-4 encoding.
708 ISO 10646 UTF-8 encoding.
710 Any ISO2022-compliant encoding. Among other things, this includes
711 JIS (the Japanese encoding commonly used for e-mail), EUC (the
712 standard Unix encoding for Japanese and other languages), and
713 Compound Text (the encoding used in X11). You can specify more
714 specific information about the conversion with the PROPS argument.
716 Big5 (the encoding commonly used for Taiwanese).
718 The conversion is performed using a user-written pseudo-code
719 program. CCL (Code Conversion Language) is the name of this
722 Write out or read in the raw contents of the memory representing
723 the buffer's text. This is primarily useful for debugging
724 purposes, and is only enabled when XEmacs has been compiled with
725 DEBUG_XEMACS defined (via the --debug configure option).
726 WARNING: Reading in a file using 'internal conversion can result
727 in an internal inconsistency in the memory representing a
728 buffer's text, which will produce unpredictable results and may
729 cause XEmacs to crash. Under normal circumstances you should
730 never use 'internal conversion.
732 DOC-STRING is a string describing the coding system.
734 PROPS is a property list, describing the specific nature of the
735 character set. Recognized properties are:
738 String to be displayed in the modeline when this coding system is
742 End-of-line conversion to be used. It should be one of
745 Automatically detect the end-of-line type (LF, CRLF,
746 or CR). Also generate subsidiary coding systems named
747 `NAME-unix', `NAME-dos', and `NAME-mac', that are
748 identical to this coding system but have an EOL-TYPE
749 value of 'lf, 'crlf, and 'cr, respectively.
751 The end of a line is marked externally using ASCII LF.
752 Since this is also the way that XEmacs represents an
753 end-of-line internally, specifying this option results
754 in no end-of-line conversion. This is the standard
755 format for Unix text files.
757 The end of a line is marked externally using ASCII
758 CRLF. This is the standard format for MS-DOS text
761 The end of a line is marked externally using ASCII CR.
762 This is the standard format for Macintosh text files.
764 Automatically detect the end-of-line type but do not
765 generate subsidiary coding systems. (This value is
766 converted to nil when stored internally, and
767 `coding-system-property' will return nil.)
769 'post-read-conversion
770 Function called after a file has been read in, to perform the
771 decoding. Called with two arguments, START and END, denoting
772 a region of the current buffer to be decoded.
774 'pre-write-conversion
775 Function called before a file is written out, to perform the
776 encoding. Called with two arguments, START and END, denoting
777 a region of the current buffer to be encoded.
780 The following additional properties are recognized if TYPE is 'iso2022:
786 The character set initially designated to the G0 - G3 registers.
787 The value should be one of
789 -- A charset object (designate that character set)
790 -- nil (do not ever use this register)
791 -- t (no character set is initially designated to
792 the register, but may be later on; this automatically
793 sets the corresponding `force-g*-on-output' property)
799 If non-nil, send an explicit designation sequence on output before
800 using the specified register.
803 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
804 "ESC $ B" on output in place of the full designation sequences
805 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
808 If non-nil, don't designate ASCII to G0 at each end of line on output.
809 Setting this to non-nil also suppresses other state-resetting that
810 normally happens at the end of a line.
813 If non-nil, don't designate ASCII to G0 before control chars on output.
816 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
820 If non-nil, use locking-shift (SO/SI) instead of single-shift
821 or designation by escape sequence.
824 If non-nil, don't use ISO6429's direction specification.
827 If non-nil, literal control characters that are the same as
828 the beginning of a recognized ISO2022 or ISO6429 escape sequence
829 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
830 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
831 so that they can be properly distinguished from an escape sequence.
832 (Note that doing this results in a non-portable encoding.) This
833 encoding flag is used for byte-compiled files. Note that ESC
834 is a good choice for a quoting character because there are no
835 escape sequences whose second byte is a character from the Control-0
836 or Control-1 character sets; this is explicitly disallowed by the
839 'input-charset-conversion
840 A list of conversion specifications, specifying conversion of
841 characters in one charset to another when decoding is performed.
842 Each specification is a list of two elements: the source charset,
843 and the destination charset.
845 'output-charset-conversion
846 A list of conversion specifications, specifying conversion of
847 characters in one charset to another when encoding is performed.
848 The form of each specification is the same as for
849 'input-charset-conversion.
852 The following additional properties are recognized (and required)
856 CCL program used for decoding (converting to internal format).
859 CCL program used for encoding (converting to external format).
861 (name, type, doc_string, props))
863 Lisp_Coding_System *codesys;
864 enum coding_system_type ty;
865 int need_to_setup_eol_systems = 1;
867 /* Convert type to constant */
868 if (NILP (type) || EQ (type, Qundecided))
869 { ty = CODESYS_AUTODETECT; }
871 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
872 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
873 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
874 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
875 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
876 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
878 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
880 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
883 signal_simple_error ("Invalid coding system type", type);
887 codesys = allocate_coding_system (ty, name);
889 if (NILP (doc_string))
890 doc_string = build_string ("");
892 CHECK_STRING (doc_string);
893 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
896 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
898 if (EQ (key, Qmnemonic))
901 CHECK_STRING (value);
902 CODING_SYSTEM_MNEMONIC (codesys) = value;
905 else if (EQ (key, Qeol_type))
907 need_to_setup_eol_systems = NILP (value);
910 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
913 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
914 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
916 else if (ty == CODESYS_ISO2022)
918 #define FROB_INITIAL_CHARSET(charset_num) \
919 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
920 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
922 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
923 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
924 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
925 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
927 #define FROB_FORCE_CHARSET(charset_num) \
928 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
930 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
931 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
932 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
933 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
935 #define FROB_BOOLEAN_PROPERTY(prop) \
936 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
938 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
939 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
940 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
941 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
942 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
943 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
944 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
946 else if (EQ (key, Qinput_charset_conversion))
948 codesys->iso2022.input_conv =
949 Dynarr_new (charset_conversion_spec);
950 parse_charset_conversion_specs (codesys->iso2022.input_conv,
953 else if (EQ (key, Qoutput_charset_conversion))
955 codesys->iso2022.output_conv =
956 Dynarr_new (charset_conversion_spec);
957 parse_charset_conversion_specs (codesys->iso2022.output_conv,
961 signal_simple_error ("Unrecognized property", key);
963 else if (EQ (type, Qccl))
966 struct ccl_program test_ccl;
969 /* Check key first. */
970 if (EQ (key, Qdecode))
971 suffix = "-ccl-decode";
972 else if (EQ (key, Qencode))
973 suffix = "-ccl-encode";
975 signal_simple_error ("Unrecognized property", key);
977 /* If value is vector, register it as a ccl program
978 associated with an newly created symbol for
979 backward compatibility. */
982 sym = Fintern (concat2 (Fsymbol_name (name),
983 build_string (suffix)),
985 Fregister_ccl_program (sym, value);
989 CHECK_SYMBOL (value);
992 /* check if the given ccl programs are valid. */
993 if (setup_ccl_program (&test_ccl, sym) < 0)
994 signal_simple_error ("Invalid CCL program", value);
996 if (EQ (key, Qdecode))
997 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
998 else if (EQ (key, Qencode))
999 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1004 signal_simple_error ("Unrecognized property", key);
1008 if (need_to_setup_eol_systems)
1009 setup_eol_coding_systems (codesys);
1012 Lisp_Object codesys_obj;
1013 XSETCODING_SYSTEM (codesys_obj, codesys);
1014 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1019 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1020 Copy OLD-CODING-SYSTEM to NEW-NAME.
1021 If NEW-NAME does not name an existing coding system, a new one will
1024 (old_coding_system, new_name))
1026 Lisp_Object new_coding_system;
1027 old_coding_system = Fget_coding_system (old_coding_system);
1028 new_coding_system = Ffind_coding_system (new_name);
1029 if (NILP (new_coding_system))
1031 XSETCODING_SYSTEM (new_coding_system,
1032 allocate_coding_system
1033 (XCODING_SYSTEM_TYPE (old_coding_system),
1035 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1039 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1040 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1041 memcpy (((char *) to ) + sizeof (to->header),
1042 ((char *) from) + sizeof (from->header),
1043 sizeof (*from) - sizeof (from->header));
1044 to->name = new_name;
1046 return new_coding_system;
1049 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1050 Return t if OBJECT names a coding system, and is not a coding system alias.
1054 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1058 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1059 Return t if OBJECT is a coding system alias.
1060 All coding system aliases are created by `define-coding-system-alias'.
1064 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1068 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1069 Return the coding-system symbol for which symbol ALIAS is an alias.
1073 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1074 if (SYMBOLP (aliasee))
1077 signal_simple_error ("Symbol is not a coding system alias", alias);
1078 return Qnil; /* To keep the compiler happy */
1082 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1084 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1088 /* A maphash function, for removing dangling coding system aliases. */
1090 dangling_coding_system_alias_p (Lisp_Object alias,
1091 Lisp_Object aliasee,
1092 void *dangling_aliases)
1094 if (SYMBOLP (aliasee)
1095 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1097 (*(int *) dangling_aliases)++;
1104 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1105 Define symbol ALIAS as an alias for coding system ALIASEE.
1107 You can use this function to redefine an alias that has already been defined,
1108 but you cannot redefine a name which is the canonical name for a coding system.
1109 \(a canonical name of a coding system is what is returned when you call
1110 `coding-system-name' on a coding system).
1112 ALIASEE itself can be an alias, which allows you to define nested aliases.
1114 You are forbidden, however, from creating alias loops or `dangling' aliases.
1115 These will be detected, and an error will be signaled if you attempt to do so.
1117 If ALIASEE is nil, then ALIAS will simply be undefined.
1119 See also `coding-system-alias-p', `coding-system-aliasee',
1120 and `coding-system-canonical-name-p'.
1124 Lisp_Object real_coding_system, probe;
1126 CHECK_SYMBOL (alias);
1128 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1130 ("Symbol is the canonical name of a coding system and cannot be redefined",
1135 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1136 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1137 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1139 Fremhash (alias, Vcoding_system_hash_table);
1141 /* Undefine subsidiary aliases,
1142 presumably created by a previous call to this function */
1143 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1144 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1145 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1147 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1148 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1149 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1152 /* Undefine dangling coding system aliases. */
1154 int dangling_aliases;
1157 dangling_aliases = 0;
1158 elisp_map_remhash (dangling_coding_system_alias_p,
1159 Vcoding_system_hash_table,
1161 } while (dangling_aliases > 0);
1167 if (CODING_SYSTEMP (aliasee))
1168 aliasee = XCODING_SYSTEM_NAME (aliasee);
1170 /* Checks that aliasee names a coding-system */
1171 real_coding_system = Fget_coding_system (aliasee);
1173 /* Check for coding system alias loops */
1174 if (EQ (alias, aliasee))
1175 alias_loop: signal_simple_error_2
1176 ("Attempt to create a coding system alias loop", alias, aliasee);
1178 for (probe = aliasee;
1180 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1182 if (EQ (probe, alias))
1186 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1188 /* Set up aliases for subsidiaries.
1189 #### There must be a better way to handle subsidiary coding systems. */
1191 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1193 for (i = 0; i < countof (suffixes); i++)
1195 Lisp_Object alias_subsidiary =
1196 append_suffix_to_symbol (alias, suffixes[i]);
1197 Lisp_Object aliasee_subsidiary =
1198 append_suffix_to_symbol (aliasee, suffixes[i]);
1200 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1201 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1204 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1205 but it doesn't look intentional, so I'd rather return something
1206 meaningful or nothing at all. */
1211 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1213 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1214 Lisp_Object new_coding_system;
1216 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1217 return coding_system;
1221 case EOL_AUTODETECT: return coding_system;
1222 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1223 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1224 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1225 default: abort (); return Qnil;
1228 return NILP (new_coding_system) ? coding_system : new_coding_system;
1231 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1232 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1234 (coding_system, eol_type))
1236 coding_system = Fget_coding_system (coding_system);
1238 return subsidiary_coding_system (coding_system,
1239 symbol_to_eol_type (eol_type));
1243 /************************************************************************/
1244 /* Coding system accessors */
1245 /************************************************************************/
1247 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1248 Return the doc string for CODING-SYSTEM.
1252 coding_system = Fget_coding_system (coding_system);
1253 return XCODING_SYSTEM_DOC_STRING (coding_system);
1256 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1257 Return the type of CODING-SYSTEM.
1261 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1264 case CODESYS_AUTODETECT: return Qundecided;
1266 case CODESYS_SHIFT_JIS: return Qshift_jis;
1267 case CODESYS_ISO2022: return Qiso2022;
1268 case CODESYS_BIG5: return Qbig5;
1269 case CODESYS_UCS4: return Qucs4;
1270 case CODESYS_UTF8: return Qutf8;
1271 case CODESYS_CCL: return Qccl;
1273 case CODESYS_NO_CONVERSION: return Qno_conversion;
1275 case CODESYS_INTERNAL: return Qinternal;
1282 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1285 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1287 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1290 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1291 Return initial charset of CODING-SYSTEM designated to GNUM.
1294 (coding_system, gnum))
1296 coding_system = Fget_coding_system (coding_system);
1299 return coding_system_charset (coding_system, XINT (gnum));
1303 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1304 Return the PROP property of CODING-SYSTEM.
1306 (coding_system, prop))
1309 enum coding_system_type type;
1311 coding_system = Fget_coding_system (coding_system);
1312 CHECK_SYMBOL (prop);
1313 type = XCODING_SYSTEM_TYPE (coding_system);
1315 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1316 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1319 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1321 case CODESYS_PROP_ALL_OK:
1324 case CODESYS_PROP_ISO2022:
1325 if (type != CODESYS_ISO2022)
1327 ("Property only valid in ISO2022 coding systems",
1331 case CODESYS_PROP_CCL:
1332 if (type != CODESYS_CCL)
1334 ("Property only valid in CCL coding systems",
1344 signal_simple_error ("Unrecognized property", prop);
1346 if (EQ (prop, Qname))
1347 return XCODING_SYSTEM_NAME (coding_system);
1348 else if (EQ (prop, Qtype))
1349 return Fcoding_system_type (coding_system);
1350 else if (EQ (prop, Qdoc_string))
1351 return XCODING_SYSTEM_DOC_STRING (coding_system);
1352 else if (EQ (prop, Qmnemonic))
1353 return XCODING_SYSTEM_MNEMONIC (coding_system);
1354 else if (EQ (prop, Qeol_type))
1355 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1356 else if (EQ (prop, Qeol_lf))
1357 return XCODING_SYSTEM_EOL_LF (coding_system);
1358 else if (EQ (prop, Qeol_crlf))
1359 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1360 else if (EQ (prop, Qeol_cr))
1361 return XCODING_SYSTEM_EOL_CR (coding_system);
1362 else if (EQ (prop, Qpost_read_conversion))
1363 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1364 else if (EQ (prop, Qpre_write_conversion))
1365 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1367 else if (type == CODESYS_ISO2022)
1369 if (EQ (prop, Qcharset_g0))
1370 return coding_system_charset (coding_system, 0);
1371 else if (EQ (prop, Qcharset_g1))
1372 return coding_system_charset (coding_system, 1);
1373 else if (EQ (prop, Qcharset_g2))
1374 return coding_system_charset (coding_system, 2);
1375 else if (EQ (prop, Qcharset_g3))
1376 return coding_system_charset (coding_system, 3);
1378 #define FORCE_CHARSET(charset_num) \
1379 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1380 (coding_system, charset_num) ? Qt : Qnil)
1382 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1383 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1384 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1385 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1387 #define LISP_BOOLEAN(prop) \
1388 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1390 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1391 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1392 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1393 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1394 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1395 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1396 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1398 else if (EQ (prop, Qinput_charset_conversion))
1400 unparse_charset_conversion_specs
1401 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1402 else if (EQ (prop, Qoutput_charset_conversion))
1404 unparse_charset_conversion_specs
1405 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1409 else if (type == CODESYS_CCL)
1411 if (EQ (prop, Qdecode))
1412 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1413 else if (EQ (prop, Qencode))
1414 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1422 return Qnil; /* not reached */
1426 /************************************************************************/
1427 /* Coding category functions */
1428 /************************************************************************/
1431 decode_coding_category (Lisp_Object symbol)
1435 CHECK_SYMBOL (symbol);
1436 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1437 if (EQ (coding_category_symbol[i], symbol))
1440 signal_simple_error ("Unrecognized coding category", symbol);
1441 return 0; /* not reached */
1444 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1445 Return a list of all recognized coding categories.
1450 Lisp_Object list = Qnil;
1452 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1453 list = Fcons (coding_category_symbol[i], list);
1457 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1458 Change the priority order of the coding categories.
1459 LIST should be list of coding categories, in descending order of
1460 priority. Unspecified coding categories will be lower in priority
1461 than all specified ones, in the same relative order they were in
1466 int category_to_priority[CODING_CATEGORY_LAST];
1470 /* First generate a list that maps coding categories to priorities. */
1472 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1473 category_to_priority[i] = -1;
1475 /* Highest priority comes from the specified list. */
1477 EXTERNAL_LIST_LOOP (rest, list)
1479 int cat = decode_coding_category (XCAR (rest));
1481 if (category_to_priority[cat] >= 0)
1482 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1483 category_to_priority[cat] = i++;
1486 /* Now go through the existing categories by priority to retrieve
1487 the categories not yet specified and preserve their priority
1489 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1491 int cat = fcd->coding_category_by_priority[j];
1492 if (category_to_priority[cat] < 0)
1493 category_to_priority[cat] = i++;
1496 /* Now we need to construct the inverse of the mapping we just
1499 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1502 /* Phew! That was confusing. */
1506 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1507 Return a list of coding categories in descending order of priority.
1512 Lisp_Object list = Qnil;
1514 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1515 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1520 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1521 Change the coding system associated with a coding category.
1523 (coding_category, coding_system))
1525 int cat = decode_coding_category (coding_category);
1527 coding_system = Fget_coding_system (coding_system);
1528 fcd->coding_category_system[cat] = coding_system;
1532 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1533 Return the coding system associated with a coding category.
1537 int cat = decode_coding_category (coding_category);
1538 Lisp_Object sys = fcd->coding_category_system[cat];
1541 return XCODING_SYSTEM_NAME (sys);
1546 /************************************************************************/
1547 /* Detecting the encoding of data */
1548 /************************************************************************/
1550 struct detection_state
1552 eol_type_t eol_type;
1588 struct iso2022_decoder iso;
1590 int high_byte_count;
1591 unsigned int saw_single_shift:1;
1604 acceptable_control_char_p (int c)
1608 /* Allow and ignore control characters that you might
1609 reasonably see in a text file */
1614 case 8: /* backspace */
1615 case 11: /* vertical tab */
1616 case 12: /* form feed */
1617 case 26: /* MS-DOS C-z junk */
1618 case 31: /* '^_' -- for info */
1626 mask_has_at_most_one_bit_p (int mask)
1628 /* Perhaps the only thing useful you learn from intensive Microsoft
1629 technical interviews */
1630 return (mask & (mask - 1)) == 0;
1634 detect_eol_type (struct detection_state *st, const Extbyte *src,
1639 unsigned char c = *(unsigned char *)src++;
1642 if (st->eol.just_saw_cr)
1644 else if (st->eol.seen_anything)
1647 else if (st->eol.just_saw_cr)
1650 st->eol.just_saw_cr = 1;
1652 st->eol.just_saw_cr = 0;
1653 st->eol.seen_anything = 1;
1656 return EOL_AUTODETECT;
1659 /* Attempt to determine the encoding and EOL type of the given text.
1660 Before calling this function for the first type, you must initialize
1661 st->eol_type as appropriate and initialize st->mask to ~0.
1663 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1666 st->mask holds the determined coding category mask, or ~0 if only
1667 ASCII has been seen so far.
1671 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1672 is present in st->mask
1673 1 == definitive answers are here for both st->eol_type and st->mask
1677 detect_coding_type (struct detection_state *st, const Extbyte *src,
1678 size_t n, int just_do_eol)
1680 if (st->eol_type == EOL_AUTODETECT)
1681 st->eol_type = detect_eol_type (st, src, n);
1684 return st->eol_type != EOL_AUTODETECT;
1686 if (!st->seen_non_ascii)
1688 for (; n; n--, src++)
1690 unsigned char c = *(unsigned char *) src;
1691 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1693 st->seen_non_ascii = 1;
1695 st->shift_jis.mask = ~0;
1699 st->iso2022.mask = ~0;
1709 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714 st->big5.mask = detect_coding_big5 (st, src, n);
1715 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716 st->utf8.mask = detect_coding_utf8 (st, src, n);
1717 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1721 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722 | st->utf8.mask | st->ucs4.mask;
1725 int retval = mask_has_at_most_one_bit_p (st->mask);
1726 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727 return retval && st->eol_type != EOL_AUTODETECT;
1732 coding_system_from_mask (int mask)
1736 /* If the file was entirely or basically ASCII, use the
1737 default value of `buffer-file-coding-system'. */
1738 Lisp_Object retval =
1739 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1742 retval = Ffind_coding_system (retval);
1746 (Qbad_variable, Qwarning,
1747 "Invalid `default-buffer-file-coding-system', set to nil");
1748 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1752 retval = Fget_coding_system (Qraw_text);
1760 mask = postprocess_iso2022_mask (mask);
1762 /* Look through the coding categories by priority and find
1763 the first one that is allowed. */
1764 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1766 cat = fcd->coding_category_by_priority[i];
1767 if ((mask & (1 << cat)) &&
1768 !NILP (fcd->coding_category_system[cat]))
1772 return fcd->coding_category_system[cat];
1774 return Fget_coding_system (Qraw_text);
1778 /* Given a seekable read stream and potential coding system and EOL type
1779 as specified, do any autodetection that is called for. If the
1780 coding system and/or EOL type are not `autodetect', they will be left
1781 alone; but this function will never return an autodetect coding system
1784 This function does not automatically fetch subsidiary coding systems;
1785 that should be unnecessary with the explicit eol-type argument. */
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1790 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1791 eol_type_t *eol_type_in_out)
1793 struct detection_state decst;
1795 if (*eol_type_in_out == EOL_AUTODETECT)
1796 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1799 decst.eol_type = *eol_type_in_out;
1802 /* If autodetection is called for, do it now. */
1803 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1804 || *eol_type_in_out == EOL_AUTODETECT)
1807 Lisp_Object coding_system = Qnil;
1809 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1812 /* Look for initial "-*-"; mode line prefix */
1814 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1819 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1821 Extbyte *local_vars_beg = p + 3;
1822 /* Look for final "-*-"; mode line suffix */
1823 for (p = local_vars_beg,
1824 scan_end = buf + nread - LENGTH ("-*-");
1829 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1831 Extbyte *suffix = p;
1832 /* Look for "coding:" */
1833 for (p = local_vars_beg,
1834 scan_end = suffix - LENGTH ("coding:?");
1837 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1838 && (p == local_vars_beg
1839 || (*(p-1) == ' ' ||
1845 p += LENGTH ("coding:");
1846 while (*p == ' ' || *p == '\t') p++;
1848 /* Get coding system name */
1849 save = *suffix; *suffix = '\0';
1850 /* Characters valid in a MIME charset name (rfc 1521),
1851 and in a Lisp symbol name. */
1852 n = strspn ( (char *) p,
1853 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1854 "abcdefghijklmnopqrstuvwxyz"
1860 save = p[n]; p[n] = '\0';
1862 Ffind_coding_system (intern ((char *) p));
1872 if (NILP (coding_system))
1875 if (detect_coding_type (&decst, buf, nread,
1876 XCODING_SYSTEM_TYPE (*codesys_in_out)
1877 != CODESYS_AUTODETECT))
1879 nread = Lstream_read (stream, buf, sizeof (buf));
1885 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1886 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1889 if (detect_coding_type (&decst, buf, nread, 1))
1891 nread = Lstream_read (stream, buf, sizeof (buf));
1897 *eol_type_in_out = decst.eol_type;
1898 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1900 if (NILP (coding_system))
1901 *codesys_in_out = coding_system_from_mask (decst.mask);
1903 *codesys_in_out = coding_system;
1907 /* If we absolutely can't determine the EOL type, just assume LF. */
1908 if (*eol_type_in_out == EOL_AUTODETECT)
1909 *eol_type_in_out = EOL_LF;
1911 Lstream_rewind (stream);
1914 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1915 Detect coding system of the text in the region between START and END.
1916 Return a list of possible coding systems ordered by priority.
1917 If only ASCII characters are found, return 'undecided or one of
1918 its subsidiary coding systems according to a detected end-of-line
1919 type. Optional arg BUFFER defaults to the current buffer.
1921 (start, end, buffer))
1923 Lisp_Object val = Qnil;
1924 struct buffer *buf = decode_buffer (buffer, 0);
1926 Lisp_Object instream, lb_instream;
1927 Lstream *istr, *lb_istr;
1928 struct detection_state decst;
1929 struct gcpro gcpro1, gcpro2;
1931 get_buffer_range_char (buf, start, end, &b, &e, 0);
1932 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1933 lb_istr = XLSTREAM (lb_instream);
1934 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1935 istr = XLSTREAM (instream);
1936 GCPRO2 (instream, lb_instream);
1938 decst.eol_type = EOL_AUTODETECT;
1942 Extbyte random_buffer[4096];
1943 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1947 if (detect_coding_type (&decst, random_buffer, nread, 0))
1951 if (decst.mask == ~0)
1952 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1960 decst.mask = postprocess_iso2022_mask (decst.mask);
1962 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1964 int sys = fcd->coding_category_by_priority[i];
1965 if (decst.mask & (1 << sys))
1967 Lisp_Object codesys = fcd->coding_category_system[sys];
1968 if (!NILP (codesys))
1969 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1970 val = Fcons (codesys, val);
1974 Lstream_close (istr);
1976 Lstream_delete (istr);
1977 Lstream_delete (lb_istr);
1982 /************************************************************************/
1983 /* Converting to internal Mule format ("decoding") */
1984 /************************************************************************/
1986 /* A decoding stream is a stream used for decoding text (i.e.
1987 converting from some external format to internal format).
1988 The decoding-stream object keeps track of the actual coding
1989 stream, the stream that is at the other end, and data that
1990 needs to be persistent across the lifetime of the stream. */
1992 /* Handle the EOL stuff related to just-read-in character C.
1993 EOL_TYPE is the EOL type of the coding stream.
1994 FLAGS is the current value of FLAGS in the coding stream, and may
1995 be modified by this macro. (The macro only looks at the
1996 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1997 bytes are to be written. You need to also define a local goto
1998 label "label_continue_loop" that is at the end of the main
1999 character-reading loop.
2001 If C is a CR character, then this macro handles it entirely and
2002 jumps to label_continue_loop. Otherwise, this macro does not add
2003 anything to DST, and continues normally. You should continue
2004 processing C normally after this macro. */
2006 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2010 if (eol_type == EOL_CR) \
2011 Dynarr_add (dst, '\n'); \
2012 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2013 Dynarr_add (dst, c); \
2015 flags |= CODING_STATE_CR; \
2016 goto label_continue_loop; \
2018 else if (flags & CODING_STATE_CR) \
2019 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2021 Dynarr_add (dst, '\r'); \
2022 flags &= ~CODING_STATE_CR; \
2026 /* C should be a binary character in the range 0 - 255; convert
2027 to internal format and add to Dynarr DST. */
2030 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2032 if (BYTE_ASCII_P (c)) \
2033 Dynarr_add (dst, c); \
2036 Dynarr_add (dst, (c >> 6) | 0xc0); \
2037 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2042 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2046 Dynarr_add (dst, c);
2048 else if ( c <= 0x7ff )
2050 Dynarr_add (dst, (c >> 6) | 0xc0);
2051 Dynarr_add (dst, (c & 0x3f) | 0x80);
2053 else if ( c <= 0xffff )
2055 Dynarr_add (dst, (c >> 12) | 0xe0);
2056 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2057 Dynarr_add (dst, (c & 0x3f) | 0x80);
2059 else if ( c <= 0x1fffff )
2061 Dynarr_add (dst, (c >> 18) | 0xf0);
2062 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2063 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2064 Dynarr_add (dst, (c & 0x3f) | 0x80);
2066 else if ( c <= 0x3ffffff )
2068 Dynarr_add (dst, (c >> 24) | 0xf8);
2069 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2070 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2071 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2072 Dynarr_add (dst, (c & 0x3f) | 0x80);
2076 Dynarr_add (dst, (c >> 30) | 0xfc);
2077 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2078 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2079 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2080 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2081 Dynarr_add (dst, (c & 0x3f) | 0x80);
2085 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2087 if (BYTE_ASCII_P (c)) \
2088 Dynarr_add (dst, c); \
2089 else if (BYTE_C1_P (c)) \
2091 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2092 Dynarr_add (dst, c + 0x20); \
2096 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2097 Dynarr_add (dst, c); \
2102 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2106 DECODE_ADD_BINARY_CHAR (ch, dst); \
2111 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2113 if (flags & CODING_STATE_END) \
2115 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2116 if (flags & CODING_STATE_CR) \
2117 Dynarr_add (dst, '\r'); \
2121 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2123 struct decoding_stream
2125 /* Coding system that governs the conversion. */
2126 Lisp_Coding_System *codesys;
2128 /* Stream that we read the encoded data from or
2129 write the decoded data to. */
2132 /* If we are reading, then we can return only a fixed amount of
2133 data, so if the conversion resulted in too much data, we store it
2134 here for retrieval the next time around. */
2135 unsigned_char_dynarr *runoff;
2137 /* FLAGS holds flags indicating the current state of the decoding.
2138 Some of these flags are dependent on the coding system. */
2141 /* CH holds a partially built-up character. Since we only deal
2142 with one- and two-byte characters at the moment, we only use
2143 this to store the first byte of a two-byte character. */
2146 /* EOL_TYPE specifies the type of end-of-line conversion that
2147 currently applies. We need to keep this separate from the
2148 EOL type stored in CODESYS because the latter might indicate
2149 automatic EOL-type detection while the former will always
2150 indicate a particular EOL type. */
2151 eol_type_t eol_type;
2153 /* Additional ISO2022 information. We define the structure above
2154 because it's also needed by the detection routines. */
2155 struct iso2022_decoder iso2022;
2157 /* Additional information (the state of the running CCL program)
2158 used by the CCL decoder. */
2159 struct ccl_program ccl;
2161 /* counter for UTF-8 or UCS-4 */
2162 unsigned char counter;
2164 struct detection_state decst;
2167 static ssize_t decoding_reader (Lstream *stream,
2168 unsigned char *data, size_t size);
2169 static ssize_t decoding_writer (Lstream *stream,
2170 const unsigned char *data, size_t size);
2171 static int decoding_rewinder (Lstream *stream);
2172 static int decoding_seekable_p (Lstream *stream);
2173 static int decoding_flusher (Lstream *stream);
2174 static int decoding_closer (Lstream *stream);
2176 static Lisp_Object decoding_marker (Lisp_Object stream);
2178 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2179 sizeof (struct decoding_stream));
2182 decoding_marker (Lisp_Object stream)
2184 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2185 Lisp_Object str_obj;
2187 /* We do not need to mark the coding systems or charsets stored
2188 within the stream because they are stored in a global list
2189 and automatically marked. */
2191 XSETLSTREAM (str_obj, str);
2192 mark_object (str_obj);
2193 if (str->imp->marker)
2194 return (str->imp->marker) (str_obj);
2199 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2200 so we read data from the other end, decode it, and store it into DATA. */
2203 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2205 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2206 unsigned char *orig_data = data;
2208 int error_occurred = 0;
2210 /* We need to interface to mule_decode(), which expects to take some
2211 amount of data and store the result into a Dynarr. We have
2212 mule_decode() store into str->runoff, and take data from there
2215 /* We loop until we have enough data, reading chunks from the other
2216 end and decoding it. */
2219 /* Take data from the runoff if we can. Make sure to take at
2220 most SIZE bytes, and delete the data from the runoff. */
2221 if (Dynarr_length (str->runoff) > 0)
2223 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2224 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2225 Dynarr_delete_many (str->runoff, 0, chunk);
2231 break; /* No more room for data */
2233 if (str->flags & CODING_STATE_END)
2234 /* This means that on the previous iteration, we hit the EOF on
2235 the other end. We loop once more so that mule_decode() can
2236 output any final stuff it may be holding, or any "go back
2237 to a sane state" escape sequences. (This latter makes sense
2238 during encoding.) */
2241 /* Exhausted the runoff, so get some more. DATA has at least
2242 SIZE bytes left of storage in it, so it's OK to read directly
2243 into it. (We'll be overwriting above, after we've decoded it
2244 into the runoff.) */
2245 read_size = Lstream_read (str->other_end, data, size);
2252 /* There might be some more end data produced in the translation.
2253 See the comment above. */
2254 str->flags |= CODING_STATE_END;
2255 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2258 if (data - orig_data == 0)
2259 return error_occurred ? -1 : 0;
2261 return data - orig_data;
2265 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2267 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2270 /* Decode all our data into the runoff, and then attempt to write
2271 it all out to the other end. Remove whatever chunk we succeeded
2273 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2274 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2275 Dynarr_length (str->runoff));
2277 Dynarr_delete_many (str->runoff, 0, retval);
2278 /* Do NOT return retval. The return value indicates how much
2279 of the incoming data was written, not how many bytes were
2285 reset_decoding_stream (struct decoding_stream *str)
2288 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2290 Lisp_Object coding_system;
2291 XSETCODING_SYSTEM (coding_system, str->codesys);
2292 reset_iso2022 (coding_system, &str->iso2022);
2294 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2296 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2300 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2301 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2304 str->decst.eol_type = EOL_AUTODETECT;
2305 str->decst.mask = ~0;
2307 str->flags = str->ch = 0;
2311 decoding_rewinder (Lstream *stream)
2313 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2314 reset_decoding_stream (str);
2315 Dynarr_reset (str->runoff);
2316 return Lstream_rewind (str->other_end);
2320 decoding_seekable_p (Lstream *stream)
2322 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2323 return Lstream_seekable_p (str->other_end);
2327 decoding_flusher (Lstream *stream)
2329 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2330 return Lstream_flush (str->other_end);
2334 decoding_closer (Lstream *stream)
2336 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2337 if (stream->flags & LSTREAM_FL_WRITE)
2339 str->flags |= CODING_STATE_END;
2340 decoding_writer (stream, 0, 0);
2342 Dynarr_free (str->runoff);
2344 #ifdef ENABLE_COMPOSITE_CHARS
2345 if (str->iso2022.composite_chars)
2346 Dynarr_free (str->iso2022.composite_chars);
2349 return Lstream_close (str->other_end);
2353 decoding_stream_coding_system (Lstream *stream)
2355 Lisp_Object coding_system;
2356 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2358 XSETCODING_SYSTEM (coding_system, str->codesys);
2359 return subsidiary_coding_system (coding_system, str->eol_type);
2363 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2365 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2366 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2368 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2369 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2370 reset_decoding_stream (str);
2373 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2374 stream for writing, no automatic code detection will be performed.
2375 The reason for this is that automatic code detection requires a
2376 seekable input. Things will also fail if you open a decoding
2377 stream for reading using a non-fully-specified coding system and
2378 a non-seekable input stream. */
2381 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2384 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2385 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2389 str->other_end = stream;
2390 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2391 str->eol_type = EOL_AUTODETECT;
2392 if (!strcmp (mode, "r")
2393 && Lstream_seekable_p (stream))
2394 /* We can determine the coding system now. */
2395 determine_real_coding_system (stream, &codesys, &str->eol_type);
2396 set_decoding_stream_coding_system (lstr, codesys);
2397 str->decst.eol_type = str->eol_type;
2398 str->decst.mask = ~0;
2399 XSETLSTREAM (obj, lstr);
2404 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2406 return make_decoding_stream_1 (stream, codesys, "r");
2410 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2412 return make_decoding_stream_1 (stream, codesys, "w");
2415 /* Note: the decode_coding_* functions all take the same
2416 arguments as mule_decode(), which is to say some SRC data of
2417 size N, which is to be stored into dynamic array DST.
2418 DECODING is the stream within which the decoding is
2419 taking place, but no data is actually read from or
2420 written to that stream; that is handled in decoding_reader()
2421 or decoding_writer(). This allows the same functions to
2422 be used for both reading and writing. */
2425 mule_decode (Lstream *decoding, const Extbyte *src,
2426 unsigned_char_dynarr *dst, size_t n)
2428 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2430 /* If necessary, do encoding-detection now. We do this when
2431 we're a writing stream or a non-seekable reading stream,
2432 meaning that we can't just process the whole input,
2433 rewind, and start over. */
2435 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2436 str->eol_type == EOL_AUTODETECT)
2438 Lisp_Object codesys;
2440 XSETCODING_SYSTEM (codesys, str->codesys);
2441 detect_coding_type (&str->decst, src, n,
2442 CODING_SYSTEM_TYPE (str->codesys) !=
2443 CODESYS_AUTODETECT);
2444 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2445 str->decst.mask != ~0)
2446 /* #### This is cheesy. What we really ought to do is
2447 buffer up a certain amount of data so as to get a
2448 less random result. */
2449 codesys = coding_system_from_mask (str->decst.mask);
2450 str->eol_type = str->decst.eol_type;
2451 if (XCODING_SYSTEM (codesys) != str->codesys)
2453 /* Preserve the CODING_STATE_END flag in case it was set.
2454 If we erase it, bad things might happen. */
2455 int was_end = str->flags & CODING_STATE_END;
2456 set_decoding_stream_coding_system (decoding, codesys);
2458 str->flags |= CODING_STATE_END;
2462 switch (CODING_SYSTEM_TYPE (str->codesys))
2465 case CODESYS_INTERNAL:
2466 Dynarr_add_many (dst, src, n);
2469 case CODESYS_AUTODETECT:
2470 /* If we got this far and still haven't decided on the coding
2471 system, then do no conversion. */
2472 case CODESYS_NO_CONVERSION:
2473 decode_coding_no_conversion (decoding, src, dst, n);
2476 case CODESYS_SHIFT_JIS:
2477 decode_coding_sjis (decoding, src, dst, n);
2480 decode_coding_big5 (decoding, src, dst, n);
2483 decode_coding_ucs4 (decoding, src, dst, n);
2486 decode_coding_utf8 (decoding, src, dst, n);
2489 str->ccl.last_block = str->flags & CODING_STATE_END;
2490 /* When applying ccl program to stream, MUST NOT set NULL
2492 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2493 dst, n, 0, CCL_MODE_DECODING);
2495 case CODESYS_ISO2022:
2496 decode_coding_iso2022 (decoding, src, dst, n);
2504 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2505 Decode the text between START and END which is encoded in CODING-SYSTEM.
2506 This is useful if you've read in encoded text from a file without decoding
2507 it (e.g. you read in a JIS-formatted file but used the `binary' or
2508 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2509 Return length of decoded text.
2510 BUFFER defaults to the current buffer if unspecified.
2512 (start, end, coding_system, buffer))
2515 struct buffer *buf = decode_buffer (buffer, 0);
2516 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2517 Lstream *istr, *ostr;
2518 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2520 get_buffer_range_char (buf, start, end, &b, &e, 0);
2522 barf_if_buffer_read_only (buf, b, e);
2524 coding_system = Fget_coding_system (coding_system);
2525 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2526 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2527 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2529 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2530 Fget_coding_system (Qbinary));
2531 istr = XLSTREAM (instream);
2532 ostr = XLSTREAM (outstream);
2533 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2535 /* The chain of streams looks like this:
2537 [BUFFER] <----- send through
2538 ------> [ENCODE AS BINARY]
2539 ------> [DECODE AS SPECIFIED]
2545 char tempbuf[1024]; /* some random amount */
2546 Bufpos newpos, even_newer_pos;
2547 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2548 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2552 newpos = lisp_buffer_stream_startpos (istr);
2553 Lstream_write (ostr, tempbuf, size_in_bytes);
2554 even_newer_pos = lisp_buffer_stream_startpos (istr);
2555 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2558 Lstream_close (istr);
2559 Lstream_close (ostr);
2561 Lstream_delete (istr);
2562 Lstream_delete (ostr);
2563 Lstream_delete (XLSTREAM (de_outstream));
2564 Lstream_delete (XLSTREAM (lb_outstream));
2569 /************************************************************************/
2570 /* Converting to an external encoding ("encoding") */
2571 /************************************************************************/
2573 /* An encoding stream is an output stream. When you create the
2574 stream, you specify the coding system that governs the encoding
2575 and another stream that the resulting encoded data is to be
2576 sent to, and then start sending data to it. */
2578 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2580 struct encoding_stream
2582 /* Coding system that governs the conversion. */
2583 Lisp_Coding_System *codesys;
2585 /* Stream that we read the encoded data from or
2586 write the decoded data to. */
2589 /* If we are reading, then we can return only a fixed amount of
2590 data, so if the conversion resulted in too much data, we store it
2591 here for retrieval the next time around. */
2592 unsigned_char_dynarr *runoff;
2594 /* FLAGS holds flags indicating the current state of the encoding.
2595 Some of these flags are dependent on the coding system. */
2598 /* CH holds a partially built-up character. Since we only deal
2599 with one- and two-byte characters at the moment, we only use
2600 this to store the first byte of a two-byte character. */
2603 /* Additional information used by the ISO2022 encoder. */
2606 /* CHARSET holds the character sets currently assigned to the G0
2607 through G3 registers. It is initialized from the array
2608 INITIAL_CHARSET in CODESYS. */
2609 Lisp_Object charset[4];
2611 /* Which registers are currently invoked into the left (GL) and
2612 right (GR) halves of the 8-bit encoding space? */
2613 int register_left, register_right;
2615 /* Whether we need to explicitly designate the charset in the
2616 G? register before using it. It is initialized from the
2617 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2618 unsigned char force_charset_on_output[4];
2620 /* Other state variables that need to be preserved across
2622 Lisp_Object current_charset;
2624 int current_char_boundary;
2627 /* Additional information (the state of the running CCL program)
2628 used by the CCL encoder. */
2629 struct ccl_program ccl;
2633 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2634 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2636 static int encoding_rewinder (Lstream *stream);
2637 static int encoding_seekable_p (Lstream *stream);
2638 static int encoding_flusher (Lstream *stream);
2639 static int encoding_closer (Lstream *stream);
2641 static Lisp_Object encoding_marker (Lisp_Object stream);
2643 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2644 sizeof (struct encoding_stream));
2647 encoding_marker (Lisp_Object stream)
2649 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2650 Lisp_Object str_obj;
2652 /* We do not need to mark the coding systems or charsets stored
2653 within the stream because they are stored in a global list
2654 and automatically marked. */
2656 XSETLSTREAM (str_obj, str);
2657 mark_object (str_obj);
2658 if (str->imp->marker)
2659 return (str->imp->marker) (str_obj);
2664 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2665 so we read data from the other end, encode it, and store it into DATA. */
2668 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2670 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2671 unsigned char *orig_data = data;
2673 int error_occurred = 0;
2675 /* We need to interface to mule_encode(), which expects to take some
2676 amount of data and store the result into a Dynarr. We have
2677 mule_encode() store into str->runoff, and take data from there
2680 /* We loop until we have enough data, reading chunks from the other
2681 end and encoding it. */
2684 /* Take data from the runoff if we can. Make sure to take at
2685 most SIZE bytes, and delete the data from the runoff. */
2686 if (Dynarr_length (str->runoff) > 0)
2688 int chunk = min ((int) size, Dynarr_length (str->runoff));
2689 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2690 Dynarr_delete_many (str->runoff, 0, chunk);
2696 break; /* No more room for data */
2698 if (str->flags & CODING_STATE_END)
2699 /* This means that on the previous iteration, we hit the EOF on
2700 the other end. We loop once more so that mule_encode() can
2701 output any final stuff it may be holding, or any "go back
2702 to a sane state" escape sequences. (This latter makes sense
2703 during encoding.) */
2706 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2707 left of storage in it, so it's OK to read directly into it.
2708 (We'll be overwriting above, after we've encoded it into the
2710 read_size = Lstream_read (str->other_end, data, size);
2717 /* There might be some more end data produced in the translation.
2718 See the comment above. */
2719 str->flags |= CODING_STATE_END;
2720 mule_encode (stream, data, str->runoff, read_size);
2723 if (data == orig_data)
2724 return error_occurred ? -1 : 0;
2726 return data - orig_data;
2730 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2732 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2735 /* Encode all our data into the runoff, and then attempt to write
2736 it all out to the other end. Remove whatever chunk we succeeded
2738 mule_encode (stream, data, str->runoff, size);
2739 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2740 Dynarr_length (str->runoff));
2742 Dynarr_delete_many (str->runoff, 0, retval);
2743 /* Do NOT return retval. The return value indicates how much
2744 of the incoming data was written, not how many bytes were
2750 reset_encoding_stream (struct encoding_stream *str)
2753 switch (CODING_SYSTEM_TYPE (str->codesys))
2755 case CODESYS_ISO2022:
2759 for (i = 0; i < 4; i++)
2761 str->iso2022.charset[i] =
2762 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2763 str->iso2022.force_charset_on_output[i] =
2764 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2766 str->iso2022.register_left = 0;
2767 str->iso2022.register_right = 1;
2768 str->iso2022.current_charset = Qnil;
2769 str->iso2022.current_half = 0;
2771 str->iso2022.current_char_boundary = 0;
2773 str->iso2022.current_char_boundary = 1;
2778 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2785 str->flags = str->ch = 0;
2789 encoding_rewinder (Lstream *stream)
2791 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2792 reset_encoding_stream (str);
2793 Dynarr_reset (str->runoff);
2794 return Lstream_rewind (str->other_end);
2798 encoding_seekable_p (Lstream *stream)
2800 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2801 return Lstream_seekable_p (str->other_end);
2805 encoding_flusher (Lstream *stream)
2807 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2808 return Lstream_flush (str->other_end);
2812 encoding_closer (Lstream *stream)
2814 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2815 if (stream->flags & LSTREAM_FL_WRITE)
2817 str->flags |= CODING_STATE_END;
2818 encoding_writer (stream, 0, 0);
2820 Dynarr_free (str->runoff);
2821 return Lstream_close (str->other_end);
2825 encoding_stream_coding_system (Lstream *stream)
2827 Lisp_Object coding_system;
2828 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2830 XSETCODING_SYSTEM (coding_system, str->codesys);
2831 return coding_system;
2835 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2837 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2838 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2840 reset_encoding_stream (str);
2844 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2847 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2848 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2852 str->runoff = Dynarr_new (unsigned_char);
2853 str->other_end = stream;
2854 set_encoding_stream_coding_system (lstr, codesys);
2855 XSETLSTREAM (obj, lstr);
2860 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2862 return make_encoding_stream_1 (stream, codesys, "r");
2866 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2868 return make_encoding_stream_1 (stream, codesys, "w");
2871 /* Convert N bytes of internally-formatted data stored in SRC to an
2872 external format, according to the encoding stream ENCODING.
2873 Store the encoded data into DST. */
2876 mule_encode (Lstream *encoding, const Bufbyte *src,
2877 unsigned_char_dynarr *dst, size_t n)
2879 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2881 switch (CODING_SYSTEM_TYPE (str->codesys))
2884 case CODESYS_INTERNAL:
2885 Dynarr_add_many (dst, src, n);
2888 case CODESYS_AUTODETECT:
2889 /* If we got this far and still haven't decided on the coding
2890 system, then do no conversion. */
2891 case CODESYS_NO_CONVERSION:
2892 encode_coding_no_conversion (encoding, src, dst, n);
2895 case CODESYS_SHIFT_JIS:
2896 encode_coding_sjis (encoding, src, dst, n);
2899 encode_coding_big5 (encoding, src, dst, n);
2902 encode_coding_ucs4 (encoding, src, dst, n);
2905 encode_coding_utf8 (encoding, src, dst, n);
2908 str->ccl.last_block = str->flags & CODING_STATE_END;
2909 /* When applying ccl program to stream, MUST NOT set NULL
2911 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
2912 dst, n, 0, CCL_MODE_ENCODING);
2914 case CODESYS_ISO2022:
2915 encode_coding_iso2022 (encoding, src, dst, n);
2923 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2924 Encode the text between START and END using CODING-SYSTEM.
2925 This will, for example, convert Japanese characters into stuff such as
2926 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2927 text. BUFFER defaults to the current buffer if unspecified.
2929 (start, end, coding_system, buffer))
2932 struct buffer *buf = decode_buffer (buffer, 0);
2933 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2934 Lstream *istr, *ostr;
2935 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2937 get_buffer_range_char (buf, start, end, &b, &e, 0);
2939 barf_if_buffer_read_only (buf, b, e);
2941 coding_system = Fget_coding_system (coding_system);
2942 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2943 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2944 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2945 Fget_coding_system (Qbinary));
2946 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2948 istr = XLSTREAM (instream);
2949 ostr = XLSTREAM (outstream);
2950 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2951 /* The chain of streams looks like this:
2953 [BUFFER] <----- send through
2954 ------> [ENCODE AS SPECIFIED]
2955 ------> [DECODE AS BINARY]
2960 char tempbuf[1024]; /* some random amount */
2961 Bufpos newpos, even_newer_pos;
2962 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2963 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2967 newpos = lisp_buffer_stream_startpos (istr);
2968 Lstream_write (ostr, tempbuf, size_in_bytes);
2969 even_newer_pos = lisp_buffer_stream_startpos (istr);
2970 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2976 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2977 Lstream_close (istr);
2978 Lstream_close (ostr);
2980 Lstream_delete (istr);
2981 Lstream_delete (ostr);
2982 Lstream_delete (XLSTREAM (de_outstream));
2983 Lstream_delete (XLSTREAM (lb_outstream));
2984 return make_int (retlen);
2990 /************************************************************************/
2991 /* Shift-JIS methods */
2992 /************************************************************************/
2994 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2995 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2996 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2997 encoded by "position-code + 0x80". A character of JISX0208
2998 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2999 position-codes are divided and shifted so that it fit in the range
3002 --- CODE RANGE of Shift-JIS ---
3003 (character set) (range)
3005 JISX0201-Kana 0xA0 .. 0xDF
3006 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3007 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3008 -------------------------------
3012 /* Is this the first byte of a Shift-JIS two-byte char? */
3014 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3015 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3017 /* Is this the second byte of a Shift-JIS two-byte char? */
3019 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3020 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3022 #define BYTE_SJIS_KATAKANA_P(c) \
3023 ((c) >= 0xA1 && (c) <= 0xDF)
3026 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3030 unsigned char c = *(unsigned char *)src++;
3031 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3033 if (st->shift_jis.in_second_byte)
3035 st->shift_jis.in_second_byte = 0;
3039 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3040 st->shift_jis.in_second_byte = 1;
3042 return CODING_CATEGORY_SHIFT_JIS_MASK;
3045 /* Convert Shift-JIS data to internal format. */
3048 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3049 unsigned_char_dynarr *dst, size_t n)
3051 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3052 unsigned int flags = str->flags;
3053 unsigned int ch = str->ch;
3054 eol_type_t eol_type = str->eol_type;
3058 unsigned char c = *(unsigned char *)src++;
3062 /* Previous character was first byte of Shift-JIS Kanji char. */
3063 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3065 unsigned char e1, e2;
3067 DECODE_SJIS (ch, c, e1, e2);
3069 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3073 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3074 Dynarr_add (dst, e1);
3075 Dynarr_add (dst, e2);
3080 DECODE_ADD_BINARY_CHAR (ch, dst);
3081 DECODE_ADD_BINARY_CHAR (c, dst);
3087 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3088 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3090 else if (BYTE_SJIS_KATAKANA_P (c))
3093 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3096 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3097 Dynarr_add (dst, c);
3101 DECODE_ADD_BINARY_CHAR (c, dst);
3103 label_continue_loop:;
3106 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3112 /* Convert internally-formatted data to Shift-JIS. */
3115 encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
3116 unsigned_char_dynarr *dst, size_t n)
3118 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3119 unsigned int flags = str->flags;
3120 unsigned int ch = str->ch;
3121 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3123 unsigned char char_boundary = str->iso2022.current_char_boundary;
3130 switch (char_boundary)
3138 else if ( c >= 0xf8 )
3143 else if ( c >= 0xf0 )
3148 else if ( c >= 0xe0 )
3153 else if ( c >= 0xc0 )
3163 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3164 Dynarr_add (dst, '\r');
3165 if (eol_type != EOL_CR)
3166 Dynarr_add (dst, c);
3169 Dynarr_add (dst, c);
3174 ch = ( ch << 6 ) | ( c & 0x3f );
3176 Lisp_Object charset;
3177 unsigned int c1, c2, s1, s2;
3179 BREAKUP_CHAR (ch, charset, c1, c2);
3180 if (EQ(charset, Vcharset_katakana_jisx0201))
3182 Dynarr_add (dst, c1 | 0x80);
3184 else if (EQ(charset, Vcharset_japanese_jisx0208))
3186 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3187 Dynarr_add (dst, s1);
3188 Dynarr_add (dst, s2);
3194 ch = ( ch << 6 ) | ( c & 0x3f );
3200 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3201 Dynarr_add (dst, '\r');
3202 if (eol_type != EOL_CR)
3203 Dynarr_add (dst, '\n');
3206 else if (BYTE_ASCII_P (c))
3208 Dynarr_add (dst, c);
3211 else if (BUFBYTE_LEADING_BYTE_P (c))
3212 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3213 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3214 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3217 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3219 Dynarr_add (dst, c);
3222 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3223 ch == LEADING_BYTE_JAPANESE_JISX0208)
3227 unsigned char j1, j2;
3228 ENCODE_SJIS (ch, c, j1, j2);
3229 Dynarr_add (dst, j1);
3230 Dynarr_add (dst, j2);
3240 str->iso2022.current_char_boundary = char_boundary;
3244 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3245 Decode a JISX0208 character of Shift-JIS coding-system.
3246 CODE is the character code in Shift-JIS as a cons of type bytes.
3247 Return the corresponding character.
3251 unsigned char c1, c2, s1, s2;
3254 CHECK_INT (XCAR (code));
3255 CHECK_INT (XCDR (code));
3256 s1 = XINT (XCAR (code));
3257 s2 = XINT (XCDR (code));
3258 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3259 BYTE_SJIS_TWO_BYTE_2_P (s2))
3261 DECODE_SJIS (s1, s2, c1, c2);
3262 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3263 c1 & 0x7F, c2 & 0x7F));
3269 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3270 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3271 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3275 Lisp_Object charset;
3278 CHECK_CHAR_COERCE_INT (character);
3279 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3280 if (EQ (charset, Vcharset_japanese_jisx0208))
3282 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3283 return Fcons (make_int (s1), make_int (s2));
3290 /************************************************************************/
3292 /************************************************************************/
3294 /* BIG5 is a coding system encoding two character sets: ASCII and
3295 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3296 character set and is encoded in two-byte.
3298 --- CODE RANGE of BIG5 ---
3299 (character set) (range)
3301 Big5 (1st byte) 0xA1 .. 0xFE
3302 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3303 --------------------------
3305 Since the number of characters in Big5 is larger than maximum
3306 characters in Emacs' charset (96x96), it can't be handled as one
3307 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3308 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3309 contains frequently used characters and the latter contains less
3310 frequently used characters. */
3312 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3313 ((c) >= 0xA1 && (c) <= 0xFE)
3315 /* Is this the second byte of a Shift-JIS two-byte char? */
3317 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3318 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3320 /* Number of Big5 characters which have the same code in 1st byte. */
3322 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3324 /* Code conversion macros. These are macros because they are used in
3325 inner loops during code conversion.
3327 Note that temporary variables in macros introduce the classic
3328 dynamic-scoping problems with variable names. We use capital-
3329 lettered variables in the assumption that XEmacs does not use
3330 capital letters in variables except in a very formalized way
3333 /* Convert Big5 code (b1, b2) into its internal string representation
3336 /* There is a much simpler way to split the Big5 charset into two.
3337 For the moment I'm going to leave the algorithm as-is because it
3338 claims to separate out the most-used characters into a single
3339 charset, which perhaps will lead to optimizations in various
3342 The way the algorithm works is something like this:
3344 Big5 can be viewed as a 94x157 charset, where the row is
3345 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3346 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3347 the split between low and high column numbers is apparently
3348 meaningless; ascending rows produce less and less frequent chars.
3349 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3350 the first charset, and the upper half (0xC9 .. 0xFE) to the
3351 second. To do the conversion, we convert the character into
3352 a single number where 0 .. 156 is the first row, 157 .. 313
3353 is the second, etc. That way, the characters are ordered by
3354 decreasing frequency. Then we just chop the space in two
3355 and coerce the result into a 94x94 space.
3358 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3360 int B1 = b1, B2 = b2; \
3362 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3366 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3370 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3371 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3373 c1 = I / (0xFF - 0xA1) + 0xA1; \
3374 c2 = I % (0xFF - 0xA1) + 0xA1; \
3377 /* Convert the internal string representation of a Big5 character
3378 (lb, c1, c2) into Big5 code (b1, b2). */
3380 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3382 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3384 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3386 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3388 b1 = I / BIG5_SAME_ROW + 0xA1; \
3389 b2 = I % BIG5_SAME_ROW; \
3390 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3394 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3398 unsigned char c = *(unsigned char *)src++;
3399 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3400 (c >= 0x80 && c <= 0xA0))
3402 if (st->big5.in_second_byte)
3404 st->big5.in_second_byte = 0;
3405 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3409 st->big5.in_second_byte = 1;
3411 return CODING_CATEGORY_BIG5_MASK;
3414 /* Convert Big5 data to internal format. */
3417 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3418 unsigned_char_dynarr *dst, size_t n)
3420 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3421 unsigned int flags = str->flags;
3422 unsigned int ch = str->ch;
3423 eol_type_t eol_type = str->eol_type;
3427 unsigned char c = *(unsigned char *)src++;
3430 /* Previous character was first byte of Big5 char. */
3431 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3433 unsigned char b1, b2, b3;
3434 DECODE_BIG5 (ch, c, b1, b2, b3);
3435 Dynarr_add (dst, b1);
3436 Dynarr_add (dst, b2);
3437 Dynarr_add (dst, b3);
3441 DECODE_ADD_BINARY_CHAR (ch, dst);
3442 DECODE_ADD_BINARY_CHAR (c, dst);
3448 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3449 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3452 DECODE_ADD_BINARY_CHAR (c, dst);
3454 label_continue_loop:;
3457 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3463 /* Convert internally-formatted data to Big5. */
3466 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3467 unsigned_char_dynarr *dst, size_t n)
3471 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3472 unsigned int flags = str->flags;
3473 unsigned int ch = str->ch;
3474 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3481 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3482 Dynarr_add (dst, '\r');
3483 if (eol_type != EOL_CR)
3484 Dynarr_add (dst, '\n');
3486 else if (BYTE_ASCII_P (c))
3489 Dynarr_add (dst, c);
3491 else if (BUFBYTE_LEADING_BYTE_P (c))
3493 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3494 c == LEADING_BYTE_CHINESE_BIG5_2)
3496 /* A recognized leading byte. */
3498 continue; /* not done with this character. */
3500 /* otherwise just ignore this character. */
3502 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3503 ch == LEADING_BYTE_CHINESE_BIG5_2)
3505 /* Previous char was a recognized leading byte. */
3507 continue; /* not done with this character. */
3511 /* Encountering second byte of a Big5 character. */
3512 unsigned char b1, b2;
3514 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3515 Dynarr_add (dst, b1);
3516 Dynarr_add (dst, b2);
3528 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3529 Decode a Big5 character CODE of BIG5 coding-system.
3530 CODE is the character code in BIG5, a cons of two integers.
3531 Return the corresponding character.
3535 unsigned char c1, c2, b1, b2;
3538 CHECK_INT (XCAR (code));
3539 CHECK_INT (XCDR (code));
3540 b1 = XINT (XCAR (code));
3541 b2 = XINT (XCDR (code));
3542 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3543 BYTE_BIG5_TWO_BYTE_2_P (b2))
3545 Charset_ID leading_byte;
3546 Lisp_Object charset;
3547 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3548 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3549 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3555 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3556 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3557 Return the corresponding character code in Big5.
3561 Lisp_Object charset;
3564 CHECK_CHAR_COERCE_INT (character);
3565 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3566 if (EQ (charset, Vcharset_chinese_big5_1) ||
3567 EQ (charset, Vcharset_chinese_big5_2))
3569 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3571 return Fcons (make_int (b1), make_int (b2));
3578 /************************************************************************/
3581 /* UCS-4 character codes are implemented as nonnegative integers. */
3583 /************************************************************************/
3586 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3587 Map UCS-4 code CODE to Mule character CHARACTER.
3589 Return T on success, NIL on failure.
3595 CHECK_CHAR (character);
3596 CHECK_NATNUM (code);
3599 if (c < countof (fcd->ucs_to_mule_table))
3601 fcd->ucs_to_mule_table[c] = character;
3609 ucs_to_char (unsigned long code)
3611 if (code < countof (fcd->ucs_to_mule_table))
3613 return fcd->ucs_to_mule_table[code];
3615 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3620 c = code % (94 * 94);
3622 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3623 (94, 2, code / (94 * 94) + '@',
3624 CHARSET_LEFT_TO_RIGHT),
3625 c / 94 + 33, c % 94 + 33));
3631 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3632 Return Mule character corresponding to UCS code CODE (a positive integer).
3636 CHECK_NATNUM (code);
3637 return ucs_to_char (XINT (code));
3640 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3641 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3645 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3646 Fset_char_ucs is more restrictive on index arg, but should
3647 check code arg in a char_table method. */
3648 CHECK_CHAR (character);
3649 CHECK_NATNUM (code);
3650 return Fput_char_table (character, code, mule_to_ucs_table);
3653 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3654 Return the UCS code (a positive integer) corresponding to CHARACTER.
3658 return Fget_char_table (character, mule_to_ucs_table);
3663 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3665 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3666 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3667 is not found, instead.
3668 #### do something more appropriate (use blob?)
3669 Danger, Will Robinson! Data loss. Should we signal user? */
3671 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3673 Lisp_Object chr = ucs_to_char (ch);
3677 Bufbyte work[MAX_EMCHAR_LEN];
3682 simple_set_charptr_emchar (work, ch) :
3683 non_ascii_set_charptr_emchar (work, ch);
3684 Dynarr_add_many (dst, work, len);
3688 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3689 Dynarr_add (dst, 34 + 128);
3690 Dynarr_add (dst, 46 + 128);
3696 static unsigned long
3697 mule_char_to_ucs4 (Lisp_Object charset,
3698 unsigned char h, unsigned char l)
3701 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3708 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3709 (XCHARSET_CHARS (charset) == 94) )
3711 unsigned char final = XCHARSET_FINAL (charset);
3713 if ( ('@' <= final) && (final < 0x7f) )
3715 return 0xe00000 + (final - '@') * 94 * 94
3716 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3730 encode_ucs4 (Lisp_Object charset,
3731 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3733 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3734 Dynarr_add (dst, code >> 24);
3735 Dynarr_add (dst, (code >> 16) & 255);
3736 Dynarr_add (dst, (code >> 8) & 255);
3737 Dynarr_add (dst, code & 255);
3742 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3746 unsigned char c = *(unsigned char *)src++;
3747 switch (st->ucs4.in_byte)
3756 st->ucs4.in_byte = 0;
3762 return CODING_CATEGORY_UCS4_MASK;
3766 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3767 unsigned_char_dynarr *dst, size_t n)
3769 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3770 unsigned int flags = str->flags;
3771 unsigned int ch = str->ch;
3772 unsigned char counter = str->counter;
3776 unsigned char c = *(unsigned char *)src++;
3784 decode_ucs4 ( ( ch << 8 ) | c, dst);
3789 ch = ( ch << 8 ) | c;
3793 if (counter & CODING_STATE_END)
3794 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3798 str->counter = counter;
3802 encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
3803 unsigned_char_dynarr *dst, size_t n)
3806 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3807 unsigned int flags = str->flags;
3808 unsigned int ch = str->ch;
3809 unsigned char char_boundary = str->iso2022.current_char_boundary;
3810 Lisp_Object charset = str->iso2022.current_charset;
3812 #ifdef ENABLE_COMPOSITE_CHARS
3813 /* flags for handling composite chars. We do a little switcharoo
3814 on the source while we're outputting the composite char. */
3815 unsigned int saved_n = 0;
3816 const unsigned char *saved_src = NULL;
3817 int in_composite = 0;
3824 unsigned char c = *src++;
3826 if (BYTE_ASCII_P (c))
3827 { /* Processing ASCII character */
3829 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3832 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3833 { /* Processing Leading Byte */
3835 charset = CHARSET_BY_LEADING_BYTE (c);
3836 if (LEADING_BYTE_PREFIX_P(c))
3841 { /* Processing Non-ASCII character */
3843 if (EQ (charset, Vcharset_control_1))
3845 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3849 switch (XCHARSET_REP_BYTES (charset))
3852 encode_ucs4 (charset, c, 0, dst);
3855 if (XCHARSET_PRIVATE_P (charset))
3857 encode_ucs4 (charset, c, 0, dst);
3862 #ifdef ENABLE_COMPOSITE_CHARS
3863 if (EQ (charset, Vcharset_composite))
3867 /* #### Bother! We don't know how to
3869 Dynarr_add (dst, '\0');
3870 Dynarr_add (dst, '\0');
3871 Dynarr_add (dst, '\0');
3872 Dynarr_add (dst, '~');
3876 Emchar emch = MAKE_CHAR (Vcharset_composite,
3877 ch & 0x7F, c & 0x7F);
3878 Lisp_Object lstr = composite_char_string (emch);
3882 src = XSTRING_DATA (lstr);
3883 n = XSTRING_LENGTH (lstr);
3887 #endif /* ENABLE_COMPOSITE_CHARS */
3889 encode_ucs4(charset, ch, c, dst);
3902 encode_ucs4 (charset, ch, c, dst);
3918 #ifdef ENABLE_COMPOSITE_CHARS
3924 goto back_to_square_n; /* Wheeeeeeeee ..... */
3926 #endif /* ENABLE_COMPOSITE_CHARS */
3930 str->iso2022.current_char_boundary = char_boundary;
3931 str->iso2022.current_charset = charset;
3933 /* Verbum caro factum est! */
3938 /************************************************************************/
3940 /************************************************************************/
3943 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3947 unsigned char c = *(unsigned char *)src++;
3948 switch (st->utf8.in_byte)
3951 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3954 st->utf8.in_byte = 5;
3956 st->utf8.in_byte = 4;
3958 st->utf8.in_byte = 3;
3960 st->utf8.in_byte = 2;
3962 st->utf8.in_byte = 1;
3967 if ((c & 0xc0) != 0x80)
3973 return CODING_CATEGORY_UTF8_MASK;
3977 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3978 unsigned_char_dynarr *dst, size_t n)
3980 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3981 unsigned int flags = str->flags;
3982 unsigned int ch = str->ch;
3983 eol_type_t eol_type = str->eol_type;
3984 unsigned char counter = str->counter;
3988 unsigned char c = *(unsigned char *)src++;
3997 else if ( c >= 0xf8 )
4002 else if ( c >= 0xf0 )
4007 else if ( c >= 0xe0 )
4012 else if ( c >= 0xc0 )
4019 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4020 decode_ucs4 (c, dst);
4024 ch = ( ch << 6 ) | ( c & 0x3f );
4025 decode_ucs4 (ch, dst);
4030 ch = ( ch << 6 ) | ( c & 0x3f );
4033 label_continue_loop:;
4036 if (flags & CODING_STATE_END)
4037 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4041 str->counter = counter;
4046 encode_utf8 (Lisp_Object charset,
4047 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4049 unsigned long code = mule_char_to_ucs4 (charset, h, l);
4052 Dynarr_add (dst, code);
4054 else if ( code <= 0x7ff )
4056 Dynarr_add (dst, (code >> 6) | 0xc0);
4057 Dynarr_add (dst, (code & 0x3f) | 0x80);
4059 else if ( code <= 0xffff )
4061 Dynarr_add (dst, (code >> 12) | 0xe0);
4062 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4063 Dynarr_add (dst, (code & 0x3f) | 0x80);
4065 else if ( code <= 0x1fffff )
4067 Dynarr_add (dst, (code >> 18) | 0xf0);
4068 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4069 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4070 Dynarr_add (dst, (code & 0x3f) | 0x80);
4072 else if ( code <= 0x3ffffff )
4074 Dynarr_add (dst, (code >> 24) | 0xf8);
4075 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4076 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4077 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4078 Dynarr_add (dst, (code & 0x3f) | 0x80);
4082 Dynarr_add (dst, (code >> 30) | 0xfc);
4083 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4084 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4085 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4086 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4087 Dynarr_add (dst, (code & 0x3f) | 0x80);
4093 encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
4094 unsigned_char_dynarr *dst, size_t n)
4096 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4097 unsigned int flags = str->flags;
4098 unsigned int ch = str->ch;
4099 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4100 unsigned char char_boundary = str->iso2022.current_char_boundary;
4105 unsigned char c = *src++;
4106 switch (char_boundary)
4111 Dynarr_add (dst, c);
4114 else if ( c >= 0xf8 )
4116 Dynarr_add (dst, c);
4119 else if ( c >= 0xf0 )
4121 Dynarr_add (dst, c);
4124 else if ( c >= 0xe0 )
4126 Dynarr_add (dst, c);
4129 else if ( c >= 0xc0 )
4131 Dynarr_add (dst, c);
4138 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4139 Dynarr_add (dst, '\r');
4140 if (eol_type != EOL_CR)
4141 Dynarr_add (dst, c);
4144 Dynarr_add (dst, c);
4149 Dynarr_add (dst, c);
4153 Dynarr_add (dst, c);
4157 #else /* not UTF2000 */
4158 Lisp_Object charset = str->iso2022.current_charset;
4160 #ifdef ENABLE_COMPOSITE_CHARS
4161 /* flags for handling composite chars. We do a little switcharoo
4162 on the source while we're outputting the composite char. */
4163 unsigned int saved_n = 0;
4164 const unsigned char *saved_src = NULL;
4165 int in_composite = 0;
4168 #endif /* ENABLE_COMPOSITE_CHARS */
4172 unsigned char c = *src++;
4174 if (BYTE_ASCII_P (c))
4175 { /* Processing ASCII character */
4179 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4180 Dynarr_add (dst, '\r');
4181 if (eol_type != EOL_CR)
4182 Dynarr_add (dst, c);
4185 encode_utf8 (Vcharset_ascii, c, 0, dst);
4188 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4189 { /* Processing Leading Byte */
4191 charset = CHARSET_BY_LEADING_BYTE (c);
4192 if (LEADING_BYTE_PREFIX_P(c))
4197 { /* Processing Non-ASCII character */
4199 if (EQ (charset, Vcharset_control_1))
4201 encode_utf8 (Vcharset_control_1, c, 0, dst);
4205 switch (XCHARSET_REP_BYTES (charset))
4208 encode_utf8 (charset, c, 0, dst);
4211 if (XCHARSET_PRIVATE_P (charset))
4213 encode_utf8 (charset, c, 0, dst);
4218 #ifdef ENABLE_COMPOSITE_CHARS
4219 if (EQ (charset, Vcharset_composite))
4223 /* #### Bother! We don't know how to
4225 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4229 Emchar emch = MAKE_CHAR (Vcharset_composite,
4230 ch & 0x7F, c & 0x7F);
4231 Lisp_Object lstr = composite_char_string (emch);
4235 src = XSTRING_DATA (lstr);
4236 n = XSTRING_LENGTH (lstr);
4240 #endif /* ENABLE_COMPOSITE_CHARS */
4242 encode_utf8 (charset, ch, c, dst);
4255 encode_utf8 (charset, ch, c, dst);
4271 #ifdef ENABLE_COMPOSITE_CHARS
4277 goto back_to_square_n; /* Wheeeeeeeee ..... */
4281 #endif /* not UTF2000 */
4284 str->iso2022.current_char_boundary = char_boundary;
4286 str->iso2022.current_charset = charset;
4289 /* Verbum caro factum est! */
4293 /************************************************************************/
4294 /* ISO2022 methods */
4295 /************************************************************************/
4297 /* The following note describes the coding system ISO2022 briefly.
4298 Since the intention of this note is to help understand the
4299 functions in this file, some parts are NOT ACCURATE or OVERLY
4300 SIMPLIFIED. For thorough understanding, please refer to the
4301 original document of ISO2022.
4303 ISO2022 provides many mechanisms to encode several character sets
4304 in 7-bit and 8-bit environments. For 7-bit environments, all text
4305 is encoded using bytes less than 128. This may make the encoded
4306 text a little bit longer, but the text passes more easily through
4307 several gateways, some of which strip off MSB (Most Signigant Bit).
4309 There are two kinds of character sets: control character set and
4310 graphic character set. The former contains control characters such
4311 as `newline' and `escape' to provide control functions (control
4312 functions are also provided by escape sequences). The latter
4313 contains graphic characters such as 'A' and '-'. Emacs recognizes
4314 two control character sets and many graphic character sets.
4316 Graphic character sets are classified into one of the following
4317 four classes, according to the number of bytes (DIMENSION) and
4318 number of characters in one dimension (CHARS) of the set:
4319 - DIMENSION1_CHARS94
4320 - DIMENSION1_CHARS96
4321 - DIMENSION2_CHARS94
4322 - DIMENSION2_CHARS96
4324 In addition, each character set is assigned an identification tag,
4325 unique for each set, called "final character" (denoted as <F>
4326 hereafter). The <F> of each character set is decided by ECMA(*)
4327 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4328 (0x30..0x3F are for private use only).
4330 Note (*): ECMA = European Computer Manufacturers Association
4332 Here are examples of graphic character set [NAME(<F>)]:
4333 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4334 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4335 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4336 o DIMENSION2_CHARS96 -- none for the moment
4338 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4339 C0 [0x00..0x1F] -- control character plane 0
4340 GL [0x20..0x7F] -- graphic character plane 0
4341 C1 [0x80..0x9F] -- control character plane 1
4342 GR [0xA0..0xFF] -- graphic character plane 1
4344 A control character set is directly designated and invoked to C0 or
4345 C1 by an escape sequence. The most common case is that:
4346 - ISO646's control character set is designated/invoked to C0, and
4347 - ISO6429's control character set is designated/invoked to C1,
4348 and usually these designations/invocations are omitted in encoded
4349 text. In a 7-bit environment, only C0 can be used, and a control
4350 character for C1 is encoded by an appropriate escape sequence to
4351 fit into the environment. All control characters for C1 are
4352 defined to have corresponding escape sequences.
4354 A graphic character set is at first designated to one of four
4355 graphic registers (G0 through G3), then these graphic registers are
4356 invoked to GL or GR. These designations and invocations can be
4357 done independently. The most common case is that G0 is invoked to
4358 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4359 these invocations and designations are omitted in encoded text.
4360 In a 7-bit environment, only GL can be used.
4362 When a graphic character set of CHARS94 is invoked to GL, codes
4363 0x20 and 0x7F of the GL area work as control characters SPACE and
4364 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4367 There are two ways of invocation: locking-shift and single-shift.
4368 With locking-shift, the invocation lasts until the next different
4369 invocation, whereas with single-shift, the invocation affects the
4370 following character only and doesn't affect the locking-shift
4371 state. Invocations are done by the following control characters or
4374 ----------------------------------------------------------------------
4375 abbrev function cntrl escape seq description
4376 ----------------------------------------------------------------------
4377 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4378 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4379 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4380 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4381 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4382 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4383 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4384 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4385 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4386 ----------------------------------------------------------------------
4387 (*) These are not used by any known coding system.
4389 Control characters for these functions are defined by macros
4390 ISO_CODE_XXX in `coding.h'.
4392 Designations are done by the following escape sequences:
4393 ----------------------------------------------------------------------
4394 escape sequence description
4395 ----------------------------------------------------------------------
4396 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4397 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4398 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4399 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4400 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4401 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4402 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4403 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4404 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4405 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4406 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4407 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4408 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4409 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4410 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4411 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4412 ----------------------------------------------------------------------
4414 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4415 of dimension 1, chars 94, and final character <F>, etc...
4417 Note (*): Although these designations are not allowed in ISO2022,
4418 Emacs accepts them on decoding, and produces them on encoding
4419 CHARS96 character sets in a coding system which is characterized as
4420 7-bit environment, non-locking-shift, and non-single-shift.
4422 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4423 '(' can be omitted. We refer to this as "short-form" hereafter.
4425 Now you may notice that there are a lot of ways for encoding the
4426 same multilingual text in ISO2022. Actually, there exist many
4427 coding systems such as Compound Text (used in X11's inter client
4428 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4429 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4430 localized platforms), and all of these are variants of ISO2022.
4432 In addition to the above, Emacs handles two more kinds of escape
4433 sequences: ISO6429's direction specification and Emacs' private
4434 sequence for specifying character composition.
4436 ISO6429's direction specification takes the following form:
4437 o CSI ']' -- end of the current direction
4438 o CSI '0' ']' -- end of the current direction
4439 o CSI '1' ']' -- start of left-to-right text
4440 o CSI '2' ']' -- start of right-to-left text
4441 The control character CSI (0x9B: control sequence introducer) is
4442 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4444 Character composition specification takes the following form:
4445 o ESC '0' -- start character composition
4446 o ESC '1' -- end character composition
4447 Since these are not standard escape sequences of any ISO standard,
4448 their use with these meanings is restricted to Emacs only. */
4451 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4455 for (i = 0; i < 4; i++)
4457 if (!NILP (coding_system))
4459 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4461 iso->charset[i] = Qt;
4462 iso->invalid_designated[i] = 0;
4464 iso->esc = ISO_ESC_NOTHING;
4465 iso->esc_bytes_index = 0;
4466 iso->register_left = 0;
4467 iso->register_right = 1;
4468 iso->switched_dir_and_no_valid_charset_yet = 0;
4469 iso->invalid_switch_dir = 0;
4470 iso->output_direction_sequence = 0;
4471 iso->output_literally = 0;
4472 #ifdef ENABLE_COMPOSITE_CHARS
4473 if (iso->composite_chars)
4474 Dynarr_reset (iso->composite_chars);
4479 fit_to_be_escape_quoted (unsigned char c)
4496 /* Parse one byte of an ISO2022 escape sequence.
4497 If the result is an invalid escape sequence, return 0 and
4498 do not change anything in STR. Otherwise, if the result is
4499 an incomplete escape sequence, update ISO2022.ESC and
4500 ISO2022.ESC_BYTES and return -1. Otherwise, update
4501 all the state variables (but not ISO2022.ESC_BYTES) and
4504 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4505 or invocation of an invalid character set and treat that as
4506 an unrecognized escape sequence.
4508 ********************************************************************
4510 #### Strategies for error annotation and coding orthogonalization
4512 We really want to separate out a number of things. Conceptually,
4513 there is a nested syntax.
4515 At the top level is the ISO 2022 extension syntax, including charset
4516 designation and invocation, and certain auxiliary controls such as the
4517 ISO 6429 direction specification. These are octet-oriented, with the
4518 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4519 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4520 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4521 (deprecated) special case in Unicode processing.
4523 The middle layer is ISO 2022 character interpretation. This will depend
4524 on the current state of the ISO 2022 registers, and assembles octets
4525 into the character's internal representation.
4527 The lowest level is translating system control conventions. At present
4528 this is restricted to newline translation, but one could imagine doing
4529 tab conversion or line wrapping here. "Escape from Unicode" processing
4530 would be done at this level.
4532 At each level the parser will verify the syntax. In the case of a
4533 syntax error or warning (such as a redundant escape sequence that affects
4534 no characters), the parser will take some action, typically inserting the
4535 erroneous octets directly into the output and creating an annotation
4536 which can be used by higher level I/O to mark the affected region.
4538 This should make it possible to do something sensible about separating
4539 newline convention processing from character construction, and about
4540 preventing ISO 2022 escape sequences from being recognized
4543 The basic strategy will be to have octet classification tables, and
4544 switch processing according to the table entry.
4546 It's possible that, by doing the processing with tables of functions or
4547 the like, the parser can be used for both detection and translation. */
4550 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4551 unsigned char c, unsigned int *flags,
4552 int check_invalid_charsets)
4554 /* (1) If we're at the end of a designation sequence, CS is the
4555 charset being designated and REG is the register to designate
4558 (2) If we're at the end of a locking-shift sequence, REG is
4559 the register to invoke and HALF (0 == left, 1 == right) is
4560 the half to invoke it into.
4562 (3) If we're at the end of a single-shift sequence, REG is
4563 the register to invoke. */
4564 Lisp_Object cs = Qnil;
4567 /* NOTE: This code does goto's all over the fucking place.
4568 The reason for this is that we're basically implementing
4569 a state machine here, and hierarchical languages like C
4570 don't really provide a clean way of doing this. */
4572 if (! (*flags & CODING_STATE_ESCAPE))
4573 /* At beginning of escape sequence; we need to reset our
4574 escape-state variables. */
4575 iso->esc = ISO_ESC_NOTHING;
4577 iso->output_literally = 0;
4578 iso->output_direction_sequence = 0;
4582 case ISO_ESC_NOTHING:
4583 iso->esc_bytes_index = 0;
4586 case ISO_CODE_ESC: /* Start escape sequence */
4587 *flags |= CODING_STATE_ESCAPE;
4591 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4592 *flags |= CODING_STATE_ESCAPE;
4593 iso->esc = ISO_ESC_5_11;
4596 case ISO_CODE_SO: /* locking shift 1 */
4599 case ISO_CODE_SI: /* locking shift 0 */
4603 case ISO_CODE_SS2: /* single shift */
4606 case ISO_CODE_SS3: /* single shift */
4610 default: /* Other control characters */
4617 /**** single shift ****/
4619 case 'N': /* single shift 2 */
4622 case 'O': /* single shift 3 */
4626 /**** locking shift ****/
4628 case '~': /* locking shift 1 right */
4631 case 'n': /* locking shift 2 */
4634 case '}': /* locking shift 2 right */
4637 case 'o': /* locking shift 3 */
4640 case '|': /* locking shift 3 right */
4644 #ifdef ENABLE_COMPOSITE_CHARS
4645 /**** composite ****/
4648 iso->esc = ISO_ESC_START_COMPOSITE;
4649 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4650 CODING_STATE_COMPOSITE;
4654 iso->esc = ISO_ESC_END_COMPOSITE;
4655 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4656 ~CODING_STATE_COMPOSITE;
4658 #endif /* ENABLE_COMPOSITE_CHARS */
4660 /**** directionality ****/
4663 iso->esc = ISO_ESC_5_11;
4666 /**** designation ****/
4668 case '$': /* multibyte charset prefix */
4669 iso->esc = ISO_ESC_2_4;
4673 if (0x28 <= c && c <= 0x2F)
4675 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4679 /* This function is called with CODESYS equal to nil when
4680 doing coding-system detection. */
4682 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4683 && fit_to_be_escape_quoted (c))
4685 iso->esc = ISO_ESC_LITERAL;
4686 *flags &= CODING_STATE_ISO2022_LOCK;
4696 /**** directionality ****/
4698 case ISO_ESC_5_11: /* ISO6429 direction control */
4701 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4702 goto directionality;
4704 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4705 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4706 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4710 case ISO_ESC_5_11_0:
4713 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4714 goto directionality;
4718 case ISO_ESC_5_11_1:
4721 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4722 goto directionality;
4726 case ISO_ESC_5_11_2:
4729 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4730 goto directionality;
4735 iso->esc = ISO_ESC_DIRECTIONALITY;
4736 /* Various junk here to attempt to preserve the direction sequences
4737 literally in the text if they would otherwise be swallowed due
4738 to invalid designations that don't show up as actual charset
4739 changes in the text. */
4740 if (iso->invalid_switch_dir)
4742 /* We already inserted a direction switch literally into the
4743 text. We assume (#### this may not be right) that the
4744 next direction switch is the one going the other way,
4745 and we need to output that literally as well. */
4746 iso->output_literally = 1;
4747 iso->invalid_switch_dir = 0;
4753 /* If we are in the thrall of an invalid designation,
4754 then stick the directionality sequence literally into the
4755 output stream so it ends up in the original text again. */
4756 for (jj = 0; jj < 4; jj++)
4757 if (iso->invalid_designated[jj])
4761 iso->output_literally = 1;
4762 iso->invalid_switch_dir = 1;
4765 /* Indicate that we haven't yet seen a valid designation,
4766 so that if a switch-dir is directly followed by an
4767 invalid designation, both get inserted literally. */
4768 iso->switched_dir_and_no_valid_charset_yet = 1;
4773 /**** designation ****/
4776 if (0x28 <= c && c <= 0x2F)
4778 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4781 if (0x40 <= c && c <= 0x42)
4783 cs = CHARSET_BY_ATTRIBUTES (94, 2, c,
4784 *flags & CODING_STATE_R2L ?
4785 CHARSET_RIGHT_TO_LEFT :
4786 CHARSET_LEFT_TO_RIGHT);
4798 if (c < '0' || c > '~')
4799 return 0; /* bad final byte */
4801 if (iso->esc >= ISO_ESC_2_8 &&
4802 iso->esc <= ISO_ESC_2_15)
4804 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4805 single = 1; /* single-byte */
4806 reg = (iso->esc - ISO_ESC_2_8) & 3;
4808 else if (iso->esc >= ISO_ESC_2_4_8 &&
4809 iso->esc <= ISO_ESC_2_4_15)
4811 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4812 single = -1; /* multi-byte */
4813 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4817 /* Can this ever be reached? -slb */
4822 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4823 *flags & CODING_STATE_R2L ?
4824 CHARSET_RIGHT_TO_LEFT :
4825 CHARSET_LEFT_TO_RIGHT);
4831 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4835 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4836 /* can't invoke something that ain't there. */
4838 iso->esc = ISO_ESC_SINGLE_SHIFT;
4839 *flags &= CODING_STATE_ISO2022_LOCK;
4841 *flags |= CODING_STATE_SS2;
4843 *flags |= CODING_STATE_SS3;
4847 if (check_invalid_charsets &&
4848 !CHARSETP (iso->charset[reg]))
4849 /* can't invoke something that ain't there. */
4852 iso->register_right = reg;
4854 iso->register_left = reg;
4855 *flags &= CODING_STATE_ISO2022_LOCK;
4856 iso->esc = ISO_ESC_LOCKING_SHIFT;
4860 if (NILP (cs) && check_invalid_charsets)
4862 iso->invalid_designated[reg] = 1;
4863 iso->charset[reg] = Vcharset_ascii;
4864 iso->esc = ISO_ESC_DESIGNATE;
4865 *flags &= CODING_STATE_ISO2022_LOCK;
4866 iso->output_literally = 1;
4867 if (iso->switched_dir_and_no_valid_charset_yet)
4869 /* We encountered a switch-direction followed by an
4870 invalid designation. Ensure that the switch-direction
4871 gets outputted; otherwise it will probably get eaten
4872 when the text is written out again. */
4873 iso->switched_dir_and_no_valid_charset_yet = 0;
4874 iso->output_direction_sequence = 1;
4875 /* And make sure that the switch-dir going the other
4876 way gets outputted, as well. */
4877 iso->invalid_switch_dir = 1;
4881 /* This function is called with CODESYS equal to nil when
4882 doing coding-system detection. */
4883 if (!NILP (codesys))
4885 charset_conversion_spec_dynarr *dyn =
4886 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4892 for (i = 0; i < Dynarr_length (dyn); i++)
4894 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4895 if (EQ (cs, spec->from_charset))
4896 cs = spec->to_charset;
4901 iso->charset[reg] = cs;
4902 iso->esc = ISO_ESC_DESIGNATE;
4903 *flags &= CODING_STATE_ISO2022_LOCK;
4904 if (iso->invalid_designated[reg])
4906 iso->invalid_designated[reg] = 0;
4907 iso->output_literally = 1;
4909 if (iso->switched_dir_and_no_valid_charset_yet)
4910 iso->switched_dir_and_no_valid_charset_yet = 0;
4915 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4919 /* #### There are serious deficiencies in the recognition mechanism
4920 here. This needs to be much smarter if it's going to cut it.
4921 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4922 it should be detected as Latin-1.
4923 All the ISO2022 stuff in this file should be synced up with the
4924 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4925 Perhaps we should wait till R2L works in FSF Emacs? */
4927 if (!st->iso2022.initted)
4929 reset_iso2022 (Qnil, &st->iso2022.iso);
4930 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4931 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4932 CODING_CATEGORY_ISO_8_1_MASK |
4933 CODING_CATEGORY_ISO_8_2_MASK |
4934 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4935 st->iso2022.flags = 0;
4936 st->iso2022.high_byte_count = 0;
4937 st->iso2022.saw_single_shift = 0;
4938 st->iso2022.initted = 1;
4941 mask = st->iso2022.mask;
4945 unsigned char c = *(unsigned char *)src++;
4948 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4949 st->iso2022.high_byte_count++;
4953 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4955 if (st->iso2022.high_byte_count & 1)
4956 /* odd number of high bytes; assume not iso-8-2 */
4957 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4959 st->iso2022.high_byte_count = 0;
4960 st->iso2022.saw_single_shift = 0;
4962 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4964 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4965 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4966 { /* control chars */
4969 /* Allow and ignore control characters that you might
4970 reasonably see in a text file */
4975 case 8: /* backspace */
4976 case 11: /* vertical tab */
4977 case 12: /* form feed */
4978 case 26: /* MS-DOS C-z junk */
4979 case 31: /* '^_' -- for info */
4980 goto label_continue_loop;
4987 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4990 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4991 &st->iso2022.flags, 0))
4993 switch (st->iso2022.iso.esc)
4995 case ISO_ESC_DESIGNATE:
4996 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4997 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4999 case ISO_ESC_LOCKING_SHIFT:
5000 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5001 goto ran_out_of_chars;
5002 case ISO_ESC_SINGLE_SHIFT:
5003 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5004 st->iso2022.saw_single_shift = 1;
5013 goto ran_out_of_chars;
5016 label_continue_loop:;
5025 postprocess_iso2022_mask (int mask)
5027 /* #### kind of cheesy */
5028 /* If seven-bit ISO is allowed, then assume that the encoding is
5029 entirely seven-bit and turn off the eight-bit ones. */
5030 if (mask & CODING_CATEGORY_ISO_7_MASK)
5031 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5032 CODING_CATEGORY_ISO_8_1_MASK |
5033 CODING_CATEGORY_ISO_8_2_MASK);
5037 /* If FLAGS is a null pointer or specifies right-to-left motion,
5038 output a switch-dir-to-left-to-right sequence to DST.
5039 Also update FLAGS if it is not a null pointer.
5040 If INTERNAL_P is set, we are outputting in internal format and
5041 need to handle the CSI differently. */
5044 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5045 unsigned_char_dynarr *dst,
5046 unsigned int *flags,
5049 if (!flags || (*flags & CODING_STATE_R2L))
5051 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5053 Dynarr_add (dst, ISO_CODE_ESC);
5054 Dynarr_add (dst, '[');
5056 else if (internal_p)
5057 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5059 Dynarr_add (dst, ISO_CODE_CSI);
5060 Dynarr_add (dst, '0');
5061 Dynarr_add (dst, ']');
5063 *flags &= ~CODING_STATE_R2L;
5067 /* If FLAGS is a null pointer or specifies a direction different from
5068 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5069 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5070 sequence to DST. Also update FLAGS if it is not a null pointer.
5071 If INTERNAL_P is set, we are outputting in internal format and
5072 need to handle the CSI differently. */
5075 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5076 unsigned_char_dynarr *dst, unsigned int *flags,
5079 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5080 direction == CHARSET_LEFT_TO_RIGHT)
5081 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5082 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5083 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5084 direction == CHARSET_RIGHT_TO_LEFT)
5086 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5088 Dynarr_add (dst, ISO_CODE_ESC);
5089 Dynarr_add (dst, '[');
5091 else if (internal_p)
5092 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5094 Dynarr_add (dst, ISO_CODE_CSI);
5095 Dynarr_add (dst, '2');
5096 Dynarr_add (dst, ']');
5098 *flags |= CODING_STATE_R2L;
5102 /* Convert ISO2022-format data to internal format. */
5105 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5106 unsigned_char_dynarr *dst, size_t n)
5108 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5109 unsigned int flags = str->flags;
5110 unsigned int ch = str->ch;
5111 eol_type_t eol_type = str->eol_type;
5112 #ifdef ENABLE_COMPOSITE_CHARS
5113 unsigned_char_dynarr *real_dst = dst;
5115 Lisp_Object coding_system;
5117 XSETCODING_SYSTEM (coding_system, str->codesys);
5119 #ifdef ENABLE_COMPOSITE_CHARS
5120 if (flags & CODING_STATE_COMPOSITE)
5121 dst = str->iso2022.composite_chars;
5122 #endif /* ENABLE_COMPOSITE_CHARS */
5126 unsigned char c = *(unsigned char *)src++;
5127 if (flags & CODING_STATE_ESCAPE)
5128 { /* Within ESC sequence */
5129 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5134 switch (str->iso2022.esc)
5136 #ifdef ENABLE_COMPOSITE_CHARS
5137 case ISO_ESC_START_COMPOSITE:
5138 if (str->iso2022.composite_chars)
5139 Dynarr_reset (str->iso2022.composite_chars);
5141 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5142 dst = str->iso2022.composite_chars;
5144 case ISO_ESC_END_COMPOSITE:
5146 Bufbyte comstr[MAX_EMCHAR_LEN];
5148 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5149 Dynarr_length (dst));
5151 len = set_charptr_emchar (comstr, emch);
5152 Dynarr_add_many (dst, comstr, len);
5155 #endif /* ENABLE_COMPOSITE_CHARS */
5157 case ISO_ESC_LITERAL:
5158 DECODE_ADD_BINARY_CHAR (c, dst);
5162 /* Everything else handled already */
5167 /* Attempted error recovery. */
5168 if (str->iso2022.output_direction_sequence)
5169 ensure_correct_direction (flags & CODING_STATE_R2L ?
5170 CHARSET_RIGHT_TO_LEFT :
5171 CHARSET_LEFT_TO_RIGHT,
5172 str->codesys, dst, 0, 1);
5173 /* More error recovery. */
5174 if (!retval || str->iso2022.output_literally)
5176 /* Output the (possibly invalid) sequence */
5178 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5179 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5180 flags &= CODING_STATE_ISO2022_LOCK;
5182 n++, src--;/* Repeat the loop with the same character. */
5185 /* No sense in reprocessing the final byte of the
5186 escape sequence; it could mess things up anyway.
5188 DECODE_ADD_BINARY_CHAR (c, dst);
5193 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5194 { /* Control characters */
5196 /***** Error-handling *****/
5198 /* If we were in the middle of a character, dump out the
5199 partial character. */
5200 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5202 /* If we just saw a single-shift character, dump it out.
5203 This may dump out the wrong sort of single-shift character,
5204 but least it will give an indication that something went
5206 if (flags & CODING_STATE_SS2)
5208 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5209 flags &= ~CODING_STATE_SS2;
5211 if (flags & CODING_STATE_SS3)
5213 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5214 flags &= ~CODING_STATE_SS3;
5217 /***** Now handle the control characters. *****/
5220 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5222 flags &= CODING_STATE_ISO2022_LOCK;
5224 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5225 DECODE_ADD_BINARY_CHAR (c, dst);
5228 { /* Graphic characters */
5229 Lisp_Object charset;
5235 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5237 /* Now determine the charset. */
5238 reg = ((flags & CODING_STATE_SS2) ? 2
5239 : (flags & CODING_STATE_SS3) ? 3
5240 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5241 : str->iso2022.register_left);
5242 charset = str->iso2022.charset[reg];
5244 /* Error checking: */
5245 if (! CHARSETP (charset)
5246 || str->iso2022.invalid_designated[reg]
5247 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5248 && XCHARSET_CHARS (charset) == 94))
5249 /* Mrmph. We are trying to invoke a register that has no
5250 or an invalid charset in it, or trying to add a character
5251 outside the range of the charset. Insert that char literally
5252 to preserve it for the output. */
5254 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5255 DECODE_ADD_BINARY_CHAR (c, dst);
5260 /* Things are probably hunky-dorey. */
5262 /* Fetch reverse charset, maybe. */
5263 if (((flags & CODING_STATE_R2L) &&
5264 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5266 (!(flags & CODING_STATE_R2L) &&
5267 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5269 Lisp_Object new_charset =
5270 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5271 if (!NILP (new_charset))
5272 charset = new_charset;
5276 if (XCHARSET_DIMENSION (charset) == 1)
5278 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5280 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5285 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5291 lb = XCHARSET_LEADING_BYTE (charset);
5292 switch (XCHARSET_REP_BYTES (charset))
5295 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5296 Dynarr_add (dst, c & 0x7F);
5299 case 2: /* one-byte official */
5300 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5301 Dynarr_add (dst, lb);
5302 Dynarr_add (dst, c | 0x80);
5305 case 3: /* one-byte private or two-byte official */
5306 if (XCHARSET_PRIVATE_P (charset))
5308 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5309 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5310 Dynarr_add (dst, lb);
5311 Dynarr_add (dst, c | 0x80);
5317 Dynarr_add (dst, lb);
5318 Dynarr_add (dst, ch | 0x80);
5319 Dynarr_add (dst, c | 0x80);
5327 default: /* two-byte private */
5330 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5331 Dynarr_add (dst, lb);
5332 Dynarr_add (dst, ch | 0x80);
5333 Dynarr_add (dst, c | 0x80);
5343 flags &= CODING_STATE_ISO2022_LOCK;
5346 label_continue_loop:;
5349 if (flags & CODING_STATE_END)
5350 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5357 /***** ISO2022 encoder *****/
5359 /* Designate CHARSET into register REG. */
5362 iso2022_designate (Lisp_Object charset, unsigned char reg,
5363 struct encoding_stream *str, unsigned_char_dynarr *dst)
5365 static const char inter94[] = "()*+";
5366 static const char inter96[] = ",-./";
5368 unsigned char final;
5369 Lisp_Object old_charset = str->iso2022.charset[reg];
5371 str->iso2022.charset[reg] = charset;
5372 if (!CHARSETP (charset))
5373 /* charset might be an initial nil or t. */
5375 type = XCHARSET_TYPE (charset);
5376 final = XCHARSET_FINAL (charset);
5377 if (!str->iso2022.force_charset_on_output[reg] &&
5378 CHARSETP (old_charset) &&
5379 XCHARSET_TYPE (old_charset) == type &&
5380 XCHARSET_FINAL (old_charset) == final)
5383 str->iso2022.force_charset_on_output[reg] = 0;
5386 charset_conversion_spec_dynarr *dyn =
5387 str->codesys->iso2022.output_conv;
5393 for (i = 0; i < Dynarr_length (dyn); i++)
5395 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5396 if (EQ (charset, spec->from_charset))
5397 charset = spec->to_charset;
5402 Dynarr_add (dst, ISO_CODE_ESC);
5405 case CHARSET_TYPE_94:
5406 Dynarr_add (dst, inter94[reg]);
5408 case CHARSET_TYPE_96:
5409 Dynarr_add (dst, inter96[reg]);
5411 case CHARSET_TYPE_94X94:
5412 Dynarr_add (dst, '$');
5414 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5417 Dynarr_add (dst, inter94[reg]);
5419 case CHARSET_TYPE_96X96:
5420 Dynarr_add (dst, '$');
5421 Dynarr_add (dst, inter96[reg]);
5424 Dynarr_add (dst, final);
5428 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5430 if (str->iso2022.register_left != 0)
5432 Dynarr_add (dst, ISO_CODE_SI);
5433 str->iso2022.register_left = 0;
5438 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5440 if (str->iso2022.register_left != 1)
5442 Dynarr_add (dst, ISO_CODE_SO);
5443 str->iso2022.register_left = 1;
5447 /* Convert internally-formatted data to ISO2022 format. */
5450 encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
5451 unsigned_char_dynarr *dst, size_t n)
5453 unsigned char charmask, c;
5454 unsigned char char_boundary;
5455 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5456 unsigned int flags = str->flags;
5457 Emchar ch = str->ch;
5458 Lisp_Coding_System *codesys = str->codesys;
5459 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5461 Lisp_Object charset;
5464 unsigned int byte1, byte2;
5467 #ifdef ENABLE_COMPOSITE_CHARS
5468 /* flags for handling composite chars. We do a little switcharoo
5469 on the source while we're outputting the composite char. */
5470 unsigned int saved_n = 0;
5471 const unsigned char *saved_src = NULL;
5472 int in_composite = 0;
5473 #endif /* ENABLE_COMPOSITE_CHARS */
5475 char_boundary = str->iso2022.current_char_boundary;
5476 charset = str->iso2022.current_charset;
5477 half = str->iso2022.current_half;
5479 #ifdef ENABLE_COMPOSITE_CHARS
5487 switch (char_boundary)
5495 else if ( c >= 0xf8 )
5500 else if ( c >= 0xf0 )
5505 else if ( c >= 0xe0 )
5510 else if ( c >= 0xc0 )
5519 restore_left_to_right_direction (codesys, dst, &flags, 0);
5521 /* Make sure G0 contains ASCII */
5522 if ((c > ' ' && c < ISO_CODE_DEL) ||
5523 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5525 ensure_normal_shift (str, dst);
5526 iso2022_designate (Vcharset_ascii, 0, str, dst);
5529 /* If necessary, restore everything to the default state
5532 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5534 restore_left_to_right_direction (codesys, dst, &flags, 0);
5536 ensure_normal_shift (str, dst);
5538 for (i = 0; i < 4; i++)
5540 Lisp_Object initial_charset =
5541 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5542 iso2022_designate (initial_charset, i, str, dst);
5547 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5548 Dynarr_add (dst, '\r');
5549 if (eol_type != EOL_CR)
5550 Dynarr_add (dst, c);
5554 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5555 && fit_to_be_escape_quoted (c))
5556 Dynarr_add (dst, ISO_CODE_ESC);
5557 Dynarr_add (dst, c);
5563 ch = ( ch << 6 ) | ( c & 0x3f );
5566 if ( (0x80 <= ch) && (ch <= 0x9f) )
5568 charmask = (half == 0 ? 0x00 : 0x80);
5570 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5571 && fit_to_be_escape_quoted (ch))
5572 Dynarr_add (dst, ISO_CODE_ESC);
5573 /* you asked for it ... */
5574 Dynarr_add (dst, ch);
5580 BREAKUP_CHAR (ch, charset, byte1, byte2);
5581 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5582 codesys, dst, &flags, 0);
5584 /* Now determine which register to use. */
5586 for (i = 0; i < 4; i++)
5588 if (EQ (charset, str->iso2022.charset[i]) ||
5590 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5599 if (XCHARSET_GRAPHIC (charset) != 0)
5601 if (!NILP (str->iso2022.charset[1]) &&
5602 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5603 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5605 else if (!NILP (str->iso2022.charset[2]))
5607 else if (!NILP (str->iso2022.charset[3]))
5616 iso2022_designate (charset, reg, str, dst);
5618 /* Now invoke that register. */
5622 ensure_normal_shift (str, dst);
5627 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5629 ensure_shift_out (str, dst);
5637 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5639 Dynarr_add (dst, ISO_CODE_ESC);
5640 Dynarr_add (dst, 'N');
5645 Dynarr_add (dst, ISO_CODE_SS2);
5651 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5653 Dynarr_add (dst, ISO_CODE_ESC);
5654 Dynarr_add (dst, 'O');
5659 Dynarr_add (dst, ISO_CODE_SS3);
5668 charmask = (half == 0 ? 0x00 : 0x80);
5670 switch (XCHARSET_DIMENSION (charset))
5673 Dynarr_add (dst, byte1 | charmask);
5676 Dynarr_add (dst, byte1 | charmask);
5677 Dynarr_add (dst, byte2 | charmask);
5686 ch = ( ch << 6 ) | ( c & 0x3f );
5690 #else /* not UTF2000 */
5696 if (BYTE_ASCII_P (c))
5697 { /* Processing ASCII character */
5700 restore_left_to_right_direction (codesys, dst, &flags, 0);
5702 /* Make sure G0 contains ASCII */
5703 if ((c > ' ' && c < ISO_CODE_DEL) ||
5704 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5706 ensure_normal_shift (str, dst);
5707 iso2022_designate (Vcharset_ascii, 0, str, dst);
5710 /* If necessary, restore everything to the default state
5713 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5715 restore_left_to_right_direction (codesys, dst, &flags, 0);
5717 ensure_normal_shift (str, dst);
5719 for (i = 0; i < 4; i++)
5721 Lisp_Object initial_charset =
5722 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5723 iso2022_designate (initial_charset, i, str, dst);
5728 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5729 Dynarr_add (dst, '\r');
5730 if (eol_type != EOL_CR)
5731 Dynarr_add (dst, c);
5735 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5736 && fit_to_be_escape_quoted (c))
5737 Dynarr_add (dst, ISO_CODE_ESC);
5738 Dynarr_add (dst, c);
5743 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5744 { /* Processing Leading Byte */
5746 charset = CHARSET_BY_LEADING_BYTE (c);
5747 if (LEADING_BYTE_PREFIX_P(c))
5749 else if (!EQ (charset, Vcharset_control_1)
5750 #ifdef ENABLE_COMPOSITE_CHARS
5751 && !EQ (charset, Vcharset_composite)
5757 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5758 codesys, dst, &flags, 0);
5760 /* Now determine which register to use. */
5762 for (i = 0; i < 4; i++)
5764 if (EQ (charset, str->iso2022.charset[i]) ||
5766 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5775 if (XCHARSET_GRAPHIC (charset) != 0)
5777 if (!NILP (str->iso2022.charset[1]) &&
5778 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5779 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5781 else if (!NILP (str->iso2022.charset[2]))
5783 else if (!NILP (str->iso2022.charset[3]))
5792 iso2022_designate (charset, reg, str, dst);
5794 /* Now invoke that register. */
5798 ensure_normal_shift (str, dst);
5803 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5805 ensure_shift_out (str, dst);
5813 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5815 Dynarr_add (dst, ISO_CODE_ESC);
5816 Dynarr_add (dst, 'N');
5821 Dynarr_add (dst, ISO_CODE_SS2);
5827 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5829 Dynarr_add (dst, ISO_CODE_ESC);
5830 Dynarr_add (dst, 'O');
5835 Dynarr_add (dst, ISO_CODE_SS3);
5847 { /* Processing Non-ASCII character */
5848 charmask = (half == 0 ? 0x7F : 0xFF);
5850 if (EQ (charset, Vcharset_control_1))
5852 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5853 && fit_to_be_escape_quoted (c))
5854 Dynarr_add (dst, ISO_CODE_ESC);
5855 /* you asked for it ... */
5856 Dynarr_add (dst, c - 0x20);
5860 switch (XCHARSET_REP_BYTES (charset))
5863 Dynarr_add (dst, c & charmask);
5866 if (XCHARSET_PRIVATE_P (charset))
5868 Dynarr_add (dst, c & charmask);
5873 #ifdef ENABLE_COMPOSITE_CHARS
5874 if (EQ (charset, Vcharset_composite))
5878 /* #### Bother! We don't know how to
5880 Dynarr_add (dst, '~');
5884 Emchar emch = MAKE_CHAR (Vcharset_composite,
5885 ch & 0x7F, c & 0x7F);
5886 Lisp_Object lstr = composite_char_string (emch);
5890 src = XSTRING_DATA (lstr);
5891 n = XSTRING_LENGTH (lstr);
5892 Dynarr_add (dst, ISO_CODE_ESC);
5893 Dynarr_add (dst, '0'); /* start composing */
5897 #endif /* ENABLE_COMPOSITE_CHARS */
5899 Dynarr_add (dst, ch & charmask);
5900 Dynarr_add (dst, c & charmask);
5913 Dynarr_add (dst, ch & charmask);
5914 Dynarr_add (dst, c & charmask);
5929 #endif /* not UTF2000 */
5931 #ifdef ENABLE_COMPOSITE_CHARS
5937 Dynarr_add (dst, ISO_CODE_ESC);
5938 Dynarr_add (dst, '1'); /* end composing */
5939 goto back_to_square_n; /* Wheeeeeeeee ..... */
5941 #endif /* ENABLE_COMPOSITE_CHARS */
5944 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5946 if (char_boundary && flags & CODING_STATE_END)
5949 restore_left_to_right_direction (codesys, dst, &flags, 0);
5950 ensure_normal_shift (str, dst);
5951 for (i = 0; i < 4; i++)
5953 Lisp_Object initial_charset =
5954 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5955 iso2022_designate (initial_charset, i, str, dst);
5961 str->iso2022.current_char_boundary = char_boundary;
5962 str->iso2022.current_charset = charset;
5963 str->iso2022.current_half = half;
5965 /* Verbum caro factum est! */
5969 /************************************************************************/
5970 /* No-conversion methods */
5971 /************************************************************************/
5973 /* This is used when reading in "binary" files -- i.e. files that may
5974 contain all 256 possible byte values and that are not to be
5975 interpreted as being in any particular decoding. */
5977 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5978 unsigned_char_dynarr *dst, size_t n)
5980 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5981 unsigned int flags = str->flags;
5982 unsigned int ch = str->ch;
5983 eol_type_t eol_type = str->eol_type;
5987 unsigned char c = *(unsigned char *)src++;
5989 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5990 DECODE_ADD_BINARY_CHAR (c, dst);
5991 label_continue_loop:;
5994 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
6001 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6002 unsigned_char_dynarr *dst, size_t n)
6005 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6006 unsigned int flags = str->flags;
6007 unsigned int ch = str->ch;
6008 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6010 unsigned char char_boundary = str->iso2022.current_char_boundary;
6017 switch (char_boundary)
6025 else if ( c >= 0xf8 )
6030 else if ( c >= 0xf0 )
6035 else if ( c >= 0xe0 )
6040 else if ( c >= 0xc0 )
6051 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6052 Dynarr_add (dst, '\r');
6053 if (eol_type != EOL_CR)
6054 Dynarr_add (dst, c);
6057 Dynarr_add (dst, c);
6062 ch = ( ch << 6 ) | ( c & 0x3f );
6063 Dynarr_add (dst, ch & 0xff);
6067 ch = ( ch << 6 ) | ( c & 0x3f );
6070 #else /* not UTF2000 */
6073 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6074 Dynarr_add (dst, '\r');
6075 if (eol_type != EOL_CR)
6076 Dynarr_add (dst, '\n');
6079 else if (BYTE_ASCII_P (c))
6082 Dynarr_add (dst, c);
6084 else if (BUFBYTE_LEADING_BYTE_P (c))
6087 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6088 c == LEADING_BYTE_CONTROL_1)
6091 Dynarr_add (dst, '~'); /* untranslatable character */
6095 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6096 Dynarr_add (dst, c);
6097 else if (ch == LEADING_BYTE_CONTROL_1)
6100 Dynarr_add (dst, c - 0x20);
6102 /* else it should be the second or third byte of an
6103 untranslatable character, so ignore it */
6106 #endif /* not UTF2000 */
6112 str->iso2022.current_char_boundary = char_boundary;
6118 /************************************************************************/
6119 /* Initialization */
6120 /************************************************************************/
6123 syms_of_file_coding (void)
6125 INIT_LRECORD_IMPLEMENTATION (coding_system);
6127 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6129 DEFSUBR (Fcoding_system_p);
6130 DEFSUBR (Ffind_coding_system);
6131 DEFSUBR (Fget_coding_system);
6132 DEFSUBR (Fcoding_system_list);
6133 DEFSUBR (Fcoding_system_name);
6134 DEFSUBR (Fmake_coding_system);
6135 DEFSUBR (Fcopy_coding_system);
6136 DEFSUBR (Fcoding_system_canonical_name_p);
6137 DEFSUBR (Fcoding_system_alias_p);
6138 DEFSUBR (Fcoding_system_aliasee);
6139 DEFSUBR (Fdefine_coding_system_alias);
6140 DEFSUBR (Fsubsidiary_coding_system);
6142 DEFSUBR (Fcoding_system_type);
6143 DEFSUBR (Fcoding_system_doc_string);
6145 DEFSUBR (Fcoding_system_charset);
6147 DEFSUBR (Fcoding_system_property);
6149 DEFSUBR (Fcoding_category_list);
6150 DEFSUBR (Fset_coding_priority_list);
6151 DEFSUBR (Fcoding_priority_list);
6152 DEFSUBR (Fset_coding_category_system);
6153 DEFSUBR (Fcoding_category_system);
6155 DEFSUBR (Fdetect_coding_region);
6156 DEFSUBR (Fdecode_coding_region);
6157 DEFSUBR (Fencode_coding_region);
6159 DEFSUBR (Fdecode_shift_jis_char);
6160 DEFSUBR (Fencode_shift_jis_char);
6161 DEFSUBR (Fdecode_big5_char);
6162 DEFSUBR (Fencode_big5_char);
6164 DEFSUBR (Fset_ucs_char);
6165 DEFSUBR (Fucs_char);
6166 DEFSUBR (Fset_char_ucs);
6167 DEFSUBR (Fchar_ucs);
6168 #endif /* not UTF2000 */
6170 defsymbol (&Qcoding_systemp, "coding-system-p");
6171 defsymbol (&Qno_conversion, "no-conversion");
6172 defsymbol (&Qraw_text, "raw-text");
6174 defsymbol (&Qbig5, "big5");
6175 defsymbol (&Qshift_jis, "shift-jis");
6176 defsymbol (&Qucs4, "ucs-4");
6177 defsymbol (&Qutf8, "utf-8");
6178 defsymbol (&Qccl, "ccl");
6179 defsymbol (&Qiso2022, "iso2022");
6181 defsymbol (&Qmnemonic, "mnemonic");
6182 defsymbol (&Qeol_type, "eol-type");
6183 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6184 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6186 defsymbol (&Qcr, "cr");
6187 defsymbol (&Qlf, "lf");
6188 defsymbol (&Qcrlf, "crlf");
6189 defsymbol (&Qeol_cr, "eol-cr");
6190 defsymbol (&Qeol_lf, "eol-lf");
6191 defsymbol (&Qeol_crlf, "eol-crlf");
6193 defsymbol (&Qcharset_g0, "charset-g0");
6194 defsymbol (&Qcharset_g1, "charset-g1");
6195 defsymbol (&Qcharset_g2, "charset-g2");
6196 defsymbol (&Qcharset_g3, "charset-g3");
6197 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6198 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6199 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6200 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6201 defsymbol (&Qno_iso6429, "no-iso6429");
6202 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6203 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6205 defsymbol (&Qshort, "short");
6206 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6207 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6208 defsymbol (&Qseven, "seven");
6209 defsymbol (&Qlock_shift, "lock-shift");
6210 defsymbol (&Qescape_quoted, "escape-quoted");
6212 defsymbol (&Qencode, "encode");
6213 defsymbol (&Qdecode, "decode");
6216 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6218 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6220 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6222 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6224 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6226 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6228 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6230 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6232 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6235 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6240 lstream_type_create_file_coding (void)
6242 LSTREAM_HAS_METHOD (decoding, reader);
6243 LSTREAM_HAS_METHOD (decoding, writer);
6244 LSTREAM_HAS_METHOD (decoding, rewinder);
6245 LSTREAM_HAS_METHOD (decoding, seekable_p);
6246 LSTREAM_HAS_METHOD (decoding, flusher);
6247 LSTREAM_HAS_METHOD (decoding, closer);
6248 LSTREAM_HAS_METHOD (decoding, marker);
6250 LSTREAM_HAS_METHOD (encoding, reader);
6251 LSTREAM_HAS_METHOD (encoding, writer);
6252 LSTREAM_HAS_METHOD (encoding, rewinder);
6253 LSTREAM_HAS_METHOD (encoding, seekable_p);
6254 LSTREAM_HAS_METHOD (encoding, flusher);
6255 LSTREAM_HAS_METHOD (encoding, closer);
6256 LSTREAM_HAS_METHOD (encoding, marker);
6260 vars_of_file_coding (void)
6264 fcd = xnew (struct file_coding_dump);
6265 dump_add_root_struct_ptr (&fcd, &fcd_description);
6267 /* Initialize to something reasonable ... */
6268 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6270 fcd->coding_category_system[i] = Qnil;
6271 fcd->coding_category_by_priority[i] = i;
6274 Fprovide (intern ("file-coding"));
6276 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6277 Coding system used for TTY keyboard input.
6278 Not used under a windowing system.
6280 Vkeyboard_coding_system = Qnil;
6282 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6283 Coding system used for TTY display output.
6284 Not used under a windowing system.
6286 Vterminal_coding_system = Qnil;
6288 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6289 Overriding coding system used when reading from a file or process.
6290 You should bind this variable with `let', but do not set it globally.
6291 If this is non-nil, it specifies the coding system that will be used
6292 to decode input on read operations, such as from a file or process.
6293 It overrides `buffer-file-coding-system-for-read',
6294 `insert-file-contents-pre-hook', etc. Use those variables instead of
6295 this one for permanent changes to the environment. */ );
6296 Vcoding_system_for_read = Qnil;
6298 DEFVAR_LISP ("coding-system-for-write",
6299 &Vcoding_system_for_write /*
6300 Overriding coding system used when writing to a file or process.
6301 You should bind this variable with `let', but do not set it globally.
6302 If this is non-nil, it specifies the coding system that will be used
6303 to encode output for write operations, such as to a file or process.
6304 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6305 Use those variables instead of this one for permanent changes to the
6307 Vcoding_system_for_write = Qnil;
6309 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6310 Coding system used to convert pathnames when accessing files.
6312 Vfile_name_coding_system = Qnil;
6314 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6315 Non-nil means the buffer contents are regarded as multi-byte form
6316 of characters, not a binary code. This affects the display, file I/O,
6317 and behaviors of various editing commands.
6319 Setting this to nil does not do anything.
6321 enable_multibyte_characters = 1;
6325 complex_vars_of_file_coding (void)
6327 staticpro (&Vcoding_system_hash_table);
6328 Vcoding_system_hash_table =
6329 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6331 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6332 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6334 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6336 struct codesys_prop csp; \
6338 csp.prop_type = (Prop_Type); \
6339 Dynarr_add (the_codesys_prop_dynarr, csp); \
6342 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6343 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6344 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6345 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6346 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6347 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6348 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6350 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6351 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6352 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6353 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6354 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6355 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6356 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6357 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6358 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6359 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6360 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6361 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6362 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6363 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6364 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6365 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6366 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6368 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6369 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6371 /* Need to create this here or we're really screwed. */
6373 (Qraw_text, Qno_conversion,
6374 build_string ("Raw text, which means it converts only line-break-codes."),
6375 list2 (Qmnemonic, build_string ("Raw")));
6378 (Qbinary, Qno_conversion,
6379 build_string ("Binary, which means it does not convert anything."),
6380 list4 (Qeol_type, Qlf,
6381 Qmnemonic, build_string ("Binary")));
6386 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6387 list2 (Qmnemonic, build_string ("UTF8")));
6390 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6392 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6394 Fdefine_coding_system_alias (Qterminal, Qbinary);
6395 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6397 /* Need this for bootstrapping */
6398 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6399 Fget_coding_system (Qraw_text);
6402 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6403 = Fget_coding_system (Qutf8);
6406 #if defined(MULE) && !defined(UTF2000)
6410 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6411 fcd->ucs_to_mule_table[i] = Qnil;
6413 staticpro (&mule_to_ucs_table);
6414 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6415 #endif /* defined(MULE) && !defined(UTF2000) */