1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST];
61 #if defined(MULE) && !defined(UTF2000)
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
68 #if defined(MULE) && !defined(UTF2000)
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const Extbyte *src, Lstream_data_count n);
180 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
181 unsigned_char_dynarr *dst, Lstream_data_count n);
182 static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
183 unsigned_char_dynarr *dst, Lstream_data_count n);
184 static int detect_coding_big5 (struct detection_state *st,
185 const Extbyte *src, Lstream_data_count n);
186 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
187 unsigned_char_dynarr *dst, Lstream_data_count n);
188 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
189 unsigned_char_dynarr *dst, Lstream_data_count n);
190 static int detect_coding_ucs4 (struct detection_state *st,
191 const Extbyte *src, Lstream_data_count n);
192 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
193 unsigned_char_dynarr *dst, Lstream_data_count n);
194 static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
195 unsigned_char_dynarr *dst, Lstream_data_count n);
196 static int detect_coding_utf8 (struct detection_state *st,
197 const Extbyte *src, Lstream_data_count n);
198 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, Lstream_data_count n);
200 static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
201 unsigned_char_dynarr *dst, Lstream_data_count n);
202 static int postprocess_iso2022_mask (int mask);
203 static void reset_iso2022 (Lisp_Object coding_system,
204 struct iso2022_decoder *iso);
205 static int detect_coding_iso2022 (struct detection_state *st,
206 const Extbyte *src, Lstream_data_count n);
207 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, Lstream_data_count n);
209 static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
210 unsigned_char_dynarr *dst, Lstream_data_count n);
212 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
213 unsigned_char_dynarr *dst, Lstream_data_count n);
214 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
215 unsigned_char_dynarr *dst, Lstream_data_count n);
216 static void mule_decode (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, Lstream_data_count n);
218 static void mule_encode (Lstream *encoding, const Bufbyte *src,
219 unsigned_char_dynarr *dst, Lstream_data_count n);
221 typedef struct codesys_prop codesys_prop;
230 Dynarr_declare (codesys_prop);
231 } codesys_prop_dynarr;
233 static const struct lrecord_description codesys_prop_description_1[] = {
234 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
238 static const struct struct_description codesys_prop_description = {
239 sizeof (codesys_prop),
240 codesys_prop_description_1
243 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
244 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
248 static const struct struct_description codesys_prop_dynarr_description = {
249 sizeof (codesys_prop_dynarr),
250 codesys_prop_dynarr_description_1
253 codesys_prop_dynarr *the_codesys_prop_dynarr;
255 enum codesys_prop_enum
258 CODESYS_PROP_ISO2022,
263 /************************************************************************/
264 /* Coding system functions */
265 /************************************************************************/
267 static Lisp_Object mark_coding_system (Lisp_Object);
268 static void print_coding_system (Lisp_Object, Lisp_Object, int);
269 static void finalize_coding_system (void *header, int for_disksave);
272 static const struct lrecord_description ccs_description_1[] = {
273 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
274 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
278 static const struct struct_description ccs_description = {
279 sizeof (charset_conversion_spec),
283 static const struct lrecord_description ccsd_description_1[] = {
284 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
288 static const struct struct_description ccsd_description = {
289 sizeof (charset_conversion_spec_dynarr),
294 static const struct lrecord_description coding_system_description[] = {
295 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
296 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
297 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
298 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
299 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
300 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
301 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
302 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
304 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
305 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
306 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
307 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
308 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
313 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
314 mark_coding_system, print_coding_system,
315 finalize_coding_system,
316 0, 0, coding_system_description,
320 mark_coding_system (Lisp_Object obj)
322 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
324 mark_object (CODING_SYSTEM_NAME (codesys));
325 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
326 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
327 mark_object (CODING_SYSTEM_EOL_LF (codesys));
328 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
329 mark_object (CODING_SYSTEM_EOL_CR (codesys));
331 switch (CODING_SYSTEM_TYPE (codesys))
335 case CODESYS_ISO2022:
336 for (i = 0; i < 4; i++)
337 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
338 if (codesys->iso2022.input_conv)
340 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
342 struct charset_conversion_spec *ccs =
343 Dynarr_atp (codesys->iso2022.input_conv, i);
344 mark_object (ccs->from_charset);
345 mark_object (ccs->to_charset);
348 if (codesys->iso2022.output_conv)
350 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp (codesys->iso2022.output_conv, i);
354 mark_object (ccs->from_charset);
355 mark_object (ccs->to_charset);
361 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
362 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
369 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
370 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
374 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
377 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
379 error ("printing unreadable object #<coding_system 0x%x>",
382 write_c_string ("#<coding_system ", printcharfun);
383 print_internal (c->name, printcharfun, 1);
384 write_c_string (">", printcharfun);
388 finalize_coding_system (void *header, int for_disksave)
390 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
391 /* Since coding systems never go away, this function is not
392 necessary. But it would be necessary if we changed things
393 so that coding systems could go away. */
394 if (!for_disksave) /* see comment in lstream.c */
396 switch (CODING_SYSTEM_TYPE (c))
399 case CODESYS_ISO2022:
400 if (c->iso2022.input_conv)
402 Dynarr_free (c->iso2022.input_conv);
403 c->iso2022.input_conv = 0;
405 if (c->iso2022.output_conv)
407 Dynarr_free (c->iso2022.output_conv);
408 c->iso2022.output_conv = 0;
419 symbol_to_eol_type (Lisp_Object symbol)
421 CHECK_SYMBOL (symbol);
422 if (NILP (symbol)) return EOL_AUTODETECT;
423 if (EQ (symbol, Qlf)) return EOL_LF;
424 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
425 if (EQ (symbol, Qcr)) return EOL_CR;
427 signal_simple_error ("Unrecognized eol type", symbol);
428 return EOL_AUTODETECT; /* not reached */
432 eol_type_to_symbol (eol_type_t type)
437 case EOL_LF: return Qlf;
438 case EOL_CRLF: return Qcrlf;
439 case EOL_CR: return Qcr;
440 case EOL_AUTODETECT: return Qnil;
445 setup_eol_coding_systems (Lisp_Coding_System *codesys)
447 Lisp_Object codesys_obj;
448 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
449 char *codesys_name = (char *) alloca (len + 7);
451 char *codesys_mnemonic=0;
453 Lisp_Object codesys_name_sym, sub_codesys_obj;
457 XSETCODING_SYSTEM (codesys_obj, codesys);
459 memcpy (codesys_name,
460 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
462 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
464 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
465 codesys_mnemonic = (char *) alloca (mlen + 7);
466 memcpy (codesys_mnemonic,
467 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
470 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
471 strcpy (codesys_name + len, "-" op_sys); \
473 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
474 codesys_name_sym = intern (codesys_name); \
475 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
476 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
478 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
479 build_string (codesys_mnemonic); \
480 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
483 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
484 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
485 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
488 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
489 Return t if OBJECT is a coding system.
490 A coding system is an object that defines how text containing multiple
491 character sets is encoded into a stream of (typically 8-bit) bytes.
492 The coding system is used to decode the stream into a series of
493 characters (which may be from multiple charsets) when the text is read
494 from a file or process, and is used to encode the text back into the
495 same format when it is written out to a file or process.
497 For example, many ISO2022-compliant coding systems (such as Compound
498 Text, which is used for inter-client data under the X Window System)
499 use escape sequences to switch between different charsets -- Japanese
500 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
501 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
502 `make-coding-system' for more information.
504 Coding systems are normally identified using a symbol, and the
505 symbol is accepted in place of the actual coding system object whenever
506 a coding system is called for. (This is similar to how faces work.)
510 return CODING_SYSTEMP (object) ? Qt : Qnil;
513 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
514 Retrieve the coding system of the given name.
516 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
517 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
518 If there is no such coding system, nil is returned. Otherwise the
519 associated coding system object is returned.
521 (coding_system_or_name))
523 if (NILP (coding_system_or_name))
524 coding_system_or_name = Qbinary;
525 else if (CODING_SYSTEMP (coding_system_or_name))
526 return coding_system_or_name;
528 CHECK_SYMBOL (coding_system_or_name);
532 coding_system_or_name =
533 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
535 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
536 return coding_system_or_name;
540 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
541 Retrieve the coding system of the given name.
542 Same as `find-coding-system' except that if there is no such
543 coding system, an error is signaled instead of returning nil.
547 Lisp_Object coding_system = Ffind_coding_system (name);
549 if (NILP (coding_system))
550 signal_simple_error ("No such coding system", name);
551 return coding_system;
554 /* We store the coding systems in hash tables with the names as the key and the
555 actual coding system object as the value. Occasionally we need to use them
556 in a list format. These routines provide us with that. */
557 struct coding_system_list_closure
559 Lisp_Object *coding_system_list;
563 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
564 void *coding_system_list_closure)
566 /* This function can GC */
567 struct coding_system_list_closure *cscl =
568 (struct coding_system_list_closure *) coding_system_list_closure;
569 Lisp_Object *coding_system_list = cscl->coding_system_list;
571 *coding_system_list = Fcons (key, *coding_system_list);
575 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
576 Return a list of the names of all defined coding systems.
580 Lisp_Object coding_system_list = Qnil;
582 struct coding_system_list_closure coding_system_list_closure;
584 GCPRO1 (coding_system_list);
585 coding_system_list_closure.coding_system_list = &coding_system_list;
586 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
587 &coding_system_list_closure);
590 return coding_system_list;
593 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
594 Return the name of the given coding system.
598 coding_system = Fget_coding_system (coding_system);
599 return XCODING_SYSTEM_NAME (coding_system);
602 static Lisp_Coding_System *
603 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
605 Lisp_Coding_System *codesys =
606 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
608 zero_lcrecord (codesys);
609 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
610 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
611 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
612 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
613 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
614 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
615 CODING_SYSTEM_TYPE (codesys) = type;
616 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
618 if (type == CODESYS_ISO2022)
621 for (i = 0; i < 4; i++)
622 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
624 else if (type == CODESYS_CCL)
626 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
627 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
630 CODING_SYSTEM_NAME (codesys) = name;
636 /* Given a list of charset conversion specs as specified in a Lisp
637 program, parse it into STORE_HERE. */
640 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
641 Lisp_Object spec_list)
645 EXTERNAL_LIST_LOOP (rest, spec_list)
647 Lisp_Object car = XCAR (rest);
648 Lisp_Object from, to;
649 struct charset_conversion_spec spec;
651 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
652 signal_simple_error ("Invalid charset conversion spec", car);
653 from = Fget_charset (XCAR (car));
654 to = Fget_charset (XCAR (XCDR (car)));
655 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
656 signal_simple_error_2
657 ("Attempted conversion between different charset types",
659 spec.from_charset = from;
660 spec.to_charset = to;
662 Dynarr_add (store_here, spec);
666 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
667 specs, return the equivalent as the Lisp programmer would see it.
669 If LOAD_HERE is 0, return Qnil. */
672 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
679 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
681 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
682 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
685 return Fnreverse (result);
690 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
691 Register symbol NAME as a coding system.
693 TYPE describes the conversion method used and should be one of
696 Automatic conversion. XEmacs attempts to detect the coding system
699 No conversion. Use this for binary files and such. On output,
700 graphic characters that are not in ASCII or Latin-1 will be
701 replaced by a ?. (For a no-conversion-encoded buffer, these
702 characters will only be present if you explicitly insert them.)
704 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
706 ISO 10646 UCS-4 encoding.
708 ISO 10646 UTF-8 encoding.
710 Any ISO2022-compliant encoding. Among other things, this includes
711 JIS (the Japanese encoding commonly used for e-mail), EUC (the
712 standard Unix encoding for Japanese and other languages), and
713 Compound Text (the encoding used in X11). You can specify more
714 specific information about the conversion with the PROPS argument.
716 Big5 (the encoding commonly used for Taiwanese).
718 The conversion is performed using a user-written pseudo-code
719 program. CCL (Code Conversion Language) is the name of this
722 Write out or read in the raw contents of the memory representing
723 the buffer's text. This is primarily useful for debugging
724 purposes, and is only enabled when XEmacs has been compiled with
725 DEBUG_XEMACS defined (via the --debug configure option).
726 WARNING: Reading in a file using 'internal conversion can result
727 in an internal inconsistency in the memory representing a
728 buffer's text, which will produce unpredictable results and may
729 cause XEmacs to crash. Under normal circumstances you should
730 never use 'internal conversion.
732 DOC-STRING is a string describing the coding system.
734 PROPS is a property list, describing the specific nature of the
735 character set. Recognized properties are:
738 String to be displayed in the modeline when this coding system is
742 End-of-line conversion to be used. It should be one of
745 Automatically detect the end-of-line type (LF, CRLF,
746 or CR). Also generate subsidiary coding systems named
747 `NAME-unix', `NAME-dos', and `NAME-mac', that are
748 identical to this coding system but have an EOL-TYPE
749 value of 'lf, 'crlf, and 'cr, respectively.
751 The end of a line is marked externally using ASCII LF.
752 Since this is also the way that XEmacs represents an
753 end-of-line internally, specifying this option results
754 in no end-of-line conversion. This is the standard
755 format for Unix text files.
757 The end of a line is marked externally using ASCII
758 CRLF. This is the standard format for MS-DOS text
761 The end of a line is marked externally using ASCII CR.
762 This is the standard format for Macintosh text files.
764 Automatically detect the end-of-line type but do not
765 generate subsidiary coding systems. (This value is
766 converted to nil when stored internally, and
767 `coding-system-property' will return nil.)
769 'post-read-conversion
770 Function called after a file has been read in, to perform the
771 decoding. Called with two arguments, START and END, denoting
772 a region of the current buffer to be decoded.
774 'pre-write-conversion
775 Function called before a file is written out, to perform the
776 encoding. Called with two arguments, START and END, denoting
777 a region of the current buffer to be encoded.
780 The following additional properties are recognized if TYPE is 'iso2022:
786 The character set initially designated to the G0 - G3 registers.
787 The value should be one of
789 -- A charset object (designate that character set)
790 -- nil (do not ever use this register)
791 -- t (no character set is initially designated to
792 the register, but may be later on; this automatically
793 sets the corresponding `force-g*-on-output' property)
799 If non-nil, send an explicit designation sequence on output before
800 using the specified register.
803 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
804 "ESC $ B" on output in place of the full designation sequences
805 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
808 If non-nil, don't designate ASCII to G0 at each end of line on output.
809 Setting this to non-nil also suppresses other state-resetting that
810 normally happens at the end of a line.
813 If non-nil, don't designate ASCII to G0 before control chars on output.
816 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
820 If non-nil, use locking-shift (SO/SI) instead of single-shift
821 or designation by escape sequence.
824 If non-nil, don't use ISO6429's direction specification.
827 If non-nil, literal control characters that are the same as
828 the beginning of a recognized ISO2022 or ISO6429 escape sequence
829 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
830 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
831 so that they can be properly distinguished from an escape sequence.
832 (Note that doing this results in a non-portable encoding.) This
833 encoding flag is used for byte-compiled files. Note that ESC
834 is a good choice for a quoting character because there are no
835 escape sequences whose second byte is a character from the Control-0
836 or Control-1 character sets; this is explicitly disallowed by the
839 'input-charset-conversion
840 A list of conversion specifications, specifying conversion of
841 characters in one charset to another when decoding is performed.
842 Each specification is a list of two elements: the source charset,
843 and the destination charset.
845 'output-charset-conversion
846 A list of conversion specifications, specifying conversion of
847 characters in one charset to another when encoding is performed.
848 The form of each specification is the same as for
849 'input-charset-conversion.
852 The following additional properties are recognized (and required)
856 CCL program used for decoding (converting to internal format).
859 CCL program used for encoding (converting to external format).
861 (name, type, doc_string, props))
863 Lisp_Coding_System *codesys;
864 enum coding_system_type ty;
865 int need_to_setup_eol_systems = 1;
867 /* Convert type to constant */
868 if (NILP (type) || EQ (type, Qundecided))
869 { ty = CODESYS_AUTODETECT; }
871 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
872 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
873 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
874 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
875 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
876 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
878 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
880 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
883 signal_simple_error ("Invalid coding system type", type);
887 codesys = allocate_coding_system (ty, name);
889 if (NILP (doc_string))
890 doc_string = build_string ("");
892 CHECK_STRING (doc_string);
893 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
896 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
898 if (EQ (key, Qmnemonic))
901 CHECK_STRING (value);
902 CODING_SYSTEM_MNEMONIC (codesys) = value;
905 else if (EQ (key, Qeol_type))
907 need_to_setup_eol_systems = NILP (value);
910 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
913 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
914 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
916 else if (ty == CODESYS_ISO2022)
918 #define FROB_INITIAL_CHARSET(charset_num) \
919 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
920 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
922 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
923 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
924 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
925 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
927 #define FROB_FORCE_CHARSET(charset_num) \
928 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
930 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
931 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
932 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
933 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
935 #define FROB_BOOLEAN_PROPERTY(prop) \
936 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
938 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
939 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
940 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
941 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
942 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
943 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
944 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
946 else if (EQ (key, Qinput_charset_conversion))
948 codesys->iso2022.input_conv =
949 Dynarr_new (charset_conversion_spec);
950 parse_charset_conversion_specs (codesys->iso2022.input_conv,
953 else if (EQ (key, Qoutput_charset_conversion))
955 codesys->iso2022.output_conv =
956 Dynarr_new (charset_conversion_spec);
957 parse_charset_conversion_specs (codesys->iso2022.output_conv,
961 signal_simple_error ("Unrecognized property", key);
963 else if (EQ (type, Qccl))
966 struct ccl_program test_ccl;
969 /* Check key first. */
970 if (EQ (key, Qdecode))
971 suffix = "-ccl-decode";
972 else if (EQ (key, Qencode))
973 suffix = "-ccl-encode";
975 signal_simple_error ("Unrecognized property", key);
977 /* If value is vector, register it as a ccl program
978 associated with an newly created symbol for
979 backward compatibility. */
982 sym = Fintern (concat2 (Fsymbol_name (name),
983 build_string (suffix)),
985 Fregister_ccl_program (sym, value);
989 CHECK_SYMBOL (value);
992 /* check if the given ccl programs are valid. */
993 if (setup_ccl_program (&test_ccl, sym) < 0)
994 signal_simple_error ("Invalid CCL program", value);
996 if (EQ (key, Qdecode))
997 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
998 else if (EQ (key, Qencode))
999 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1004 signal_simple_error ("Unrecognized property", key);
1008 if (need_to_setup_eol_systems)
1009 setup_eol_coding_systems (codesys);
1012 Lisp_Object codesys_obj;
1013 XSETCODING_SYSTEM (codesys_obj, codesys);
1014 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1019 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1020 Copy OLD-CODING-SYSTEM to NEW-NAME.
1021 If NEW-NAME does not name an existing coding system, a new one will
1024 (old_coding_system, new_name))
1026 Lisp_Object new_coding_system;
1027 old_coding_system = Fget_coding_system (old_coding_system);
1028 new_coding_system = Ffind_coding_system (new_name);
1029 if (NILP (new_coding_system))
1031 XSETCODING_SYSTEM (new_coding_system,
1032 allocate_coding_system
1033 (XCODING_SYSTEM_TYPE (old_coding_system),
1035 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1039 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1040 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1041 memcpy (((char *) to ) + sizeof (to->header),
1042 ((char *) from) + sizeof (from->header),
1043 sizeof (*from) - sizeof (from->header));
1044 to->name = new_name;
1046 return new_coding_system;
1049 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1050 Return t if OBJECT names a coding system, and is not a coding system alias.
1054 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1058 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1059 Return t if OBJECT is a coding system alias.
1060 All coding system aliases are created by `define-coding-system-alias'.
1064 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1068 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1069 Return the coding-system symbol for which symbol ALIAS is an alias.
1073 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1074 if (SYMBOLP (aliasee))
1077 signal_simple_error ("Symbol is not a coding system alias", alias);
1078 return Qnil; /* To keep the compiler happy */
1082 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1084 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1088 /* A maphash function, for removing dangling coding system aliases. */
1090 dangling_coding_system_alias_p (Lisp_Object alias,
1091 Lisp_Object aliasee,
1092 void *dangling_aliases)
1094 if (SYMBOLP (aliasee)
1095 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1097 (*(int *) dangling_aliases)++;
1104 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1105 Define symbol ALIAS as an alias for coding system ALIASEE.
1107 You can use this function to redefine an alias that has already been defined,
1108 but you cannot redefine a name which is the canonical name for a coding system.
1109 \(a canonical name of a coding system is what is returned when you call
1110 `coding-system-name' on a coding system).
1112 ALIASEE itself can be an alias, which allows you to define nested aliases.
1114 You are forbidden, however, from creating alias loops or `dangling' aliases.
1115 These will be detected, and an error will be signaled if you attempt to do so.
1117 If ALIASEE is nil, then ALIAS will simply be undefined.
1119 See also `coding-system-alias-p', `coding-system-aliasee',
1120 and `coding-system-canonical-name-p'.
1124 Lisp_Object real_coding_system, probe;
1126 CHECK_SYMBOL (alias);
1128 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1130 ("Symbol is the canonical name of a coding system and cannot be redefined",
1135 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1136 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1137 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1139 Fremhash (alias, Vcoding_system_hash_table);
1141 /* Undefine subsidiary aliases,
1142 presumably created by a previous call to this function */
1143 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1144 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1145 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1147 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1148 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1149 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1152 /* Undefine dangling coding system aliases. */
1154 int dangling_aliases;
1157 dangling_aliases = 0;
1158 elisp_map_remhash (dangling_coding_system_alias_p,
1159 Vcoding_system_hash_table,
1161 } while (dangling_aliases > 0);
1167 if (CODING_SYSTEMP (aliasee))
1168 aliasee = XCODING_SYSTEM_NAME (aliasee);
1170 /* Checks that aliasee names a coding-system */
1171 real_coding_system = Fget_coding_system (aliasee);
1173 /* Check for coding system alias loops */
1174 if (EQ (alias, aliasee))
1175 alias_loop: signal_simple_error_2
1176 ("Attempt to create a coding system alias loop", alias, aliasee);
1178 for (probe = aliasee;
1180 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1182 if (EQ (probe, alias))
1186 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1188 /* Set up aliases for subsidiaries.
1189 #### There must be a better way to handle subsidiary coding systems. */
1191 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1193 for (i = 0; i < countof (suffixes); i++)
1195 Lisp_Object alias_subsidiary =
1196 append_suffix_to_symbol (alias, suffixes[i]);
1197 Lisp_Object aliasee_subsidiary =
1198 append_suffix_to_symbol (aliasee, suffixes[i]);
1200 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1201 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1204 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1205 but it doesn't look intentional, so I'd rather return something
1206 meaningful or nothing at all. */
1211 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1213 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1214 Lisp_Object new_coding_system;
1216 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1217 return coding_system;
1221 case EOL_AUTODETECT: return coding_system;
1222 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1223 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1224 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1225 default: abort (); return Qnil;
1228 return NILP (new_coding_system) ? coding_system : new_coding_system;
1231 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1232 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1234 (coding_system, eol_type))
1236 coding_system = Fget_coding_system (coding_system);
1238 return subsidiary_coding_system (coding_system,
1239 symbol_to_eol_type (eol_type));
1243 /************************************************************************/
1244 /* Coding system accessors */
1245 /************************************************************************/
1247 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1248 Return the doc string for CODING-SYSTEM.
1252 coding_system = Fget_coding_system (coding_system);
1253 return XCODING_SYSTEM_DOC_STRING (coding_system);
1256 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1257 Return the type of CODING-SYSTEM.
1261 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1264 case CODESYS_AUTODETECT: return Qundecided;
1266 case CODESYS_SHIFT_JIS: return Qshift_jis;
1267 case CODESYS_ISO2022: return Qiso2022;
1268 case CODESYS_BIG5: return Qbig5;
1269 case CODESYS_UCS4: return Qucs4;
1270 case CODESYS_UTF8: return Qutf8;
1271 case CODESYS_CCL: return Qccl;
1273 case CODESYS_NO_CONVERSION: return Qno_conversion;
1275 case CODESYS_INTERNAL: return Qinternal;
1282 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1285 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1287 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1290 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1291 Return initial charset of CODING-SYSTEM designated to GNUM.
1294 (coding_system, gnum))
1296 coding_system = Fget_coding_system (coding_system);
1299 return coding_system_charset (coding_system, XINT (gnum));
1303 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1304 Return the PROP property of CODING-SYSTEM.
1306 (coding_system, prop))
1309 enum coding_system_type type;
1311 coding_system = Fget_coding_system (coding_system);
1312 CHECK_SYMBOL (prop);
1313 type = XCODING_SYSTEM_TYPE (coding_system);
1315 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1316 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1319 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1321 case CODESYS_PROP_ALL_OK:
1324 case CODESYS_PROP_ISO2022:
1325 if (type != CODESYS_ISO2022)
1327 ("Property only valid in ISO2022 coding systems",
1331 case CODESYS_PROP_CCL:
1332 if (type != CODESYS_CCL)
1334 ("Property only valid in CCL coding systems",
1344 signal_simple_error ("Unrecognized property", prop);
1346 if (EQ (prop, Qname))
1347 return XCODING_SYSTEM_NAME (coding_system);
1348 else if (EQ (prop, Qtype))
1349 return Fcoding_system_type (coding_system);
1350 else if (EQ (prop, Qdoc_string))
1351 return XCODING_SYSTEM_DOC_STRING (coding_system);
1352 else if (EQ (prop, Qmnemonic))
1353 return XCODING_SYSTEM_MNEMONIC (coding_system);
1354 else if (EQ (prop, Qeol_type))
1355 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1356 else if (EQ (prop, Qeol_lf))
1357 return XCODING_SYSTEM_EOL_LF (coding_system);
1358 else if (EQ (prop, Qeol_crlf))
1359 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1360 else if (EQ (prop, Qeol_cr))
1361 return XCODING_SYSTEM_EOL_CR (coding_system);
1362 else if (EQ (prop, Qpost_read_conversion))
1363 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1364 else if (EQ (prop, Qpre_write_conversion))
1365 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1367 else if (type == CODESYS_ISO2022)
1369 if (EQ (prop, Qcharset_g0))
1370 return coding_system_charset (coding_system, 0);
1371 else if (EQ (prop, Qcharset_g1))
1372 return coding_system_charset (coding_system, 1);
1373 else if (EQ (prop, Qcharset_g2))
1374 return coding_system_charset (coding_system, 2);
1375 else if (EQ (prop, Qcharset_g3))
1376 return coding_system_charset (coding_system, 3);
1378 #define FORCE_CHARSET(charset_num) \
1379 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1380 (coding_system, charset_num) ? Qt : Qnil)
1382 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1383 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1384 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1385 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1387 #define LISP_BOOLEAN(prop) \
1388 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1390 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1391 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1392 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1393 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1394 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1395 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1396 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1398 else if (EQ (prop, Qinput_charset_conversion))
1400 unparse_charset_conversion_specs
1401 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1402 else if (EQ (prop, Qoutput_charset_conversion))
1404 unparse_charset_conversion_specs
1405 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1409 else if (type == CODESYS_CCL)
1411 if (EQ (prop, Qdecode))
1412 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1413 else if (EQ (prop, Qencode))
1414 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1422 return Qnil; /* not reached */
1426 /************************************************************************/
1427 /* Coding category functions */
1428 /************************************************************************/
1431 decode_coding_category (Lisp_Object symbol)
1435 CHECK_SYMBOL (symbol);
1436 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1437 if (EQ (coding_category_symbol[i], symbol))
1440 signal_simple_error ("Unrecognized coding category", symbol);
1441 return 0; /* not reached */
1444 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1445 Return a list of all recognized coding categories.
1450 Lisp_Object list = Qnil;
1452 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1453 list = Fcons (coding_category_symbol[i], list);
1457 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1458 Change the priority order of the coding categories.
1459 LIST should be list of coding categories, in descending order of
1460 priority. Unspecified coding categories will be lower in priority
1461 than all specified ones, in the same relative order they were in
1466 int category_to_priority[CODING_CATEGORY_LAST];
1470 /* First generate a list that maps coding categories to priorities. */
1472 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1473 category_to_priority[i] = -1;
1475 /* Highest priority comes from the specified list. */
1477 EXTERNAL_LIST_LOOP (rest, list)
1479 int cat = decode_coding_category (XCAR (rest));
1481 if (category_to_priority[cat] >= 0)
1482 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1483 category_to_priority[cat] = i++;
1486 /* Now go through the existing categories by priority to retrieve
1487 the categories not yet specified and preserve their priority
1489 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1491 int cat = fcd->coding_category_by_priority[j];
1492 if (category_to_priority[cat] < 0)
1493 category_to_priority[cat] = i++;
1496 /* Now we need to construct the inverse of the mapping we just
1499 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1502 /* Phew! That was confusing. */
1506 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1507 Return a list of coding categories in descending order of priority.
1512 Lisp_Object list = Qnil;
1514 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1515 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1520 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1521 Change the coding system associated with a coding category.
1523 (coding_category, coding_system))
1525 int cat = decode_coding_category (coding_category);
1527 coding_system = Fget_coding_system (coding_system);
1528 fcd->coding_category_system[cat] = coding_system;
1532 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1533 Return the coding system associated with a coding category.
1537 int cat = decode_coding_category (coding_category);
1538 Lisp_Object sys = fcd->coding_category_system[cat];
1541 return XCODING_SYSTEM_NAME (sys);
1546 /************************************************************************/
1547 /* Detecting the encoding of data */
1548 /************************************************************************/
1550 struct detection_state
1552 eol_type_t eol_type;
1588 struct iso2022_decoder iso;
1590 int high_byte_count;
1591 unsigned int saw_single_shift:1;
1604 acceptable_control_char_p (int c)
1608 /* Allow and ignore control characters that you might
1609 reasonably see in a text file */
1614 case 8: /* backspace */
1615 case 11: /* vertical tab */
1616 case 12: /* form feed */
1617 case 26: /* MS-DOS C-z junk */
1618 case 31: /* '^_' -- for info */
1626 mask_has_at_most_one_bit_p (int mask)
1628 /* Perhaps the only thing useful you learn from intensive Microsoft
1629 technical interviews */
1630 return (mask & (mask - 1)) == 0;
1634 detect_eol_type (struct detection_state *st, const Extbyte *src,
1635 Lstream_data_count n)
1639 unsigned char c = *(unsigned char *)src++;
1642 if (st->eol.just_saw_cr)
1644 else if (st->eol.seen_anything)
1647 else if (st->eol.just_saw_cr)
1650 st->eol.just_saw_cr = 1;
1652 st->eol.just_saw_cr = 0;
1653 st->eol.seen_anything = 1;
1656 return EOL_AUTODETECT;
1659 /* Attempt to determine the encoding and EOL type of the given text.
1660 Before calling this function for the first type, you must initialize
1661 st->eol_type as appropriate and initialize st->mask to ~0.
1663 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1666 st->mask holds the determined coding category mask, or ~0 if only
1667 ASCII has been seen so far.
1671 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1672 is present in st->mask
1673 1 == definitive answers are here for both st->eol_type and st->mask
1677 detect_coding_type (struct detection_state *st, const Extbyte *src,
1678 Lstream_data_count n, int just_do_eol)
1680 if (st->eol_type == EOL_AUTODETECT)
1681 st->eol_type = detect_eol_type (st, src, n);
1684 return st->eol_type != EOL_AUTODETECT;
1686 if (!st->seen_non_ascii)
1688 for (; n; n--, src++)
1690 unsigned char c = *(unsigned char *) src;
1691 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1693 st->seen_non_ascii = 1;
1695 st->shift_jis.mask = ~0;
1699 st->iso2022.mask = ~0;
1709 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714 st->big5.mask = detect_coding_big5 (st, src, n);
1715 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716 st->utf8.mask = detect_coding_utf8 (st, src, n);
1717 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1721 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722 | st->utf8.mask | st->ucs4.mask;
1725 int retval = mask_has_at_most_one_bit_p (st->mask);
1726 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727 return retval && st->eol_type != EOL_AUTODETECT;
1732 coding_system_from_mask (int mask)
1736 /* If the file was entirely or basically ASCII, use the
1737 default value of `buffer-file-coding-system'. */
1738 Lisp_Object retval =
1739 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1742 retval = Ffind_coding_system (retval);
1746 (Qbad_variable, Qwarning,
1747 "Invalid `default-buffer-file-coding-system', set to nil");
1748 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1752 retval = Fget_coding_system (Qraw_text);
1760 mask = postprocess_iso2022_mask (mask);
1762 /* Look through the coding categories by priority and find
1763 the first one that is allowed. */
1764 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1766 cat = fcd->coding_category_by_priority[i];
1767 if ((mask & (1 << cat)) &&
1768 !NILP (fcd->coding_category_system[cat]))
1772 return fcd->coding_category_system[cat];
1774 return Fget_coding_system (Qraw_text);
1778 /* Given a seekable read stream and potential coding system and EOL type
1779 as specified, do any autodetection that is called for. If the
1780 coding system and/or EOL type are not `autodetect', they will be left
1781 alone; but this function will never return an autodetect coding system
1784 This function does not automatically fetch subsidiary coding systems;
1785 that should be unnecessary with the explicit eol-type argument. */
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1788 /* number of leading lines to check for a coding cookie */
1789 #define LINES_TO_CHECK 2
1792 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1793 eol_type_t *eol_type_in_out)
1795 struct detection_state decst;
1797 if (*eol_type_in_out == EOL_AUTODETECT)
1798 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1801 decst.eol_type = *eol_type_in_out;
1804 /* If autodetection is called for, do it now. */
1805 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1806 || *eol_type_in_out == EOL_AUTODETECT)
1809 Lisp_Object coding_system = Qnil;
1811 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1813 int lines_checked = 0;
1815 /* Look for initial "-*-"; mode line prefix */
1817 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1819 && lines_checked < LINES_TO_CHECK;
1821 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1823 Extbyte *local_vars_beg = p + 3;
1824 /* Look for final "-*-"; mode line suffix */
1825 for (p = local_vars_beg,
1826 scan_end = buf + nread - LENGTH ("-*-");
1828 && lines_checked < LINES_TO_CHECK;
1830 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1832 Extbyte *suffix = p;
1833 /* Look for "coding:" */
1834 for (p = local_vars_beg,
1835 scan_end = suffix - LENGTH ("coding:?");
1838 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1839 && (p == local_vars_beg
1840 || (*(p-1) == ' ' ||
1846 p += LENGTH ("coding:");
1847 while (*p == ' ' || *p == '\t') p++;
1849 /* Get coding system name */
1850 save = *suffix; *suffix = '\0';
1851 /* Characters valid in a MIME charset name (rfc 1521),
1852 and in a Lisp symbol name. */
1853 n = strspn ( (char *) p,
1854 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1855 "abcdefghijklmnopqrstuvwxyz"
1861 save = p[n]; p[n] = '\0';
1863 Ffind_coding_system (intern ((char *) p));
1870 /* #### file must use standard EOLs or we miss 2d line */
1871 /* #### not to mention this is broken for UTF-16 DOS files */
1872 else if (*p == '\n' || *p == '\r')
1875 /* skip past multibyte (DOS) newline */
1876 if (*p == '\r' && *(p+1) == '\n') p++;
1880 /* #### file must use standard EOLs or we miss 2d line */
1881 /* #### not to mention this is broken for UTF-16 DOS files */
1882 else if (*p == '\n' || *p == '\r')
1885 /* skip past multibyte (DOS) newline */
1886 if (*p == '\r' && *(p+1) == '\n') p++;
1889 if (NILP (coding_system))
1892 if (detect_coding_type (&decst, buf, nread,
1893 XCODING_SYSTEM_TYPE (*codesys_in_out)
1894 != CODESYS_AUTODETECT))
1896 nread = Lstream_read (stream, buf, sizeof (buf));
1902 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1903 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1906 if (detect_coding_type (&decst, buf, nread, 1))
1908 nread = Lstream_read (stream, buf, sizeof (buf));
1914 *eol_type_in_out = decst.eol_type;
1915 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1917 if (NILP (coding_system))
1918 *codesys_in_out = coding_system_from_mask (decst.mask);
1920 *codesys_in_out = coding_system;
1924 /* If we absolutely can't determine the EOL type, just assume LF. */
1925 if (*eol_type_in_out == EOL_AUTODETECT)
1926 *eol_type_in_out = EOL_LF;
1928 Lstream_rewind (stream);
1931 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1932 Detect coding system of the text in the region between START and END.
1933 Return a list of possible coding systems ordered by priority.
1934 If only ASCII characters are found, return 'undecided or one of
1935 its subsidiary coding systems according to a detected end-of-line
1936 type. Optional arg BUFFER defaults to the current buffer.
1938 (start, end, buffer))
1940 Lisp_Object val = Qnil;
1941 struct buffer *buf = decode_buffer (buffer, 0);
1943 Lisp_Object instream, lb_instream;
1944 Lstream *istr, *lb_istr;
1945 struct detection_state decst;
1946 struct gcpro gcpro1, gcpro2;
1948 get_buffer_range_char (buf, start, end, &b, &e, 0);
1949 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1950 lb_istr = XLSTREAM (lb_instream);
1951 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1952 istr = XLSTREAM (instream);
1953 GCPRO2 (instream, lb_instream);
1955 decst.eol_type = EOL_AUTODETECT;
1959 Extbyte random_buffer[4096];
1960 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1964 if (detect_coding_type (&decst, random_buffer, nread, 0))
1968 if (decst.mask == ~0)
1969 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1977 decst.mask = postprocess_iso2022_mask (decst.mask);
1979 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1981 int sys = fcd->coding_category_by_priority[i];
1982 if (decst.mask & (1 << sys))
1984 Lisp_Object codesys = fcd->coding_category_system[sys];
1985 if (!NILP (codesys))
1986 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1987 val = Fcons (codesys, val);
1991 Lstream_close (istr);
1993 Lstream_delete (istr);
1994 Lstream_delete (lb_istr);
1999 /************************************************************************/
2000 /* Converting to internal Mule format ("decoding") */
2001 /************************************************************************/
2003 /* A decoding stream is a stream used for decoding text (i.e.
2004 converting from some external format to internal format).
2005 The decoding-stream object keeps track of the actual coding
2006 stream, the stream that is at the other end, and data that
2007 needs to be persistent across the lifetime of the stream. */
2009 /* Handle the EOL stuff related to just-read-in character C.
2010 EOL_TYPE is the EOL type of the coding stream.
2011 FLAGS is the current value of FLAGS in the coding stream, and may
2012 be modified by this macro. (The macro only looks at the
2013 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2014 bytes are to be written. You need to also define a local goto
2015 label "label_continue_loop" that is at the end of the main
2016 character-reading loop.
2018 If C is a CR character, then this macro handles it entirely and
2019 jumps to label_continue_loop. Otherwise, this macro does not add
2020 anything to DST, and continues normally. You should continue
2021 processing C normally after this macro. */
2023 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2027 if (eol_type == EOL_CR) \
2028 Dynarr_add (dst, '\n'); \
2029 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2030 Dynarr_add (dst, c); \
2032 flags |= CODING_STATE_CR; \
2033 goto label_continue_loop; \
2035 else if (flags & CODING_STATE_CR) \
2036 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2038 Dynarr_add (dst, '\r'); \
2039 flags &= ~CODING_STATE_CR; \
2043 /* C should be a binary character in the range 0 - 255; convert
2044 to internal format and add to Dynarr DST. */
2047 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2049 if (BYTE_ASCII_P (c)) \
2050 Dynarr_add (dst, c); \
2053 Dynarr_add (dst, (c >> 6) | 0xc0); \
2054 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2059 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2063 Dynarr_add (dst, c);
2065 else if ( c <= 0x7ff )
2067 Dynarr_add (dst, (c >> 6) | 0xc0);
2068 Dynarr_add (dst, (c & 0x3f) | 0x80);
2070 else if ( c <= 0xffff )
2072 Dynarr_add (dst, (c >> 12) | 0xe0);
2073 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2074 Dynarr_add (dst, (c & 0x3f) | 0x80);
2076 else if ( c <= 0x1fffff )
2078 Dynarr_add (dst, (c >> 18) | 0xf0);
2079 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2080 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2081 Dynarr_add (dst, (c & 0x3f) | 0x80);
2083 else if ( c <= 0x3ffffff )
2085 Dynarr_add (dst, (c >> 24) | 0xf8);
2086 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2087 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2088 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2089 Dynarr_add (dst, (c & 0x3f) | 0x80);
2093 Dynarr_add (dst, (c >> 30) | 0xfc);
2094 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2095 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2096 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2097 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2098 Dynarr_add (dst, (c & 0x3f) | 0x80);
2102 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2104 if (BYTE_ASCII_P (c)) \
2105 Dynarr_add (dst, c); \
2106 else if (BYTE_C1_P (c)) \
2108 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2109 Dynarr_add (dst, c + 0x20); \
2113 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2114 Dynarr_add (dst, c); \
2119 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2123 DECODE_ADD_BINARY_CHAR (ch, dst); \
2128 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2130 if (flags & CODING_STATE_END) \
2132 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2133 if (flags & CODING_STATE_CR) \
2134 Dynarr_add (dst, '\r'); \
2138 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2140 struct decoding_stream
2142 /* Coding system that governs the conversion. */
2143 Lisp_Coding_System *codesys;
2145 /* Stream that we read the encoded data from or
2146 write the decoded data to. */
2149 /* If we are reading, then we can return only a fixed amount of
2150 data, so if the conversion resulted in too much data, we store it
2151 here for retrieval the next time around. */
2152 unsigned_char_dynarr *runoff;
2154 /* FLAGS holds flags indicating the current state of the decoding.
2155 Some of these flags are dependent on the coding system. */
2158 /* CH holds a partially built-up character. Since we only deal
2159 with one- and two-byte characters at the moment, we only use
2160 this to store the first byte of a two-byte character. */
2163 /* EOL_TYPE specifies the type of end-of-line conversion that
2164 currently applies. We need to keep this separate from the
2165 EOL type stored in CODESYS because the latter might indicate
2166 automatic EOL-type detection while the former will always
2167 indicate a particular EOL type. */
2168 eol_type_t eol_type;
2170 /* Additional ISO2022 information. We define the structure above
2171 because it's also needed by the detection routines. */
2172 struct iso2022_decoder iso2022;
2174 /* Additional information (the state of the running CCL program)
2175 used by the CCL decoder. */
2176 struct ccl_program ccl;
2178 /* counter for UTF-8 or UCS-4 */
2179 unsigned char counter;
2181 struct detection_state decst;
2184 static Lstream_data_count decoding_reader (Lstream *stream,
2185 unsigned char *data, Lstream_data_count size);
2186 static Lstream_data_count decoding_writer (Lstream *stream,
2187 const unsigned char *data, Lstream_data_count size);
2188 static int decoding_rewinder (Lstream *stream);
2189 static int decoding_seekable_p (Lstream *stream);
2190 static int decoding_flusher (Lstream *stream);
2191 static int decoding_closer (Lstream *stream);
2193 static Lisp_Object decoding_marker (Lisp_Object stream);
2195 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2196 sizeof (struct decoding_stream));
2199 decoding_marker (Lisp_Object stream)
2201 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2202 Lisp_Object str_obj;
2204 /* We do not need to mark the coding systems or charsets stored
2205 within the stream because they are stored in a global list
2206 and automatically marked. */
2208 XSETLSTREAM (str_obj, str);
2209 mark_object (str_obj);
2210 if (str->imp->marker)
2211 return (str->imp->marker) (str_obj);
2216 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2217 so we read data from the other end, decode it, and store it into DATA. */
2219 static Lstream_data_count
2220 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2222 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2223 unsigned char *orig_data = data;
2224 Lstream_data_count read_size;
2225 int error_occurred = 0;
2227 /* We need to interface to mule_decode(), which expects to take some
2228 amount of data and store the result into a Dynarr. We have
2229 mule_decode() store into str->runoff, and take data from there
2232 /* We loop until we have enough data, reading chunks from the other
2233 end and decoding it. */
2236 /* Take data from the runoff if we can. Make sure to take at
2237 most SIZE bytes, and delete the data from the runoff. */
2238 if (Dynarr_length (str->runoff) > 0)
2240 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2241 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2242 Dynarr_delete_many (str->runoff, 0, chunk);
2248 break; /* No more room for data */
2250 if (str->flags & CODING_STATE_END)
2251 /* This means that on the previous iteration, we hit the EOF on
2252 the other end. We loop once more so that mule_decode() can
2253 output any final stuff it may be holding, or any "go back
2254 to a sane state" escape sequences. (This latter makes sense
2255 during encoding.) */
2258 /* Exhausted the runoff, so get some more. DATA has at least
2259 SIZE bytes left of storage in it, so it's OK to read directly
2260 into it. (We'll be overwriting above, after we've decoded it
2261 into the runoff.) */
2262 read_size = Lstream_read (str->other_end, data, size);
2269 /* There might be some more end data produced in the translation.
2270 See the comment above. */
2271 str->flags |= CODING_STATE_END;
2272 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2275 if (data - orig_data == 0)
2276 return error_occurred ? -1 : 0;
2278 return data - orig_data;
2281 static Lstream_data_count
2282 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2284 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2285 Lstream_data_count retval;
2287 /* Decode all our data into the runoff, and then attempt to write
2288 it all out to the other end. Remove whatever chunk we succeeded
2290 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2291 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2292 Dynarr_length (str->runoff));
2294 Dynarr_delete_many (str->runoff, 0, retval);
2295 /* Do NOT return retval. The return value indicates how much
2296 of the incoming data was written, not how many bytes were
2302 reset_decoding_stream (struct decoding_stream *str)
2305 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2307 Lisp_Object coding_system;
2308 XSETCODING_SYSTEM (coding_system, str->codesys);
2309 reset_iso2022 (coding_system, &str->iso2022);
2311 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2313 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2317 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2318 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2321 str->decst.eol_type = EOL_AUTODETECT;
2322 str->decst.mask = ~0;
2324 str->flags = str->ch = 0;
2328 decoding_rewinder (Lstream *stream)
2330 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2331 reset_decoding_stream (str);
2332 Dynarr_reset (str->runoff);
2333 return Lstream_rewind (str->other_end);
2337 decoding_seekable_p (Lstream *stream)
2339 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2340 return Lstream_seekable_p (str->other_end);
2344 decoding_flusher (Lstream *stream)
2346 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2347 return Lstream_flush (str->other_end);
2351 decoding_closer (Lstream *stream)
2353 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2354 if (stream->flags & LSTREAM_FL_WRITE)
2356 str->flags |= CODING_STATE_END;
2357 decoding_writer (stream, 0, 0);
2359 Dynarr_free (str->runoff);
2361 #ifdef ENABLE_COMPOSITE_CHARS
2362 if (str->iso2022.composite_chars)
2363 Dynarr_free (str->iso2022.composite_chars);
2366 return Lstream_close (str->other_end);
2370 decoding_stream_coding_system (Lstream *stream)
2372 Lisp_Object coding_system;
2373 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2375 XSETCODING_SYSTEM (coding_system, str->codesys);
2376 return subsidiary_coding_system (coding_system, str->eol_type);
2380 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2382 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2383 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2385 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2386 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2387 reset_decoding_stream (str);
2390 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2391 stream for writing, no automatic code detection will be performed.
2392 The reason for this is that automatic code detection requires a
2393 seekable input. Things will also fail if you open a decoding
2394 stream for reading using a non-fully-specified coding system and
2395 a non-seekable input stream. */
2398 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2401 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2402 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2406 str->other_end = stream;
2407 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2408 str->eol_type = EOL_AUTODETECT;
2409 if (!strcmp (mode, "r")
2410 && Lstream_seekable_p (stream))
2411 /* We can determine the coding system now. */
2412 determine_real_coding_system (stream, &codesys, &str->eol_type);
2413 set_decoding_stream_coding_system (lstr, codesys);
2414 str->decst.eol_type = str->eol_type;
2415 str->decst.mask = ~0;
2416 XSETLSTREAM (obj, lstr);
2421 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2423 return make_decoding_stream_1 (stream, codesys, "r");
2427 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2429 return make_decoding_stream_1 (stream, codesys, "w");
2432 /* Note: the decode_coding_* functions all take the same
2433 arguments as mule_decode(), which is to say some SRC data of
2434 size N, which is to be stored into dynamic array DST.
2435 DECODING is the stream within which the decoding is
2436 taking place, but no data is actually read from or
2437 written to that stream; that is handled in decoding_reader()
2438 or decoding_writer(). This allows the same functions to
2439 be used for both reading and writing. */
2442 mule_decode (Lstream *decoding, const Extbyte *src,
2443 unsigned_char_dynarr *dst, Lstream_data_count n)
2445 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2447 /* If necessary, do encoding-detection now. We do this when
2448 we're a writing stream or a non-seekable reading stream,
2449 meaning that we can't just process the whole input,
2450 rewind, and start over. */
2452 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2453 str->eol_type == EOL_AUTODETECT)
2455 Lisp_Object codesys;
2457 XSETCODING_SYSTEM (codesys, str->codesys);
2458 detect_coding_type (&str->decst, src, n,
2459 CODING_SYSTEM_TYPE (str->codesys) !=
2460 CODESYS_AUTODETECT);
2461 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2462 str->decst.mask != ~0)
2463 /* #### This is cheesy. What we really ought to do is
2464 buffer up a certain amount of data so as to get a
2465 less random result. */
2466 codesys = coding_system_from_mask (str->decst.mask);
2467 str->eol_type = str->decst.eol_type;
2468 if (XCODING_SYSTEM (codesys) != str->codesys)
2470 /* Preserve the CODING_STATE_END flag in case it was set.
2471 If we erase it, bad things might happen. */
2472 int was_end = str->flags & CODING_STATE_END;
2473 set_decoding_stream_coding_system (decoding, codesys);
2475 str->flags |= CODING_STATE_END;
2479 switch (CODING_SYSTEM_TYPE (str->codesys))
2482 case CODESYS_INTERNAL:
2483 Dynarr_add_many (dst, src, n);
2486 case CODESYS_AUTODETECT:
2487 /* If we got this far and still haven't decided on the coding
2488 system, then do no conversion. */
2489 case CODESYS_NO_CONVERSION:
2490 decode_coding_no_conversion (decoding, src, dst, n);
2493 case CODESYS_SHIFT_JIS:
2494 decode_coding_sjis (decoding, src, dst, n);
2497 decode_coding_big5 (decoding, src, dst, n);
2500 decode_coding_ucs4 (decoding, src, dst, n);
2503 decode_coding_utf8 (decoding, src, dst, n);
2506 str->ccl.last_block = str->flags & CODING_STATE_END;
2507 /* When applying ccl program to stream, MUST NOT set NULL
2509 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2510 dst, n, 0, CCL_MODE_DECODING);
2512 case CODESYS_ISO2022:
2513 decode_coding_iso2022 (decoding, src, dst, n);
2521 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2522 Decode the text between START and END which is encoded in CODING-SYSTEM.
2523 This is useful if you've read in encoded text from a file without decoding
2524 it (e.g. you read in a JIS-formatted file but used the `binary' or
2525 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2526 Return length of decoded text.
2527 BUFFER defaults to the current buffer if unspecified.
2529 (start, end, coding_system, buffer))
2532 struct buffer *buf = decode_buffer (buffer, 0);
2533 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2534 Lstream *istr, *ostr;
2535 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2537 get_buffer_range_char (buf, start, end, &b, &e, 0);
2539 barf_if_buffer_read_only (buf, b, e);
2541 coding_system = Fget_coding_system (coding_system);
2542 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2543 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2544 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2546 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2547 Fget_coding_system (Qbinary));
2548 istr = XLSTREAM (instream);
2549 ostr = XLSTREAM (outstream);
2550 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2552 /* The chain of streams looks like this:
2554 [BUFFER] <----- send through
2555 ------> [ENCODE AS BINARY]
2556 ------> [DECODE AS SPECIFIED]
2562 char tempbuf[1024]; /* some random amount */
2563 Bufpos newpos, even_newer_pos;
2564 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2565 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2569 newpos = lisp_buffer_stream_startpos (istr);
2570 Lstream_write (ostr, tempbuf, size_in_bytes);
2571 even_newer_pos = lisp_buffer_stream_startpos (istr);
2572 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2575 Lstream_close (istr);
2576 Lstream_close (ostr);
2578 Lstream_delete (istr);
2579 Lstream_delete (ostr);
2580 Lstream_delete (XLSTREAM (de_outstream));
2581 Lstream_delete (XLSTREAM (lb_outstream));
2586 /************************************************************************/
2587 /* Converting to an external encoding ("encoding") */
2588 /************************************************************************/
2590 /* An encoding stream is an output stream. When you create the
2591 stream, you specify the coding system that governs the encoding
2592 and another stream that the resulting encoded data is to be
2593 sent to, and then start sending data to it. */
2595 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2597 struct encoding_stream
2599 /* Coding system that governs the conversion. */
2600 Lisp_Coding_System *codesys;
2602 /* Stream that we read the encoded data from or
2603 write the decoded data to. */
2606 /* If we are reading, then we can return only a fixed amount of
2607 data, so if the conversion resulted in too much data, we store it
2608 here for retrieval the next time around. */
2609 unsigned_char_dynarr *runoff;
2611 /* FLAGS holds flags indicating the current state of the encoding.
2612 Some of these flags are dependent on the coding system. */
2615 /* CH holds a partially built-up character. Since we only deal
2616 with one- and two-byte characters at the moment, we only use
2617 this to store the first byte of a two-byte character. */
2620 /* Additional information used by the ISO2022 encoder. */
2623 /* CHARSET holds the character sets currently assigned to the G0
2624 through G3 registers. It is initialized from the array
2625 INITIAL_CHARSET in CODESYS. */
2626 Lisp_Object charset[4];
2628 /* Which registers are currently invoked into the left (GL) and
2629 right (GR) halves of the 8-bit encoding space? */
2630 int register_left, register_right;
2632 /* Whether we need to explicitly designate the charset in the
2633 G? register before using it. It is initialized from the
2634 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2635 unsigned char force_charset_on_output[4];
2637 /* Other state variables that need to be preserved across
2639 Lisp_Object current_charset;
2641 int current_char_boundary;
2644 /* Additional information (the state of the running CCL program)
2645 used by the CCL encoder. */
2646 struct ccl_program ccl;
2650 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2651 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2652 Lstream_data_count size);
2653 static int encoding_rewinder (Lstream *stream);
2654 static int encoding_seekable_p (Lstream *stream);
2655 static int encoding_flusher (Lstream *stream);
2656 static int encoding_closer (Lstream *stream);
2658 static Lisp_Object encoding_marker (Lisp_Object stream);
2660 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2661 sizeof (struct encoding_stream));
2664 encoding_marker (Lisp_Object stream)
2666 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2667 Lisp_Object str_obj;
2669 /* We do not need to mark the coding systems or charsets stored
2670 within the stream because they are stored in a global list
2671 and automatically marked. */
2673 XSETLSTREAM (str_obj, str);
2674 mark_object (str_obj);
2675 if (str->imp->marker)
2676 return (str->imp->marker) (str_obj);
2681 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2682 so we read data from the other end, encode it, and store it into DATA. */
2684 static Lstream_data_count
2685 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2687 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2688 unsigned char *orig_data = data;
2689 Lstream_data_count read_size;
2690 int error_occurred = 0;
2692 /* We need to interface to mule_encode(), which expects to take some
2693 amount of data and store the result into a Dynarr. We have
2694 mule_encode() store into str->runoff, and take data from there
2697 /* We loop until we have enough data, reading chunks from the other
2698 end and encoding it. */
2701 /* Take data from the runoff if we can. Make sure to take at
2702 most SIZE bytes, and delete the data from the runoff. */
2703 if (Dynarr_length (str->runoff) > 0)
2705 int chunk = min ((int) size, Dynarr_length (str->runoff));
2706 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2707 Dynarr_delete_many (str->runoff, 0, chunk);
2713 break; /* No more room for data */
2715 if (str->flags & CODING_STATE_END)
2716 /* This means that on the previous iteration, we hit the EOF on
2717 the other end. We loop once more so that mule_encode() can
2718 output any final stuff it may be holding, or any "go back
2719 to a sane state" escape sequences. (This latter makes sense
2720 during encoding.) */
2723 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2724 left of storage in it, so it's OK to read directly into it.
2725 (We'll be overwriting above, after we've encoded it into the
2727 read_size = Lstream_read (str->other_end, data, size);
2734 /* There might be some more end data produced in the translation.
2735 See the comment above. */
2736 str->flags |= CODING_STATE_END;
2737 mule_encode (stream, data, str->runoff, read_size);
2740 if (data == orig_data)
2741 return error_occurred ? -1 : 0;
2743 return data - orig_data;
2746 static Lstream_data_count
2747 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2749 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2750 Lstream_data_count retval;
2752 /* Encode all our data into the runoff, and then attempt to write
2753 it all out to the other end. Remove whatever chunk we succeeded
2755 mule_encode (stream, data, str->runoff, size);
2756 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2757 Dynarr_length (str->runoff));
2759 Dynarr_delete_many (str->runoff, 0, retval);
2760 /* Do NOT return retval. The return value indicates how much
2761 of the incoming data was written, not how many bytes were
2767 reset_encoding_stream (struct encoding_stream *str)
2770 switch (CODING_SYSTEM_TYPE (str->codesys))
2772 case CODESYS_ISO2022:
2776 for (i = 0; i < 4; i++)
2778 str->iso2022.charset[i] =
2779 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2780 str->iso2022.force_charset_on_output[i] =
2781 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2783 str->iso2022.register_left = 0;
2784 str->iso2022.register_right = 1;
2785 str->iso2022.current_charset = Qnil;
2786 str->iso2022.current_half = 0;
2788 str->iso2022.current_char_boundary = 0;
2790 str->iso2022.current_char_boundary = 1;
2795 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2802 str->flags = str->ch = 0;
2806 encoding_rewinder (Lstream *stream)
2808 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2809 reset_encoding_stream (str);
2810 Dynarr_reset (str->runoff);
2811 return Lstream_rewind (str->other_end);
2815 encoding_seekable_p (Lstream *stream)
2817 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2818 return Lstream_seekable_p (str->other_end);
2822 encoding_flusher (Lstream *stream)
2824 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2825 return Lstream_flush (str->other_end);
2829 encoding_closer (Lstream *stream)
2831 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2832 if (stream->flags & LSTREAM_FL_WRITE)
2834 str->flags |= CODING_STATE_END;
2835 encoding_writer (stream, 0, 0);
2837 Dynarr_free (str->runoff);
2838 return Lstream_close (str->other_end);
2842 encoding_stream_coding_system (Lstream *stream)
2844 Lisp_Object coding_system;
2845 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2847 XSETCODING_SYSTEM (coding_system, str->codesys);
2848 return coding_system;
2852 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2854 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2855 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2857 reset_encoding_stream (str);
2861 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2864 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2865 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2869 str->runoff = Dynarr_new (unsigned_char);
2870 str->other_end = stream;
2871 set_encoding_stream_coding_system (lstr, codesys);
2872 XSETLSTREAM (obj, lstr);
2877 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2879 return make_encoding_stream_1 (stream, codesys, "r");
2883 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2885 return make_encoding_stream_1 (stream, codesys, "w");
2888 /* Convert N bytes of internally-formatted data stored in SRC to an
2889 external format, according to the encoding stream ENCODING.
2890 Store the encoded data into DST. */
2893 mule_encode (Lstream *encoding, const Bufbyte *src,
2894 unsigned_char_dynarr *dst, Lstream_data_count n)
2896 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2898 switch (CODING_SYSTEM_TYPE (str->codesys))
2901 case CODESYS_INTERNAL:
2902 Dynarr_add_many (dst, src, n);
2905 case CODESYS_AUTODETECT:
2906 /* If we got this far and still haven't decided on the coding
2907 system, then do no conversion. */
2908 case CODESYS_NO_CONVERSION:
2909 encode_coding_no_conversion (encoding, src, dst, n);
2912 case CODESYS_SHIFT_JIS:
2913 encode_coding_sjis (encoding, src, dst, n);
2916 encode_coding_big5 (encoding, src, dst, n);
2919 encode_coding_ucs4 (encoding, src, dst, n);
2922 encode_coding_utf8 (encoding, src, dst, n);
2925 str->ccl.last_block = str->flags & CODING_STATE_END;
2926 /* When applying ccl program to stream, MUST NOT set NULL
2928 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
2929 dst, n, 0, CCL_MODE_ENCODING);
2931 case CODESYS_ISO2022:
2932 encode_coding_iso2022 (encoding, src, dst, n);
2940 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2941 Encode the text between START and END using CODING-SYSTEM.
2942 This will, for example, convert Japanese characters into stuff such as
2943 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2944 text. BUFFER defaults to the current buffer if unspecified.
2946 (start, end, coding_system, buffer))
2949 struct buffer *buf = decode_buffer (buffer, 0);
2950 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2951 Lstream *istr, *ostr;
2952 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2954 get_buffer_range_char (buf, start, end, &b, &e, 0);
2956 barf_if_buffer_read_only (buf, b, e);
2958 coding_system = Fget_coding_system (coding_system);
2959 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2960 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2961 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2962 Fget_coding_system (Qbinary));
2963 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2965 istr = XLSTREAM (instream);
2966 ostr = XLSTREAM (outstream);
2967 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2968 /* The chain of streams looks like this:
2970 [BUFFER] <----- send through
2971 ------> [ENCODE AS SPECIFIED]
2972 ------> [DECODE AS BINARY]
2977 char tempbuf[1024]; /* some random amount */
2978 Bufpos newpos, even_newer_pos;
2979 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2980 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2984 newpos = lisp_buffer_stream_startpos (istr);
2985 Lstream_write (ostr, tempbuf, size_in_bytes);
2986 even_newer_pos = lisp_buffer_stream_startpos (istr);
2987 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2993 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2994 Lstream_close (istr);
2995 Lstream_close (ostr);
2997 Lstream_delete (istr);
2998 Lstream_delete (ostr);
2999 Lstream_delete (XLSTREAM (de_outstream));
3000 Lstream_delete (XLSTREAM (lb_outstream));
3001 return make_int (retlen);
3007 /************************************************************************/
3008 /* Shift-JIS methods */
3009 /************************************************************************/
3011 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3012 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3013 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3014 encoded by "position-code + 0x80". A character of JISX0208
3015 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3016 position-codes are divided and shifted so that it fit in the range
3019 --- CODE RANGE of Shift-JIS ---
3020 (character set) (range)
3022 JISX0201-Kana 0xA0 .. 0xDF
3023 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3024 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3025 -------------------------------
3029 /* Is this the first byte of a Shift-JIS two-byte char? */
3031 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3032 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3034 /* Is this the second byte of a Shift-JIS two-byte char? */
3036 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3037 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3039 #define BYTE_SJIS_KATAKANA_P(c) \
3040 ((c) >= 0xA1 && (c) <= 0xDF)
3043 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3047 unsigned char c = *(unsigned char *)src++;
3048 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3050 if (st->shift_jis.in_second_byte)
3052 st->shift_jis.in_second_byte = 0;
3056 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3057 st->shift_jis.in_second_byte = 1;
3059 return CODING_CATEGORY_SHIFT_JIS_MASK;
3062 /* Convert Shift-JIS data to internal format. */
3065 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3066 unsigned_char_dynarr *dst, Lstream_data_count n)
3068 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3069 unsigned int flags = str->flags;
3070 unsigned int ch = str->ch;
3071 eol_type_t eol_type = str->eol_type;
3075 unsigned char c = *(unsigned char *)src++;
3079 /* Previous character was first byte of Shift-JIS Kanji char. */
3080 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3082 unsigned char e1, e2;
3084 DECODE_SJIS (ch, c, e1, e2);
3086 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3090 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3091 Dynarr_add (dst, e1);
3092 Dynarr_add (dst, e2);
3097 DECODE_ADD_BINARY_CHAR (ch, dst);
3098 DECODE_ADD_BINARY_CHAR (c, dst);
3104 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3105 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3107 else if (BYTE_SJIS_KATAKANA_P (c))
3110 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3113 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3114 Dynarr_add (dst, c);
3118 DECODE_ADD_BINARY_CHAR (c, dst);
3120 label_continue_loop:;
3123 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3129 /* Convert internally-formatted data to Shift-JIS. */
3132 encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
3133 unsigned_char_dynarr *dst, Lstream_data_count n)
3135 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3136 unsigned int flags = str->flags;
3137 unsigned int ch = str->ch;
3138 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3140 unsigned char char_boundary = str->iso2022.current_char_boundary;
3147 switch (char_boundary)
3155 else if ( c >= 0xf8 )
3160 else if ( c >= 0xf0 )
3165 else if ( c >= 0xe0 )
3170 else if ( c >= 0xc0 )
3180 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3181 Dynarr_add (dst, '\r');
3182 if (eol_type != EOL_CR)
3183 Dynarr_add (dst, c);
3186 Dynarr_add (dst, c);
3191 ch = ( ch << 6 ) | ( c & 0x3f );
3193 Lisp_Object charset;
3194 unsigned int c1, c2, s1, s2;
3196 BREAKUP_CHAR (ch, charset, c1, c2);
3197 if (EQ(charset, Vcharset_katakana_jisx0201))
3199 Dynarr_add (dst, c1 | 0x80);
3201 else if (EQ(charset, Vcharset_japanese_jisx0208))
3203 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3204 Dynarr_add (dst, s1);
3205 Dynarr_add (dst, s2);
3211 ch = ( ch << 6 ) | ( c & 0x3f );
3217 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3218 Dynarr_add (dst, '\r');
3219 if (eol_type != EOL_CR)
3220 Dynarr_add (dst, '\n');
3223 else if (BYTE_ASCII_P (c))
3225 Dynarr_add (dst, c);
3228 else if (BUFBYTE_LEADING_BYTE_P (c))
3229 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3230 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3231 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3234 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3236 Dynarr_add (dst, c);
3239 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3240 ch == LEADING_BYTE_JAPANESE_JISX0208)
3244 unsigned char j1, j2;
3245 ENCODE_SJIS (ch, c, j1, j2);
3246 Dynarr_add (dst, j1);
3247 Dynarr_add (dst, j2);
3257 str->iso2022.current_char_boundary = char_boundary;
3261 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3262 Decode a JISX0208 character of Shift-JIS coding-system.
3263 CODE is the character code in Shift-JIS as a cons of type bytes.
3264 Return the corresponding character.
3268 unsigned char c1, c2, s1, s2;
3271 CHECK_INT (XCAR (code));
3272 CHECK_INT (XCDR (code));
3273 s1 = XINT (XCAR (code));
3274 s2 = XINT (XCDR (code));
3275 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3276 BYTE_SJIS_TWO_BYTE_2_P (s2))
3278 DECODE_SJIS (s1, s2, c1, c2);
3279 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3280 c1 & 0x7F, c2 & 0x7F));
3286 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3287 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3288 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3292 Lisp_Object charset;
3295 CHECK_CHAR_COERCE_INT (character);
3296 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3297 if (EQ (charset, Vcharset_japanese_jisx0208))
3299 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3300 return Fcons (make_int (s1), make_int (s2));
3307 /************************************************************************/
3309 /************************************************************************/
3311 /* BIG5 is a coding system encoding two character sets: ASCII and
3312 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3313 character set and is encoded in two-byte.
3315 --- CODE RANGE of BIG5 ---
3316 (character set) (range)
3318 Big5 (1st byte) 0xA1 .. 0xFE
3319 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3320 --------------------------
3322 Since the number of characters in Big5 is larger than maximum
3323 characters in Emacs' charset (96x96), it can't be handled as one
3324 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3325 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3326 contains frequently used characters and the latter contains less
3327 frequently used characters. */
3329 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3330 ((c) >= 0xA1 && (c) <= 0xFE)
3332 /* Is this the second byte of a Shift-JIS two-byte char? */
3334 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3335 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3337 /* Number of Big5 characters which have the same code in 1st byte. */
3339 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3341 /* Code conversion macros. These are macros because they are used in
3342 inner loops during code conversion.
3344 Note that temporary variables in macros introduce the classic
3345 dynamic-scoping problems with variable names. We use capital-
3346 lettered variables in the assumption that XEmacs does not use
3347 capital letters in variables except in a very formalized way
3350 /* Convert Big5 code (b1, b2) into its internal string representation
3353 /* There is a much simpler way to split the Big5 charset into two.
3354 For the moment I'm going to leave the algorithm as-is because it
3355 claims to separate out the most-used characters into a single
3356 charset, which perhaps will lead to optimizations in various
3359 The way the algorithm works is something like this:
3361 Big5 can be viewed as a 94x157 charset, where the row is
3362 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3363 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3364 the split between low and high column numbers is apparently
3365 meaningless; ascending rows produce less and less frequent chars.
3366 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3367 the first charset, and the upper half (0xC9 .. 0xFE) to the
3368 second. To do the conversion, we convert the character into
3369 a single number where 0 .. 156 is the first row, 157 .. 313
3370 is the second, etc. That way, the characters are ordered by
3371 decreasing frequency. Then we just chop the space in two
3372 and coerce the result into a 94x94 space.
3375 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3377 int B1 = b1, B2 = b2; \
3379 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3383 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3387 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3388 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3390 c1 = I / (0xFF - 0xA1) + 0xA1; \
3391 c2 = I % (0xFF - 0xA1) + 0xA1; \
3394 /* Convert the internal string representation of a Big5 character
3395 (lb, c1, c2) into Big5 code (b1, b2). */
3397 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3399 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3401 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3403 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3405 b1 = I / BIG5_SAME_ROW + 0xA1; \
3406 b2 = I % BIG5_SAME_ROW; \
3407 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3411 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3415 unsigned char c = *(unsigned char *)src++;
3416 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3417 (c >= 0x80 && c <= 0xA0))
3419 if (st->big5.in_second_byte)
3421 st->big5.in_second_byte = 0;
3422 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3426 st->big5.in_second_byte = 1;
3428 return CODING_CATEGORY_BIG5_MASK;
3431 /* Convert Big5 data to internal format. */
3434 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3435 unsigned_char_dynarr *dst, Lstream_data_count n)
3437 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3438 unsigned int flags = str->flags;
3439 unsigned int ch = str->ch;
3440 eol_type_t eol_type = str->eol_type;
3444 unsigned char c = *(unsigned char *)src++;
3447 /* Previous character was first byte of Big5 char. */
3448 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3450 unsigned char b1, b2, b3;
3451 DECODE_BIG5 (ch, c, b1, b2, b3);
3452 Dynarr_add (dst, b1);
3453 Dynarr_add (dst, b2);
3454 Dynarr_add (dst, b3);
3458 DECODE_ADD_BINARY_CHAR (ch, dst);
3459 DECODE_ADD_BINARY_CHAR (c, dst);
3465 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3466 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3469 DECODE_ADD_BINARY_CHAR (c, dst);
3471 label_continue_loop:;
3474 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3480 /* Convert internally-formatted data to Big5. */
3483 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3484 unsigned_char_dynarr *dst, Lstream_data_count n)
3488 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3489 unsigned int flags = str->flags;
3490 unsigned int ch = str->ch;
3491 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3498 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3499 Dynarr_add (dst, '\r');
3500 if (eol_type != EOL_CR)
3501 Dynarr_add (dst, '\n');
3503 else if (BYTE_ASCII_P (c))
3506 Dynarr_add (dst, c);
3508 else if (BUFBYTE_LEADING_BYTE_P (c))
3510 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3511 c == LEADING_BYTE_CHINESE_BIG5_2)
3513 /* A recognized leading byte. */
3515 continue; /* not done with this character. */
3517 /* otherwise just ignore this character. */
3519 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3520 ch == LEADING_BYTE_CHINESE_BIG5_2)
3522 /* Previous char was a recognized leading byte. */
3524 continue; /* not done with this character. */
3528 /* Encountering second byte of a Big5 character. */
3529 unsigned char b1, b2;
3531 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3532 Dynarr_add (dst, b1);
3533 Dynarr_add (dst, b2);
3545 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3546 Decode a Big5 character CODE of BIG5 coding-system.
3547 CODE is the character code in BIG5, a cons of two integers.
3548 Return the corresponding character.
3552 unsigned char c1, c2, b1, b2;
3555 CHECK_INT (XCAR (code));
3556 CHECK_INT (XCDR (code));
3557 b1 = XINT (XCAR (code));
3558 b2 = XINT (XCDR (code));
3559 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3560 BYTE_BIG5_TWO_BYTE_2_P (b2))
3562 Charset_ID leading_byte;
3563 Lisp_Object charset;
3564 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3565 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3566 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3572 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3573 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3574 Return the corresponding character code in Big5.
3578 Lisp_Object charset;
3581 CHECK_CHAR_COERCE_INT (character);
3582 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3583 if (EQ (charset, Vcharset_chinese_big5_1) ||
3584 EQ (charset, Vcharset_chinese_big5_2))
3586 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3588 return Fcons (make_int (b1), make_int (b2));
3595 /************************************************************************/
3598 /* UCS-4 character codes are implemented as nonnegative integers. */
3600 /************************************************************************/
3603 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3604 Map UCS-4 code CODE to Mule character CHARACTER.
3606 Return T on success, NIL on failure.
3612 CHECK_CHAR (character);
3613 CHECK_NATNUM (code);
3616 if (c < countof (fcd->ucs_to_mule_table))
3618 fcd->ucs_to_mule_table[c] = character;
3626 ucs_to_char (unsigned long code)
3628 if (code < countof (fcd->ucs_to_mule_table))
3630 return fcd->ucs_to_mule_table[code];
3632 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3637 c = code % (94 * 94);
3639 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3640 (94, 2, code / (94 * 94) + '@',
3641 CHARSET_LEFT_TO_RIGHT),
3642 c / 94 + 33, c % 94 + 33));
3648 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3649 Return Mule character corresponding to UCS code CODE (a positive integer).
3653 CHECK_NATNUM (code);
3654 return ucs_to_char (XINT (code));
3657 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3658 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3662 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3663 Fset_char_ucs is more restrictive on index arg, but should
3664 check code arg in a char_table method. */
3665 CHECK_CHAR (character);
3666 CHECK_NATNUM (code);
3667 return Fput_char_table (character, code, mule_to_ucs_table);
3670 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3671 Return the UCS code (a positive integer) corresponding to CHARACTER.
3675 return Fget_char_table (character, mule_to_ucs_table);
3680 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3682 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3683 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3684 is not found, instead.
3685 #### do something more appropriate (use blob?)
3686 Danger, Will Robinson! Data loss. Should we signal user? */
3688 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3690 Lisp_Object chr = ucs_to_char (ch);
3694 Bufbyte work[MAX_EMCHAR_LEN];
3699 simple_set_charptr_emchar (work, ch) :
3700 non_ascii_set_charptr_emchar (work, ch);
3701 Dynarr_add_many (dst, work, len);
3705 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3706 Dynarr_add (dst, 34 + 128);
3707 Dynarr_add (dst, 46 + 128);
3713 static unsigned long
3714 mule_char_to_ucs4 (Lisp_Object charset,
3715 unsigned char h, unsigned char l)
3718 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3725 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3726 (XCHARSET_CHARS (charset) == 94) )
3728 unsigned char final = XCHARSET_FINAL (charset);
3730 if ( ('@' <= final) && (final < 0x7f) )
3732 return 0xe00000 + (final - '@') * 94 * 94
3733 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3747 encode_ucs4 (Lisp_Object charset,
3748 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3750 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3751 Dynarr_add (dst, code >> 24);
3752 Dynarr_add (dst, (code >> 16) & 255);
3753 Dynarr_add (dst, (code >> 8) & 255);
3754 Dynarr_add (dst, code & 255);
3759 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3763 unsigned char c = *(unsigned char *)src++;
3764 switch (st->ucs4.in_byte)
3773 st->ucs4.in_byte = 0;
3779 return CODING_CATEGORY_UCS4_MASK;
3783 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3784 unsigned_char_dynarr *dst, Lstream_data_count n)
3786 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3787 unsigned int flags = str->flags;
3788 unsigned int ch = str->ch;
3789 unsigned char counter = str->counter;
3793 unsigned char c = *(unsigned char *)src++;
3801 decode_ucs4 ( ( ch << 8 ) | c, dst);
3806 ch = ( ch << 8 ) | c;
3810 if (counter & CODING_STATE_END)
3811 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3815 str->counter = counter;
3819 encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
3820 unsigned_char_dynarr *dst, Lstream_data_count n)
3823 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3824 unsigned int flags = str->flags;
3825 unsigned int ch = str->ch;
3826 unsigned char char_boundary = str->iso2022.current_char_boundary;
3827 Lisp_Object charset = str->iso2022.current_charset;
3829 #ifdef ENABLE_COMPOSITE_CHARS
3830 /* flags for handling composite chars. We do a little switcharoo
3831 on the source while we're outputting the composite char. */
3832 unsigned int saved_n = 0;
3833 const unsigned char *saved_src = NULL;
3834 int in_composite = 0;
3841 unsigned char c = *src++;
3843 if (BYTE_ASCII_P (c))
3844 { /* Processing ASCII character */
3846 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3849 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3850 { /* Processing Leading Byte */
3852 charset = CHARSET_BY_LEADING_BYTE (c);
3853 if (LEADING_BYTE_PREFIX_P(c))
3858 { /* Processing Non-ASCII character */
3860 if (EQ (charset, Vcharset_control_1))
3862 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3866 switch (XCHARSET_REP_BYTES (charset))
3869 encode_ucs4 (charset, c, 0, dst);
3872 if (XCHARSET_PRIVATE_P (charset))
3874 encode_ucs4 (charset, c, 0, dst);
3879 #ifdef ENABLE_COMPOSITE_CHARS
3880 if (EQ (charset, Vcharset_composite))
3884 /* #### Bother! We don't know how to
3886 Dynarr_add (dst, '\0');
3887 Dynarr_add (dst, '\0');
3888 Dynarr_add (dst, '\0');
3889 Dynarr_add (dst, '~');
3893 Emchar emch = MAKE_CHAR (Vcharset_composite,
3894 ch & 0x7F, c & 0x7F);
3895 Lisp_Object lstr = composite_char_string (emch);
3899 src = XSTRING_DATA (lstr);
3900 n = XSTRING_LENGTH (lstr);
3904 #endif /* ENABLE_COMPOSITE_CHARS */
3906 encode_ucs4(charset, ch, c, dst);
3919 encode_ucs4 (charset, ch, c, dst);
3935 #ifdef ENABLE_COMPOSITE_CHARS
3941 goto back_to_square_n; /* Wheeeeeeeee ..... */
3943 #endif /* ENABLE_COMPOSITE_CHARS */
3947 str->iso2022.current_char_boundary = char_boundary;
3948 str->iso2022.current_charset = charset;
3950 /* Verbum caro factum est! */
3955 /************************************************************************/
3957 /************************************************************************/
3960 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3964 unsigned char c = *(unsigned char *)src++;
3965 switch (st->utf8.in_byte)
3968 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3971 st->utf8.in_byte = 5;
3973 st->utf8.in_byte = 4;
3975 st->utf8.in_byte = 3;
3977 st->utf8.in_byte = 2;
3979 st->utf8.in_byte = 1;
3984 if ((c & 0xc0) != 0x80)
3990 return CODING_CATEGORY_UTF8_MASK;
3994 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3995 unsigned_char_dynarr *dst, Lstream_data_count n)
3997 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3998 unsigned int flags = str->flags;
3999 unsigned int ch = str->ch;
4000 eol_type_t eol_type = str->eol_type;
4001 unsigned char counter = str->counter;
4005 unsigned char c = *(unsigned char *)src++;
4014 else if ( c >= 0xf8 )
4019 else if ( c >= 0xf0 )
4024 else if ( c >= 0xe0 )
4029 else if ( c >= 0xc0 )
4036 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4037 decode_ucs4 (c, dst);
4041 ch = ( ch << 6 ) | ( c & 0x3f );
4042 decode_ucs4 (ch, dst);
4047 ch = ( ch << 6 ) | ( c & 0x3f );
4050 label_continue_loop:;
4053 if (flags & CODING_STATE_END)
4054 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4058 str->counter = counter;
4063 encode_utf8 (Lisp_Object charset,
4064 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4066 unsigned long code = mule_char_to_ucs4 (charset, h, l);
4069 Dynarr_add (dst, code);
4071 else if ( code <= 0x7ff )
4073 Dynarr_add (dst, (code >> 6) | 0xc0);
4074 Dynarr_add (dst, (code & 0x3f) | 0x80);
4076 else if ( code <= 0xffff )
4078 Dynarr_add (dst, (code >> 12) | 0xe0);
4079 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4080 Dynarr_add (dst, (code & 0x3f) | 0x80);
4082 else if ( code <= 0x1fffff )
4084 Dynarr_add (dst, (code >> 18) | 0xf0);
4085 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4086 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4087 Dynarr_add (dst, (code & 0x3f) | 0x80);
4089 else if ( code <= 0x3ffffff )
4091 Dynarr_add (dst, (code >> 24) | 0xf8);
4092 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4093 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4094 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4095 Dynarr_add (dst, (code & 0x3f) | 0x80);
4099 Dynarr_add (dst, (code >> 30) | 0xfc);
4100 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4101 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4102 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4103 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4104 Dynarr_add (dst, (code & 0x3f) | 0x80);
4110 encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
4111 unsigned_char_dynarr *dst, Lstream_data_count n)
4113 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4114 unsigned int flags = str->flags;
4115 unsigned int ch = str->ch;
4116 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4117 unsigned char char_boundary = str->iso2022.current_char_boundary;
4122 unsigned char c = *src++;
4123 switch (char_boundary)
4128 Dynarr_add (dst, c);
4131 else if ( c >= 0xf8 )
4133 Dynarr_add (dst, c);
4136 else if ( c >= 0xf0 )
4138 Dynarr_add (dst, c);
4141 else if ( c >= 0xe0 )
4143 Dynarr_add (dst, c);
4146 else if ( c >= 0xc0 )
4148 Dynarr_add (dst, c);
4155 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4156 Dynarr_add (dst, '\r');
4157 if (eol_type != EOL_CR)
4158 Dynarr_add (dst, c);
4161 Dynarr_add (dst, c);
4166 Dynarr_add (dst, c);
4170 Dynarr_add (dst, c);
4174 #else /* not UTF2000 */
4175 Lisp_Object charset = str->iso2022.current_charset;
4177 #ifdef ENABLE_COMPOSITE_CHARS
4178 /* flags for handling composite chars. We do a little switcharoo
4179 on the source while we're outputting the composite char. */
4180 unsigned int saved_n = 0;
4181 const unsigned char *saved_src = NULL;
4182 int in_composite = 0;
4185 #endif /* ENABLE_COMPOSITE_CHARS */
4189 unsigned char c = *src++;
4191 if (BYTE_ASCII_P (c))
4192 { /* Processing ASCII character */
4196 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4197 Dynarr_add (dst, '\r');
4198 if (eol_type != EOL_CR)
4199 Dynarr_add (dst, c);
4202 encode_utf8 (Vcharset_ascii, c, 0, dst);
4205 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4206 { /* Processing Leading Byte */
4208 charset = CHARSET_BY_LEADING_BYTE (c);
4209 if (LEADING_BYTE_PREFIX_P(c))
4214 { /* Processing Non-ASCII character */
4216 if (EQ (charset, Vcharset_control_1))
4218 encode_utf8 (Vcharset_control_1, c, 0, dst);
4222 switch (XCHARSET_REP_BYTES (charset))
4225 encode_utf8 (charset, c, 0, dst);
4228 if (XCHARSET_PRIVATE_P (charset))
4230 encode_utf8 (charset, c, 0, dst);
4235 #ifdef ENABLE_COMPOSITE_CHARS
4236 if (EQ (charset, Vcharset_composite))
4240 /* #### Bother! We don't know how to
4242 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4246 Emchar emch = MAKE_CHAR (Vcharset_composite,
4247 ch & 0x7F, c & 0x7F);
4248 Lisp_Object lstr = composite_char_string (emch);
4252 src = XSTRING_DATA (lstr);
4253 n = XSTRING_LENGTH (lstr);
4257 #endif /* ENABLE_COMPOSITE_CHARS */
4259 encode_utf8 (charset, ch, c, dst);
4272 encode_utf8 (charset, ch, c, dst);
4288 #ifdef ENABLE_COMPOSITE_CHARS
4294 goto back_to_square_n; /* Wheeeeeeeee ..... */
4298 #endif /* not UTF2000 */
4301 str->iso2022.current_char_boundary = char_boundary;
4303 str->iso2022.current_charset = charset;
4306 /* Verbum caro factum est! */
4310 /************************************************************************/
4311 /* ISO2022 methods */
4312 /************************************************************************/
4314 /* The following note describes the coding system ISO2022 briefly.
4315 Since the intention of this note is to help understand the
4316 functions in this file, some parts are NOT ACCURATE or OVERLY
4317 SIMPLIFIED. For thorough understanding, please refer to the
4318 original document of ISO2022.
4320 ISO2022 provides many mechanisms to encode several character sets
4321 in 7-bit and 8-bit environments. For 7-bit environments, all text
4322 is encoded using bytes less than 128. This may make the encoded
4323 text a little bit longer, but the text passes more easily through
4324 several gateways, some of which strip off MSB (Most Signigant Bit).
4326 There are two kinds of character sets: control character set and
4327 graphic character set. The former contains control characters such
4328 as `newline' and `escape' to provide control functions (control
4329 functions are also provided by escape sequences). The latter
4330 contains graphic characters such as 'A' and '-'. Emacs recognizes
4331 two control character sets and many graphic character sets.
4333 Graphic character sets are classified into one of the following
4334 four classes, according to the number of bytes (DIMENSION) and
4335 number of characters in one dimension (CHARS) of the set:
4336 - DIMENSION1_CHARS94
4337 - DIMENSION1_CHARS96
4338 - DIMENSION2_CHARS94
4339 - DIMENSION2_CHARS96
4341 In addition, each character set is assigned an identification tag,
4342 unique for each set, called "final character" (denoted as <F>
4343 hereafter). The <F> of each character set is decided by ECMA(*)
4344 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4345 (0x30..0x3F are for private use only).
4347 Note (*): ECMA = European Computer Manufacturers Association
4349 Here are examples of graphic character set [NAME(<F>)]:
4350 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4351 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4352 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4353 o DIMENSION2_CHARS96 -- none for the moment
4355 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4356 C0 [0x00..0x1F] -- control character plane 0
4357 GL [0x20..0x7F] -- graphic character plane 0
4358 C1 [0x80..0x9F] -- control character plane 1
4359 GR [0xA0..0xFF] -- graphic character plane 1
4361 A control character set is directly designated and invoked to C0 or
4362 C1 by an escape sequence. The most common case is that:
4363 - ISO646's control character set is designated/invoked to C0, and
4364 - ISO6429's control character set is designated/invoked to C1,
4365 and usually these designations/invocations are omitted in encoded
4366 text. In a 7-bit environment, only C0 can be used, and a control
4367 character for C1 is encoded by an appropriate escape sequence to
4368 fit into the environment. All control characters for C1 are
4369 defined to have corresponding escape sequences.
4371 A graphic character set is at first designated to one of four
4372 graphic registers (G0 through G3), then these graphic registers are
4373 invoked to GL or GR. These designations and invocations can be
4374 done independently. The most common case is that G0 is invoked to
4375 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4376 these invocations and designations are omitted in encoded text.
4377 In a 7-bit environment, only GL can be used.
4379 When a graphic character set of CHARS94 is invoked to GL, codes
4380 0x20 and 0x7F of the GL area work as control characters SPACE and
4381 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4384 There are two ways of invocation: locking-shift and single-shift.
4385 With locking-shift, the invocation lasts until the next different
4386 invocation, whereas with single-shift, the invocation affects the
4387 following character only and doesn't affect the locking-shift
4388 state. Invocations are done by the following control characters or
4391 ----------------------------------------------------------------------
4392 abbrev function cntrl escape seq description
4393 ----------------------------------------------------------------------
4394 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4395 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4396 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4397 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4398 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4399 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4400 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4401 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4402 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4403 ----------------------------------------------------------------------
4404 (*) These are not used by any known coding system.
4406 Control characters for these functions are defined by macros
4407 ISO_CODE_XXX in `coding.h'.
4409 Designations are done by the following escape sequences:
4410 ----------------------------------------------------------------------
4411 escape sequence description
4412 ----------------------------------------------------------------------
4413 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4414 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4415 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4416 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4417 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4418 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4419 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4420 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4421 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4422 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4423 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4424 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4425 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4426 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4427 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4428 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4429 ----------------------------------------------------------------------
4431 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4432 of dimension 1, chars 94, and final character <F>, etc...
4434 Note (*): Although these designations are not allowed in ISO2022,
4435 Emacs accepts them on decoding, and produces them on encoding
4436 CHARS96 character sets in a coding system which is characterized as
4437 7-bit environment, non-locking-shift, and non-single-shift.
4439 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4440 '(' can be omitted. We refer to this as "short-form" hereafter.
4442 Now you may notice that there are a lot of ways for encoding the
4443 same multilingual text in ISO2022. Actually, there exist many
4444 coding systems such as Compound Text (used in X11's inter client
4445 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4446 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4447 localized platforms), and all of these are variants of ISO2022.
4449 In addition to the above, Emacs handles two more kinds of escape
4450 sequences: ISO6429's direction specification and Emacs' private
4451 sequence for specifying character composition.
4453 ISO6429's direction specification takes the following form:
4454 o CSI ']' -- end of the current direction
4455 o CSI '0' ']' -- end of the current direction
4456 o CSI '1' ']' -- start of left-to-right text
4457 o CSI '2' ']' -- start of right-to-left text
4458 The control character CSI (0x9B: control sequence introducer) is
4459 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4461 Character composition specification takes the following form:
4462 o ESC '0' -- start character composition
4463 o ESC '1' -- end character composition
4464 Since these are not standard escape sequences of any ISO standard,
4465 their use with these meanings is restricted to Emacs only. */
4468 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4472 for (i = 0; i < 4; i++)
4474 if (!NILP (coding_system))
4476 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4478 iso->charset[i] = Qt;
4479 iso->invalid_designated[i] = 0;
4481 iso->esc = ISO_ESC_NOTHING;
4482 iso->esc_bytes_index = 0;
4483 iso->register_left = 0;
4484 iso->register_right = 1;
4485 iso->switched_dir_and_no_valid_charset_yet = 0;
4486 iso->invalid_switch_dir = 0;
4487 iso->output_direction_sequence = 0;
4488 iso->output_literally = 0;
4489 #ifdef ENABLE_COMPOSITE_CHARS
4490 if (iso->composite_chars)
4491 Dynarr_reset (iso->composite_chars);
4496 fit_to_be_escape_quoted (unsigned char c)
4513 /* Parse one byte of an ISO2022 escape sequence.
4514 If the result is an invalid escape sequence, return 0 and
4515 do not change anything in STR. Otherwise, if the result is
4516 an incomplete escape sequence, update ISO2022.ESC and
4517 ISO2022.ESC_BYTES and return -1. Otherwise, update
4518 all the state variables (but not ISO2022.ESC_BYTES) and
4521 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4522 or invocation of an invalid character set and treat that as
4523 an unrecognized escape sequence.
4525 ********************************************************************
4527 #### Strategies for error annotation and coding orthogonalization
4529 We really want to separate out a number of things. Conceptually,
4530 there is a nested syntax.
4532 At the top level is the ISO 2022 extension syntax, including charset
4533 designation and invocation, and certain auxiliary controls such as the
4534 ISO 6429 direction specification. These are octet-oriented, with the
4535 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4536 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4537 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4538 (deprecated) special case in Unicode processing.
4540 The middle layer is ISO 2022 character interpretation. This will depend
4541 on the current state of the ISO 2022 registers, and assembles octets
4542 into the character's internal representation.
4544 The lowest level is translating system control conventions. At present
4545 this is restricted to newline translation, but one could imagine doing
4546 tab conversion or line wrapping here. "Escape from Unicode" processing
4547 would be done at this level.
4549 At each level the parser will verify the syntax. In the case of a
4550 syntax error or warning (such as a redundant escape sequence that affects
4551 no characters), the parser will take some action, typically inserting the
4552 erroneous octets directly into the output and creating an annotation
4553 which can be used by higher level I/O to mark the affected region.
4555 This should make it possible to do something sensible about separating
4556 newline convention processing from character construction, and about
4557 preventing ISO 2022 escape sequences from being recognized
4560 The basic strategy will be to have octet classification tables, and
4561 switch processing according to the table entry.
4563 It's possible that, by doing the processing with tables of functions or
4564 the like, the parser can be used for both detection and translation. */
4567 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4568 unsigned char c, unsigned int *flags,
4569 int check_invalid_charsets)
4571 /* (1) If we're at the end of a designation sequence, CS is the
4572 charset being designated and REG is the register to designate
4575 (2) If we're at the end of a locking-shift sequence, REG is
4576 the register to invoke and HALF (0 == left, 1 == right) is
4577 the half to invoke it into.
4579 (3) If we're at the end of a single-shift sequence, REG is
4580 the register to invoke. */
4581 Lisp_Object cs = Qnil;
4584 /* NOTE: This code does goto's all over the fucking place.
4585 The reason for this is that we're basically implementing
4586 a state machine here, and hierarchical languages like C
4587 don't really provide a clean way of doing this. */
4589 if (! (*flags & CODING_STATE_ESCAPE))
4590 /* At beginning of escape sequence; we need to reset our
4591 escape-state variables. */
4592 iso->esc = ISO_ESC_NOTHING;
4594 iso->output_literally = 0;
4595 iso->output_direction_sequence = 0;
4599 case ISO_ESC_NOTHING:
4600 iso->esc_bytes_index = 0;
4603 case ISO_CODE_ESC: /* Start escape sequence */
4604 *flags |= CODING_STATE_ESCAPE;
4608 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4609 *flags |= CODING_STATE_ESCAPE;
4610 iso->esc = ISO_ESC_5_11;
4613 case ISO_CODE_SO: /* locking shift 1 */
4616 case ISO_CODE_SI: /* locking shift 0 */
4620 case ISO_CODE_SS2: /* single shift */
4623 case ISO_CODE_SS3: /* single shift */
4627 default: /* Other control characters */
4634 /**** single shift ****/
4636 case 'N': /* single shift 2 */
4639 case 'O': /* single shift 3 */
4643 /**** locking shift ****/
4645 case '~': /* locking shift 1 right */
4648 case 'n': /* locking shift 2 */
4651 case '}': /* locking shift 2 right */
4654 case 'o': /* locking shift 3 */
4657 case '|': /* locking shift 3 right */
4661 #ifdef ENABLE_COMPOSITE_CHARS
4662 /**** composite ****/
4665 iso->esc = ISO_ESC_START_COMPOSITE;
4666 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4667 CODING_STATE_COMPOSITE;
4671 iso->esc = ISO_ESC_END_COMPOSITE;
4672 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4673 ~CODING_STATE_COMPOSITE;
4675 #endif /* ENABLE_COMPOSITE_CHARS */
4677 /**** directionality ****/
4680 iso->esc = ISO_ESC_5_11;
4683 /**** designation ****/
4685 case '$': /* multibyte charset prefix */
4686 iso->esc = ISO_ESC_2_4;
4690 if (0x28 <= c && c <= 0x2F)
4692 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4696 /* This function is called with CODESYS equal to nil when
4697 doing coding-system detection. */
4699 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4700 && fit_to_be_escape_quoted (c))
4702 iso->esc = ISO_ESC_LITERAL;
4703 *flags &= CODING_STATE_ISO2022_LOCK;
4713 /**** directionality ****/
4715 case ISO_ESC_5_11: /* ISO6429 direction control */
4718 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4719 goto directionality;
4721 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4722 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4723 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4727 case ISO_ESC_5_11_0:
4730 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4731 goto directionality;
4735 case ISO_ESC_5_11_1:
4738 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4739 goto directionality;
4743 case ISO_ESC_5_11_2:
4746 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4747 goto directionality;
4752 iso->esc = ISO_ESC_DIRECTIONALITY;
4753 /* Various junk here to attempt to preserve the direction sequences
4754 literally in the text if they would otherwise be swallowed due
4755 to invalid designations that don't show up as actual charset
4756 changes in the text. */
4757 if (iso->invalid_switch_dir)
4759 /* We already inserted a direction switch literally into the
4760 text. We assume (#### this may not be right) that the
4761 next direction switch is the one going the other way,
4762 and we need to output that literally as well. */
4763 iso->output_literally = 1;
4764 iso->invalid_switch_dir = 0;
4770 /* If we are in the thrall of an invalid designation,
4771 then stick the directionality sequence literally into the
4772 output stream so it ends up in the original text again. */
4773 for (jj = 0; jj < 4; jj++)
4774 if (iso->invalid_designated[jj])
4778 iso->output_literally = 1;
4779 iso->invalid_switch_dir = 1;
4782 /* Indicate that we haven't yet seen a valid designation,
4783 so that if a switch-dir is directly followed by an
4784 invalid designation, both get inserted literally. */
4785 iso->switched_dir_and_no_valid_charset_yet = 1;
4790 /**** designation ****/
4793 if (0x28 <= c && c <= 0x2F)
4795 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4798 if (0x40 <= c && c <= 0x42)
4800 cs = CHARSET_BY_ATTRIBUTES (94, 2, c,
4801 *flags & CODING_STATE_R2L ?
4802 CHARSET_RIGHT_TO_LEFT :
4803 CHARSET_LEFT_TO_RIGHT);
4815 if (c < '0' || c > '~')
4816 return 0; /* bad final byte */
4818 if (iso->esc >= ISO_ESC_2_8 &&
4819 iso->esc <= ISO_ESC_2_15)
4821 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4822 single = 1; /* single-byte */
4823 reg = (iso->esc - ISO_ESC_2_8) & 3;
4825 else if (iso->esc >= ISO_ESC_2_4_8 &&
4826 iso->esc <= ISO_ESC_2_4_15)
4828 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4829 single = -1; /* multi-byte */
4830 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4834 /* Can this ever be reached? -slb */
4839 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4840 *flags & CODING_STATE_R2L ?
4841 CHARSET_RIGHT_TO_LEFT :
4842 CHARSET_LEFT_TO_RIGHT);
4848 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4852 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4853 /* can't invoke something that ain't there. */
4855 iso->esc = ISO_ESC_SINGLE_SHIFT;
4856 *flags &= CODING_STATE_ISO2022_LOCK;
4858 *flags |= CODING_STATE_SS2;
4860 *flags |= CODING_STATE_SS3;
4864 if (check_invalid_charsets &&
4865 !CHARSETP (iso->charset[reg]))
4866 /* can't invoke something that ain't there. */
4869 iso->register_right = reg;
4871 iso->register_left = reg;
4872 *flags &= CODING_STATE_ISO2022_LOCK;
4873 iso->esc = ISO_ESC_LOCKING_SHIFT;
4877 if (NILP (cs) && check_invalid_charsets)
4879 iso->invalid_designated[reg] = 1;
4880 iso->charset[reg] = Vcharset_ascii;
4881 iso->esc = ISO_ESC_DESIGNATE;
4882 *flags &= CODING_STATE_ISO2022_LOCK;
4883 iso->output_literally = 1;
4884 if (iso->switched_dir_and_no_valid_charset_yet)
4886 /* We encountered a switch-direction followed by an
4887 invalid designation. Ensure that the switch-direction
4888 gets outputted; otherwise it will probably get eaten
4889 when the text is written out again. */
4890 iso->switched_dir_and_no_valid_charset_yet = 0;
4891 iso->output_direction_sequence = 1;
4892 /* And make sure that the switch-dir going the other
4893 way gets outputted, as well. */
4894 iso->invalid_switch_dir = 1;
4898 /* This function is called with CODESYS equal to nil when
4899 doing coding-system detection. */
4900 if (!NILP (codesys))
4902 charset_conversion_spec_dynarr *dyn =
4903 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4909 for (i = 0; i < Dynarr_length (dyn); i++)
4911 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4912 if (EQ (cs, spec->from_charset))
4913 cs = spec->to_charset;
4918 iso->charset[reg] = cs;
4919 iso->esc = ISO_ESC_DESIGNATE;
4920 *flags &= CODING_STATE_ISO2022_LOCK;
4921 if (iso->invalid_designated[reg])
4923 iso->invalid_designated[reg] = 0;
4924 iso->output_literally = 1;
4926 if (iso->switched_dir_and_no_valid_charset_yet)
4927 iso->switched_dir_and_no_valid_charset_yet = 0;
4932 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4936 /* #### There are serious deficiencies in the recognition mechanism
4937 here. This needs to be much smarter if it's going to cut it.
4938 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4939 it should be detected as Latin-1.
4940 All the ISO2022 stuff in this file should be synced up with the
4941 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4942 Perhaps we should wait till R2L works in FSF Emacs? */
4944 if (!st->iso2022.initted)
4946 reset_iso2022 (Qnil, &st->iso2022.iso);
4947 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4948 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4949 CODING_CATEGORY_ISO_8_1_MASK |
4950 CODING_CATEGORY_ISO_8_2_MASK |
4951 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4952 st->iso2022.flags = 0;
4953 st->iso2022.high_byte_count = 0;
4954 st->iso2022.saw_single_shift = 0;
4955 st->iso2022.initted = 1;
4958 mask = st->iso2022.mask;
4962 unsigned char c = *(unsigned char *)src++;
4965 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4966 st->iso2022.high_byte_count++;
4970 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4972 if (st->iso2022.high_byte_count & 1)
4973 /* odd number of high bytes; assume not iso-8-2 */
4974 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4976 st->iso2022.high_byte_count = 0;
4977 st->iso2022.saw_single_shift = 0;
4979 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4981 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4982 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4983 { /* control chars */
4986 /* Allow and ignore control characters that you might
4987 reasonably see in a text file */
4992 case 8: /* backspace */
4993 case 11: /* vertical tab */
4994 case 12: /* form feed */
4995 case 26: /* MS-DOS C-z junk */
4996 case 31: /* '^_' -- for info */
4997 goto label_continue_loop;
5004 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5007 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5008 &st->iso2022.flags, 0))
5010 switch (st->iso2022.iso.esc)
5012 case ISO_ESC_DESIGNATE:
5013 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5014 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5016 case ISO_ESC_LOCKING_SHIFT:
5017 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5018 goto ran_out_of_chars;
5019 case ISO_ESC_SINGLE_SHIFT:
5020 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5021 st->iso2022.saw_single_shift = 1;
5030 goto ran_out_of_chars;
5033 label_continue_loop:;
5042 postprocess_iso2022_mask (int mask)
5044 /* #### kind of cheesy */
5045 /* If seven-bit ISO is allowed, then assume that the encoding is
5046 entirely seven-bit and turn off the eight-bit ones. */
5047 if (mask & CODING_CATEGORY_ISO_7_MASK)
5048 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5049 CODING_CATEGORY_ISO_8_1_MASK |
5050 CODING_CATEGORY_ISO_8_2_MASK);
5054 /* If FLAGS is a null pointer or specifies right-to-left motion,
5055 output a switch-dir-to-left-to-right sequence to DST.
5056 Also update FLAGS if it is not a null pointer.
5057 If INTERNAL_P is set, we are outputting in internal format and
5058 need to handle the CSI differently. */
5061 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5062 unsigned_char_dynarr *dst,
5063 unsigned int *flags,
5066 if (!flags || (*flags & CODING_STATE_R2L))
5068 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5070 Dynarr_add (dst, ISO_CODE_ESC);
5071 Dynarr_add (dst, '[');
5073 else if (internal_p)
5074 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5076 Dynarr_add (dst, ISO_CODE_CSI);
5077 Dynarr_add (dst, '0');
5078 Dynarr_add (dst, ']');
5080 *flags &= ~CODING_STATE_R2L;
5084 /* If FLAGS is a null pointer or specifies a direction different from
5085 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5086 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5087 sequence to DST. Also update FLAGS if it is not a null pointer.
5088 If INTERNAL_P is set, we are outputting in internal format and
5089 need to handle the CSI differently. */
5092 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5093 unsigned_char_dynarr *dst, unsigned int *flags,
5096 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5097 direction == CHARSET_LEFT_TO_RIGHT)
5098 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5099 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5100 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5101 direction == CHARSET_RIGHT_TO_LEFT)
5103 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5105 Dynarr_add (dst, ISO_CODE_ESC);
5106 Dynarr_add (dst, '[');
5108 else if (internal_p)
5109 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5111 Dynarr_add (dst, ISO_CODE_CSI);
5112 Dynarr_add (dst, '2');
5113 Dynarr_add (dst, ']');
5115 *flags |= CODING_STATE_R2L;
5119 /* Convert ISO2022-format data to internal format. */
5122 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5123 unsigned_char_dynarr *dst, Lstream_data_count n)
5125 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5126 unsigned int flags = str->flags;
5127 unsigned int ch = str->ch;
5128 eol_type_t eol_type = str->eol_type;
5129 #ifdef ENABLE_COMPOSITE_CHARS
5130 unsigned_char_dynarr *real_dst = dst;
5132 Lisp_Object coding_system;
5134 XSETCODING_SYSTEM (coding_system, str->codesys);
5136 #ifdef ENABLE_COMPOSITE_CHARS
5137 if (flags & CODING_STATE_COMPOSITE)
5138 dst = str->iso2022.composite_chars;
5139 #endif /* ENABLE_COMPOSITE_CHARS */
5143 unsigned char c = *(unsigned char *)src++;
5144 if (flags & CODING_STATE_ESCAPE)
5145 { /* Within ESC sequence */
5146 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5151 switch (str->iso2022.esc)
5153 #ifdef ENABLE_COMPOSITE_CHARS
5154 case ISO_ESC_START_COMPOSITE:
5155 if (str->iso2022.composite_chars)
5156 Dynarr_reset (str->iso2022.composite_chars);
5158 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5159 dst = str->iso2022.composite_chars;
5161 case ISO_ESC_END_COMPOSITE:
5163 Bufbyte comstr[MAX_EMCHAR_LEN];
5165 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5166 Dynarr_length (dst));
5168 len = set_charptr_emchar (comstr, emch);
5169 Dynarr_add_many (dst, comstr, len);
5172 #endif /* ENABLE_COMPOSITE_CHARS */
5174 case ISO_ESC_LITERAL:
5175 DECODE_ADD_BINARY_CHAR (c, dst);
5179 /* Everything else handled already */
5184 /* Attempted error recovery. */
5185 if (str->iso2022.output_direction_sequence)
5186 ensure_correct_direction (flags & CODING_STATE_R2L ?
5187 CHARSET_RIGHT_TO_LEFT :
5188 CHARSET_LEFT_TO_RIGHT,
5189 str->codesys, dst, 0, 1);
5190 /* More error recovery. */
5191 if (!retval || str->iso2022.output_literally)
5193 /* Output the (possibly invalid) sequence */
5195 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5196 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5197 flags &= CODING_STATE_ISO2022_LOCK;
5199 n++, src--;/* Repeat the loop with the same character. */
5202 /* No sense in reprocessing the final byte of the
5203 escape sequence; it could mess things up anyway.
5205 DECODE_ADD_BINARY_CHAR (c, dst);
5210 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5211 { /* Control characters */
5213 /***** Error-handling *****/
5215 /* If we were in the middle of a character, dump out the
5216 partial character. */
5217 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5219 /* If we just saw a single-shift character, dump it out.
5220 This may dump out the wrong sort of single-shift character,
5221 but least it will give an indication that something went
5223 if (flags & CODING_STATE_SS2)
5225 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5226 flags &= ~CODING_STATE_SS2;
5228 if (flags & CODING_STATE_SS3)
5230 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5231 flags &= ~CODING_STATE_SS3;
5234 /***** Now handle the control characters. *****/
5237 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5239 flags &= CODING_STATE_ISO2022_LOCK;
5241 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5242 DECODE_ADD_BINARY_CHAR (c, dst);
5245 { /* Graphic characters */
5246 Lisp_Object charset;
5252 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5254 /* Now determine the charset. */
5255 reg = ((flags & CODING_STATE_SS2) ? 2
5256 : (flags & CODING_STATE_SS3) ? 3
5257 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5258 : str->iso2022.register_left);
5259 charset = str->iso2022.charset[reg];
5261 /* Error checking: */
5262 if (! CHARSETP (charset)
5263 || str->iso2022.invalid_designated[reg]
5264 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5265 && XCHARSET_CHARS (charset) == 94))
5266 /* Mrmph. We are trying to invoke a register that has no
5267 or an invalid charset in it, or trying to add a character
5268 outside the range of the charset. Insert that char literally
5269 to preserve it for the output. */
5271 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5272 DECODE_ADD_BINARY_CHAR (c, dst);
5277 /* Things are probably hunky-dorey. */
5279 /* Fetch reverse charset, maybe. */
5280 if (((flags & CODING_STATE_R2L) &&
5281 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5283 (!(flags & CODING_STATE_R2L) &&
5284 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5286 Lisp_Object new_charset =
5287 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5288 if (!NILP (new_charset))
5289 charset = new_charset;
5293 if (XCHARSET_DIMENSION (charset) == 1)
5295 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5297 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5302 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5308 lb = XCHARSET_LEADING_BYTE (charset);
5309 switch (XCHARSET_REP_BYTES (charset))
5312 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5313 Dynarr_add (dst, c & 0x7F);
5316 case 2: /* one-byte official */
5317 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5318 Dynarr_add (dst, lb);
5319 Dynarr_add (dst, c | 0x80);
5322 case 3: /* one-byte private or two-byte official */
5323 if (XCHARSET_PRIVATE_P (charset))
5325 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5326 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5327 Dynarr_add (dst, lb);
5328 Dynarr_add (dst, c | 0x80);
5334 Dynarr_add (dst, lb);
5335 Dynarr_add (dst, ch | 0x80);
5336 Dynarr_add (dst, c | 0x80);
5344 default: /* two-byte private */
5347 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5348 Dynarr_add (dst, lb);
5349 Dynarr_add (dst, ch | 0x80);
5350 Dynarr_add (dst, c | 0x80);
5360 flags &= CODING_STATE_ISO2022_LOCK;
5363 label_continue_loop:;
5366 if (flags & CODING_STATE_END)
5367 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5374 /***** ISO2022 encoder *****/
5376 /* Designate CHARSET into register REG. */
5379 iso2022_designate (Lisp_Object charset, unsigned char reg,
5380 struct encoding_stream *str, unsigned_char_dynarr *dst)
5382 static const char inter94[] = "()*+";
5383 static const char inter96[] = ",-./";
5385 unsigned char final;
5386 Lisp_Object old_charset = str->iso2022.charset[reg];
5388 str->iso2022.charset[reg] = charset;
5389 if (!CHARSETP (charset))
5390 /* charset might be an initial nil or t. */
5392 type = XCHARSET_TYPE (charset);
5393 final = XCHARSET_FINAL (charset);
5394 if (!str->iso2022.force_charset_on_output[reg] &&
5395 CHARSETP (old_charset) &&
5396 XCHARSET_TYPE (old_charset) == type &&
5397 XCHARSET_FINAL (old_charset) == final)
5400 str->iso2022.force_charset_on_output[reg] = 0;
5403 charset_conversion_spec_dynarr *dyn =
5404 str->codesys->iso2022.output_conv;
5410 for (i = 0; i < Dynarr_length (dyn); i++)
5412 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5413 if (EQ (charset, spec->from_charset))
5414 charset = spec->to_charset;
5419 Dynarr_add (dst, ISO_CODE_ESC);
5422 case CHARSET_TYPE_94:
5423 Dynarr_add (dst, inter94[reg]);
5425 case CHARSET_TYPE_96:
5426 Dynarr_add (dst, inter96[reg]);
5428 case CHARSET_TYPE_94X94:
5429 Dynarr_add (dst, '$');
5431 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5434 Dynarr_add (dst, inter94[reg]);
5436 case CHARSET_TYPE_96X96:
5437 Dynarr_add (dst, '$');
5438 Dynarr_add (dst, inter96[reg]);
5441 Dynarr_add (dst, final);
5445 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5447 if (str->iso2022.register_left != 0)
5449 Dynarr_add (dst, ISO_CODE_SI);
5450 str->iso2022.register_left = 0;
5455 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5457 if (str->iso2022.register_left != 1)
5459 Dynarr_add (dst, ISO_CODE_SO);
5460 str->iso2022.register_left = 1;
5464 /* Convert internally-formatted data to ISO2022 format. */
5467 encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
5468 unsigned_char_dynarr *dst, Lstream_data_count n)
5470 unsigned char charmask, c;
5471 unsigned char char_boundary;
5472 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5473 unsigned int flags = str->flags;
5474 Emchar ch = str->ch;
5475 Lisp_Coding_System *codesys = str->codesys;
5476 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5478 Lisp_Object charset;
5481 unsigned int byte1, byte2;
5484 #ifdef ENABLE_COMPOSITE_CHARS
5485 /* flags for handling composite chars. We do a little switcharoo
5486 on the source while we're outputting the composite char. */
5487 unsigned int saved_n = 0;
5488 const unsigned char *saved_src = NULL;
5489 int in_composite = 0;
5490 #endif /* ENABLE_COMPOSITE_CHARS */
5492 char_boundary = str->iso2022.current_char_boundary;
5493 charset = str->iso2022.current_charset;
5494 half = str->iso2022.current_half;
5496 #ifdef ENABLE_COMPOSITE_CHARS
5504 switch (char_boundary)
5512 else if ( c >= 0xf8 )
5517 else if ( c >= 0xf0 )
5522 else if ( c >= 0xe0 )
5527 else if ( c >= 0xc0 )
5536 restore_left_to_right_direction (codesys, dst, &flags, 0);
5538 /* Make sure G0 contains ASCII */
5539 if ((c > ' ' && c < ISO_CODE_DEL) ||
5540 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5542 ensure_normal_shift (str, dst);
5543 iso2022_designate (Vcharset_ascii, 0, str, dst);
5546 /* If necessary, restore everything to the default state
5549 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5551 restore_left_to_right_direction (codesys, dst, &flags, 0);
5553 ensure_normal_shift (str, dst);
5555 for (i = 0; i < 4; i++)
5557 Lisp_Object initial_charset =
5558 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5559 iso2022_designate (initial_charset, i, str, dst);
5564 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5565 Dynarr_add (dst, '\r');
5566 if (eol_type != EOL_CR)
5567 Dynarr_add (dst, c);
5571 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5572 && fit_to_be_escape_quoted (c))
5573 Dynarr_add (dst, ISO_CODE_ESC);
5574 Dynarr_add (dst, c);
5580 ch = ( ch << 6 ) | ( c & 0x3f );
5583 if ( (0x80 <= ch) && (ch <= 0x9f) )
5585 charmask = (half == 0 ? 0x00 : 0x80);
5587 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5588 && fit_to_be_escape_quoted (ch))
5589 Dynarr_add (dst, ISO_CODE_ESC);
5590 /* you asked for it ... */
5591 Dynarr_add (dst, ch);
5597 BREAKUP_CHAR (ch, charset, byte1, byte2);
5598 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5599 codesys, dst, &flags, 0);
5601 /* Now determine which register to use. */
5603 for (i = 0; i < 4; i++)
5605 if (EQ (charset, str->iso2022.charset[i]) ||
5607 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5616 if (XCHARSET_GRAPHIC (charset) != 0)
5618 if (!NILP (str->iso2022.charset[1]) &&
5619 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5620 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5622 else if (!NILP (str->iso2022.charset[2]))
5624 else if (!NILP (str->iso2022.charset[3]))
5633 iso2022_designate (charset, reg, str, dst);
5635 /* Now invoke that register. */
5639 ensure_normal_shift (str, dst);
5644 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5646 ensure_shift_out (str, dst);
5654 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5656 Dynarr_add (dst, ISO_CODE_ESC);
5657 Dynarr_add (dst, 'N');
5662 Dynarr_add (dst, ISO_CODE_SS2);
5668 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5670 Dynarr_add (dst, ISO_CODE_ESC);
5671 Dynarr_add (dst, 'O');
5676 Dynarr_add (dst, ISO_CODE_SS3);
5685 charmask = (half == 0 ? 0x00 : 0x80);
5687 switch (XCHARSET_DIMENSION (charset))
5690 Dynarr_add (dst, byte1 | charmask);
5693 Dynarr_add (dst, byte1 | charmask);
5694 Dynarr_add (dst, byte2 | charmask);
5703 ch = ( ch << 6 ) | ( c & 0x3f );
5707 #else /* not UTF2000 */
5713 if (BYTE_ASCII_P (c))
5714 { /* Processing ASCII character */
5717 restore_left_to_right_direction (codesys, dst, &flags, 0);
5719 /* Make sure G0 contains ASCII */
5720 if ((c > ' ' && c < ISO_CODE_DEL) ||
5721 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5723 ensure_normal_shift (str, dst);
5724 iso2022_designate (Vcharset_ascii, 0, str, dst);
5727 /* If necessary, restore everything to the default state
5730 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5732 restore_left_to_right_direction (codesys, dst, &flags, 0);
5734 ensure_normal_shift (str, dst);
5736 for (i = 0; i < 4; i++)
5738 Lisp_Object initial_charset =
5739 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5740 iso2022_designate (initial_charset, i, str, dst);
5745 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5746 Dynarr_add (dst, '\r');
5747 if (eol_type != EOL_CR)
5748 Dynarr_add (dst, c);
5752 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5753 && fit_to_be_escape_quoted (c))
5754 Dynarr_add (dst, ISO_CODE_ESC);
5755 Dynarr_add (dst, c);
5760 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5761 { /* Processing Leading Byte */
5763 charset = CHARSET_BY_LEADING_BYTE (c);
5764 if (LEADING_BYTE_PREFIX_P(c))
5766 else if (!EQ (charset, Vcharset_control_1)
5767 #ifdef ENABLE_COMPOSITE_CHARS
5768 && !EQ (charset, Vcharset_composite)
5774 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5775 codesys, dst, &flags, 0);
5777 /* Now determine which register to use. */
5779 for (i = 0; i < 4; i++)
5781 if (EQ (charset, str->iso2022.charset[i]) ||
5783 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5792 if (XCHARSET_GRAPHIC (charset) != 0)
5794 if (!NILP (str->iso2022.charset[1]) &&
5795 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5796 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5798 else if (!NILP (str->iso2022.charset[2]))
5800 else if (!NILP (str->iso2022.charset[3]))
5809 iso2022_designate (charset, reg, str, dst);
5811 /* Now invoke that register. */
5815 ensure_normal_shift (str, dst);
5820 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5822 ensure_shift_out (str, dst);
5830 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5832 Dynarr_add (dst, ISO_CODE_ESC);
5833 Dynarr_add (dst, 'N');
5838 Dynarr_add (dst, ISO_CODE_SS2);
5844 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5846 Dynarr_add (dst, ISO_CODE_ESC);
5847 Dynarr_add (dst, 'O');
5852 Dynarr_add (dst, ISO_CODE_SS3);
5864 { /* Processing Non-ASCII character */
5865 charmask = (half == 0 ? 0x7F : 0xFF);
5867 if (EQ (charset, Vcharset_control_1))
5869 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5870 && fit_to_be_escape_quoted (c))
5871 Dynarr_add (dst, ISO_CODE_ESC);
5872 /* you asked for it ... */
5873 Dynarr_add (dst, c - 0x20);
5877 switch (XCHARSET_REP_BYTES (charset))
5880 Dynarr_add (dst, c & charmask);
5883 if (XCHARSET_PRIVATE_P (charset))
5885 Dynarr_add (dst, c & charmask);
5890 #ifdef ENABLE_COMPOSITE_CHARS
5891 if (EQ (charset, Vcharset_composite))
5895 /* #### Bother! We don't know how to
5897 Dynarr_add (dst, '~');
5901 Emchar emch = MAKE_CHAR (Vcharset_composite,
5902 ch & 0x7F, c & 0x7F);
5903 Lisp_Object lstr = composite_char_string (emch);
5907 src = XSTRING_DATA (lstr);
5908 n = XSTRING_LENGTH (lstr);
5909 Dynarr_add (dst, ISO_CODE_ESC);
5910 Dynarr_add (dst, '0'); /* start composing */
5914 #endif /* ENABLE_COMPOSITE_CHARS */
5916 Dynarr_add (dst, ch & charmask);
5917 Dynarr_add (dst, c & charmask);
5930 Dynarr_add (dst, ch & charmask);
5931 Dynarr_add (dst, c & charmask);
5946 #endif /* not UTF2000 */
5948 #ifdef ENABLE_COMPOSITE_CHARS
5954 Dynarr_add (dst, ISO_CODE_ESC);
5955 Dynarr_add (dst, '1'); /* end composing */
5956 goto back_to_square_n; /* Wheeeeeeeee ..... */
5958 #endif /* ENABLE_COMPOSITE_CHARS */
5961 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5963 if (char_boundary && flags & CODING_STATE_END)
5966 restore_left_to_right_direction (codesys, dst, &flags, 0);
5967 ensure_normal_shift (str, dst);
5968 for (i = 0; i < 4; i++)
5970 Lisp_Object initial_charset =
5971 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5972 iso2022_designate (initial_charset, i, str, dst);
5978 str->iso2022.current_char_boundary = char_boundary;
5979 str->iso2022.current_charset = charset;
5980 str->iso2022.current_half = half;
5982 /* Verbum caro factum est! */
5986 /************************************************************************/
5987 /* No-conversion methods */
5988 /************************************************************************/
5990 /* This is used when reading in "binary" files -- i.e. files that may
5991 contain all 256 possible byte values and that are not to be
5992 interpreted as being in any particular decoding. */
5994 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5995 unsigned_char_dynarr *dst, Lstream_data_count n)
5997 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5998 unsigned int flags = str->flags;
5999 unsigned int ch = str->ch;
6000 eol_type_t eol_type = str->eol_type;
6004 unsigned char c = *(unsigned char *)src++;
6006 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6007 DECODE_ADD_BINARY_CHAR (c, dst);
6008 label_continue_loop:;
6011 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
6018 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6019 unsigned_char_dynarr *dst, Lstream_data_count n)
6022 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6023 unsigned int flags = str->flags;
6024 unsigned int ch = str->ch;
6025 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6027 unsigned char char_boundary = str->iso2022.current_char_boundary;
6034 switch (char_boundary)
6042 else if ( c >= 0xf8 )
6047 else if ( c >= 0xf0 )
6052 else if ( c >= 0xe0 )
6057 else if ( c >= 0xc0 )
6068 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6069 Dynarr_add (dst, '\r');
6070 if (eol_type != EOL_CR)
6071 Dynarr_add (dst, c);
6074 Dynarr_add (dst, c);
6079 ch = ( ch << 6 ) | ( c & 0x3f );
6080 Dynarr_add (dst, ch & 0xff);
6084 ch = ( ch << 6 ) | ( c & 0x3f );
6087 #else /* not UTF2000 */
6090 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6091 Dynarr_add (dst, '\r');
6092 if (eol_type != EOL_CR)
6093 Dynarr_add (dst, '\n');
6096 else if (BYTE_ASCII_P (c))
6099 Dynarr_add (dst, c);
6101 else if (BUFBYTE_LEADING_BYTE_P (c))
6104 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6105 c == LEADING_BYTE_CONTROL_1)
6108 Dynarr_add (dst, '~'); /* untranslatable character */
6112 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6113 Dynarr_add (dst, c);
6114 else if (ch == LEADING_BYTE_CONTROL_1)
6117 Dynarr_add (dst, c - 0x20);
6119 /* else it should be the second or third byte of an
6120 untranslatable character, so ignore it */
6123 #endif /* not UTF2000 */
6129 str->iso2022.current_char_boundary = char_boundary;
6135 /************************************************************************/
6136 /* Initialization */
6137 /************************************************************************/
6140 syms_of_file_coding (void)
6142 INIT_LRECORD_IMPLEMENTATION (coding_system);
6144 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6146 DEFSUBR (Fcoding_system_p);
6147 DEFSUBR (Ffind_coding_system);
6148 DEFSUBR (Fget_coding_system);
6149 DEFSUBR (Fcoding_system_list);
6150 DEFSUBR (Fcoding_system_name);
6151 DEFSUBR (Fmake_coding_system);
6152 DEFSUBR (Fcopy_coding_system);
6153 DEFSUBR (Fcoding_system_canonical_name_p);
6154 DEFSUBR (Fcoding_system_alias_p);
6155 DEFSUBR (Fcoding_system_aliasee);
6156 DEFSUBR (Fdefine_coding_system_alias);
6157 DEFSUBR (Fsubsidiary_coding_system);
6159 DEFSUBR (Fcoding_system_type);
6160 DEFSUBR (Fcoding_system_doc_string);
6162 DEFSUBR (Fcoding_system_charset);
6164 DEFSUBR (Fcoding_system_property);
6166 DEFSUBR (Fcoding_category_list);
6167 DEFSUBR (Fset_coding_priority_list);
6168 DEFSUBR (Fcoding_priority_list);
6169 DEFSUBR (Fset_coding_category_system);
6170 DEFSUBR (Fcoding_category_system);
6172 DEFSUBR (Fdetect_coding_region);
6173 DEFSUBR (Fdecode_coding_region);
6174 DEFSUBR (Fencode_coding_region);
6176 DEFSUBR (Fdecode_shift_jis_char);
6177 DEFSUBR (Fencode_shift_jis_char);
6178 DEFSUBR (Fdecode_big5_char);
6179 DEFSUBR (Fencode_big5_char);
6181 DEFSUBR (Fset_ucs_char);
6182 DEFSUBR (Fucs_char);
6183 DEFSUBR (Fset_char_ucs);
6184 DEFSUBR (Fchar_ucs);
6185 #endif /* not UTF2000 */
6187 defsymbol (&Qcoding_systemp, "coding-system-p");
6188 defsymbol (&Qno_conversion, "no-conversion");
6189 defsymbol (&Qraw_text, "raw-text");
6191 defsymbol (&Qbig5, "big5");
6192 defsymbol (&Qshift_jis, "shift-jis");
6193 defsymbol (&Qucs4, "ucs-4");
6194 defsymbol (&Qutf8, "utf-8");
6195 defsymbol (&Qccl, "ccl");
6196 defsymbol (&Qiso2022, "iso2022");
6198 defsymbol (&Qmnemonic, "mnemonic");
6199 defsymbol (&Qeol_type, "eol-type");
6200 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6201 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6203 defsymbol (&Qcr, "cr");
6204 defsymbol (&Qlf, "lf");
6205 defsymbol (&Qcrlf, "crlf");
6206 defsymbol (&Qeol_cr, "eol-cr");
6207 defsymbol (&Qeol_lf, "eol-lf");
6208 defsymbol (&Qeol_crlf, "eol-crlf");
6210 defsymbol (&Qcharset_g0, "charset-g0");
6211 defsymbol (&Qcharset_g1, "charset-g1");
6212 defsymbol (&Qcharset_g2, "charset-g2");
6213 defsymbol (&Qcharset_g3, "charset-g3");
6214 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6215 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6216 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6217 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6218 defsymbol (&Qno_iso6429, "no-iso6429");
6219 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6220 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6222 defsymbol (&Qshort, "short");
6223 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6224 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6225 defsymbol (&Qseven, "seven");
6226 defsymbol (&Qlock_shift, "lock-shift");
6227 defsymbol (&Qescape_quoted, "escape-quoted");
6229 defsymbol (&Qencode, "encode");
6230 defsymbol (&Qdecode, "decode");
6233 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6235 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6237 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6239 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6241 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6243 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6245 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6247 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6249 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6252 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6257 lstream_type_create_file_coding (void)
6259 LSTREAM_HAS_METHOD (decoding, reader);
6260 LSTREAM_HAS_METHOD (decoding, writer);
6261 LSTREAM_HAS_METHOD (decoding, rewinder);
6262 LSTREAM_HAS_METHOD (decoding, seekable_p);
6263 LSTREAM_HAS_METHOD (decoding, flusher);
6264 LSTREAM_HAS_METHOD (decoding, closer);
6265 LSTREAM_HAS_METHOD (decoding, marker);
6267 LSTREAM_HAS_METHOD (encoding, reader);
6268 LSTREAM_HAS_METHOD (encoding, writer);
6269 LSTREAM_HAS_METHOD (encoding, rewinder);
6270 LSTREAM_HAS_METHOD (encoding, seekable_p);
6271 LSTREAM_HAS_METHOD (encoding, flusher);
6272 LSTREAM_HAS_METHOD (encoding, closer);
6273 LSTREAM_HAS_METHOD (encoding, marker);
6277 vars_of_file_coding (void)
6281 fcd = xnew (struct file_coding_dump);
6282 dump_add_root_struct_ptr (&fcd, &fcd_description);
6284 /* Initialize to something reasonable ... */
6285 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6287 fcd->coding_category_system[i] = Qnil;
6288 fcd->coding_category_by_priority[i] = i;
6291 Fprovide (intern ("file-coding"));
6293 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6294 Coding system used for TTY keyboard input.
6295 Not used under a windowing system.
6297 Vkeyboard_coding_system = Qnil;
6299 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6300 Coding system used for TTY display output.
6301 Not used under a windowing system.
6303 Vterminal_coding_system = Qnil;
6305 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6306 Overriding coding system used when reading from a file or process.
6307 You should bind this variable with `let', but do not set it globally.
6308 If this is non-nil, it specifies the coding system that will be used
6309 to decode input on read operations, such as from a file or process.
6310 It overrides `buffer-file-coding-system-for-read',
6311 `insert-file-contents-pre-hook', etc. Use those variables instead of
6312 this one for permanent changes to the environment. */ );
6313 Vcoding_system_for_read = Qnil;
6315 DEFVAR_LISP ("coding-system-for-write",
6316 &Vcoding_system_for_write /*
6317 Overriding coding system used when writing to a file or process.
6318 You should bind this variable with `let', but do not set it globally.
6319 If this is non-nil, it specifies the coding system that will be used
6320 to encode output for write operations, such as to a file or process.
6321 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6322 Use those variables instead of this one for permanent changes to the
6324 Vcoding_system_for_write = Qnil;
6326 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6327 Coding system used to convert pathnames when accessing files.
6329 Vfile_name_coding_system = Qnil;
6331 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6332 Non-nil means the buffer contents are regarded as multi-byte form
6333 of characters, not a binary code. This affects the display, file I/O,
6334 and behaviors of various editing commands.
6336 Setting this to nil does not do anything.
6338 enable_multibyte_characters = 1;
6342 complex_vars_of_file_coding (void)
6344 staticpro (&Vcoding_system_hash_table);
6345 Vcoding_system_hash_table =
6346 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6348 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6349 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6351 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6353 struct codesys_prop csp; \
6355 csp.prop_type = (Prop_Type); \
6356 Dynarr_add (the_codesys_prop_dynarr, csp); \
6359 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6360 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6361 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6362 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6363 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6364 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6365 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6367 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6368 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6369 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6370 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6371 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6372 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6373 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6374 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6375 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6376 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6377 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6378 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6379 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6380 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6381 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6382 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6383 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6385 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6386 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6388 /* Need to create this here or we're really screwed. */
6390 (Qraw_text, Qno_conversion,
6391 build_string ("Raw text, which means it converts only line-break-codes."),
6392 list2 (Qmnemonic, build_string ("Raw")));
6395 (Qbinary, Qno_conversion,
6396 build_string ("Binary, which means it does not convert anything."),
6397 list4 (Qeol_type, Qlf,
6398 Qmnemonic, build_string ("Binary")));
6403 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6404 list2 (Qmnemonic, build_string ("UTF8")));
6407 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6409 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6411 Fdefine_coding_system_alias (Qterminal, Qbinary);
6412 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6414 /* Need this for bootstrapping */
6415 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6416 Fget_coding_system (Qraw_text);
6419 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6420 = Fget_coding_system (Qutf8);
6423 #if defined(MULE) && !defined(UTF2000)
6427 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6428 fcd->ucs_to_mule_table[i] = Qnil;
6430 staticpro (&mule_to_ucs_table);
6431 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6432 #endif /* defined(MULE) && !defined(UTF2000) */