1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof(struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qctext, Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
180 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
183 static int detect_coding_sjis (struct detection_state *st,
184 CONST unsigned char *src,
186 static void decode_coding_sjis (Lstream *decoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst,
190 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
191 unsigned_char_dynarr *dst, unsigned int *flags);
192 void char_finish_shift_jis (struct encoding_stream *str,
193 unsigned_char_dynarr *dst, unsigned int *flags);
195 static int detect_coding_big5 (struct detection_state *st,
196 CONST unsigned char *src,
198 static void decode_coding_big5 (Lstream *decoding,
199 CONST unsigned char *src,
200 unsigned_char_dynarr *dst, unsigned int n);
201 static void encode_coding_big5 (Lstream *encoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static int detect_coding_ucs4 (struct detection_state *st,
205 CONST unsigned char *src,
207 static void decode_coding_ucs4 (Lstream *decoding,
208 CONST unsigned char *src,
209 unsigned_char_dynarr *dst, unsigned int n);
210 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
211 unsigned_char_dynarr *dst, unsigned int *flags);
212 void char_finish_ucs4 (struct encoding_stream *str,
213 unsigned_char_dynarr *dst, unsigned int *flags);
215 static int detect_coding_utf8 (struct detection_state *st,
216 CONST unsigned char *src,
218 static void decode_coding_utf8 (Lstream *decoding,
219 CONST unsigned char *src,
220 unsigned_char_dynarr *dst, unsigned int n);
221 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
222 unsigned_char_dynarr *dst, unsigned int *flags);
223 void char_finish_utf8 (struct encoding_stream *str,
224 unsigned_char_dynarr *dst, unsigned int *flags);
226 static int postprocess_iso2022_mask (int mask);
227 static void reset_iso2022 (Lisp_Object coding_system,
228 struct iso2022_decoder *iso);
229 static int detect_coding_iso2022 (struct detection_state *st,
230 CONST unsigned char *src,
232 static void decode_coding_iso2022 (Lstream *decoding,
233 CONST unsigned char *src,
234 unsigned_char_dynarr *dst, unsigned int n);
235 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
236 unsigned_char_dynarr *dst, unsigned int *flags);
237 void char_finish_iso2022 (struct encoding_stream *str,
238 unsigned_char_dynarr *dst, unsigned int *flags);
240 static void decode_coding_no_conversion (Lstream *decoding,
241 CONST unsigned char *src,
242 unsigned_char_dynarr *dst,
244 static void encode_coding_no_conversion (Lstream *encoding,
245 CONST unsigned char *src,
246 unsigned_char_dynarr *dst,
248 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
249 unsigned_char_dynarr *dst, unsigned int n);
250 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
251 unsigned_char_dynarr *dst, unsigned int n);
253 typedef struct codesys_prop codesys_prop;
262 Dynarr_declare (codesys_prop);
263 } codesys_prop_dynarr;
265 static const struct lrecord_description codesys_prop_description_1[] = {
266 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
270 static const struct struct_description codesys_prop_description = {
271 sizeof(codesys_prop),
272 codesys_prop_description_1
275 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
276 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
280 static const struct struct_description codesys_prop_dynarr_description = {
281 sizeof(codesys_prop_dynarr),
282 codesys_prop_dynarr_description_1
285 codesys_prop_dynarr *the_codesys_prop_dynarr;
287 enum codesys_prop_enum
290 CODESYS_PROP_ISO2022,
295 /************************************************************************/
296 /* Coding system functions */
297 /************************************************************************/
299 static Lisp_Object mark_coding_system (Lisp_Object);
300 static void print_coding_system (Lisp_Object, Lisp_Object, int);
301 static void finalize_coding_system (void *header, int for_disksave);
304 static const struct lrecord_description ccs_description_1[] = {
305 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
309 static const struct struct_description ccs_description = {
310 sizeof(charset_conversion_spec),
314 static const struct lrecord_description ccsd_description_1[] = {
315 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
319 static const struct struct_description ccsd_description = {
320 sizeof(charset_conversion_spec_dynarr),
325 static const struct lrecord_description coding_system_description[] = {
326 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
327 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
328 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
330 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
331 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
332 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
333 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
338 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
339 mark_coding_system, print_coding_system,
340 finalize_coding_system,
341 0, 0, coding_system_description,
342 struct Lisp_Coding_System);
345 mark_coding_system (Lisp_Object obj)
347 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
349 mark_object (CODING_SYSTEM_NAME (codesys));
350 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
351 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
352 mark_object (CODING_SYSTEM_EOL_LF (codesys));
353 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
354 mark_object (CODING_SYSTEM_EOL_CR (codesys));
356 switch (CODING_SYSTEM_TYPE (codesys))
360 case CODESYS_ISO2022:
361 for (i = 0; i < 4; i++)
362 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
363 if (codesys->iso2022.input_conv)
365 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
367 struct charset_conversion_spec *ccs =
368 Dynarr_atp (codesys->iso2022.input_conv, i);
369 mark_object (ccs->from_charset);
370 mark_object (ccs->to_charset);
373 if (codesys->iso2022.output_conv)
375 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
377 struct charset_conversion_spec *ccs =
378 Dynarr_atp (codesys->iso2022.output_conv, i);
379 mark_object (ccs->from_charset);
380 mark_object (ccs->to_charset);
386 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
387 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
394 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
395 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
399 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
402 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
404 error ("printing unreadable object #<coding_system 0x%x>",
407 write_c_string ("#<coding_system ", printcharfun);
408 print_internal (c->name, printcharfun, 1);
409 write_c_string (">", printcharfun);
413 finalize_coding_system (void *header, int for_disksave)
415 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
416 /* Since coding systems never go away, this function is not
417 necessary. But it would be necessary if we changed things
418 so that coding systems could go away. */
419 if (!for_disksave) /* see comment in lstream.c */
421 switch (CODING_SYSTEM_TYPE (c))
424 case CODESYS_ISO2022:
425 if (c->iso2022.input_conv)
427 Dynarr_free (c->iso2022.input_conv);
428 c->iso2022.input_conv = 0;
430 if (c->iso2022.output_conv)
432 Dynarr_free (c->iso2022.output_conv);
433 c->iso2022.output_conv = 0;
444 symbol_to_eol_type (Lisp_Object symbol)
446 CHECK_SYMBOL (symbol);
447 if (NILP (symbol)) return EOL_AUTODETECT;
448 if (EQ (symbol, Qlf)) return EOL_LF;
449 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
450 if (EQ (symbol, Qcr)) return EOL_CR;
452 signal_simple_error ("Unrecognized eol type", symbol);
453 return EOL_AUTODETECT; /* not reached */
457 eol_type_to_symbol (enum eol_type type)
462 case EOL_LF: return Qlf;
463 case EOL_CRLF: return Qcrlf;
464 case EOL_CR: return Qcr;
465 case EOL_AUTODETECT: return Qnil;
470 setup_eol_coding_systems (Lisp_Coding_System *codesys)
472 Lisp_Object codesys_obj;
473 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
474 char *codesys_name = (char *) alloca (len + 7);
476 char *codesys_mnemonic=0;
478 Lisp_Object codesys_name_sym, sub_codesys_obj;
482 XSETCODING_SYSTEM (codesys_obj, codesys);
484 memcpy (codesys_name,
485 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
487 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
489 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
490 codesys_mnemonic = (char *) alloca (mlen + 7);
491 memcpy (codesys_mnemonic,
492 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
495 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
496 strcpy (codesys_name + len, "-" op_sys); \
498 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
499 codesys_name_sym = intern (codesys_name); \
500 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
501 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
503 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
504 build_string (codesys_mnemonic); \
505 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
508 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
509 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
510 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
513 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
514 Return t if OBJECT is a coding system.
515 A coding system is an object that defines how text containing multiple
516 character sets is encoded into a stream of (typically 8-bit) bytes.
517 The coding system is used to decode the stream into a series of
518 characters (which may be from multiple charsets) when the text is read
519 from a file or process, and is used to encode the text back into the
520 same format when it is written out to a file or process.
522 For example, many ISO2022-compliant coding systems (such as Compound
523 Text, which is used for inter-client data under the X Window System)
524 use escape sequences to switch between different charsets -- Japanese
525 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
526 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
527 `make-coding-system' for more information.
529 Coding systems are normally identified using a symbol, and the
530 symbol is accepted in place of the actual coding system object whenever
531 a coding system is called for. (This is similar to how faces work.)
535 return CODING_SYSTEMP (object) ? Qt : Qnil;
538 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
539 Retrieve the coding system of the given name.
541 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
542 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
543 If there is no such coding system, nil is returned. Otherwise the
544 associated coding system object is returned.
546 (coding_system_or_name))
548 if (CODING_SYSTEMP (coding_system_or_name))
549 return coding_system_or_name;
551 if (NILP (coding_system_or_name))
552 coding_system_or_name = Qbinary;
554 CHECK_SYMBOL (coding_system_or_name);
556 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
559 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
560 Retrieve the coding system of the given name.
561 Same as `find-coding-system' except that if there is no such
562 coding system, an error is signaled instead of returning nil.
566 Lisp_Object coding_system = Ffind_coding_system (name);
568 if (NILP (coding_system))
569 signal_simple_error ("No such coding system", name);
570 return coding_system;
573 /* We store the coding systems in hash tables with the names as the key and the
574 actual coding system object as the value. Occasionally we need to use them
575 in a list format. These routines provide us with that. */
576 struct coding_system_list_closure
578 Lisp_Object *coding_system_list;
582 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
583 void *coding_system_list_closure)
585 /* This function can GC */
586 struct coding_system_list_closure *cscl =
587 (struct coding_system_list_closure *) coding_system_list_closure;
588 Lisp_Object *coding_system_list = cscl->coding_system_list;
590 *coding_system_list = Fcons (key, *coding_system_list);
594 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
595 Return a list of the names of all defined coding systems.
599 Lisp_Object coding_system_list = Qnil;
601 struct coding_system_list_closure coding_system_list_closure;
603 GCPRO1 (coding_system_list);
604 coding_system_list_closure.coding_system_list = &coding_system_list;
605 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
606 &coding_system_list_closure);
609 return coding_system_list;
612 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
613 Return the name of the given coding system.
617 coding_system = Fget_coding_system (coding_system);
618 return XCODING_SYSTEM_NAME (coding_system);
621 static Lisp_Coding_System *
622 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
624 Lisp_Coding_System *codesys =
625 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
627 zero_lcrecord (codesys);
628 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
629 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
630 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
631 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
632 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
633 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
634 CODING_SYSTEM_TYPE (codesys) = type;
635 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
637 if (type == CODESYS_ISO2022)
640 for (i = 0; i < 4; i++)
641 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
643 else if (type == CODESYS_CCL)
645 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
646 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
649 CODING_SYSTEM_NAME (codesys) = name;
655 /* Given a list of charset conversion specs as specified in a Lisp
656 program, parse it into STORE_HERE. */
659 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
660 Lisp_Object spec_list)
664 EXTERNAL_LIST_LOOP (rest, spec_list)
666 Lisp_Object car = XCAR (rest);
667 Lisp_Object from, to;
668 struct charset_conversion_spec spec;
670 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
671 signal_simple_error ("Invalid charset conversion spec", car);
672 from = Fget_charset (XCAR (car));
673 to = Fget_charset (XCAR (XCDR (car)));
674 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
675 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
676 signal_simple_error_2
677 ("Attempted conversion between different charset types",
679 spec.from_charset = from;
680 spec.to_charset = to;
682 Dynarr_add (store_here, spec);
686 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
687 specs, return the equivalent as the Lisp programmer would see it.
689 If LOAD_HERE is 0, return Qnil. */
692 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
699 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
701 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
702 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
705 return Fnreverse (result);
710 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
711 Register symbol NAME as a coding system.
713 TYPE describes the conversion method used and should be one of
716 Automatic conversion. XEmacs attempts to detect the coding system
719 No conversion. Use this for binary files and such. On output,
720 graphic characters that are not in ASCII or Latin-1 will be
721 replaced by a ?. (For a no-conversion-encoded buffer, these
722 characters will only be present if you explicitly insert them.)
724 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
726 ISO 10646 UCS-4 encoding.
728 ISO 10646 UTF-8 encoding.
730 Any ISO2022-compliant encoding. Among other things, this includes
731 JIS (the Japanese encoding commonly used for e-mail), EUC (the
732 standard Unix encoding for Japanese and other languages), and
733 Compound Text (the encoding used in X11). You can specify more
734 specific information about the conversion with the FLAGS argument.
736 Big5 (the encoding commonly used for Taiwanese).
738 The conversion is performed using a user-written pseudo-code
739 program. CCL (Code Conversion Language) is the name of this
742 Write out or read in the raw contents of the memory representing
743 the buffer's text. This is primarily useful for debugging
744 purposes, and is only enabled when XEmacs has been compiled with
745 DEBUG_XEMACS defined (via the --debug configure option).
746 WARNING: Reading in a file using 'internal conversion can result
747 in an internal inconsistency in the memory representing a
748 buffer's text, which will produce unpredictable results and may
749 cause XEmacs to crash. Under normal circumstances you should
750 never use 'internal conversion.
752 DOC-STRING is a string describing the coding system.
754 PROPS is a property list, describing the specific nature of the
755 character set. Recognized properties are:
758 String to be displayed in the modeline when this coding system is
762 End-of-line conversion to be used. It should be one of
765 Automatically detect the end-of-line type (LF, CRLF,
766 or CR). Also generate subsidiary coding systems named
767 `NAME-unix', `NAME-dos', and `NAME-mac', that are
768 identical to this coding system but have an EOL-TYPE
769 value of 'lf, 'crlf, and 'cr, respectively.
771 The end of a line is marked externally using ASCII LF.
772 Since this is also the way that XEmacs represents an
773 end-of-line internally, specifying this option results
774 in no end-of-line conversion. This is the standard
775 format for Unix text files.
777 The end of a line is marked externally using ASCII
778 CRLF. This is the standard format for MS-DOS text
781 The end of a line is marked externally using ASCII CR.
782 This is the standard format for Macintosh text files.
784 Automatically detect the end-of-line type but do not
785 generate subsidiary coding systems. (This value is
786 converted to nil when stored internally, and
787 `coding-system-property' will return nil.)
789 'post-read-conversion
790 Function called after a file has been read in, to perform the
791 decoding. Called with two arguments, BEG and END, denoting
792 a region of the current buffer to be decoded.
794 'pre-write-conversion
795 Function called before a file is written out, to perform the
796 encoding. Called with two arguments, BEG and END, denoting
797 a region of the current buffer to be encoded.
800 The following additional properties are recognized if TYPE is 'iso2022:
806 The character set initially designated to the G0 - G3 registers.
807 The value should be one of
809 -- A charset object (designate that character set)
810 -- nil (do not ever use this register)
811 -- t (no character set is initially designated to
812 the register, but may be later on; this automatically
813 sets the corresponding `force-g*-on-output' property)
819 If non-nil, send an explicit designation sequence on output before
820 using the specified register.
823 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
824 "ESC $ B" on output in place of the full designation sequences
825 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
828 If non-nil, don't designate ASCII to G0 at each end of line on output.
829 Setting this to non-nil also suppresses other state-resetting that
830 normally happens at the end of a line.
833 If non-nil, don't designate ASCII to G0 before control chars on output.
836 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
840 If non-nil, use locking-shift (SO/SI) instead of single-shift
841 or designation by escape sequence.
844 If non-nil, don't use ISO6429's direction specification.
847 If non-nil, literal control characters that are the same as
848 the beginning of a recognized ISO2022 or ISO6429 escape sequence
849 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
850 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
851 so that they can be properly distinguished from an escape sequence.
852 (Note that doing this results in a non-portable encoding.) This
853 encoding flag is used for byte-compiled files. Note that ESC
854 is a good choice for a quoting character because there are no
855 escape sequences whose second byte is a character from the Control-0
856 or Control-1 character sets; this is explicitly disallowed by the
859 'input-charset-conversion
860 A list of conversion specifications, specifying conversion of
861 characters in one charset to another when decoding is performed.
862 Each specification is a list of two elements: the source charset,
863 and the destination charset.
865 'output-charset-conversion
866 A list of conversion specifications, specifying conversion of
867 characters in one charset to another when encoding is performed.
868 The form of each specification is the same as for
869 'input-charset-conversion.
872 The following additional properties are recognized (and required)
876 CCL program used for decoding (converting to internal format).
879 CCL program used for encoding (converting to external format).
881 (name, type, doc_string, props))
883 Lisp_Coding_System *codesys;
884 Lisp_Object rest, key, value;
885 enum coding_system_type ty;
886 int need_to_setup_eol_systems = 1;
888 /* Convert type to constant */
889 if (NILP (type) || EQ (type, Qundecided))
890 { ty = CODESYS_AUTODETECT; }
892 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
893 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
894 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
895 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
896 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
897 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
899 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
901 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
904 signal_simple_error ("Invalid coding system type", type);
908 codesys = allocate_coding_system (ty, name);
910 if (NILP (doc_string))
911 doc_string = build_string ("");
913 CHECK_STRING (doc_string);
914 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
916 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
918 if (EQ (key, Qmnemonic))
921 CHECK_STRING (value);
922 CODING_SYSTEM_MNEMONIC (codesys) = value;
925 else if (EQ (key, Qeol_type))
927 need_to_setup_eol_systems = NILP (value);
930 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
933 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
934 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
936 else if (ty == CODESYS_ISO2022)
938 #define FROB_INITIAL_CHARSET(charset_num) \
939 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
940 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
942 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
943 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
944 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
945 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
947 #define FROB_FORCE_CHARSET(charset_num) \
948 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
950 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
951 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
952 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
953 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
955 #define FROB_BOOLEAN_PROPERTY(prop) \
956 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
958 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
959 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
960 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
961 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
962 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
963 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
964 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
966 else if (EQ (key, Qinput_charset_conversion))
968 codesys->iso2022.input_conv =
969 Dynarr_new (charset_conversion_spec);
970 parse_charset_conversion_specs (codesys->iso2022.input_conv,
973 else if (EQ (key, Qoutput_charset_conversion))
975 codesys->iso2022.output_conv =
976 Dynarr_new (charset_conversion_spec);
977 parse_charset_conversion_specs (codesys->iso2022.output_conv,
981 signal_simple_error ("Unrecognized property", key);
983 else if (EQ (type, Qccl))
985 if (EQ (key, Qdecode))
987 CHECK_VECTOR (value);
988 CODING_SYSTEM_CCL_DECODE (codesys) = value;
990 else if (EQ (key, Qencode))
992 CHECK_VECTOR (value);
993 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
996 signal_simple_error ("Unrecognized property", key);
1000 signal_simple_error ("Unrecognized property", key);
1003 if (need_to_setup_eol_systems)
1004 setup_eol_coding_systems (codesys);
1007 Lisp_Object codesys_obj;
1008 XSETCODING_SYSTEM (codesys_obj, codesys);
1009 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1014 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1015 Copy OLD-CODING-SYSTEM to NEW-NAME.
1016 If NEW-NAME does not name an existing coding system, a new one will
1019 (old_coding_system, new_name))
1021 Lisp_Object new_coding_system;
1022 old_coding_system = Fget_coding_system (old_coding_system);
1023 new_coding_system = Ffind_coding_system (new_name);
1024 if (NILP (new_coding_system))
1026 XSETCODING_SYSTEM (new_coding_system,
1027 allocate_coding_system
1028 (XCODING_SYSTEM_TYPE (old_coding_system),
1030 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1034 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1035 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1036 memcpy (((char *) to ) + sizeof (to->header),
1037 ((char *) from) + sizeof (from->header),
1038 sizeof (*from) - sizeof (from->header));
1039 to->name = new_name;
1041 return new_coding_system;
1044 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1045 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1047 (alias, coding_system))
1049 CHECK_SYMBOL (alias);
1050 if (!NILP (Ffind_coding_system (alias)))
1051 signal_simple_error ("Symbol already names a coding system", alias);
1052 coding_system = Fget_coding_system (coding_system);
1053 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1055 /* Set up aliases for subsidiaries. */
1056 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1059 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1060 #define FROB(type, name) \
1062 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1063 if (!NILP (subsidiary)) \
1064 Fdefine_coding_system_alias \
1065 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1068 FROB (CRLF, "-dos");
1072 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1073 but it doesn't look intentional, so I'd rather return something
1074 meaningful or nothing at all. */
1079 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1081 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1082 Lisp_Object new_coding_system;
1084 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1085 return coding_system;
1089 case EOL_AUTODETECT: return coding_system;
1090 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1091 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1092 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1096 return NILP (new_coding_system) ? coding_system : new_coding_system;
1099 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1100 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1102 (coding_system, eol_type))
1104 coding_system = Fget_coding_system (coding_system);
1106 return subsidiary_coding_system (coding_system,
1107 symbol_to_eol_type (eol_type));
1111 /************************************************************************/
1112 /* Coding system accessors */
1113 /************************************************************************/
1115 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1116 Return the doc string for CODING-SYSTEM.
1120 coding_system = Fget_coding_system (coding_system);
1121 return XCODING_SYSTEM_DOC_STRING (coding_system);
1124 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1125 Return the type of CODING-SYSTEM.
1129 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1132 case CODESYS_AUTODETECT: return Qundecided;
1134 case CODESYS_SHIFT_JIS: return Qshift_jis;
1135 case CODESYS_ISO2022: return Qiso2022;
1136 case CODESYS_BIG5: return Qbig5;
1137 case CODESYS_UCS4: return Qucs4;
1138 case CODESYS_UTF8: return Qutf8;
1139 case CODESYS_CCL: return Qccl;
1141 case CODESYS_NO_CONVERSION: return Qno_conversion;
1143 case CODESYS_INTERNAL: return Qinternal;
1150 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1153 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1155 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1158 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1159 Return initial charset of CODING-SYSTEM designated to GNUM.
1162 (coding_system, gnum))
1164 coding_system = Fget_coding_system (coding_system);
1167 return coding_system_charset (coding_system, XINT (gnum));
1171 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1172 Return the PROP property of CODING-SYSTEM.
1174 (coding_system, prop))
1177 enum coding_system_type type;
1179 coding_system = Fget_coding_system (coding_system);
1180 CHECK_SYMBOL (prop);
1181 type = XCODING_SYSTEM_TYPE (coding_system);
1183 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1184 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1187 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1189 case CODESYS_PROP_ALL_OK:
1192 case CODESYS_PROP_ISO2022:
1193 if (type != CODESYS_ISO2022)
1195 ("Property only valid in ISO2022 coding systems",
1199 case CODESYS_PROP_CCL:
1200 if (type != CODESYS_CCL)
1202 ("Property only valid in CCL coding systems",
1212 signal_simple_error ("Unrecognized property", prop);
1214 if (EQ (prop, Qname))
1215 return XCODING_SYSTEM_NAME (coding_system);
1216 else if (EQ (prop, Qtype))
1217 return Fcoding_system_type (coding_system);
1218 else if (EQ (prop, Qdoc_string))
1219 return XCODING_SYSTEM_DOC_STRING (coding_system);
1220 else if (EQ (prop, Qmnemonic))
1221 return XCODING_SYSTEM_MNEMONIC (coding_system);
1222 else if (EQ (prop, Qeol_type))
1223 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1224 else if (EQ (prop, Qeol_lf))
1225 return XCODING_SYSTEM_EOL_LF (coding_system);
1226 else if (EQ (prop, Qeol_crlf))
1227 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1228 else if (EQ (prop, Qeol_cr))
1229 return XCODING_SYSTEM_EOL_CR (coding_system);
1230 else if (EQ (prop, Qpost_read_conversion))
1231 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1232 else if (EQ (prop, Qpre_write_conversion))
1233 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1235 else if (type == CODESYS_ISO2022)
1237 if (EQ (prop, Qcharset_g0))
1238 return coding_system_charset (coding_system, 0);
1239 else if (EQ (prop, Qcharset_g1))
1240 return coding_system_charset (coding_system, 1);
1241 else if (EQ (prop, Qcharset_g2))
1242 return coding_system_charset (coding_system, 2);
1243 else if (EQ (prop, Qcharset_g3))
1244 return coding_system_charset (coding_system, 3);
1246 #define FORCE_CHARSET(charset_num) \
1247 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1248 (coding_system, charset_num) ? Qt : Qnil)
1250 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1251 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1252 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1253 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1255 #define LISP_BOOLEAN(prop) \
1256 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1258 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1259 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1260 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1261 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1262 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1263 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1264 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1266 else if (EQ (prop, Qinput_charset_conversion))
1268 unparse_charset_conversion_specs
1269 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1270 else if (EQ (prop, Qoutput_charset_conversion))
1272 unparse_charset_conversion_specs
1273 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1277 else if (type == CODESYS_CCL)
1279 if (EQ (prop, Qdecode))
1280 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1281 else if (EQ (prop, Qencode))
1282 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1290 return Qnil; /* not reached */
1294 /************************************************************************/
1295 /* Coding category functions */
1296 /************************************************************************/
1299 decode_coding_category (Lisp_Object symbol)
1303 CHECK_SYMBOL (symbol);
1304 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1305 if (EQ (coding_category_symbol[i], symbol))
1308 signal_simple_error ("Unrecognized coding category", symbol);
1309 return 0; /* not reached */
1312 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1313 Return a list of all recognized coding categories.
1318 Lisp_Object list = Qnil;
1320 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1321 list = Fcons (coding_category_symbol[i], list);
1325 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1326 Change the priority order of the coding categories.
1327 LIST should be list of coding categories, in descending order of
1328 priority. Unspecified coding categories will be lower in priority
1329 than all specified ones, in the same relative order they were in
1334 int category_to_priority[CODING_CATEGORY_LAST + 1];
1338 /* First generate a list that maps coding categories to priorities. */
1340 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1341 category_to_priority[i] = -1;
1343 /* Highest priority comes from the specified list. */
1345 EXTERNAL_LIST_LOOP (rest, list)
1347 int cat = decode_coding_category (XCAR (rest));
1349 if (category_to_priority[cat] >= 0)
1350 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1351 category_to_priority[cat] = i++;
1354 /* Now go through the existing categories by priority to retrieve
1355 the categories not yet specified and preserve their priority
1357 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1359 int cat = fcd->coding_category_by_priority[j];
1360 if (category_to_priority[cat] < 0)
1361 category_to_priority[cat] = i++;
1364 /* Now we need to construct the inverse of the mapping we just
1367 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1368 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1370 /* Phew! That was confusing. */
1374 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1375 Return a list of coding categories in descending order of priority.
1380 Lisp_Object list = Qnil;
1382 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1383 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1388 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1389 Change the coding system associated with a coding category.
1391 (coding_category, coding_system))
1393 int cat = decode_coding_category (coding_category);
1395 coding_system = Fget_coding_system (coding_system);
1396 fcd->coding_category_system[cat] = coding_system;
1400 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1401 Return the coding system associated with a coding category.
1405 int cat = decode_coding_category (coding_category);
1406 Lisp_Object sys = fcd->coding_category_system[cat];
1409 return XCODING_SYSTEM_NAME (sys);
1414 /************************************************************************/
1415 /* Detecting the encoding of data */
1416 /************************************************************************/
1418 struct detection_state
1420 enum eol_type eol_type;
1456 struct iso2022_decoder iso;
1458 int high_byte_count;
1459 unsigned int saw_single_shift:1;
1472 acceptable_control_char_p (int c)
1476 /* Allow and ignore control characters that you might
1477 reasonably see in a text file */
1482 case 8: /* backspace */
1483 case 11: /* vertical tab */
1484 case 12: /* form feed */
1485 case 26: /* MS-DOS C-z junk */
1486 case 31: /* '^_' -- for info */
1494 mask_has_at_most_one_bit_p (int mask)
1496 /* Perhaps the only thing useful you learn from intensive Microsoft
1497 technical interviews */
1498 return (mask & (mask - 1)) == 0;
1501 static enum eol_type
1502 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1512 if (st->eol.just_saw_cr)
1514 else if (st->eol.seen_anything)
1517 else if (st->eol.just_saw_cr)
1520 st->eol.just_saw_cr = 1;
1522 st->eol.just_saw_cr = 0;
1523 st->eol.seen_anything = 1;
1526 return EOL_AUTODETECT;
1529 /* Attempt to determine the encoding and EOL type of the given text.
1530 Before calling this function for the first type, you must initialize
1531 st->eol_type as appropriate and initialize st->mask to ~0.
1533 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1536 st->mask holds the determined coding category mask, or ~0 if only
1537 ASCII has been seen so far.
1541 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1542 is present in st->mask
1543 1 == definitive answers are here for both st->eol_type and st->mask
1547 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1548 unsigned int n, int just_do_eol)
1552 if (st->eol_type == EOL_AUTODETECT)
1553 st->eol_type = detect_eol_type (st, src, n);
1556 return st->eol_type != EOL_AUTODETECT;
1558 if (!st->seen_non_ascii)
1560 for (; n; n--, src++)
1563 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1565 st->seen_non_ascii = 1;
1567 st->shift_jis.mask = ~0;
1571 st->iso2022.mask = ~0;
1581 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1582 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1583 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1584 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1585 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1586 st->big5.mask = detect_coding_big5 (st, src, n);
1587 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1588 st->utf8.mask = detect_coding_utf8 (st, src, n);
1589 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1590 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1593 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1594 | st->utf8.mask | st->ucs4.mask;
1597 int retval = mask_has_at_most_one_bit_p (st->mask);
1598 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1599 return retval && st->eol_type != EOL_AUTODETECT;
1604 coding_system_from_mask (int mask)
1608 /* If the file was entirely or basically ASCII, use the
1609 default value of `buffer-file-coding-system'. */
1610 Lisp_Object retval =
1611 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1614 retval = Ffind_coding_system (retval);
1618 (Qbad_variable, Qwarning,
1619 "Invalid `default-buffer-file-coding-system', set to nil");
1620 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1624 retval = Fget_coding_system (Qraw_text);
1632 mask = postprocess_iso2022_mask (mask);
1634 /* Look through the coding categories by priority and find
1635 the first one that is allowed. */
1636 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1638 cat = fcd->coding_category_by_priority[i];
1639 if ((mask & (1 << cat)) &&
1640 !NILP (fcd->coding_category_system[cat]))
1644 return fcd->coding_category_system[cat];
1646 return Fget_coding_system (Qraw_text);
1650 /* Given a seekable read stream and potential coding system and EOL type
1651 as specified, do any autodetection that is called for. If the
1652 coding system and/or EOL type are not `autodetect', they will be left
1653 alone; but this function will never return an autodetect coding system
1656 This function does not automatically fetch subsidiary coding systems;
1657 that should be unnecessary with the explicit eol-type argument. */
1659 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1662 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1663 enum eol_type *eol_type_in_out)
1665 struct detection_state decst;
1667 if (*eol_type_in_out == EOL_AUTODETECT)
1668 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1671 decst.eol_type = *eol_type_in_out;
1674 /* If autodetection is called for, do it now. */
1675 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1676 || *eol_type_in_out == EOL_AUTODETECT)
1679 Lisp_Object coding_system = Qnil;
1681 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1684 /* Look for initial "-*-"; mode line prefix */
1686 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1691 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1693 Extbyte *local_vars_beg = p + 3;
1694 /* Look for final "-*-"; mode line suffix */
1695 for (p = local_vars_beg,
1696 scan_end = buf + nread - LENGTH ("-*-");
1701 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1703 Extbyte *suffix = p;
1704 /* Look for "coding:" */
1705 for (p = local_vars_beg,
1706 scan_end = suffix - LENGTH ("coding:?");
1709 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1710 && (p == local_vars_beg
1711 || (*(p-1) == ' ' ||
1717 p += LENGTH ("coding:");
1718 while (*p == ' ' || *p == '\t') p++;
1720 /* Get coding system name */
1721 save = *suffix; *suffix = '\0';
1722 /* Characters valid in a MIME charset name (rfc 1521),
1723 and in a Lisp symbol name. */
1724 n = strspn ( (char *) p,
1725 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1726 "abcdefghijklmnopqrstuvwxyz"
1732 save = p[n]; p[n] = '\0';
1734 Ffind_coding_system (intern ((char *) p));
1744 if (NILP (coding_system))
1747 if (detect_coding_type (&decst, buf, nread,
1748 XCODING_SYSTEM_TYPE (*codesys_in_out)
1749 != CODESYS_AUTODETECT))
1751 nread = Lstream_read (stream, buf, sizeof (buf));
1757 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1758 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1761 if (detect_coding_type (&decst, buf, nread, 1))
1763 nread = Lstream_read (stream, buf, sizeof (buf));
1769 *eol_type_in_out = decst.eol_type;
1770 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1772 if (NILP (coding_system))
1773 *codesys_in_out = coding_system_from_mask (decst.mask);
1775 *codesys_in_out = coding_system;
1779 /* If we absolutely can't determine the EOL type, just assume LF. */
1780 if (*eol_type_in_out == EOL_AUTODETECT)
1781 *eol_type_in_out = EOL_LF;
1783 Lstream_rewind (stream);
1786 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1787 Detect coding system of the text in the region between START and END.
1788 Returned a list of possible coding systems ordered by priority.
1789 If only ASCII characters are found, it returns 'undecided or one of
1790 its subsidiary coding systems according to a detected end-of-line
1791 type. Optional arg BUFFER defaults to the current buffer.
1793 (start, end, buffer))
1795 Lisp_Object val = Qnil;
1796 struct buffer *buf = decode_buffer (buffer, 0);
1798 Lisp_Object instream, lb_instream;
1799 Lstream *istr, *lb_istr;
1800 struct detection_state decst;
1801 struct gcpro gcpro1, gcpro2;
1803 get_buffer_range_char (buf, start, end, &b, &e, 0);
1804 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1805 lb_istr = XLSTREAM (lb_instream);
1806 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1807 istr = XLSTREAM (instream);
1808 GCPRO2 (instream, lb_instream);
1810 decst.eol_type = EOL_AUTODETECT;
1814 unsigned char random_buffer[4096];
1815 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1819 if (detect_coding_type (&decst, random_buffer, nread, 0))
1823 if (decst.mask == ~0)
1824 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1832 decst.mask = postprocess_iso2022_mask (decst.mask);
1834 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1836 int sys = fcd->coding_category_by_priority[i];
1837 if (decst.mask & (1 << sys))
1839 Lisp_Object codesys = fcd->coding_category_system[sys];
1840 if (!NILP (codesys))
1841 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1842 val = Fcons (codesys, val);
1846 Lstream_close (istr);
1848 Lstream_delete (istr);
1849 Lstream_delete (lb_istr);
1854 /************************************************************************/
1855 /* Converting to internal Mule format ("decoding") */
1856 /************************************************************************/
1858 /* A decoding stream is a stream used for decoding text (i.e.
1859 converting from some external format to internal format).
1860 The decoding-stream object keeps track of the actual coding
1861 stream, the stream that is at the other end, and data that
1862 needs to be persistent across the lifetime of the stream. */
1864 /* Handle the EOL stuff related to just-read-in character C.
1865 EOL_TYPE is the EOL type of the coding stream.
1866 FLAGS is the current value of FLAGS in the coding stream, and may
1867 be modified by this macro. (The macro only looks at the
1868 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1869 bytes are to be written. You need to also define a local goto
1870 label "label_continue_loop" that is at the end of the main
1871 character-reading loop.
1873 If C is a CR character, then this macro handles it entirely and
1874 jumps to label_continue_loop. Otherwise, this macro does not add
1875 anything to DST, and continues normally. You should continue
1876 processing C normally after this macro. */
1878 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1882 if (eol_type == EOL_CR) \
1883 Dynarr_add (dst, '\n'); \
1884 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1885 Dynarr_add (dst, c); \
1887 flags |= CODING_STATE_CR; \
1888 goto label_continue_loop; \
1890 else if (flags & CODING_STATE_CR) \
1891 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1893 Dynarr_add (dst, '\r'); \
1894 flags &= ~CODING_STATE_CR; \
1898 /* C should be a binary character in the range 0 - 255; convert
1899 to internal format and add to Dynarr DST. */
1902 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1904 if (BYTE_ASCII_P (c)) \
1905 Dynarr_add (dst, c); \
1908 Dynarr_add (dst, (c >> 6) | 0xc0); \
1909 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1914 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1918 Dynarr_add (dst, c);
1920 else if ( c <= 0x7ff )
1922 Dynarr_add (dst, (c >> 6) | 0xc0);
1923 Dynarr_add (dst, (c & 0x3f) | 0x80);
1925 else if ( c <= 0xffff )
1927 Dynarr_add (dst, (c >> 12) | 0xe0);
1928 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1929 Dynarr_add (dst, (c & 0x3f) | 0x80);
1931 else if ( c <= 0x1fffff )
1933 Dynarr_add (dst, (c >> 18) | 0xf0);
1934 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1935 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1936 Dynarr_add (dst, (c & 0x3f) | 0x80);
1938 else if ( c <= 0x3ffffff )
1940 Dynarr_add (dst, (c >> 24) | 0xf8);
1941 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1942 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1943 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1944 Dynarr_add (dst, (c & 0x3f) | 0x80);
1948 Dynarr_add (dst, (c >> 30) | 0xfc);
1949 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1950 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1951 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1952 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1953 Dynarr_add (dst, (c & 0x3f) | 0x80);
1957 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1959 if (BYTE_ASCII_P (c)) \
1960 Dynarr_add (dst, c); \
1961 else if (BYTE_C1_P (c)) \
1963 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1964 Dynarr_add (dst, c + 0x20); \
1968 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1969 Dynarr_add (dst, c); \
1974 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1978 DECODE_ADD_BINARY_CHAR (ch, dst); \
1983 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1985 if (flags & CODING_STATE_END) \
1987 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1988 if (flags & CODING_STATE_CR) \
1989 Dynarr_add (dst, '\r'); \
1993 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1995 struct decoding_stream
1997 /* Coding system that governs the conversion. */
1998 Lisp_Coding_System *codesys;
2000 /* Stream that we read the encoded data from or
2001 write the decoded data to. */
2004 /* If we are reading, then we can return only a fixed amount of
2005 data, so if the conversion resulted in too much data, we store it
2006 here for retrieval the next time around. */
2007 unsigned_char_dynarr *runoff;
2009 /* FLAGS holds flags indicating the current state of the decoding.
2010 Some of these flags are dependent on the coding system. */
2013 /* CH holds a partially built-up character. Since we only deal
2014 with one- and two-byte characters at the moment, we only use
2015 this to store the first byte of a two-byte character. */
2018 /* EOL_TYPE specifies the type of end-of-line conversion that
2019 currently applies. We need to keep this separate from the
2020 EOL type stored in CODESYS because the latter might indicate
2021 automatic EOL-type detection while the former will always
2022 indicate a particular EOL type. */
2023 enum eol_type eol_type;
2025 /* Additional ISO2022 information. We define the structure above
2026 because it's also needed by the detection routines. */
2027 struct iso2022_decoder iso2022;
2029 /* Additional information (the state of the running CCL program)
2030 used by the CCL decoder. */
2031 struct ccl_program ccl;
2033 /* counter for UTF-8 or UCS-4 */
2034 unsigned char counter;
2036 struct detection_state decst;
2039 static ssize_t decoding_reader (Lstream *stream,
2040 unsigned char *data, size_t size);
2041 static ssize_t decoding_writer (Lstream *stream,
2042 CONST unsigned char *data, size_t size);
2043 static int decoding_rewinder (Lstream *stream);
2044 static int decoding_seekable_p (Lstream *stream);
2045 static int decoding_flusher (Lstream *stream);
2046 static int decoding_closer (Lstream *stream);
2048 static Lisp_Object decoding_marker (Lisp_Object stream);
2050 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2051 sizeof (struct decoding_stream));
2054 decoding_marker (Lisp_Object stream)
2056 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2057 Lisp_Object str_obj;
2059 /* We do not need to mark the coding systems or charsets stored
2060 within the stream because they are stored in a global list
2061 and automatically marked. */
2063 XSETLSTREAM (str_obj, str);
2064 mark_object (str_obj);
2065 if (str->imp->marker)
2066 return (str->imp->marker) (str_obj);
2071 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2072 so we read data from the other end, decode it, and store it into DATA. */
2075 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2077 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2078 unsigned char *orig_data = data;
2080 int error_occurred = 0;
2082 /* We need to interface to mule_decode(), which expects to take some
2083 amount of data and store the result into a Dynarr. We have
2084 mule_decode() store into str->runoff, and take data from there
2087 /* We loop until we have enough data, reading chunks from the other
2088 end and decoding it. */
2091 /* Take data from the runoff if we can. Make sure to take at
2092 most SIZE bytes, and delete the data from the runoff. */
2093 if (Dynarr_length (str->runoff) > 0)
2095 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2096 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2097 Dynarr_delete_many (str->runoff, 0, chunk);
2103 break; /* No more room for data */
2105 if (str->flags & CODING_STATE_END)
2106 /* This means that on the previous iteration, we hit the EOF on
2107 the other end. We loop once more so that mule_decode() can
2108 output any final stuff it may be holding, or any "go back
2109 to a sane state" escape sequences. (This latter makes sense
2110 during encoding.) */
2113 /* Exhausted the runoff, so get some more. DATA has at least
2114 SIZE bytes left of storage in it, so it's OK to read directly
2115 into it. (We'll be overwriting above, after we've decoded it
2116 into the runoff.) */
2117 read_size = Lstream_read (str->other_end, data, size);
2124 /* There might be some more end data produced in the translation.
2125 See the comment above. */
2126 str->flags |= CODING_STATE_END;
2127 mule_decode (stream, data, str->runoff, read_size);
2130 if (data - orig_data == 0)
2131 return error_occurred ? -1 : 0;
2133 return data - orig_data;
2137 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2139 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2142 /* Decode all our data into the runoff, and then attempt to write
2143 it all out to the other end. Remove whatever chunk we succeeded
2145 mule_decode (stream, data, str->runoff, size);
2146 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2147 Dynarr_length (str->runoff));
2149 Dynarr_delete_many (str->runoff, 0, retval);
2150 /* Do NOT return retval. The return value indicates how much
2151 of the incoming data was written, not how many bytes were
2157 reset_decoding_stream (struct decoding_stream *str)
2160 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2162 Lisp_Object coding_system;
2163 XSETCODING_SYSTEM (coding_system, str->codesys);
2164 reset_iso2022 (coding_system, &str->iso2022);
2166 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2168 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2172 str->flags = str->ch = 0;
2176 decoding_rewinder (Lstream *stream)
2178 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2179 reset_decoding_stream (str);
2180 Dynarr_reset (str->runoff);
2181 return Lstream_rewind (str->other_end);
2185 decoding_seekable_p (Lstream *stream)
2187 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2188 return Lstream_seekable_p (str->other_end);
2192 decoding_flusher (Lstream *stream)
2194 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2195 return Lstream_flush (str->other_end);
2199 decoding_closer (Lstream *stream)
2201 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2202 if (stream->flags & LSTREAM_FL_WRITE)
2204 str->flags |= CODING_STATE_END;
2205 decoding_writer (stream, 0, 0);
2207 Dynarr_free (str->runoff);
2209 #ifdef ENABLE_COMPOSITE_CHARS
2210 if (str->iso2022.composite_chars)
2211 Dynarr_free (str->iso2022.composite_chars);
2214 return Lstream_close (str->other_end);
2218 decoding_stream_coding_system (Lstream *stream)
2220 Lisp_Object coding_system;
2221 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2223 XSETCODING_SYSTEM (coding_system, str->codesys);
2224 return subsidiary_coding_system (coding_system, str->eol_type);
2228 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2230 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2231 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2233 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2234 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2235 reset_decoding_stream (str);
2238 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2239 stream for writing, no automatic code detection will be performed.
2240 The reason for this is that automatic code detection requires a
2241 seekable input. Things will also fail if you open a decoding
2242 stream for reading using a non-fully-specified coding system and
2243 a non-seekable input stream. */
2246 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2249 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2250 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2254 str->other_end = stream;
2255 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2256 str->eol_type = EOL_AUTODETECT;
2257 if (!strcmp (mode, "r")
2258 && Lstream_seekable_p (stream))
2259 /* We can determine the coding system now. */
2260 determine_real_coding_system (stream, &codesys, &str->eol_type);
2261 set_decoding_stream_coding_system (lstr, codesys);
2262 str->decst.eol_type = str->eol_type;
2263 str->decst.mask = ~0;
2264 XSETLSTREAM (obj, lstr);
2269 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2271 return make_decoding_stream_1 (stream, codesys, "r");
2275 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2277 return make_decoding_stream_1 (stream, codesys, "w");
2280 /* Note: the decode_coding_* functions all take the same
2281 arguments as mule_decode(), which is to say some SRC data of
2282 size N, which is to be stored into dynamic array DST.
2283 DECODING is the stream within which the decoding is
2284 taking place, but no data is actually read from or
2285 written to that stream; that is handled in decoding_reader()
2286 or decoding_writer(). This allows the same functions to
2287 be used for both reading and writing. */
2290 mule_decode (Lstream *decoding, CONST unsigned char *src,
2291 unsigned_char_dynarr *dst, unsigned int n)
2293 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2295 /* If necessary, do encoding-detection now. We do this when
2296 we're a writing stream or a non-seekable reading stream,
2297 meaning that we can't just process the whole input,
2298 rewind, and start over. */
2300 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2301 str->eol_type == EOL_AUTODETECT)
2303 Lisp_Object codesys;
2305 XSETCODING_SYSTEM (codesys, str->codesys);
2306 detect_coding_type (&str->decst, src, n,
2307 CODING_SYSTEM_TYPE (str->codesys) !=
2308 CODESYS_AUTODETECT);
2309 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2310 str->decst.mask != ~0)
2311 /* #### This is cheesy. What we really ought to do is
2312 buffer up a certain amount of data so as to get a
2313 less random result. */
2314 codesys = coding_system_from_mask (str->decst.mask);
2315 str->eol_type = str->decst.eol_type;
2316 if (XCODING_SYSTEM (codesys) != str->codesys)
2318 /* Preserve the CODING_STATE_END flag in case it was set.
2319 If we erase it, bad things might happen. */
2320 int was_end = str->flags & CODING_STATE_END;
2321 set_decoding_stream_coding_system (decoding, codesys);
2323 str->flags |= CODING_STATE_END;
2327 switch (CODING_SYSTEM_TYPE (str->codesys))
2330 case CODESYS_INTERNAL:
2331 Dynarr_add_many (dst, src, n);
2334 case CODESYS_AUTODETECT:
2335 /* If we got this far and still haven't decided on the coding
2336 system, then do no conversion. */
2337 case CODESYS_NO_CONVERSION:
2338 decode_coding_no_conversion (decoding, src, dst, n);
2341 case CODESYS_SHIFT_JIS:
2342 decode_coding_sjis (decoding, src, dst, n);
2345 decode_coding_big5 (decoding, src, dst, n);
2348 decode_coding_ucs4 (decoding, src, dst, n);
2351 decode_coding_utf8 (decoding, src, dst, n);
2354 str->ccl.last_block = str->flags & CODING_STATE_END;
2355 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2357 case CODESYS_ISO2022:
2358 decode_coding_iso2022 (decoding, src, dst, n);
2366 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2367 Decode the text between START and END which is encoded in CODING-SYSTEM.
2368 This is useful if you've read in encoded text from a file without decoding
2369 it (e.g. you read in a JIS-formatted file but used the `binary' or
2370 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2371 Return length of decoded text.
2372 BUFFER defaults to the current buffer if unspecified.
2374 (start, end, coding_system, buffer))
2377 struct buffer *buf = decode_buffer (buffer, 0);
2378 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2379 Lstream *istr, *ostr;
2380 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2382 get_buffer_range_char (buf, start, end, &b, &e, 0);
2384 barf_if_buffer_read_only (buf, b, e);
2386 coding_system = Fget_coding_system (coding_system);
2387 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2388 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2389 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2391 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2392 Fget_coding_system (Qbinary));
2393 istr = XLSTREAM (instream);
2394 ostr = XLSTREAM (outstream);
2395 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2397 /* The chain of streams looks like this:
2399 [BUFFER] <----- send through
2400 ------> [ENCODE AS BINARY]
2401 ------> [DECODE AS SPECIFIED]
2407 char tempbuf[1024]; /* some random amount */
2408 Bufpos newpos, even_newer_pos;
2409 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2410 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2414 newpos = lisp_buffer_stream_startpos (istr);
2415 Lstream_write (ostr, tempbuf, size_in_bytes);
2416 even_newer_pos = lisp_buffer_stream_startpos (istr);
2417 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2420 Lstream_close (istr);
2421 Lstream_close (ostr);
2423 Lstream_delete (istr);
2424 Lstream_delete (ostr);
2425 Lstream_delete (XLSTREAM (de_outstream));
2426 Lstream_delete (XLSTREAM (lb_outstream));
2431 /************************************************************************/
2432 /* Converting to an external encoding ("encoding") */
2433 /************************************************************************/
2435 /* An encoding stream is an output stream. When you create the
2436 stream, you specify the coding system that governs the encoding
2437 and another stream that the resulting encoded data is to be
2438 sent to, and then start sending data to it. */
2440 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2442 struct encoding_stream
2444 /* Coding system that governs the conversion. */
2445 Lisp_Coding_System *codesys;
2447 /* Stream that we read the encoded data from or
2448 write the decoded data to. */
2451 /* If we are reading, then we can return only a fixed amount of
2452 data, so if the conversion resulted in too much data, we store it
2453 here for retrieval the next time around. */
2454 unsigned_char_dynarr *runoff;
2456 /* FLAGS holds flags indicating the current state of the encoding.
2457 Some of these flags are dependent on the coding system. */
2460 /* CH holds a partially built-up character. Since we only deal
2461 with one- and two-byte characters at the moment, we only use
2462 this to store the first byte of a two-byte character. */
2465 /* Additional information used by the ISO2022 encoder. */
2468 /* CHARSET holds the character sets currently assigned to the G0
2469 through G3 registers. It is initialized from the array
2470 INITIAL_CHARSET in CODESYS. */
2471 Lisp_Object charset[4];
2473 /* Which registers are currently invoked into the left (GL) and
2474 right (GR) halves of the 8-bit encoding space? */
2475 int register_left, register_right;
2477 /* Whether we need to explicitly designate the charset in the
2478 G? register before using it. It is initialized from the
2479 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2480 unsigned char force_charset_on_output[4];
2482 /* Other state variables that need to be preserved across
2484 Lisp_Object current_charset;
2486 int current_char_boundary;
2489 void (*encode_char) (struct encoding_stream *str, Emchar c,
2490 unsigned_char_dynarr *dst, unsigned int *flags);
2491 void (*finish) (struct encoding_stream *str,
2492 unsigned_char_dynarr *dst, unsigned int *flags);
2494 /* Additional information (the state of the running CCL program)
2495 used by the CCL encoder. */
2496 struct ccl_program ccl;
2500 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2501 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2503 static int encoding_rewinder (Lstream *stream);
2504 static int encoding_seekable_p (Lstream *stream);
2505 static int encoding_flusher (Lstream *stream);
2506 static int encoding_closer (Lstream *stream);
2508 static Lisp_Object encoding_marker (Lisp_Object stream);
2510 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2511 sizeof (struct encoding_stream));
2514 encoding_marker (Lisp_Object stream)
2516 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2517 Lisp_Object str_obj;
2519 /* We do not need to mark the coding systems or charsets stored
2520 within the stream because they are stored in a global list
2521 and automatically marked. */
2523 XSETLSTREAM (str_obj, str);
2524 mark_object (str_obj);
2525 if (str->imp->marker)
2526 return (str->imp->marker) (str_obj);
2531 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2532 so we read data from the other end, encode it, and store it into DATA. */
2535 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2537 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2538 unsigned char *orig_data = data;
2540 int error_occurred = 0;
2542 /* We need to interface to mule_encode(), which expects to take some
2543 amount of data and store the result into a Dynarr. We have
2544 mule_encode() store into str->runoff, and take data from there
2547 /* We loop until we have enough data, reading chunks from the other
2548 end and encoding it. */
2551 /* Take data from the runoff if we can. Make sure to take at
2552 most SIZE bytes, and delete the data from the runoff. */
2553 if (Dynarr_length (str->runoff) > 0)
2555 int chunk = min ((int) size, Dynarr_length (str->runoff));
2556 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2557 Dynarr_delete_many (str->runoff, 0, chunk);
2563 break; /* No more room for data */
2565 if (str->flags & CODING_STATE_END)
2566 /* This means that on the previous iteration, we hit the EOF on
2567 the other end. We loop once more so that mule_encode() can
2568 output any final stuff it may be holding, or any "go back
2569 to a sane state" escape sequences. (This latter makes sense
2570 during encoding.) */
2573 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2574 left of storage in it, so it's OK to read directly into it.
2575 (We'll be overwriting above, after we've encoded it into the
2577 read_size = Lstream_read (str->other_end, data, size);
2584 /* There might be some more end data produced in the translation.
2585 See the comment above. */
2586 str->flags |= CODING_STATE_END;
2587 mule_encode (stream, data, str->runoff, read_size);
2590 if (data == orig_data)
2591 return error_occurred ? -1 : 0;
2593 return data - orig_data;
2597 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2599 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2602 /* Encode all our data into the runoff, and then attempt to write
2603 it all out to the other end. Remove whatever chunk we succeeded
2605 mule_encode (stream, data, str->runoff, size);
2606 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2607 Dynarr_length (str->runoff));
2609 Dynarr_delete_many (str->runoff, 0, retval);
2610 /* Do NOT return retval. The return value indicates how much
2611 of the incoming data was written, not how many bytes were
2617 reset_encoding_stream (struct encoding_stream *str)
2620 switch (CODING_SYSTEM_TYPE (str->codesys))
2622 case CODESYS_ISO2022:
2626 str->encode_char = &char_encode_iso2022;
2627 str->finish = &char_finish_iso2022;
2628 for (i = 0; i < 4; i++)
2630 str->iso2022.charset[i] =
2631 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2632 str->iso2022.force_charset_on_output[i] =
2633 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2635 str->iso2022.register_left = 0;
2636 str->iso2022.register_right = 1;
2637 str->iso2022.current_charset = Qnil;
2638 str->iso2022.current_half = 0;
2642 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2645 str->encode_char = &char_encode_utf8;
2646 str->finish = &char_finish_utf8;
2649 str->encode_char = &char_encode_ucs4;
2650 str->finish = &char_finish_ucs4;
2652 case CODESYS_SHIFT_JIS:
2653 str->encode_char = &char_encode_shift_jis;
2654 str->finish = &char_finish_shift_jis;
2660 str->iso2022.current_char_boundary = 0;
2661 str->flags = str->ch = 0;
2665 encoding_rewinder (Lstream *stream)
2667 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2668 reset_encoding_stream (str);
2669 Dynarr_reset (str->runoff);
2670 return Lstream_rewind (str->other_end);
2674 encoding_seekable_p (Lstream *stream)
2676 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2677 return Lstream_seekable_p (str->other_end);
2681 encoding_flusher (Lstream *stream)
2683 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2684 return Lstream_flush (str->other_end);
2688 encoding_closer (Lstream *stream)
2690 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2691 if (stream->flags & LSTREAM_FL_WRITE)
2693 str->flags |= CODING_STATE_END;
2694 encoding_writer (stream, 0, 0);
2696 Dynarr_free (str->runoff);
2697 return Lstream_close (str->other_end);
2701 encoding_stream_coding_system (Lstream *stream)
2703 Lisp_Object coding_system;
2704 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2706 XSETCODING_SYSTEM (coding_system, str->codesys);
2707 return coding_system;
2711 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2713 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2714 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2716 reset_encoding_stream (str);
2720 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2723 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2724 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2728 str->runoff = Dynarr_new (unsigned_char);
2729 str->other_end = stream;
2730 set_encoding_stream_coding_system (lstr, codesys);
2731 XSETLSTREAM (obj, lstr);
2736 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2738 return make_encoding_stream_1 (stream, codesys, "r");
2742 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2744 return make_encoding_stream_1 (stream, codesys, "w");
2747 /* Convert N bytes of internally-formatted data stored in SRC to an
2748 external format, according to the encoding stream ENCODING.
2749 Store the encoded data into DST. */
2752 mule_encode (Lstream *encoding, CONST unsigned char *src,
2753 unsigned_char_dynarr *dst, unsigned int n)
2755 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2757 switch (CODING_SYSTEM_TYPE (str->codesys))
2760 case CODESYS_INTERNAL:
2761 Dynarr_add_many (dst, src, n);
2764 case CODESYS_AUTODETECT:
2765 /* If we got this far and still haven't decided on the coding
2766 system, then do no conversion. */
2767 case CODESYS_NO_CONVERSION:
2768 encode_coding_no_conversion (encoding, src, dst, n);
2772 encode_coding_big5 (encoding, src, dst, n);
2775 str->ccl.last_block = str->flags & CODING_STATE_END;
2776 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2780 text_encode_generic (encoding, src, dst, n);
2784 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2785 Encode the text between START and END using CODING-SYSTEM.
2786 This will, for example, convert Japanese characters into stuff such as
2787 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2788 text. BUFFER defaults to the current buffer if unspecified.
2790 (start, end, coding_system, buffer))
2793 struct buffer *buf = decode_buffer (buffer, 0);
2794 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2795 Lstream *istr, *ostr;
2796 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2798 get_buffer_range_char (buf, start, end, &b, &e, 0);
2800 barf_if_buffer_read_only (buf, b, e);
2802 coding_system = Fget_coding_system (coding_system);
2803 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2804 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2805 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2806 Fget_coding_system (Qbinary));
2807 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2809 istr = XLSTREAM (instream);
2810 ostr = XLSTREAM (outstream);
2811 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2812 /* The chain of streams looks like this:
2814 [BUFFER] <----- send through
2815 ------> [ENCODE AS SPECIFIED]
2816 ------> [DECODE AS BINARY]
2821 char tempbuf[1024]; /* some random amount */
2822 Bufpos newpos, even_newer_pos;
2823 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2824 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2828 newpos = lisp_buffer_stream_startpos (istr);
2829 Lstream_write (ostr, tempbuf, size_in_bytes);
2830 even_newer_pos = lisp_buffer_stream_startpos (istr);
2831 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2837 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2838 Lstream_close (istr);
2839 Lstream_close (ostr);
2841 Lstream_delete (istr);
2842 Lstream_delete (ostr);
2843 Lstream_delete (XLSTREAM (de_outstream));
2844 Lstream_delete (XLSTREAM (lb_outstream));
2845 return make_int (retlen);
2852 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2853 unsigned_char_dynarr *dst, unsigned int n)
2856 unsigned char char_boundary;
2857 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2858 unsigned int flags = str->flags;
2859 Emchar ch = str->ch;
2861 char_boundary = str->iso2022.current_char_boundary;
2867 if (char_boundary == 0)
2895 (*str->encode_char) (str, c, dst, &flags);
2897 else if (char_boundary == 1)
2899 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2905 ch = (ch << 6) | (c & 0x3f);
2910 if ((char_boundary == 0) && (flags & CODING_STATE_END))
2912 (*str->finish) (str, dst, &flags);
2917 str->iso2022.current_char_boundary = char_boundary;
2921 /************************************************************************/
2922 /* Shift-JIS methods */
2923 /************************************************************************/
2925 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2926 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2927 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2928 encoded by "position-code + 0x80". A character of JISX0208
2929 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2930 position-codes are divided and shifted so that it fit in the range
2933 --- CODE RANGE of Shift-JIS ---
2934 (character set) (range)
2936 JISX0201-Kana 0xA0 .. 0xDF
2937 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2938 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2939 -------------------------------
2943 /* Is this the first byte of a Shift-JIS two-byte char? */
2945 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2946 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2948 /* Is this the second byte of a Shift-JIS two-byte char? */
2950 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2951 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2953 #define BYTE_SJIS_KATAKANA_P(c) \
2954 ((c) >= 0xA1 && (c) <= 0xDF)
2957 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2965 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2967 if (st->shift_jis.in_second_byte)
2969 st->shift_jis.in_second_byte = 0;
2973 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2974 st->shift_jis.in_second_byte = 1;
2976 return CODING_CATEGORY_SHIFT_JIS_MASK;
2979 /* Convert Shift-JIS data to internal format. */
2982 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2983 unsigned_char_dynarr *dst, unsigned int n)
2986 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2987 unsigned int flags = str->flags;
2988 unsigned int ch = str->ch;
2989 eol_type_t eol_type = str->eol_type;
2997 /* Previous character was first byte of Shift-JIS Kanji char. */
2998 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3000 unsigned char e1, e2;
3002 DECODE_SJIS (ch, c, e1, e2);
3004 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3008 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3009 Dynarr_add (dst, e1);
3010 Dynarr_add (dst, e2);
3015 DECODE_ADD_BINARY_CHAR (ch, dst);
3016 DECODE_ADD_BINARY_CHAR (c, dst);
3022 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3023 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3025 else if (BYTE_SJIS_KATAKANA_P (c))
3028 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3031 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3032 Dynarr_add (dst, c);
3037 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3041 DECODE_ADD_BINARY_CHAR (c, dst);
3043 label_continue_loop:;
3046 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3052 /* Convert internal character representation to Shift_JIS. */
3055 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3056 unsigned_char_dynarr *dst, unsigned int *flags)
3058 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3062 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3063 Dynarr_add (dst, '\r');
3064 if (eol_type != EOL_CR)
3065 Dynarr_add (dst, ch);
3069 Lisp_Object charset;
3070 unsigned int c1, c2, s1, s2;
3072 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3074 if (code_point >= 0)
3076 charset = Vcharset_latin_jisx0201;
3082 BREAKUP_CHAR (ch, charset, c1, c2);
3084 if (EQ(charset, Vcharset_katakana_jisx0201))
3086 Dynarr_add (dst, c1 | 0x80);
3090 Dynarr_add (dst, c1);
3092 else if (EQ(charset, Vcharset_japanese_jisx0208))
3094 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3095 Dynarr_add (dst, s1);
3096 Dynarr_add (dst, s2);
3099 Dynarr_add (dst, '?');
3104 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3105 unsigned int *flags)
3109 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3110 Decode a JISX0208 character of Shift-JIS coding-system.
3111 CODE is the character code in Shift-JIS as a cons of type bytes.
3112 Return the corresponding character.
3116 unsigned char c1, c2, s1, s2;
3119 CHECK_INT (XCAR (code));
3120 CHECK_INT (XCDR (code));
3121 s1 = XINT (XCAR (code));
3122 s2 = XINT (XCDR (code));
3123 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3124 BYTE_SJIS_TWO_BYTE_2_P (s2))
3126 DECODE_SJIS (s1, s2, c1, c2);
3127 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3128 c1 & 0x7F, c2 & 0x7F));
3134 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3135 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3136 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3140 Lisp_Object charset;
3143 CHECK_CHAR_COERCE_INT (ch);
3144 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3145 if (EQ (charset, Vcharset_japanese_jisx0208))
3147 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3148 return Fcons (make_int (s1), make_int (s2));
3155 /************************************************************************/
3157 /************************************************************************/
3159 /* BIG5 is a coding system encoding two character sets: ASCII and
3160 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3161 character set and is encoded in two-byte.
3163 --- CODE RANGE of BIG5 ---
3164 (character set) (range)
3166 Big5 (1st byte) 0xA1 .. 0xFE
3167 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3168 --------------------------
3170 Since the number of characters in Big5 is larger than maximum
3171 characters in Emacs' charset (96x96), it can't be handled as one
3172 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3173 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3174 contains frequently used characters and the latter contains less
3175 frequently used characters. */
3177 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3178 ((c) >= 0xA1 && (c) <= 0xFE)
3180 /* Is this the second byte of a Shift-JIS two-byte char? */
3182 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3183 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3185 /* Number of Big5 characters which have the same code in 1st byte. */
3187 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3189 /* Code conversion macros. These are macros because they are used in
3190 inner loops during code conversion.
3192 Note that temporary variables in macros introduce the classic
3193 dynamic-scoping problems with variable names. We use capital-
3194 lettered variables in the assumption that XEmacs does not use
3195 capital letters in variables except in a very formalized way
3198 /* Convert Big5 code (b1, b2) into its internal string representation
3201 /* There is a much simpler way to split the Big5 charset into two.
3202 For the moment I'm going to leave the algorithm as-is because it
3203 claims to separate out the most-used characters into a single
3204 charset, which perhaps will lead to optimizations in various
3207 The way the algorithm works is something like this:
3209 Big5 can be viewed as a 94x157 charset, where the row is
3210 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3211 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3212 the split between low and high column numbers is apparently
3213 meaningless; ascending rows produce less and less frequent chars.
3214 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3215 the first charset, and the upper half (0xC9 .. 0xFE) to the
3216 second. To do the conversion, we convert the character into
3217 a single number where 0 .. 156 is the first row, 157 .. 313
3218 is the second, etc. That way, the characters are ordered by
3219 decreasing frequency. Then we just chop the space in two
3220 and coerce the result into a 94x94 space.
3223 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3225 int B1 = b1, B2 = b2; \
3227 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3231 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3235 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3236 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3238 c1 = I / (0xFF - 0xA1) + 0xA1; \
3239 c2 = I % (0xFF - 0xA1) + 0xA1; \
3242 /* Convert the internal string representation of a Big5 character
3243 (lb, c1, c2) into Big5 code (b1, b2). */
3245 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3247 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3249 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3251 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3253 b1 = I / BIG5_SAME_ROW + 0xA1; \
3254 b2 = I % BIG5_SAME_ROW; \
3255 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3259 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3267 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3268 (c >= 0x80 && c <= 0xA0))
3270 if (st->big5.in_second_byte)
3272 st->big5.in_second_byte = 0;
3273 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3277 st->big5.in_second_byte = 1;
3279 return CODING_CATEGORY_BIG5_MASK;
3282 /* Convert Big5 data to internal format. */
3285 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3286 unsigned_char_dynarr *dst, unsigned int n)
3289 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3290 unsigned int flags = str->flags;
3291 unsigned int ch = str->ch;
3292 eol_type_t eol_type = str->eol_type;
3299 /* Previous character was first byte of Big5 char. */
3300 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3302 unsigned char b1, b2, b3;
3303 DECODE_BIG5 (ch, c, b1, b2, b3);
3304 Dynarr_add (dst, b1);
3305 Dynarr_add (dst, b2);
3306 Dynarr_add (dst, b3);
3310 DECODE_ADD_BINARY_CHAR (ch, dst);
3311 DECODE_ADD_BINARY_CHAR (c, dst);
3317 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3318 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3321 DECODE_ADD_BINARY_CHAR (c, dst);
3323 label_continue_loop:;
3326 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3332 /* Convert internally-formatted data to Big5. */
3335 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3336 unsigned_char_dynarr *dst, unsigned int n)
3340 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3341 unsigned int flags = str->flags;
3342 unsigned int ch = str->ch;
3343 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3350 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3351 Dynarr_add (dst, '\r');
3352 if (eol_type != EOL_CR)
3353 Dynarr_add (dst, '\n');
3355 else if (BYTE_ASCII_P (c))
3358 Dynarr_add (dst, c);
3360 else if (BUFBYTE_LEADING_BYTE_P (c))
3362 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3363 c == LEADING_BYTE_CHINESE_BIG5_2)
3365 /* A recognized leading byte. */
3367 continue; /* not done with this character. */
3369 /* otherwise just ignore this character. */
3371 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3372 ch == LEADING_BYTE_CHINESE_BIG5_2)
3374 /* Previous char was a recognized leading byte. */
3376 continue; /* not done with this character. */
3380 /* Encountering second byte of a Big5 character. */
3381 unsigned char b1, b2;
3383 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3384 Dynarr_add (dst, b1);
3385 Dynarr_add (dst, b2);
3397 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3398 Decode a Big5 character CODE of BIG5 coding-system.
3399 CODE is the character code in BIG5, a cons of two integers.
3400 Return the corresponding character.
3404 unsigned char c1, c2, b1, b2;
3407 CHECK_INT (XCAR (code));
3408 CHECK_INT (XCDR (code));
3409 b1 = XINT (XCAR (code));
3410 b2 = XINT (XCDR (code));
3411 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3412 BYTE_BIG5_TWO_BYTE_2_P (b2))
3414 Charset_ID leading_byte;
3415 Lisp_Object charset;
3416 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3417 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3418 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3424 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3425 Encode the Big5 character CH to BIG5 coding-system.
3426 Return the corresponding character code in Big5.
3430 Lisp_Object charset;
3433 CHECK_CHAR_COERCE_INT (ch);
3434 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3435 if (EQ (charset, Vcharset_chinese_big5_1) ||
3436 EQ (charset, Vcharset_chinese_big5_2))
3438 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3440 return Fcons (make_int (b1), make_int (b2));
3447 /************************************************************************/
3449 /************************************************************************/
3452 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3458 switch (st->ucs4.in_byte)
3467 st->ucs4.in_byte = 0;
3473 return CODING_CATEGORY_UCS4_MASK;
3477 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3478 unsigned_char_dynarr *dst, unsigned int n)
3480 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3481 unsigned int flags = str->flags;
3482 unsigned int ch = str->ch;
3483 unsigned char counter = str->counter;
3487 unsigned char c = *src++;
3495 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3500 ch = ( ch << 8 ) | c;
3504 if (counter & CODING_STATE_END)
3505 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3509 str->counter = counter;
3513 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3514 unsigned_char_dynarr *dst, unsigned int *flags)
3516 Dynarr_add (dst, ch >> 24);
3517 Dynarr_add (dst, ch >> 16);
3518 Dynarr_add (dst, ch >> 8);
3519 Dynarr_add (dst, ch );
3523 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3524 unsigned int *flags)
3529 /************************************************************************/
3531 /************************************************************************/
3534 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3539 unsigned char c = *src++;
3540 switch (st->utf8.in_byte)
3543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3546 st->utf8.in_byte = 5;
3548 st->utf8.in_byte = 4;
3550 st->utf8.in_byte = 3;
3552 st->utf8.in_byte = 2;
3554 st->utf8.in_byte = 1;
3559 if ((c & 0xc0) != 0x80)
3565 return CODING_CATEGORY_UTF8_MASK;
3569 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3570 unsigned_char_dynarr *dst, unsigned int n)
3572 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3573 unsigned int flags = str->flags;
3574 unsigned int ch = str->ch;
3575 eol_type_t eol_type = str->eol_type;
3576 unsigned char counter = str->counter;
3580 unsigned char c = *src++;
3589 else if ( c >= 0xf8 )
3594 else if ( c >= 0xf0 )
3599 else if ( c >= 0xe0 )
3604 else if ( c >= 0xc0 )
3611 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3612 DECODE_ADD_UCS_CHAR (c, dst);
3616 ch = ( ch << 6 ) | ( c & 0x3f );
3617 DECODE_ADD_UCS_CHAR (ch, dst);
3622 ch = ( ch << 6 ) | ( c & 0x3f );
3625 label_continue_loop:;
3628 if (flags & CODING_STATE_END)
3629 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3633 str->counter = counter;
3637 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3638 unsigned_char_dynarr *dst, unsigned int *flags)
3640 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3644 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3645 Dynarr_add (dst, '\r');
3646 if (eol_type != EOL_CR)
3647 Dynarr_add (dst, ch);
3649 else if (ch <= 0x7f)
3651 Dynarr_add (dst, ch);
3653 else if (ch <= 0x7ff)
3655 Dynarr_add (dst, (ch >> 6) | 0xc0);
3656 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3658 else if (ch <= 0xffff)
3660 Dynarr_add (dst, (ch >> 12) | 0xe0);
3661 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3662 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3664 else if (ch <= 0x1fffff)
3666 Dynarr_add (dst, (ch >> 18) | 0xf0);
3667 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3668 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3669 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3671 else if (ch <= 0x3ffffff)
3673 Dynarr_add (dst, (ch >> 24) | 0xf8);
3674 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3675 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3676 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3677 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3681 Dynarr_add (dst, (ch >> 30) | 0xfc);
3682 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3683 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3684 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3685 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3686 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3691 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3692 unsigned int *flags)
3697 /************************************************************************/
3698 /* ISO2022 methods */
3699 /************************************************************************/
3701 /* The following note describes the coding system ISO2022 briefly.
3702 Since the intention of this note is to help understand the
3703 functions in this file, some parts are NOT ACCURATE or OVERLY
3704 SIMPLIFIED. For thorough understanding, please refer to the
3705 original document of ISO2022.
3707 ISO2022 provides many mechanisms to encode several character sets
3708 in 7-bit and 8-bit environments. For 7-bit environments, all text
3709 is encoded using bytes less than 128. This may make the encoded
3710 text a little bit longer, but the text passes more easily through
3711 several gateways, some of which strip off MSB (Most Signigant Bit).
3713 There are two kinds of character sets: control character set and
3714 graphic character set. The former contains control characters such
3715 as `newline' and `escape' to provide control functions (control
3716 functions are also provided by escape sequences). The latter
3717 contains graphic characters such as 'A' and '-'. Emacs recognizes
3718 two control character sets and many graphic character sets.
3720 Graphic character sets are classified into one of the following
3721 four classes, according to the number of bytes (DIMENSION) and
3722 number of characters in one dimension (CHARS) of the set:
3723 - DIMENSION1_CHARS94
3724 - DIMENSION1_CHARS96
3725 - DIMENSION2_CHARS94
3726 - DIMENSION2_CHARS96
3728 In addition, each character set is assigned an identification tag,
3729 unique for each set, called "final character" (denoted as <F>
3730 hereafter). The <F> of each character set is decided by ECMA(*)
3731 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3732 (0x30..0x3F are for private use only).
3734 Note (*): ECMA = European Computer Manufacturers Association
3736 Here are examples of graphic character set [NAME(<F>)]:
3737 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3738 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3739 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3740 o DIMENSION2_CHARS96 -- none for the moment
3742 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3743 C0 [0x00..0x1F] -- control character plane 0
3744 GL [0x20..0x7F] -- graphic character plane 0
3745 C1 [0x80..0x9F] -- control character plane 1
3746 GR [0xA0..0xFF] -- graphic character plane 1
3748 A control character set is directly designated and invoked to C0 or
3749 C1 by an escape sequence. The most common case is that:
3750 - ISO646's control character set is designated/invoked to C0, and
3751 - ISO6429's control character set is designated/invoked to C1,
3752 and usually these designations/invocations are omitted in encoded
3753 text. In a 7-bit environment, only C0 can be used, and a control
3754 character for C1 is encoded by an appropriate escape sequence to
3755 fit into the environment. All control characters for C1 are
3756 defined to have corresponding escape sequences.
3758 A graphic character set is at first designated to one of four
3759 graphic registers (G0 through G3), then these graphic registers are
3760 invoked to GL or GR. These designations and invocations can be
3761 done independently. The most common case is that G0 is invoked to
3762 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3763 these invocations and designations are omitted in encoded text.
3764 In a 7-bit environment, only GL can be used.
3766 When a graphic character set of CHARS94 is invoked to GL, codes
3767 0x20 and 0x7F of the GL area work as control characters SPACE and
3768 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3771 There are two ways of invocation: locking-shift and single-shift.
3772 With locking-shift, the invocation lasts until the next different
3773 invocation, whereas with single-shift, the invocation affects the
3774 following character only and doesn't affect the locking-shift
3775 state. Invocations are done by the following control characters or
3778 ----------------------------------------------------------------------
3779 abbrev function cntrl escape seq description
3780 ----------------------------------------------------------------------
3781 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3782 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3783 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3784 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3785 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3786 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3787 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3788 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3789 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3790 ----------------------------------------------------------------------
3791 (*) These are not used by any known coding system.
3793 Control characters for these functions are defined by macros
3794 ISO_CODE_XXX in `coding.h'.
3796 Designations are done by the following escape sequences:
3797 ----------------------------------------------------------------------
3798 escape sequence description
3799 ----------------------------------------------------------------------
3800 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3801 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3802 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3803 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3804 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3805 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3806 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3807 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3808 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3809 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3810 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3811 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3812 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3813 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3814 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3815 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3816 ----------------------------------------------------------------------
3818 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3819 of dimension 1, chars 94, and final character <F>, etc...
3821 Note (*): Although these designations are not allowed in ISO2022,
3822 Emacs accepts them on decoding, and produces them on encoding
3823 CHARS96 character sets in a coding system which is characterized as
3824 7-bit environment, non-locking-shift, and non-single-shift.
3826 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3827 '(' can be omitted. We refer to this as "short-form" hereafter.
3829 Now you may notice that there are a lot of ways for encoding the
3830 same multilingual text in ISO2022. Actually, there exist many
3831 coding systems such as Compound Text (used in X11's inter client
3832 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3833 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3834 localized platforms), and all of these are variants of ISO2022.
3836 In addition to the above, Emacs handles two more kinds of escape
3837 sequences: ISO6429's direction specification and Emacs' private
3838 sequence for specifying character composition.
3840 ISO6429's direction specification takes the following form:
3841 o CSI ']' -- end of the current direction
3842 o CSI '0' ']' -- end of the current direction
3843 o CSI '1' ']' -- start of left-to-right text
3844 o CSI '2' ']' -- start of right-to-left text
3845 The control character CSI (0x9B: control sequence introducer) is
3846 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3848 Character composition specification takes the following form:
3849 o ESC '0' -- start character composition
3850 o ESC '1' -- end character composition
3851 Since these are not standard escape sequences of any ISO standard,
3852 their use with these meanings is restricted to Emacs only. */
3855 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3859 for (i = 0; i < 4; i++)
3861 if (!NILP (coding_system))
3863 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3865 iso->charset[i] = Qt;
3866 iso->invalid_designated[i] = 0;
3868 iso->esc = ISO_ESC_NOTHING;
3869 iso->esc_bytes_index = 0;
3870 iso->register_left = 0;
3871 iso->register_right = 1;
3872 iso->switched_dir_and_no_valid_charset_yet = 0;
3873 iso->invalid_switch_dir = 0;
3874 iso->output_direction_sequence = 0;
3875 iso->output_literally = 0;
3876 #ifdef ENABLE_COMPOSITE_CHARS
3877 if (iso->composite_chars)
3878 Dynarr_reset (iso->composite_chars);
3883 fit_to_be_escape_quoted (unsigned char c)
3900 /* Parse one byte of an ISO2022 escape sequence.
3901 If the result is an invalid escape sequence, return 0 and
3902 do not change anything in STR. Otherwise, if the result is
3903 an incomplete escape sequence, update ISO2022.ESC and
3904 ISO2022.ESC_BYTES and return -1. Otherwise, update
3905 all the state variables (but not ISO2022.ESC_BYTES) and
3908 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3909 or invocation of an invalid character set and treat that as
3910 an unrecognized escape sequence. */
3913 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3914 unsigned char c, unsigned int *flags,
3915 int check_invalid_charsets)
3917 /* (1) If we're at the end of a designation sequence, CS is the
3918 charset being designated and REG is the register to designate
3921 (2) If we're at the end of a locking-shift sequence, REG is
3922 the register to invoke and HALF (0 == left, 1 == right) is
3923 the half to invoke it into.
3925 (3) If we're at the end of a single-shift sequence, REG is
3926 the register to invoke. */
3927 Lisp_Object cs = Qnil;
3930 /* NOTE: This code does goto's all over the fucking place.
3931 The reason for this is that we're basically implementing
3932 a state machine here, and hierarchical languages like C
3933 don't really provide a clean way of doing this. */
3935 if (! (*flags & CODING_STATE_ESCAPE))
3936 /* At beginning of escape sequence; we need to reset our
3937 escape-state variables. */
3938 iso->esc = ISO_ESC_NOTHING;
3940 iso->output_literally = 0;
3941 iso->output_direction_sequence = 0;
3945 case ISO_ESC_NOTHING:
3946 iso->esc_bytes_index = 0;
3949 case ISO_CODE_ESC: /* Start escape sequence */
3950 *flags |= CODING_STATE_ESCAPE;
3954 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3955 *flags |= CODING_STATE_ESCAPE;
3956 iso->esc = ISO_ESC_5_11;
3959 case ISO_CODE_SO: /* locking shift 1 */
3962 case ISO_CODE_SI: /* locking shift 0 */
3966 case ISO_CODE_SS2: /* single shift */
3969 case ISO_CODE_SS3: /* single shift */
3973 default: /* Other control characters */
3980 /**** single shift ****/
3982 case 'N': /* single shift 2 */
3985 case 'O': /* single shift 3 */
3989 /**** locking shift ****/
3991 case '~': /* locking shift 1 right */
3994 case 'n': /* locking shift 2 */
3997 case '}': /* locking shift 2 right */
4000 case 'o': /* locking shift 3 */
4003 case '|': /* locking shift 3 right */
4007 #ifdef ENABLE_COMPOSITE_CHARS
4008 /**** composite ****/
4011 iso->esc = ISO_ESC_START_COMPOSITE;
4012 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4013 CODING_STATE_COMPOSITE;
4017 iso->esc = ISO_ESC_END_COMPOSITE;
4018 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4019 ~CODING_STATE_COMPOSITE;
4021 #endif /* ENABLE_COMPOSITE_CHARS */
4023 /**** directionality ****/
4026 iso->esc = ISO_ESC_5_11;
4029 /**** designation ****/
4031 case '$': /* multibyte charset prefix */
4032 iso->esc = ISO_ESC_2_4;
4036 if (0x28 <= c && c <= 0x2F)
4038 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4042 /* This function is called with CODESYS equal to nil when
4043 doing coding-system detection. */
4045 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4046 && fit_to_be_escape_quoted (c))
4048 iso->esc = ISO_ESC_LITERAL;
4049 *flags &= CODING_STATE_ISO2022_LOCK;
4059 /**** directionality ****/
4061 case ISO_ESC_5_11: /* ISO6429 direction control */
4064 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4065 goto directionality;
4067 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4068 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4069 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4073 case ISO_ESC_5_11_0:
4076 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4077 goto directionality;
4081 case ISO_ESC_5_11_1:
4084 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4085 goto directionality;
4089 case ISO_ESC_5_11_2:
4092 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4093 goto directionality;
4098 iso->esc = ISO_ESC_DIRECTIONALITY;
4099 /* Various junk here to attempt to preserve the direction sequences
4100 literally in the text if they would otherwise be swallowed due
4101 to invalid designations that don't show up as actual charset
4102 changes in the text. */
4103 if (iso->invalid_switch_dir)
4105 /* We already inserted a direction switch literally into the
4106 text. We assume (#### this may not be right) that the
4107 next direction switch is the one going the other way,
4108 and we need to output that literally as well. */
4109 iso->output_literally = 1;
4110 iso->invalid_switch_dir = 0;
4116 /* If we are in the thrall of an invalid designation,
4117 then stick the directionality sequence literally into the
4118 output stream so it ends up in the original text again. */
4119 for (jj = 0; jj < 4; jj++)
4120 if (iso->invalid_designated[jj])
4124 iso->output_literally = 1;
4125 iso->invalid_switch_dir = 1;
4128 /* Indicate that we haven't yet seen a valid designation,
4129 so that if a switch-dir is directly followed by an
4130 invalid designation, both get inserted literally. */
4131 iso->switched_dir_and_no_valid_charset_yet = 1;
4136 /**** designation ****/
4139 if (0x28 <= c && c <= 0x2F)
4141 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4144 if (0x40 <= c && c <= 0x42)
4146 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4147 *flags & CODING_STATE_R2L ?
4148 CHARSET_RIGHT_TO_LEFT :
4149 CHARSET_LEFT_TO_RIGHT);
4159 if (c < '0' || c > '~')
4160 return 0; /* bad final byte */
4162 if (iso->esc >= ISO_ESC_2_8 &&
4163 iso->esc <= ISO_ESC_2_15)
4165 type = ((iso->esc >= ISO_ESC_2_12) ?
4166 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4167 reg = (iso->esc - ISO_ESC_2_8) & 3;
4169 else if (iso->esc >= ISO_ESC_2_4_8 &&
4170 iso->esc <= ISO_ESC_2_4_15)
4172 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4173 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4174 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4178 /* Can this ever be reached? -slb */
4182 cs = CHARSET_BY_ATTRIBUTES (type, c,
4183 *flags & CODING_STATE_R2L ?
4184 CHARSET_RIGHT_TO_LEFT :
4185 CHARSET_LEFT_TO_RIGHT);
4191 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4195 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4196 /* can't invoke something that ain't there. */
4198 iso->esc = ISO_ESC_SINGLE_SHIFT;
4199 *flags &= CODING_STATE_ISO2022_LOCK;
4201 *flags |= CODING_STATE_SS2;
4203 *flags |= CODING_STATE_SS3;
4207 if (check_invalid_charsets &&
4208 !CHARSETP (iso->charset[reg]))
4209 /* can't invoke something that ain't there. */
4212 iso->register_right = reg;
4214 iso->register_left = reg;
4215 *flags &= CODING_STATE_ISO2022_LOCK;
4216 iso->esc = ISO_ESC_LOCKING_SHIFT;
4220 if (NILP (cs) && check_invalid_charsets)
4222 iso->invalid_designated[reg] = 1;
4223 iso->charset[reg] = Vcharset_ascii;
4224 iso->esc = ISO_ESC_DESIGNATE;
4225 *flags &= CODING_STATE_ISO2022_LOCK;
4226 iso->output_literally = 1;
4227 if (iso->switched_dir_and_no_valid_charset_yet)
4229 /* We encountered a switch-direction followed by an
4230 invalid designation. Ensure that the switch-direction
4231 gets outputted; otherwise it will probably get eaten
4232 when the text is written out again. */
4233 iso->switched_dir_and_no_valid_charset_yet = 0;
4234 iso->output_direction_sequence = 1;
4235 /* And make sure that the switch-dir going the other
4236 way gets outputted, as well. */
4237 iso->invalid_switch_dir = 1;
4241 /* This function is called with CODESYS equal to nil when
4242 doing coding-system detection. */
4243 if (!NILP (codesys))
4245 charset_conversion_spec_dynarr *dyn =
4246 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4252 for (i = 0; i < Dynarr_length (dyn); i++)
4254 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4255 if (EQ (cs, spec->from_charset))
4256 cs = spec->to_charset;
4261 iso->charset[reg] = cs;
4262 iso->esc = ISO_ESC_DESIGNATE;
4263 *flags &= CODING_STATE_ISO2022_LOCK;
4264 if (iso->invalid_designated[reg])
4266 iso->invalid_designated[reg] = 0;
4267 iso->output_literally = 1;
4269 if (iso->switched_dir_and_no_valid_charset_yet)
4270 iso->switched_dir_and_no_valid_charset_yet = 0;
4275 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4280 /* #### There are serious deficiencies in the recognition mechanism
4281 here. This needs to be much smarter if it's going to cut it.
4282 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4283 it should be detected as Latin-1.
4284 All the ISO2022 stuff in this file should be synced up with the
4285 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4286 Perhaps we should wait till R2L works in FSF Emacs? */
4288 if (!st->iso2022.initted)
4290 reset_iso2022 (Qnil, &st->iso2022.iso);
4291 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4292 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4293 CODING_CATEGORY_ISO_8_1_MASK |
4294 CODING_CATEGORY_ISO_8_2_MASK |
4295 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4296 st->iso2022.flags = 0;
4297 st->iso2022.high_byte_count = 0;
4298 st->iso2022.saw_single_shift = 0;
4299 st->iso2022.initted = 1;
4302 mask = st->iso2022.mask;
4309 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4310 st->iso2022.high_byte_count++;
4314 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4316 if (st->iso2022.high_byte_count & 1)
4317 /* odd number of high bytes; assume not iso-8-2 */
4318 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4320 st->iso2022.high_byte_count = 0;
4321 st->iso2022.saw_single_shift = 0;
4323 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4325 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4326 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4327 { /* control chars */
4330 /* Allow and ignore control characters that you might
4331 reasonably see in a text file */
4336 case 8: /* backspace */
4337 case 11: /* vertical tab */
4338 case 12: /* form feed */
4339 case 26: /* MS-DOS C-z junk */
4340 case 31: /* '^_' -- for info */
4341 goto label_continue_loop;
4348 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4351 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4352 &st->iso2022.flags, 0))
4354 switch (st->iso2022.iso.esc)
4356 case ISO_ESC_DESIGNATE:
4357 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4358 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4360 case ISO_ESC_LOCKING_SHIFT:
4361 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4362 goto ran_out_of_chars;
4363 case ISO_ESC_SINGLE_SHIFT:
4364 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4365 st->iso2022.saw_single_shift = 1;
4374 goto ran_out_of_chars;
4377 label_continue_loop:;
4386 postprocess_iso2022_mask (int mask)
4388 /* #### kind of cheesy */
4389 /* If seven-bit ISO is allowed, then assume that the encoding is
4390 entirely seven-bit and turn off the eight-bit ones. */
4391 if (mask & CODING_CATEGORY_ISO_7_MASK)
4392 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4393 CODING_CATEGORY_ISO_8_1_MASK |
4394 CODING_CATEGORY_ISO_8_2_MASK);
4398 /* If FLAGS is a null pointer or specifies right-to-left motion,
4399 output a switch-dir-to-left-to-right sequence to DST.
4400 Also update FLAGS if it is not a null pointer.
4401 If INTERNAL_P is set, we are outputting in internal format and
4402 need to handle the CSI differently. */
4405 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4406 unsigned_char_dynarr *dst,
4407 unsigned int *flags,
4410 if (!flags || (*flags & CODING_STATE_R2L))
4412 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4414 Dynarr_add (dst, ISO_CODE_ESC);
4415 Dynarr_add (dst, '[');
4417 else if (internal_p)
4418 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4420 Dynarr_add (dst, ISO_CODE_CSI);
4421 Dynarr_add (dst, '0');
4422 Dynarr_add (dst, ']');
4424 *flags &= ~CODING_STATE_R2L;
4428 /* If FLAGS is a null pointer or specifies a direction different from
4429 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4430 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4431 sequence to DST. Also update FLAGS if it is not a null pointer.
4432 If INTERNAL_P is set, we are outputting in internal format and
4433 need to handle the CSI differently. */
4436 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4437 unsigned_char_dynarr *dst, unsigned int *flags,
4440 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4441 direction == CHARSET_LEFT_TO_RIGHT)
4442 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4443 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4444 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4445 direction == CHARSET_RIGHT_TO_LEFT)
4447 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4449 Dynarr_add (dst, ISO_CODE_ESC);
4450 Dynarr_add (dst, '[');
4452 else if (internal_p)
4453 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4455 Dynarr_add (dst, ISO_CODE_CSI);
4456 Dynarr_add (dst, '2');
4457 Dynarr_add (dst, ']');
4459 *flags |= CODING_STATE_R2L;
4463 /* Convert ISO2022-format data to internal format. */
4466 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4467 unsigned_char_dynarr *dst, unsigned int n)
4469 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4470 unsigned int flags = str->flags;
4471 unsigned int ch = str->ch;
4472 eol_type_t eol_type = str->eol_type;
4473 #ifdef ENABLE_COMPOSITE_CHARS
4474 unsigned_char_dynarr *real_dst = dst;
4476 Lisp_Object coding_system;
4478 XSETCODING_SYSTEM (coding_system, str->codesys);
4480 #ifdef ENABLE_COMPOSITE_CHARS
4481 if (flags & CODING_STATE_COMPOSITE)
4482 dst = str->iso2022.composite_chars;
4483 #endif /* ENABLE_COMPOSITE_CHARS */
4487 unsigned char c = *src++;
4488 if (flags & CODING_STATE_ESCAPE)
4489 { /* Within ESC sequence */
4490 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4495 switch (str->iso2022.esc)
4497 #ifdef ENABLE_COMPOSITE_CHARS
4498 case ISO_ESC_START_COMPOSITE:
4499 if (str->iso2022.composite_chars)
4500 Dynarr_reset (str->iso2022.composite_chars);
4502 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4503 dst = str->iso2022.composite_chars;
4505 case ISO_ESC_END_COMPOSITE:
4507 Bufbyte comstr[MAX_EMCHAR_LEN];
4509 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4510 Dynarr_length (dst));
4512 len = set_charptr_emchar (comstr, emch);
4513 Dynarr_add_many (dst, comstr, len);
4516 #endif /* ENABLE_COMPOSITE_CHARS */
4518 case ISO_ESC_LITERAL:
4519 DECODE_ADD_BINARY_CHAR (c, dst);
4523 /* Everything else handled already */
4528 /* Attempted error recovery. */
4529 if (str->iso2022.output_direction_sequence)
4530 ensure_correct_direction (flags & CODING_STATE_R2L ?
4531 CHARSET_RIGHT_TO_LEFT :
4532 CHARSET_LEFT_TO_RIGHT,
4533 str->codesys, dst, 0, 1);
4534 /* More error recovery. */
4535 if (!retval || str->iso2022.output_literally)
4537 /* Output the (possibly invalid) sequence */
4539 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4540 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4541 flags &= CODING_STATE_ISO2022_LOCK;
4543 n++, src--;/* Repeat the loop with the same character. */
4546 /* No sense in reprocessing the final byte of the
4547 escape sequence; it could mess things up anyway.
4549 DECODE_ADD_BINARY_CHAR (c, dst);
4554 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4555 { /* Control characters */
4557 /***** Error-handling *****/
4559 /* If we were in the middle of a character, dump out the
4560 partial character. */
4561 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4563 /* If we just saw a single-shift character, dump it out.
4564 This may dump out the wrong sort of single-shift character,
4565 but least it will give an indication that something went
4567 if (flags & CODING_STATE_SS2)
4569 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4570 flags &= ~CODING_STATE_SS2;
4572 if (flags & CODING_STATE_SS3)
4574 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4575 flags &= ~CODING_STATE_SS3;
4578 /***** Now handle the control characters. *****/
4581 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4583 flags &= CODING_STATE_ISO2022_LOCK;
4585 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4586 DECODE_ADD_BINARY_CHAR (c, dst);
4589 { /* Graphic characters */
4590 Lisp_Object charset;
4596 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4598 /* Now determine the charset. */
4599 reg = ((flags & CODING_STATE_SS2) ? 2
4600 : (flags & CODING_STATE_SS3) ? 3
4601 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4602 : str->iso2022.register_left);
4603 charset = str->iso2022.charset[reg];
4605 /* Error checking: */
4606 if (! CHARSETP (charset)
4607 || str->iso2022.invalid_designated[reg]
4608 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4609 && XCHARSET_CHARS (charset) == 94))
4610 /* Mrmph. We are trying to invoke a register that has no
4611 or an invalid charset in it, or trying to add a character
4612 outside the range of the charset. Insert that char literally
4613 to preserve it for the output. */
4615 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4616 DECODE_ADD_BINARY_CHAR (c, dst);
4621 /* Things are probably hunky-dorey. */
4623 /* Fetch reverse charset, maybe. */
4624 if (((flags & CODING_STATE_R2L) &&
4625 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4627 (!(flags & CODING_STATE_R2L) &&
4628 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4630 Lisp_Object new_charset =
4631 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4632 if (!NILP (new_charset))
4633 charset = new_charset;
4637 if (XCHARSET_DIMENSION (charset) == 1)
4639 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4641 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4646 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4652 lb = XCHARSET_LEADING_BYTE (charset);
4653 switch (XCHARSET_REP_BYTES (charset))
4656 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4657 Dynarr_add (dst, c & 0x7F);
4660 case 2: /* one-byte official */
4661 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4662 Dynarr_add (dst, lb);
4663 Dynarr_add (dst, c | 0x80);
4666 case 3: /* one-byte private or two-byte official */
4667 if (XCHARSET_PRIVATE_P (charset))
4669 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4670 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4671 Dynarr_add (dst, lb);
4672 Dynarr_add (dst, c | 0x80);
4678 Dynarr_add (dst, lb);
4679 Dynarr_add (dst, ch | 0x80);
4680 Dynarr_add (dst, c | 0x80);
4688 default: /* two-byte private */
4691 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4692 Dynarr_add (dst, lb);
4693 Dynarr_add (dst, ch | 0x80);
4694 Dynarr_add (dst, c | 0x80);
4704 flags &= CODING_STATE_ISO2022_LOCK;
4707 label_continue_loop:;
4710 if (flags & CODING_STATE_END)
4711 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4718 /***** ISO2022 encoder *****/
4720 /* Designate CHARSET into register REG. */
4723 iso2022_designate (Lisp_Object charset, unsigned char reg,
4724 struct encoding_stream *str, unsigned_char_dynarr *dst)
4726 static CONST char inter94[] = "()*+";
4727 static CONST char inter96[] = ",-./";
4728 unsigned short chars;
4729 unsigned char dimension;
4730 unsigned char final;
4731 Lisp_Object old_charset = str->iso2022.charset[reg];
4733 str->iso2022.charset[reg] = charset;
4734 if (!CHARSETP (charset))
4735 /* charset might be an initial nil or t. */
4737 chars = XCHARSET_CHARS (charset);
4738 dimension = XCHARSET_DIMENSION (charset);
4739 final = XCHARSET_FINAL (charset);
4740 if (!str->iso2022.force_charset_on_output[reg] &&
4741 CHARSETP (old_charset) &&
4742 XCHARSET_CHARS (old_charset) == chars &&
4743 XCHARSET_DIMENSION (old_charset) == dimension &&
4744 XCHARSET_FINAL (old_charset) == final)
4747 str->iso2022.force_charset_on_output[reg] = 0;
4750 charset_conversion_spec_dynarr *dyn =
4751 str->codesys->iso2022.output_conv;
4757 for (i = 0; i < Dynarr_length (dyn); i++)
4759 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4760 if (EQ (charset, spec->from_charset))
4761 charset = spec->to_charset;
4766 Dynarr_add (dst, ISO_CODE_ESC);
4771 Dynarr_add (dst, inter94[reg]);
4774 Dynarr_add (dst, '$');
4776 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4779 Dynarr_add (dst, inter94[reg]);
4784 Dynarr_add (dst, inter96[reg]);
4787 Dynarr_add (dst, '$');
4788 Dynarr_add (dst, inter96[reg]);
4792 Dynarr_add (dst, final);
4796 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4798 if (str->iso2022.register_left != 0)
4800 Dynarr_add (dst, ISO_CODE_SI);
4801 str->iso2022.register_left = 0;
4806 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4808 if (str->iso2022.register_left != 1)
4810 Dynarr_add (dst, ISO_CODE_SO);
4811 str->iso2022.register_left = 1;
4816 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4817 unsigned_char_dynarr *dst, unsigned int *flags)
4819 unsigned char charmask;
4820 Lisp_Coding_System* codesys = str->codesys;
4821 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4823 Lisp_Object charset = str->iso2022.current_charset;
4824 int half = str->iso2022.current_half;
4825 unsigned int byte1, byte2;
4829 restore_left_to_right_direction (codesys, dst, flags, 0);
4831 /* Make sure G0 contains ASCII */
4832 if ((ch > ' ' && ch < ISO_CODE_DEL)
4833 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4835 ensure_normal_shift (str, dst);
4836 iso2022_designate (Vcharset_ascii, 0, str, dst);
4839 /* If necessary, restore everything to the default state
4841 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4843 restore_left_to_right_direction (codesys, dst, flags, 0);
4845 ensure_normal_shift (str, dst);
4847 for (i = 0; i < 4; i++)
4849 Lisp_Object initial_charset =
4850 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4851 iso2022_designate (initial_charset, i, str, dst);
4856 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4857 Dynarr_add (dst, '\r');
4858 if (eol_type != EOL_CR)
4859 Dynarr_add (dst, ch);
4863 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4864 && fit_to_be_escape_quoted (ch))
4865 Dynarr_add (dst, ISO_CODE_ESC);
4866 Dynarr_add (dst, ch);
4869 else if ( (0x80 <= ch) && (ch <= 0x9f) )
4871 charmask = (half == 0 ? 0x00 : 0x80);
4873 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4874 && fit_to_be_escape_quoted (ch))
4875 Dynarr_add (dst, ISO_CODE_ESC);
4876 /* you asked for it ... */
4877 Dynarr_add (dst, ch);
4883 /* Now determine which register to use. */
4885 for (i = 0; i < 4; i++)
4889 if ((CHARSETP (charset = str->iso2022.charset[i])
4890 && ((code_point = charset_code_point (charset, ch)) >= 0))
4894 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
4895 && ((code_point = charset_code_point (charset, ch)) >= 0)))
4897 if (XCHARSET_DIMENSION (charset) == 1)
4902 else /* if (XCHARSET_DIMENSION (charset) == 2) */
4904 byte1 = code_point >> 8;
4905 byte2 = code_point & 255;
4913 Lisp_Object original_default_coded_charset_priority_list
4914 = Vdefault_coded_charset_priority_list;
4916 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
4918 BREAKUP_CHAR (ch, charset, byte1, byte2);
4919 if (XCHARSET_FINAL (charset))
4921 Vdefault_coded_charset_priority_list
4922 = Fcdr (Fmemq (XCHARSET_NAME (charset),
4923 Vdefault_coded_charset_priority_list));
4925 BREAKUP_CHAR (ch, charset, byte1, byte2);
4926 if (!XCHARSET_FINAL (charset))
4928 charset = Vcharset_ascii;
4932 Vdefault_coded_charset_priority_list
4933 = original_default_coded_charset_priority_list;
4935 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4936 codesys, dst, flags, 0);
4940 if (XCHARSET_GRAPHIC (charset) != 0)
4942 if (!NILP (str->iso2022.charset[1]) &&
4943 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4944 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4946 else if (!NILP (str->iso2022.charset[2]))
4948 else if (!NILP (str->iso2022.charset[3]))
4957 iso2022_designate (charset, reg, str, dst);
4959 /* Now invoke that register. */
4963 ensure_normal_shift (str, dst);
4967 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4969 ensure_shift_out (str, dst);
4976 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4978 Dynarr_add (dst, ISO_CODE_ESC);
4979 Dynarr_add (dst, 'N');
4984 Dynarr_add (dst, ISO_CODE_SS2);
4989 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4991 Dynarr_add (dst, ISO_CODE_ESC);
4992 Dynarr_add (dst, 'O');
4997 Dynarr_add (dst, ISO_CODE_SS3);
5005 charmask = (half == 0 ? 0x00 : 0x80);
5007 switch (XCHARSET_DIMENSION (charset))
5010 Dynarr_add (dst, byte1 | charmask);
5013 Dynarr_add (dst, byte1 | charmask);
5014 Dynarr_add (dst, byte2 | charmask);
5020 str->iso2022.current_charset = charset;
5021 str->iso2022.current_half = half;
5025 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5026 unsigned int *flags)
5028 Lisp_Coding_System* codesys = str->codesys;
5031 restore_left_to_right_direction (codesys, dst, flags, 0);
5032 ensure_normal_shift (str, dst);
5033 for (i = 0; i < 4; i++)
5035 Lisp_Object initial_charset
5036 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5037 iso2022_designate (initial_charset, i, str, dst);
5042 /************************************************************************/
5043 /* No-conversion methods */
5044 /************************************************************************/
5046 /* This is used when reading in "binary" files -- i.e. files that may
5047 contain all 256 possible byte values and that are not to be
5048 interpreted as being in any particular decoding. */
5050 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5051 unsigned_char_dynarr *dst, unsigned int n)
5054 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5055 unsigned int flags = str->flags;
5056 unsigned int ch = str->ch;
5057 eol_type_t eol_type = str->eol_type;
5063 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5064 DECODE_ADD_BINARY_CHAR (c, dst);
5065 label_continue_loop:;
5068 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5075 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5076 unsigned_char_dynarr *dst, unsigned int n)
5079 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5080 unsigned int flags = str->flags;
5081 unsigned int ch = str->ch;
5082 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5084 unsigned char char_boundary = str->iso2022.current_char_boundary;
5091 if (char_boundary == 0)
5097 else if ( c >= 0xf8 )
5102 else if ( c >= 0xf0 )
5107 else if ( c >= 0xe0 )
5112 else if ( c >= 0xc0 )
5122 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5123 Dynarr_add (dst, '\r');
5124 if (eol_type != EOL_CR)
5125 Dynarr_add (dst, c);
5128 Dynarr_add (dst, c);
5131 else if (char_boundary == 1)
5133 ch = ( ch << 6 ) | ( c & 0x3f );
5134 Dynarr_add (dst, ch & 0xff);
5139 ch = ( ch << 6 ) | ( c & 0x3f );
5142 #else /* not UTF2000 */
5145 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5146 Dynarr_add (dst, '\r');
5147 if (eol_type != EOL_CR)
5148 Dynarr_add (dst, '\n');
5151 else if (BYTE_ASCII_P (c))
5154 Dynarr_add (dst, c);
5156 else if (BUFBYTE_LEADING_BYTE_P (c))
5159 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5160 c == LEADING_BYTE_CONTROL_1)
5163 Dynarr_add (dst, '~'); /* untranslatable character */
5167 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5168 Dynarr_add (dst, c);
5169 else if (ch == LEADING_BYTE_CONTROL_1)
5172 Dynarr_add (dst, c - 0x20);
5174 /* else it should be the second or third byte of an
5175 untranslatable character, so ignore it */
5178 #endif /* not UTF2000 */
5184 str->iso2022.current_char_boundary = char_boundary;
5189 /************************************************************************/
5190 /* Simple internal/external functions */
5191 /************************************************************************/
5193 static Extbyte_dynarr *conversion_out_dynarr;
5194 static Bufbyte_dynarr *conversion_in_dynarr;
5196 /* Determine coding system from coding format */
5198 /* #### not correct for all values of `fmt'! */
5200 external_data_format_to_coding_system (enum external_data_format fmt)
5204 case FORMAT_FILENAME:
5205 case FORMAT_TERMINAL:
5206 if (EQ (Vfile_name_coding_system, Qnil) ||
5207 EQ (Vfile_name_coding_system, Qbinary))
5210 return Fget_coding_system (Vfile_name_coding_system);
5213 return Fget_coding_system (Qctext);
5221 convert_to_external_format (CONST Bufbyte *ptr,
5224 enum external_data_format fmt)
5226 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5228 if (!conversion_out_dynarr)
5229 conversion_out_dynarr = Dynarr_new (Extbyte);
5231 Dynarr_reset (conversion_out_dynarr);
5233 if (NILP (coding_system))
5235 CONST Bufbyte *end = ptr + len;
5241 (*ptr < 0xc0) ? *ptr :
5242 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5245 (BYTE_ASCII_P (*ptr)) ? *ptr :
5246 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5247 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5250 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5254 #ifdef ERROR_CHECK_BUFPOS
5255 assert (ptr == end);
5260 Lisp_Object instream, outstream, da_outstream;
5261 Lstream *istr, *ostr;
5262 struct gcpro gcpro1, gcpro2, gcpro3;
5263 char tempbuf[1024]; /* some random amount */
5265 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5266 da_outstream = make_dynarr_output_stream
5267 ((unsigned_char_dynarr *) conversion_out_dynarr);
5269 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5270 istr = XLSTREAM (instream);
5271 ostr = XLSTREAM (outstream);
5272 GCPRO3 (instream, outstream, da_outstream);
5275 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5278 Lstream_write (ostr, tempbuf, size_in_bytes);
5280 Lstream_close (istr);
5281 Lstream_close (ostr);
5283 Lstream_delete (istr);
5284 Lstream_delete (ostr);
5285 Lstream_delete (XLSTREAM (da_outstream));
5288 *len_out = Dynarr_length (conversion_out_dynarr);
5289 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5290 return Dynarr_atp (conversion_out_dynarr, 0);
5294 convert_from_external_format (CONST Extbyte *ptr,
5297 enum external_data_format fmt)
5299 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5301 if (!conversion_in_dynarr)
5302 conversion_in_dynarr = Dynarr_new (Bufbyte);
5304 Dynarr_reset (conversion_in_dynarr);
5306 if (NILP (coding_system))
5308 CONST Extbyte *end = ptr + len;
5309 for (; ptr < end; ptr++)
5312 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5317 Lisp_Object instream, outstream, da_outstream;
5318 Lstream *istr, *ostr;
5319 struct gcpro gcpro1, gcpro2, gcpro3;
5320 char tempbuf[1024]; /* some random amount */
5322 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5323 da_outstream = make_dynarr_output_stream
5324 ((unsigned_char_dynarr *) conversion_in_dynarr);
5326 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5327 istr = XLSTREAM (instream);
5328 ostr = XLSTREAM (outstream);
5329 GCPRO3 (instream, outstream, da_outstream);
5332 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5335 Lstream_write (ostr, tempbuf, size_in_bytes);
5337 Lstream_close (istr);
5338 Lstream_close (ostr);
5340 Lstream_delete (istr);
5341 Lstream_delete (ostr);
5342 Lstream_delete (XLSTREAM (da_outstream));
5345 *len_out = Dynarr_length (conversion_in_dynarr);
5346 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5347 return Dynarr_atp (conversion_in_dynarr, 0);
5351 /************************************************************************/
5352 /* Initialization */
5353 /************************************************************************/
5356 syms_of_file_coding (void)
5358 deferror (&Qcoding_system_error, "coding-system-error",
5359 "Coding-system error", Qio_error);
5361 DEFSUBR (Fcoding_system_p);
5362 DEFSUBR (Ffind_coding_system);
5363 DEFSUBR (Fget_coding_system);
5364 DEFSUBR (Fcoding_system_list);
5365 DEFSUBR (Fcoding_system_name);
5366 DEFSUBR (Fmake_coding_system);
5367 DEFSUBR (Fcopy_coding_system);
5368 DEFSUBR (Fdefine_coding_system_alias);
5369 DEFSUBR (Fsubsidiary_coding_system);
5371 DEFSUBR (Fcoding_system_type);
5372 DEFSUBR (Fcoding_system_doc_string);
5374 DEFSUBR (Fcoding_system_charset);
5376 DEFSUBR (Fcoding_system_property);
5378 DEFSUBR (Fcoding_category_list);
5379 DEFSUBR (Fset_coding_priority_list);
5380 DEFSUBR (Fcoding_priority_list);
5381 DEFSUBR (Fset_coding_category_system);
5382 DEFSUBR (Fcoding_category_system);
5384 DEFSUBR (Fdetect_coding_region);
5385 DEFSUBR (Fdecode_coding_region);
5386 DEFSUBR (Fencode_coding_region);
5388 DEFSUBR (Fdecode_shift_jis_char);
5389 DEFSUBR (Fencode_shift_jis_char);
5390 DEFSUBR (Fdecode_big5_char);
5391 DEFSUBR (Fencode_big5_char);
5393 defsymbol (&Qcoding_systemp, "coding-system-p");
5394 defsymbol (&Qno_conversion, "no-conversion");
5395 defsymbol (&Qraw_text, "raw-text");
5397 defsymbol (&Qbig5, "big5");
5398 defsymbol (&Qshift_jis, "shift-jis");
5399 defsymbol (&Qucs4, "ucs-4");
5400 defsymbol (&Qutf8, "utf-8");
5401 defsymbol (&Qccl, "ccl");
5402 defsymbol (&Qiso2022, "iso2022");
5404 defsymbol (&Qmnemonic, "mnemonic");
5405 defsymbol (&Qeol_type, "eol-type");
5406 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5407 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5409 defsymbol (&Qcr, "cr");
5410 defsymbol (&Qlf, "lf");
5411 defsymbol (&Qcrlf, "crlf");
5412 defsymbol (&Qeol_cr, "eol-cr");
5413 defsymbol (&Qeol_lf, "eol-lf");
5414 defsymbol (&Qeol_crlf, "eol-crlf");
5416 defsymbol (&Qcharset_g0, "charset-g0");
5417 defsymbol (&Qcharset_g1, "charset-g1");
5418 defsymbol (&Qcharset_g2, "charset-g2");
5419 defsymbol (&Qcharset_g3, "charset-g3");
5420 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5421 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5422 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5423 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5424 defsymbol (&Qno_iso6429, "no-iso6429");
5425 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5426 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5428 defsymbol (&Qshort, "short");
5429 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5430 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5431 defsymbol (&Qseven, "seven");
5432 defsymbol (&Qlock_shift, "lock-shift");
5433 defsymbol (&Qescape_quoted, "escape-quoted");
5435 defsymbol (&Qencode, "encode");
5436 defsymbol (&Qdecode, "decode");
5439 defsymbol (&Qctext, "ctext");
5440 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5442 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5444 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5446 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5448 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5450 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5452 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5454 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5456 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5459 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5464 lstream_type_create_file_coding (void)
5466 LSTREAM_HAS_METHOD (decoding, reader);
5467 LSTREAM_HAS_METHOD (decoding, writer);
5468 LSTREAM_HAS_METHOD (decoding, rewinder);
5469 LSTREAM_HAS_METHOD (decoding, seekable_p);
5470 LSTREAM_HAS_METHOD (decoding, flusher);
5471 LSTREAM_HAS_METHOD (decoding, closer);
5472 LSTREAM_HAS_METHOD (decoding, marker);
5474 LSTREAM_HAS_METHOD (encoding, reader);
5475 LSTREAM_HAS_METHOD (encoding, writer);
5476 LSTREAM_HAS_METHOD (encoding, rewinder);
5477 LSTREAM_HAS_METHOD (encoding, seekable_p);
5478 LSTREAM_HAS_METHOD (encoding, flusher);
5479 LSTREAM_HAS_METHOD (encoding, closer);
5480 LSTREAM_HAS_METHOD (encoding, marker);
5484 vars_of_file_coding (void)
5488 fcd = xnew (struct file_coding_dump);
5489 dumpstruct (&fcd, &fcd_description);
5491 /* Initialize to something reasonable ... */
5492 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5494 fcd->coding_category_system[i] = Qnil;
5495 fcd->coding_category_by_priority[i] = i;
5498 Fprovide (intern ("file-coding"));
5500 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5501 Coding system used for TTY keyboard input.
5502 Not used under a windowing system.
5504 Vkeyboard_coding_system = Qnil;
5506 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5507 Coding system used for TTY display output.
5508 Not used under a windowing system.
5510 Vterminal_coding_system = Qnil;
5512 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5513 Overriding coding system used when writing a file or process.
5514 You should *bind* this, not set it. If this is non-nil, it specifies
5515 the coding system that will be used when a file or process is read
5516 in, and overrides `buffer-file-coding-system-for-read',
5517 `insert-file-contents-pre-hook', etc. Use those variables instead of
5518 this one for permanent changes to the environment.
5520 Vcoding_system_for_read = Qnil;
5522 DEFVAR_LISP ("coding-system-for-write",
5523 &Vcoding_system_for_write /*
5524 Overriding coding system used when writing a file or process.
5525 You should *bind* this, not set it. If this is non-nil, it specifies
5526 the coding system that will be used when a file or process is wrote
5527 in, and overrides `buffer-file-coding-system',
5528 `write-region-pre-hook', etc. Use those variables instead of this one
5529 for permanent changes to the environment.
5531 Vcoding_system_for_write = Qnil;
5533 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5534 Coding system used to convert pathnames when accessing files.
5536 Vfile_name_coding_system = Qnil;
5538 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5539 Non-nil means the buffer contents are regarded as multi-byte form
5540 of characters, not a binary code. This affects the display, file I/O,
5541 and behaviors of various editing commands.
5543 Setting this to nil does not do anything.
5545 enable_multibyte_characters = 1;
5549 complex_vars_of_file_coding (void)
5551 staticpro (&Vcoding_system_hash_table);
5552 Vcoding_system_hash_table =
5553 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5555 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5556 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5558 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5560 struct codesys_prop csp; \
5562 csp.prop_type = (Prop_Type); \
5563 Dynarr_add (the_codesys_prop_dynarr, csp); \
5566 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5567 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5568 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5569 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5570 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5571 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5572 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5574 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5575 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5576 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5577 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5578 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5579 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5580 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5581 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5582 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5583 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5584 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5585 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5586 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5587 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5588 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5589 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5590 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5592 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5593 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5595 /* Need to create this here or we're really screwed. */
5597 (Qraw_text, Qno_conversion,
5598 build_string ("Raw text, which means it converts only line-break-codes."),
5599 list2 (Qmnemonic, build_string ("Raw")));
5602 (Qbinary, Qno_conversion,
5603 build_string ("Binary, which means it does not convert anything."),
5604 list4 (Qeol_type, Qlf,
5605 Qmnemonic, build_string ("Binary")));
5610 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5611 list2 (Qmnemonic, build_string ("UTF8")));
5614 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5616 /* Need this for bootstrapping */
5617 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5618 Fget_coding_system (Qraw_text);
5621 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5622 = Fget_coding_system (Qutf8);