1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof(struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qctext, Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
180 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
183 static int detect_coding_sjis (struct detection_state *st,
184 CONST unsigned char *src,
186 static void decode_coding_sjis (Lstream *decoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst,
190 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
191 unsigned_char_dynarr *dst, unsigned int *flags);
192 void char_finish_shift_jis (struct encoding_stream *str,
193 unsigned_char_dynarr *dst, unsigned int *flags);
195 static int detect_coding_big5 (struct detection_state *st,
196 CONST unsigned char *src,
198 static void decode_coding_big5 (Lstream *decoding,
199 CONST unsigned char *src,
200 unsigned_char_dynarr *dst, unsigned int n);
201 static void encode_coding_big5 (Lstream *encoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static int detect_coding_ucs4 (struct detection_state *st,
205 CONST unsigned char *src,
207 static void decode_coding_ucs4 (Lstream *decoding,
208 CONST unsigned char *src,
209 unsigned_char_dynarr *dst, unsigned int n);
210 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
211 unsigned_char_dynarr *dst, unsigned int *flags);
212 void char_finish_ucs4 (struct encoding_stream *str,
213 unsigned_char_dynarr *dst, unsigned int *flags);
215 static int detect_coding_utf8 (struct detection_state *st,
216 CONST unsigned char *src,
218 static void decode_coding_utf8 (Lstream *decoding,
219 CONST unsigned char *src,
220 unsigned_char_dynarr *dst, unsigned int n);
221 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
222 unsigned_char_dynarr *dst, unsigned int *flags);
223 void char_finish_utf8 (struct encoding_stream *str,
224 unsigned_char_dynarr *dst, unsigned int *flags);
226 static int postprocess_iso2022_mask (int mask);
227 static void reset_iso2022 (Lisp_Object coding_system,
228 struct iso2022_decoder *iso);
229 static int detect_coding_iso2022 (struct detection_state *st,
230 CONST unsigned char *src,
232 static void decode_coding_iso2022 (Lstream *decoding,
233 CONST unsigned char *src,
234 unsigned_char_dynarr *dst, unsigned int n);
235 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
236 unsigned_char_dynarr *dst, unsigned int *flags);
237 void char_finish_iso2022 (struct encoding_stream *str,
238 unsigned_char_dynarr *dst, unsigned int *flags);
240 static void decode_coding_no_conversion (Lstream *decoding,
241 CONST unsigned char *src,
242 unsigned_char_dynarr *dst,
244 static void encode_coding_no_conversion (Lstream *encoding,
245 CONST unsigned char *src,
246 unsigned_char_dynarr *dst,
248 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
249 unsigned_char_dynarr *dst, unsigned int n);
250 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
251 unsigned_char_dynarr *dst, unsigned int n);
253 typedef struct codesys_prop codesys_prop;
262 Dynarr_declare (codesys_prop);
263 } codesys_prop_dynarr;
265 static const struct lrecord_description codesys_prop_description_1[] = {
266 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
270 static const struct struct_description codesys_prop_description = {
271 sizeof(codesys_prop),
272 codesys_prop_description_1
275 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
276 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
280 static const struct struct_description codesys_prop_dynarr_description = {
281 sizeof(codesys_prop_dynarr),
282 codesys_prop_dynarr_description_1
285 codesys_prop_dynarr *the_codesys_prop_dynarr;
287 enum codesys_prop_enum
290 CODESYS_PROP_ISO2022,
295 /************************************************************************/
296 /* Coding system functions */
297 /************************************************************************/
299 static Lisp_Object mark_coding_system (Lisp_Object);
300 static void print_coding_system (Lisp_Object, Lisp_Object, int);
301 static void finalize_coding_system (void *header, int for_disksave);
304 static const struct lrecord_description ccs_description_1[] = {
305 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
309 static const struct struct_description ccs_description = {
310 sizeof(charset_conversion_spec),
314 static const struct lrecord_description ccsd_description_1[] = {
315 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
319 static const struct struct_description ccsd_description = {
320 sizeof(charset_conversion_spec_dynarr),
325 static const struct lrecord_description coding_system_description[] = {
326 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
327 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
328 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
330 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
331 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
332 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
333 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
338 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
339 mark_coding_system, print_coding_system,
340 finalize_coding_system,
341 0, 0, coding_system_description,
342 struct Lisp_Coding_System);
345 mark_coding_system (Lisp_Object obj)
347 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
349 mark_object (CODING_SYSTEM_NAME (codesys));
350 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
351 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
352 mark_object (CODING_SYSTEM_EOL_LF (codesys));
353 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
354 mark_object (CODING_SYSTEM_EOL_CR (codesys));
356 switch (CODING_SYSTEM_TYPE (codesys))
360 case CODESYS_ISO2022:
361 for (i = 0; i < 4; i++)
362 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
363 if (codesys->iso2022.input_conv)
365 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
367 struct charset_conversion_spec *ccs =
368 Dynarr_atp (codesys->iso2022.input_conv, i);
369 mark_object (ccs->from_charset);
370 mark_object (ccs->to_charset);
373 if (codesys->iso2022.output_conv)
375 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
377 struct charset_conversion_spec *ccs =
378 Dynarr_atp (codesys->iso2022.output_conv, i);
379 mark_object (ccs->from_charset);
380 mark_object (ccs->to_charset);
386 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
387 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
394 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
395 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
399 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
402 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
404 error ("printing unreadable object #<coding_system 0x%x>",
407 write_c_string ("#<coding_system ", printcharfun);
408 print_internal (c->name, printcharfun, 1);
409 write_c_string (">", printcharfun);
413 finalize_coding_system (void *header, int for_disksave)
415 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
416 /* Since coding systems never go away, this function is not
417 necessary. But it would be necessary if we changed things
418 so that coding systems could go away. */
419 if (!for_disksave) /* see comment in lstream.c */
421 switch (CODING_SYSTEM_TYPE (c))
424 case CODESYS_ISO2022:
425 if (c->iso2022.input_conv)
427 Dynarr_free (c->iso2022.input_conv);
428 c->iso2022.input_conv = 0;
430 if (c->iso2022.output_conv)
432 Dynarr_free (c->iso2022.output_conv);
433 c->iso2022.output_conv = 0;
444 symbol_to_eol_type (Lisp_Object symbol)
446 CHECK_SYMBOL (symbol);
447 if (NILP (symbol)) return EOL_AUTODETECT;
448 if (EQ (symbol, Qlf)) return EOL_LF;
449 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
450 if (EQ (symbol, Qcr)) return EOL_CR;
452 signal_simple_error ("Unrecognized eol type", symbol);
453 return EOL_AUTODETECT; /* not reached */
457 eol_type_to_symbol (enum eol_type type)
462 case EOL_LF: return Qlf;
463 case EOL_CRLF: return Qcrlf;
464 case EOL_CR: return Qcr;
465 case EOL_AUTODETECT: return Qnil;
470 setup_eol_coding_systems (Lisp_Coding_System *codesys)
472 Lisp_Object codesys_obj;
473 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
474 char *codesys_name = (char *) alloca (len + 7);
476 char *codesys_mnemonic=0;
478 Lisp_Object codesys_name_sym, sub_codesys_obj;
482 XSETCODING_SYSTEM (codesys_obj, codesys);
484 memcpy (codesys_name,
485 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
487 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
489 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
490 codesys_mnemonic = (char *) alloca (mlen + 7);
491 memcpy (codesys_mnemonic,
492 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
495 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
496 strcpy (codesys_name + len, "-" op_sys); \
498 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
499 codesys_name_sym = intern (codesys_name); \
500 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
501 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
503 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
504 build_string (codesys_mnemonic); \
505 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
508 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
509 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
510 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
513 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
514 Return t if OBJECT is a coding system.
515 A coding system is an object that defines how text containing multiple
516 character sets is encoded into a stream of (typically 8-bit) bytes.
517 The coding system is used to decode the stream into a series of
518 characters (which may be from multiple charsets) when the text is read
519 from a file or process, and is used to encode the text back into the
520 same format when it is written out to a file or process.
522 For example, many ISO2022-compliant coding systems (such as Compound
523 Text, which is used for inter-client data under the X Window System)
524 use escape sequences to switch between different charsets -- Japanese
525 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
526 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
527 `make-coding-system' for more information.
529 Coding systems are normally identified using a symbol, and the
530 symbol is accepted in place of the actual coding system object whenever
531 a coding system is called for. (This is similar to how faces work.)
535 return CODING_SYSTEMP (object) ? Qt : Qnil;
538 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
539 Retrieve the coding system of the given name.
541 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
542 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
543 If there is no such coding system, nil is returned. Otherwise the
544 associated coding system object is returned.
546 (coding_system_or_name))
548 if (CODING_SYSTEMP (coding_system_or_name))
549 return coding_system_or_name;
551 if (NILP (coding_system_or_name))
552 coding_system_or_name = Qbinary;
554 CHECK_SYMBOL (coding_system_or_name);
556 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
559 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
560 Retrieve the coding system of the given name.
561 Same as `find-coding-system' except that if there is no such
562 coding system, an error is signaled instead of returning nil.
566 Lisp_Object coding_system = Ffind_coding_system (name);
568 if (NILP (coding_system))
569 signal_simple_error ("No such coding system", name);
570 return coding_system;
573 /* We store the coding systems in hash tables with the names as the key and the
574 actual coding system object as the value. Occasionally we need to use them
575 in a list format. These routines provide us with that. */
576 struct coding_system_list_closure
578 Lisp_Object *coding_system_list;
582 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
583 void *coding_system_list_closure)
585 /* This function can GC */
586 struct coding_system_list_closure *cscl =
587 (struct coding_system_list_closure *) coding_system_list_closure;
588 Lisp_Object *coding_system_list = cscl->coding_system_list;
590 *coding_system_list = Fcons (key, *coding_system_list);
594 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
595 Return a list of the names of all defined coding systems.
599 Lisp_Object coding_system_list = Qnil;
601 struct coding_system_list_closure coding_system_list_closure;
603 GCPRO1 (coding_system_list);
604 coding_system_list_closure.coding_system_list = &coding_system_list;
605 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
606 &coding_system_list_closure);
609 return coding_system_list;
612 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
613 Return the name of the given coding system.
617 coding_system = Fget_coding_system (coding_system);
618 return XCODING_SYSTEM_NAME (coding_system);
621 static Lisp_Coding_System *
622 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
624 Lisp_Coding_System *codesys =
625 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
627 zero_lcrecord (codesys);
628 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
629 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
630 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
631 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
632 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
633 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
634 CODING_SYSTEM_TYPE (codesys) = type;
635 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
637 if (type == CODESYS_ISO2022)
640 for (i = 0; i < 4; i++)
641 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
643 else if (type == CODESYS_CCL)
645 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
646 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
649 CODING_SYSTEM_NAME (codesys) = name;
655 /* Given a list of charset conversion specs as specified in a Lisp
656 program, parse it into STORE_HERE. */
659 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
660 Lisp_Object spec_list)
664 EXTERNAL_LIST_LOOP (rest, spec_list)
666 Lisp_Object car = XCAR (rest);
667 Lisp_Object from, to;
668 struct charset_conversion_spec spec;
670 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
671 signal_simple_error ("Invalid charset conversion spec", car);
672 from = Fget_charset (XCAR (car));
673 to = Fget_charset (XCAR (XCDR (car)));
674 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
675 signal_simple_error_2
676 ("Attempted conversion between different charset types",
678 spec.from_charset = from;
679 spec.to_charset = to;
681 Dynarr_add (store_here, spec);
685 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
686 specs, return the equivalent as the Lisp programmer would see it.
688 If LOAD_HERE is 0, return Qnil. */
691 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
698 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
700 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
701 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
704 return Fnreverse (result);
709 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
710 Register symbol NAME as a coding system.
712 TYPE describes the conversion method used and should be one of
715 Automatic conversion. XEmacs attempts to detect the coding system
718 No conversion. Use this for binary files and such. On output,
719 graphic characters that are not in ASCII or Latin-1 will be
720 replaced by a ?. (For a no-conversion-encoded buffer, these
721 characters will only be present if you explicitly insert them.)
723 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
725 ISO 10646 UCS-4 encoding.
727 ISO 10646 UTF-8 encoding.
729 Any ISO2022-compliant encoding. Among other things, this includes
730 JIS (the Japanese encoding commonly used for e-mail), EUC (the
731 standard Unix encoding for Japanese and other languages), and
732 Compound Text (the encoding used in X11). You can specify more
733 specific information about the conversion with the FLAGS argument.
735 Big5 (the encoding commonly used for Taiwanese).
737 The conversion is performed using a user-written pseudo-code
738 program. CCL (Code Conversion Language) is the name of this
741 Write out or read in the raw contents of the memory representing
742 the buffer's text. This is primarily useful for debugging
743 purposes, and is only enabled when XEmacs has been compiled with
744 DEBUG_XEMACS defined (via the --debug configure option).
745 WARNING: Reading in a file using 'internal conversion can result
746 in an internal inconsistency in the memory representing a
747 buffer's text, which will produce unpredictable results and may
748 cause XEmacs to crash. Under normal circumstances you should
749 never use 'internal conversion.
751 DOC-STRING is a string describing the coding system.
753 PROPS is a property list, describing the specific nature of the
754 character set. Recognized properties are:
757 String to be displayed in the modeline when this coding system is
761 End-of-line conversion to be used. It should be one of
764 Automatically detect the end-of-line type (LF, CRLF,
765 or CR). Also generate subsidiary coding systems named
766 `NAME-unix', `NAME-dos', and `NAME-mac', that are
767 identical to this coding system but have an EOL-TYPE
768 value of 'lf, 'crlf, and 'cr, respectively.
770 The end of a line is marked externally using ASCII LF.
771 Since this is also the way that XEmacs represents an
772 end-of-line internally, specifying this option results
773 in no end-of-line conversion. This is the standard
774 format for Unix text files.
776 The end of a line is marked externally using ASCII
777 CRLF. This is the standard format for MS-DOS text
780 The end of a line is marked externally using ASCII CR.
781 This is the standard format for Macintosh text files.
783 Automatically detect the end-of-line type but do not
784 generate subsidiary coding systems. (This value is
785 converted to nil when stored internally, and
786 `coding-system-property' will return nil.)
788 'post-read-conversion
789 Function called after a file has been read in, to perform the
790 decoding. Called with two arguments, BEG and END, denoting
791 a region of the current buffer to be decoded.
793 'pre-write-conversion
794 Function called before a file is written out, to perform the
795 encoding. Called with two arguments, BEG and END, denoting
796 a region of the current buffer to be encoded.
799 The following additional properties are recognized if TYPE is 'iso2022:
805 The character set initially designated to the G0 - G3 registers.
806 The value should be one of
808 -- A charset object (designate that character set)
809 -- nil (do not ever use this register)
810 -- t (no character set is initially designated to
811 the register, but may be later on; this automatically
812 sets the corresponding `force-g*-on-output' property)
818 If non-nil, send an explicit designation sequence on output before
819 using the specified register.
822 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
823 "ESC $ B" on output in place of the full designation sequences
824 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
827 If non-nil, don't designate ASCII to G0 at each end of line on output.
828 Setting this to non-nil also suppresses other state-resetting that
829 normally happens at the end of a line.
832 If non-nil, don't designate ASCII to G0 before control chars on output.
835 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
839 If non-nil, use locking-shift (SO/SI) instead of single-shift
840 or designation by escape sequence.
843 If non-nil, don't use ISO6429's direction specification.
846 If non-nil, literal control characters that are the same as
847 the beginning of a recognized ISO2022 or ISO6429 escape sequence
848 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
849 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
850 so that they can be properly distinguished from an escape sequence.
851 (Note that doing this results in a non-portable encoding.) This
852 encoding flag is used for byte-compiled files. Note that ESC
853 is a good choice for a quoting character because there are no
854 escape sequences whose second byte is a character from the Control-0
855 or Control-1 character sets; this is explicitly disallowed by the
858 'input-charset-conversion
859 A list of conversion specifications, specifying conversion of
860 characters in one charset to another when decoding is performed.
861 Each specification is a list of two elements: the source charset,
862 and the destination charset.
864 'output-charset-conversion
865 A list of conversion specifications, specifying conversion of
866 characters in one charset to another when encoding is performed.
867 The form of each specification is the same as for
868 'input-charset-conversion.
871 The following additional properties are recognized (and required)
875 CCL program used for decoding (converting to internal format).
878 CCL program used for encoding (converting to external format).
880 (name, type, doc_string, props))
882 Lisp_Coding_System *codesys;
883 Lisp_Object rest, key, value;
884 enum coding_system_type ty;
885 int need_to_setup_eol_systems = 1;
887 /* Convert type to constant */
888 if (NILP (type) || EQ (type, Qundecided))
889 { ty = CODESYS_AUTODETECT; }
891 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
892 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
893 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
894 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
895 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
896 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
898 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
900 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
903 signal_simple_error ("Invalid coding system type", type);
907 codesys = allocate_coding_system (ty, name);
909 if (NILP (doc_string))
910 doc_string = build_string ("");
912 CHECK_STRING (doc_string);
913 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
915 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
917 if (EQ (key, Qmnemonic))
920 CHECK_STRING (value);
921 CODING_SYSTEM_MNEMONIC (codesys) = value;
924 else if (EQ (key, Qeol_type))
926 need_to_setup_eol_systems = NILP (value);
929 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
932 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
933 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
935 else if (ty == CODESYS_ISO2022)
937 #define FROB_INITIAL_CHARSET(charset_num) \
938 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
939 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
941 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
942 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
943 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
944 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
946 #define FROB_FORCE_CHARSET(charset_num) \
947 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
949 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
950 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
951 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
952 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
954 #define FROB_BOOLEAN_PROPERTY(prop) \
955 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
957 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
958 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
959 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
960 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
961 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
962 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
963 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
965 else if (EQ (key, Qinput_charset_conversion))
967 codesys->iso2022.input_conv =
968 Dynarr_new (charset_conversion_spec);
969 parse_charset_conversion_specs (codesys->iso2022.input_conv,
972 else if (EQ (key, Qoutput_charset_conversion))
974 codesys->iso2022.output_conv =
975 Dynarr_new (charset_conversion_spec);
976 parse_charset_conversion_specs (codesys->iso2022.output_conv,
980 signal_simple_error ("Unrecognized property", key);
982 else if (EQ (type, Qccl))
984 if (EQ (key, Qdecode))
986 CHECK_VECTOR (value);
987 CODING_SYSTEM_CCL_DECODE (codesys) = value;
989 else if (EQ (key, Qencode))
991 CHECK_VECTOR (value);
992 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
995 signal_simple_error ("Unrecognized property", key);
999 signal_simple_error ("Unrecognized property", key);
1002 if (need_to_setup_eol_systems)
1003 setup_eol_coding_systems (codesys);
1006 Lisp_Object codesys_obj;
1007 XSETCODING_SYSTEM (codesys_obj, codesys);
1008 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1013 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1014 Copy OLD-CODING-SYSTEM to NEW-NAME.
1015 If NEW-NAME does not name an existing coding system, a new one will
1018 (old_coding_system, new_name))
1020 Lisp_Object new_coding_system;
1021 old_coding_system = Fget_coding_system (old_coding_system);
1022 new_coding_system = Ffind_coding_system (new_name);
1023 if (NILP (new_coding_system))
1025 XSETCODING_SYSTEM (new_coding_system,
1026 allocate_coding_system
1027 (XCODING_SYSTEM_TYPE (old_coding_system),
1029 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1033 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1034 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1035 memcpy (((char *) to ) + sizeof (to->header),
1036 ((char *) from) + sizeof (from->header),
1037 sizeof (*from) - sizeof (from->header));
1038 to->name = new_name;
1040 return new_coding_system;
1043 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1044 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1046 (alias, coding_system))
1048 CHECK_SYMBOL (alias);
1049 if (!NILP (Ffind_coding_system (alias)))
1050 signal_simple_error ("Symbol already names a coding system", alias);
1051 coding_system = Fget_coding_system (coding_system);
1052 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1054 /* Set up aliases for subsidiaries. */
1055 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1058 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1059 #define FROB(type, name) \
1061 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1062 if (!NILP (subsidiary)) \
1063 Fdefine_coding_system_alias \
1064 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1067 FROB (CRLF, "-dos");
1071 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1072 but it doesn't look intentional, so I'd rather return something
1073 meaningful or nothing at all. */
1078 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1080 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1081 Lisp_Object new_coding_system;
1083 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1084 return coding_system;
1088 case EOL_AUTODETECT: return coding_system;
1089 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1090 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1091 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1095 return NILP (new_coding_system) ? coding_system : new_coding_system;
1098 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1099 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1101 (coding_system, eol_type))
1103 coding_system = Fget_coding_system (coding_system);
1105 return subsidiary_coding_system (coding_system,
1106 symbol_to_eol_type (eol_type));
1110 /************************************************************************/
1111 /* Coding system accessors */
1112 /************************************************************************/
1114 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1115 Return the doc string for CODING-SYSTEM.
1119 coding_system = Fget_coding_system (coding_system);
1120 return XCODING_SYSTEM_DOC_STRING (coding_system);
1123 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1124 Return the type of CODING-SYSTEM.
1128 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1131 case CODESYS_AUTODETECT: return Qundecided;
1133 case CODESYS_SHIFT_JIS: return Qshift_jis;
1134 case CODESYS_ISO2022: return Qiso2022;
1135 case CODESYS_BIG5: return Qbig5;
1136 case CODESYS_UCS4: return Qucs4;
1137 case CODESYS_UTF8: return Qutf8;
1138 case CODESYS_CCL: return Qccl;
1140 case CODESYS_NO_CONVERSION: return Qno_conversion;
1142 case CODESYS_INTERNAL: return Qinternal;
1149 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1152 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1154 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1157 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1158 Return initial charset of CODING-SYSTEM designated to GNUM.
1161 (coding_system, gnum))
1163 coding_system = Fget_coding_system (coding_system);
1166 return coding_system_charset (coding_system, XINT (gnum));
1170 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1171 Return the PROP property of CODING-SYSTEM.
1173 (coding_system, prop))
1176 enum coding_system_type type;
1178 coding_system = Fget_coding_system (coding_system);
1179 CHECK_SYMBOL (prop);
1180 type = XCODING_SYSTEM_TYPE (coding_system);
1182 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1183 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1186 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1188 case CODESYS_PROP_ALL_OK:
1191 case CODESYS_PROP_ISO2022:
1192 if (type != CODESYS_ISO2022)
1194 ("Property only valid in ISO2022 coding systems",
1198 case CODESYS_PROP_CCL:
1199 if (type != CODESYS_CCL)
1201 ("Property only valid in CCL coding systems",
1211 signal_simple_error ("Unrecognized property", prop);
1213 if (EQ (prop, Qname))
1214 return XCODING_SYSTEM_NAME (coding_system);
1215 else if (EQ (prop, Qtype))
1216 return Fcoding_system_type (coding_system);
1217 else if (EQ (prop, Qdoc_string))
1218 return XCODING_SYSTEM_DOC_STRING (coding_system);
1219 else if (EQ (prop, Qmnemonic))
1220 return XCODING_SYSTEM_MNEMONIC (coding_system);
1221 else if (EQ (prop, Qeol_type))
1222 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1223 else if (EQ (prop, Qeol_lf))
1224 return XCODING_SYSTEM_EOL_LF (coding_system);
1225 else if (EQ (prop, Qeol_crlf))
1226 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1227 else if (EQ (prop, Qeol_cr))
1228 return XCODING_SYSTEM_EOL_CR (coding_system);
1229 else if (EQ (prop, Qpost_read_conversion))
1230 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1231 else if (EQ (prop, Qpre_write_conversion))
1232 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1234 else if (type == CODESYS_ISO2022)
1236 if (EQ (prop, Qcharset_g0))
1237 return coding_system_charset (coding_system, 0);
1238 else if (EQ (prop, Qcharset_g1))
1239 return coding_system_charset (coding_system, 1);
1240 else if (EQ (prop, Qcharset_g2))
1241 return coding_system_charset (coding_system, 2);
1242 else if (EQ (prop, Qcharset_g3))
1243 return coding_system_charset (coding_system, 3);
1245 #define FORCE_CHARSET(charset_num) \
1246 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1247 (coding_system, charset_num) ? Qt : Qnil)
1249 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1250 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1251 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1252 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1254 #define LISP_BOOLEAN(prop) \
1255 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1257 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1258 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1259 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1260 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1261 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1262 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1263 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1265 else if (EQ (prop, Qinput_charset_conversion))
1267 unparse_charset_conversion_specs
1268 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1269 else if (EQ (prop, Qoutput_charset_conversion))
1271 unparse_charset_conversion_specs
1272 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1276 else if (type == CODESYS_CCL)
1278 if (EQ (prop, Qdecode))
1279 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1280 else if (EQ (prop, Qencode))
1281 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1289 return Qnil; /* not reached */
1293 /************************************************************************/
1294 /* Coding category functions */
1295 /************************************************************************/
1298 decode_coding_category (Lisp_Object symbol)
1302 CHECK_SYMBOL (symbol);
1303 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1304 if (EQ (coding_category_symbol[i], symbol))
1307 signal_simple_error ("Unrecognized coding category", symbol);
1308 return 0; /* not reached */
1311 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1312 Return a list of all recognized coding categories.
1317 Lisp_Object list = Qnil;
1319 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1320 list = Fcons (coding_category_symbol[i], list);
1324 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1325 Change the priority order of the coding categories.
1326 LIST should be list of coding categories, in descending order of
1327 priority. Unspecified coding categories will be lower in priority
1328 than all specified ones, in the same relative order they were in
1333 int category_to_priority[CODING_CATEGORY_LAST + 1];
1337 /* First generate a list that maps coding categories to priorities. */
1339 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1340 category_to_priority[i] = -1;
1342 /* Highest priority comes from the specified list. */
1344 EXTERNAL_LIST_LOOP (rest, list)
1346 int cat = decode_coding_category (XCAR (rest));
1348 if (category_to_priority[cat] >= 0)
1349 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1350 category_to_priority[cat] = i++;
1353 /* Now go through the existing categories by priority to retrieve
1354 the categories not yet specified and preserve their priority
1356 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1358 int cat = fcd->coding_category_by_priority[j];
1359 if (category_to_priority[cat] < 0)
1360 category_to_priority[cat] = i++;
1363 /* Now we need to construct the inverse of the mapping we just
1366 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1367 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1369 /* Phew! That was confusing. */
1373 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1374 Return a list of coding categories in descending order of priority.
1379 Lisp_Object list = Qnil;
1381 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1382 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1387 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1388 Change the coding system associated with a coding category.
1390 (coding_category, coding_system))
1392 int cat = decode_coding_category (coding_category);
1394 coding_system = Fget_coding_system (coding_system);
1395 fcd->coding_category_system[cat] = coding_system;
1399 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1400 Return the coding system associated with a coding category.
1404 int cat = decode_coding_category (coding_category);
1405 Lisp_Object sys = fcd->coding_category_system[cat];
1408 return XCODING_SYSTEM_NAME (sys);
1413 /************************************************************************/
1414 /* Detecting the encoding of data */
1415 /************************************************************************/
1417 struct detection_state
1419 enum eol_type eol_type;
1455 struct iso2022_decoder iso;
1457 int high_byte_count;
1458 unsigned int saw_single_shift:1;
1471 acceptable_control_char_p (int c)
1475 /* Allow and ignore control characters that you might
1476 reasonably see in a text file */
1481 case 8: /* backspace */
1482 case 11: /* vertical tab */
1483 case 12: /* form feed */
1484 case 26: /* MS-DOS C-z junk */
1485 case 31: /* '^_' -- for info */
1493 mask_has_at_most_one_bit_p (int mask)
1495 /* Perhaps the only thing useful you learn from intensive Microsoft
1496 technical interviews */
1497 return (mask & (mask - 1)) == 0;
1500 static enum eol_type
1501 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1511 if (st->eol.just_saw_cr)
1513 else if (st->eol.seen_anything)
1516 else if (st->eol.just_saw_cr)
1519 st->eol.just_saw_cr = 1;
1521 st->eol.just_saw_cr = 0;
1522 st->eol.seen_anything = 1;
1525 return EOL_AUTODETECT;
1528 /* Attempt to determine the encoding and EOL type of the given text.
1529 Before calling this function for the first type, you must initialize
1530 st->eol_type as appropriate and initialize st->mask to ~0.
1532 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1535 st->mask holds the determined coding category mask, or ~0 if only
1536 ASCII has been seen so far.
1540 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1541 is present in st->mask
1542 1 == definitive answers are here for both st->eol_type and st->mask
1546 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1547 unsigned int n, int just_do_eol)
1551 if (st->eol_type == EOL_AUTODETECT)
1552 st->eol_type = detect_eol_type (st, src, n);
1555 return st->eol_type != EOL_AUTODETECT;
1557 if (!st->seen_non_ascii)
1559 for (; n; n--, src++)
1562 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1564 st->seen_non_ascii = 1;
1566 st->shift_jis.mask = ~0;
1570 st->iso2022.mask = ~0;
1580 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1581 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1582 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1583 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1584 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1585 st->big5.mask = detect_coding_big5 (st, src, n);
1586 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1587 st->utf8.mask = detect_coding_utf8 (st, src, n);
1588 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1589 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1592 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1593 | st->utf8.mask | st->ucs4.mask;
1596 int retval = mask_has_at_most_one_bit_p (st->mask);
1597 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1598 return retval && st->eol_type != EOL_AUTODETECT;
1603 coding_system_from_mask (int mask)
1607 /* If the file was entirely or basically ASCII, use the
1608 default value of `buffer-file-coding-system'. */
1609 Lisp_Object retval =
1610 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1613 retval = Ffind_coding_system (retval);
1617 (Qbad_variable, Qwarning,
1618 "Invalid `default-buffer-file-coding-system', set to nil");
1619 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1623 retval = Fget_coding_system (Qraw_text);
1631 mask = postprocess_iso2022_mask (mask);
1633 /* Look through the coding categories by priority and find
1634 the first one that is allowed. */
1635 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1637 cat = fcd->coding_category_by_priority[i];
1638 if ((mask & (1 << cat)) &&
1639 !NILP (fcd->coding_category_system[cat]))
1643 return fcd->coding_category_system[cat];
1645 return Fget_coding_system (Qraw_text);
1649 /* Given a seekable read stream and potential coding system and EOL type
1650 as specified, do any autodetection that is called for. If the
1651 coding system and/or EOL type are not `autodetect', they will be left
1652 alone; but this function will never return an autodetect coding system
1655 This function does not automatically fetch subsidiary coding systems;
1656 that should be unnecessary with the explicit eol-type argument. */
1658 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1661 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1662 enum eol_type *eol_type_in_out)
1664 struct detection_state decst;
1666 if (*eol_type_in_out == EOL_AUTODETECT)
1667 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1670 decst.eol_type = *eol_type_in_out;
1673 /* If autodetection is called for, do it now. */
1674 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1675 || *eol_type_in_out == EOL_AUTODETECT)
1678 Lisp_Object coding_system = Qnil;
1680 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1683 /* Look for initial "-*-"; mode line prefix */
1685 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1690 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1692 Extbyte *local_vars_beg = p + 3;
1693 /* Look for final "-*-"; mode line suffix */
1694 for (p = local_vars_beg,
1695 scan_end = buf + nread - LENGTH ("-*-");
1700 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1702 Extbyte *suffix = p;
1703 /* Look for "coding:" */
1704 for (p = local_vars_beg,
1705 scan_end = suffix - LENGTH ("coding:?");
1708 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1709 && (p == local_vars_beg
1710 || (*(p-1) == ' ' ||
1716 p += LENGTH ("coding:");
1717 while (*p == ' ' || *p == '\t') p++;
1719 /* Get coding system name */
1720 save = *suffix; *suffix = '\0';
1721 /* Characters valid in a MIME charset name (rfc 1521),
1722 and in a Lisp symbol name. */
1723 n = strspn ( (char *) p,
1724 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1725 "abcdefghijklmnopqrstuvwxyz"
1731 save = p[n]; p[n] = '\0';
1733 Ffind_coding_system (intern ((char *) p));
1743 if (NILP (coding_system))
1746 if (detect_coding_type (&decst, buf, nread,
1747 XCODING_SYSTEM_TYPE (*codesys_in_out)
1748 != CODESYS_AUTODETECT))
1750 nread = Lstream_read (stream, buf, sizeof (buf));
1756 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1757 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1760 if (detect_coding_type (&decst, buf, nread, 1))
1762 nread = Lstream_read (stream, buf, sizeof (buf));
1768 *eol_type_in_out = decst.eol_type;
1769 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1771 if (NILP (coding_system))
1772 *codesys_in_out = coding_system_from_mask (decst.mask);
1774 *codesys_in_out = coding_system;
1778 /* If we absolutely can't determine the EOL type, just assume LF. */
1779 if (*eol_type_in_out == EOL_AUTODETECT)
1780 *eol_type_in_out = EOL_LF;
1782 Lstream_rewind (stream);
1785 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1786 Detect coding system of the text in the region between START and END.
1787 Returned a list of possible coding systems ordered by priority.
1788 If only ASCII characters are found, it returns 'undecided or one of
1789 its subsidiary coding systems according to a detected end-of-line
1790 type. Optional arg BUFFER defaults to the current buffer.
1792 (start, end, buffer))
1794 Lisp_Object val = Qnil;
1795 struct buffer *buf = decode_buffer (buffer, 0);
1797 Lisp_Object instream, lb_instream;
1798 Lstream *istr, *lb_istr;
1799 struct detection_state decst;
1800 struct gcpro gcpro1, gcpro2;
1802 get_buffer_range_char (buf, start, end, &b, &e, 0);
1803 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1804 lb_istr = XLSTREAM (lb_instream);
1805 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1806 istr = XLSTREAM (instream);
1807 GCPRO2 (instream, lb_instream);
1809 decst.eol_type = EOL_AUTODETECT;
1813 unsigned char random_buffer[4096];
1814 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1818 if (detect_coding_type (&decst, random_buffer, nread, 0))
1822 if (decst.mask == ~0)
1823 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1831 decst.mask = postprocess_iso2022_mask (decst.mask);
1833 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1835 int sys = fcd->coding_category_by_priority[i];
1836 if (decst.mask & (1 << sys))
1838 Lisp_Object codesys = fcd->coding_category_system[sys];
1839 if (!NILP (codesys))
1840 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1841 val = Fcons (codesys, val);
1845 Lstream_close (istr);
1847 Lstream_delete (istr);
1848 Lstream_delete (lb_istr);
1853 /************************************************************************/
1854 /* Converting to internal Mule format ("decoding") */
1855 /************************************************************************/
1857 /* A decoding stream is a stream used for decoding text (i.e.
1858 converting from some external format to internal format).
1859 The decoding-stream object keeps track of the actual coding
1860 stream, the stream that is at the other end, and data that
1861 needs to be persistent across the lifetime of the stream. */
1863 /* Handle the EOL stuff related to just-read-in character C.
1864 EOL_TYPE is the EOL type of the coding stream.
1865 FLAGS is the current value of FLAGS in the coding stream, and may
1866 be modified by this macro. (The macro only looks at the
1867 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1868 bytes are to be written. You need to also define a local goto
1869 label "label_continue_loop" that is at the end of the main
1870 character-reading loop.
1872 If C is a CR character, then this macro handles it entirely and
1873 jumps to label_continue_loop. Otherwise, this macro does not add
1874 anything to DST, and continues normally. You should continue
1875 processing C normally after this macro. */
1877 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1881 if (eol_type == EOL_CR) \
1882 Dynarr_add (dst, '\n'); \
1883 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1884 Dynarr_add (dst, c); \
1886 flags |= CODING_STATE_CR; \
1887 goto label_continue_loop; \
1889 else if (flags & CODING_STATE_CR) \
1890 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1892 Dynarr_add (dst, '\r'); \
1893 flags &= ~CODING_STATE_CR; \
1897 /* C should be a binary character in the range 0 - 255; convert
1898 to internal format and add to Dynarr DST. */
1901 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1903 if (BYTE_ASCII_P (c)) \
1904 Dynarr_add (dst, c); \
1907 Dynarr_add (dst, (c >> 6) | 0xc0); \
1908 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1913 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1917 Dynarr_add (dst, c);
1919 else if ( c <= 0x7ff )
1921 Dynarr_add (dst, (c >> 6) | 0xc0);
1922 Dynarr_add (dst, (c & 0x3f) | 0x80);
1924 else if ( c <= 0xffff )
1926 Dynarr_add (dst, (c >> 12) | 0xe0);
1927 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1928 Dynarr_add (dst, (c & 0x3f) | 0x80);
1930 else if ( c <= 0x1fffff )
1932 Dynarr_add (dst, (c >> 18) | 0xf0);
1933 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1934 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1935 Dynarr_add (dst, (c & 0x3f) | 0x80);
1937 else if ( c <= 0x3ffffff )
1939 Dynarr_add (dst, (c >> 24) | 0xf8);
1940 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1941 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1942 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1943 Dynarr_add (dst, (c & 0x3f) | 0x80);
1947 Dynarr_add (dst, (c >> 30) | 0xfc);
1948 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1949 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1950 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1951 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1952 Dynarr_add (dst, (c & 0x3f) | 0x80);
1956 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1958 if (BYTE_ASCII_P (c)) \
1959 Dynarr_add (dst, c); \
1960 else if (BYTE_C1_P (c)) \
1962 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1963 Dynarr_add (dst, c + 0x20); \
1967 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1968 Dynarr_add (dst, c); \
1973 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1977 DECODE_ADD_BINARY_CHAR (ch, dst); \
1982 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1984 if (flags & CODING_STATE_END) \
1986 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1987 if (flags & CODING_STATE_CR) \
1988 Dynarr_add (dst, '\r'); \
1992 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1994 struct decoding_stream
1996 /* Coding system that governs the conversion. */
1997 Lisp_Coding_System *codesys;
1999 /* Stream that we read the encoded data from or
2000 write the decoded data to. */
2003 /* If we are reading, then we can return only a fixed amount of
2004 data, so if the conversion resulted in too much data, we store it
2005 here for retrieval the next time around. */
2006 unsigned_char_dynarr *runoff;
2008 /* FLAGS holds flags indicating the current state of the decoding.
2009 Some of these flags are dependent on the coding system. */
2012 /* CH holds a partially built-up character. Since we only deal
2013 with one- and two-byte characters at the moment, we only use
2014 this to store the first byte of a two-byte character. */
2017 /* EOL_TYPE specifies the type of end-of-line conversion that
2018 currently applies. We need to keep this separate from the
2019 EOL type stored in CODESYS because the latter might indicate
2020 automatic EOL-type detection while the former will always
2021 indicate a particular EOL type. */
2022 enum eol_type eol_type;
2024 /* Additional ISO2022 information. We define the structure above
2025 because it's also needed by the detection routines. */
2026 struct iso2022_decoder iso2022;
2028 /* Additional information (the state of the running CCL program)
2029 used by the CCL decoder. */
2030 struct ccl_program ccl;
2032 /* counter for UTF-8 or UCS-4 */
2033 unsigned char counter;
2035 struct detection_state decst;
2038 static ssize_t decoding_reader (Lstream *stream,
2039 unsigned char *data, size_t size);
2040 static ssize_t decoding_writer (Lstream *stream,
2041 CONST unsigned char *data, size_t size);
2042 static int decoding_rewinder (Lstream *stream);
2043 static int decoding_seekable_p (Lstream *stream);
2044 static int decoding_flusher (Lstream *stream);
2045 static int decoding_closer (Lstream *stream);
2047 static Lisp_Object decoding_marker (Lisp_Object stream);
2049 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2050 sizeof (struct decoding_stream));
2053 decoding_marker (Lisp_Object stream)
2055 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2056 Lisp_Object str_obj;
2058 /* We do not need to mark the coding systems or charsets stored
2059 within the stream because they are stored in a global list
2060 and automatically marked. */
2062 XSETLSTREAM (str_obj, str);
2063 mark_object (str_obj);
2064 if (str->imp->marker)
2065 return (str->imp->marker) (str_obj);
2070 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2071 so we read data from the other end, decode it, and store it into DATA. */
2074 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2076 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2077 unsigned char *orig_data = data;
2079 int error_occurred = 0;
2081 /* We need to interface to mule_decode(), which expects to take some
2082 amount of data and store the result into a Dynarr. We have
2083 mule_decode() store into str->runoff, and take data from there
2086 /* We loop until we have enough data, reading chunks from the other
2087 end and decoding it. */
2090 /* Take data from the runoff if we can. Make sure to take at
2091 most SIZE bytes, and delete the data from the runoff. */
2092 if (Dynarr_length (str->runoff) > 0)
2094 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2095 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2096 Dynarr_delete_many (str->runoff, 0, chunk);
2102 break; /* No more room for data */
2104 if (str->flags & CODING_STATE_END)
2105 /* This means that on the previous iteration, we hit the EOF on
2106 the other end. We loop once more so that mule_decode() can
2107 output any final stuff it may be holding, or any "go back
2108 to a sane state" escape sequences. (This latter makes sense
2109 during encoding.) */
2112 /* Exhausted the runoff, so get some more. DATA has at least
2113 SIZE bytes left of storage in it, so it's OK to read directly
2114 into it. (We'll be overwriting above, after we've decoded it
2115 into the runoff.) */
2116 read_size = Lstream_read (str->other_end, data, size);
2123 /* There might be some more end data produced in the translation.
2124 See the comment above. */
2125 str->flags |= CODING_STATE_END;
2126 mule_decode (stream, data, str->runoff, read_size);
2129 if (data - orig_data == 0)
2130 return error_occurred ? -1 : 0;
2132 return data - orig_data;
2136 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2138 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2141 /* Decode all our data into the runoff, and then attempt to write
2142 it all out to the other end. Remove whatever chunk we succeeded
2144 mule_decode (stream, data, str->runoff, size);
2145 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2146 Dynarr_length (str->runoff));
2148 Dynarr_delete_many (str->runoff, 0, retval);
2149 /* Do NOT return retval. The return value indicates how much
2150 of the incoming data was written, not how many bytes were
2156 reset_decoding_stream (struct decoding_stream *str)
2159 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2161 Lisp_Object coding_system;
2162 XSETCODING_SYSTEM (coding_system, str->codesys);
2163 reset_iso2022 (coding_system, &str->iso2022);
2165 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2167 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2171 str->flags = str->ch = 0;
2175 decoding_rewinder (Lstream *stream)
2177 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2178 reset_decoding_stream (str);
2179 Dynarr_reset (str->runoff);
2180 return Lstream_rewind (str->other_end);
2184 decoding_seekable_p (Lstream *stream)
2186 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2187 return Lstream_seekable_p (str->other_end);
2191 decoding_flusher (Lstream *stream)
2193 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2194 return Lstream_flush (str->other_end);
2198 decoding_closer (Lstream *stream)
2200 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2201 if (stream->flags & LSTREAM_FL_WRITE)
2203 str->flags |= CODING_STATE_END;
2204 decoding_writer (stream, 0, 0);
2206 Dynarr_free (str->runoff);
2208 #ifdef ENABLE_COMPOSITE_CHARS
2209 if (str->iso2022.composite_chars)
2210 Dynarr_free (str->iso2022.composite_chars);
2213 return Lstream_close (str->other_end);
2217 decoding_stream_coding_system (Lstream *stream)
2219 Lisp_Object coding_system;
2220 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2222 XSETCODING_SYSTEM (coding_system, str->codesys);
2223 return subsidiary_coding_system (coding_system, str->eol_type);
2227 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2229 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2230 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2232 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2233 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2234 reset_decoding_stream (str);
2237 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2238 stream for writing, no automatic code detection will be performed.
2239 The reason for this is that automatic code detection requires a
2240 seekable input. Things will also fail if you open a decoding
2241 stream for reading using a non-fully-specified coding system and
2242 a non-seekable input stream. */
2245 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2248 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2249 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2253 str->other_end = stream;
2254 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2255 str->eol_type = EOL_AUTODETECT;
2256 if (!strcmp (mode, "r")
2257 && Lstream_seekable_p (stream))
2258 /* We can determine the coding system now. */
2259 determine_real_coding_system (stream, &codesys, &str->eol_type);
2260 set_decoding_stream_coding_system (lstr, codesys);
2261 str->decst.eol_type = str->eol_type;
2262 str->decst.mask = ~0;
2263 XSETLSTREAM (obj, lstr);
2268 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2270 return make_decoding_stream_1 (stream, codesys, "r");
2274 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2276 return make_decoding_stream_1 (stream, codesys, "w");
2279 /* Note: the decode_coding_* functions all take the same
2280 arguments as mule_decode(), which is to say some SRC data of
2281 size N, which is to be stored into dynamic array DST.
2282 DECODING is the stream within which the decoding is
2283 taking place, but no data is actually read from or
2284 written to that stream; that is handled in decoding_reader()
2285 or decoding_writer(). This allows the same functions to
2286 be used for both reading and writing. */
2289 mule_decode (Lstream *decoding, CONST unsigned char *src,
2290 unsigned_char_dynarr *dst, unsigned int n)
2292 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2294 /* If necessary, do encoding-detection now. We do this when
2295 we're a writing stream or a non-seekable reading stream,
2296 meaning that we can't just process the whole input,
2297 rewind, and start over. */
2299 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2300 str->eol_type == EOL_AUTODETECT)
2302 Lisp_Object codesys;
2304 XSETCODING_SYSTEM (codesys, str->codesys);
2305 detect_coding_type (&str->decst, src, n,
2306 CODING_SYSTEM_TYPE (str->codesys) !=
2307 CODESYS_AUTODETECT);
2308 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2309 str->decst.mask != ~0)
2310 /* #### This is cheesy. What we really ought to do is
2311 buffer up a certain amount of data so as to get a
2312 less random result. */
2313 codesys = coding_system_from_mask (str->decst.mask);
2314 str->eol_type = str->decst.eol_type;
2315 if (XCODING_SYSTEM (codesys) != str->codesys)
2317 /* Preserve the CODING_STATE_END flag in case it was set.
2318 If we erase it, bad things might happen. */
2319 int was_end = str->flags & CODING_STATE_END;
2320 set_decoding_stream_coding_system (decoding, codesys);
2322 str->flags |= CODING_STATE_END;
2326 switch (CODING_SYSTEM_TYPE (str->codesys))
2329 case CODESYS_INTERNAL:
2330 Dynarr_add_many (dst, src, n);
2333 case CODESYS_AUTODETECT:
2334 /* If we got this far and still haven't decided on the coding
2335 system, then do no conversion. */
2336 case CODESYS_NO_CONVERSION:
2337 decode_coding_no_conversion (decoding, src, dst, n);
2340 case CODESYS_SHIFT_JIS:
2341 decode_coding_sjis (decoding, src, dst, n);
2344 decode_coding_big5 (decoding, src, dst, n);
2347 decode_coding_ucs4 (decoding, src, dst, n);
2350 decode_coding_utf8 (decoding, src, dst, n);
2353 str->ccl.last_block = str->flags & CODING_STATE_END;
2354 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2356 case CODESYS_ISO2022:
2357 decode_coding_iso2022 (decoding, src, dst, n);
2365 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2366 Decode the text between START and END which is encoded in CODING-SYSTEM.
2367 This is useful if you've read in encoded text from a file without decoding
2368 it (e.g. you read in a JIS-formatted file but used the `binary' or
2369 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2370 Return length of decoded text.
2371 BUFFER defaults to the current buffer if unspecified.
2373 (start, end, coding_system, buffer))
2376 struct buffer *buf = decode_buffer (buffer, 0);
2377 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2378 Lstream *istr, *ostr;
2379 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2381 get_buffer_range_char (buf, start, end, &b, &e, 0);
2383 barf_if_buffer_read_only (buf, b, e);
2385 coding_system = Fget_coding_system (coding_system);
2386 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2387 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2388 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2390 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2391 Fget_coding_system (Qbinary));
2392 istr = XLSTREAM (instream);
2393 ostr = XLSTREAM (outstream);
2394 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2396 /* The chain of streams looks like this:
2398 [BUFFER] <----- send through
2399 ------> [ENCODE AS BINARY]
2400 ------> [DECODE AS SPECIFIED]
2406 char tempbuf[1024]; /* some random amount */
2407 Bufpos newpos, even_newer_pos;
2408 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2409 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2413 newpos = lisp_buffer_stream_startpos (istr);
2414 Lstream_write (ostr, tempbuf, size_in_bytes);
2415 even_newer_pos = lisp_buffer_stream_startpos (istr);
2416 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2419 Lstream_close (istr);
2420 Lstream_close (ostr);
2422 Lstream_delete (istr);
2423 Lstream_delete (ostr);
2424 Lstream_delete (XLSTREAM (de_outstream));
2425 Lstream_delete (XLSTREAM (lb_outstream));
2430 /************************************************************************/
2431 /* Converting to an external encoding ("encoding") */
2432 /************************************************************************/
2434 /* An encoding stream is an output stream. When you create the
2435 stream, you specify the coding system that governs the encoding
2436 and another stream that the resulting encoded data is to be
2437 sent to, and then start sending data to it. */
2439 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2441 struct encoding_stream
2443 /* Coding system that governs the conversion. */
2444 Lisp_Coding_System *codesys;
2446 /* Stream that we read the encoded data from or
2447 write the decoded data to. */
2450 /* If we are reading, then we can return only a fixed amount of
2451 data, so if the conversion resulted in too much data, we store it
2452 here for retrieval the next time around. */
2453 unsigned_char_dynarr *runoff;
2455 /* FLAGS holds flags indicating the current state of the encoding.
2456 Some of these flags are dependent on the coding system. */
2459 /* CH holds a partially built-up character. Since we only deal
2460 with one- and two-byte characters at the moment, we only use
2461 this to store the first byte of a two-byte character. */
2464 /* Additional information used by the ISO2022 encoder. */
2467 /* CHARSET holds the character sets currently assigned to the G0
2468 through G3 registers. It is initialized from the array
2469 INITIAL_CHARSET in CODESYS. */
2470 Lisp_Object charset[4];
2472 /* Which registers are currently invoked into the left (GL) and
2473 right (GR) halves of the 8-bit encoding space? */
2474 int register_left, register_right;
2476 /* Whether we need to explicitly designate the charset in the
2477 G? register before using it. It is initialized from the
2478 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2479 unsigned char force_charset_on_output[4];
2481 /* Other state variables that need to be preserved across
2483 Lisp_Object current_charset;
2485 int current_char_boundary;
2488 void (*encode_char) (struct encoding_stream *str, Emchar c,
2489 unsigned_char_dynarr *dst, unsigned int *flags);
2490 void (*finish) (struct encoding_stream *str,
2491 unsigned_char_dynarr *dst, unsigned int *flags);
2493 /* Additional information (the state of the running CCL program)
2494 used by the CCL encoder. */
2495 struct ccl_program ccl;
2499 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2500 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2502 static int encoding_rewinder (Lstream *stream);
2503 static int encoding_seekable_p (Lstream *stream);
2504 static int encoding_flusher (Lstream *stream);
2505 static int encoding_closer (Lstream *stream);
2507 static Lisp_Object encoding_marker (Lisp_Object stream);
2509 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2510 sizeof (struct encoding_stream));
2513 encoding_marker (Lisp_Object stream)
2515 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2516 Lisp_Object str_obj;
2518 /* We do not need to mark the coding systems or charsets stored
2519 within the stream because they are stored in a global list
2520 and automatically marked. */
2522 XSETLSTREAM (str_obj, str);
2523 mark_object (str_obj);
2524 if (str->imp->marker)
2525 return (str->imp->marker) (str_obj);
2530 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2531 so we read data from the other end, encode it, and store it into DATA. */
2534 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2536 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2537 unsigned char *orig_data = data;
2539 int error_occurred = 0;
2541 /* We need to interface to mule_encode(), which expects to take some
2542 amount of data and store the result into a Dynarr. We have
2543 mule_encode() store into str->runoff, and take data from there
2546 /* We loop until we have enough data, reading chunks from the other
2547 end and encoding it. */
2550 /* Take data from the runoff if we can. Make sure to take at
2551 most SIZE bytes, and delete the data from the runoff. */
2552 if (Dynarr_length (str->runoff) > 0)
2554 int chunk = min ((int) size, Dynarr_length (str->runoff));
2555 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2556 Dynarr_delete_many (str->runoff, 0, chunk);
2562 break; /* No more room for data */
2564 if (str->flags & CODING_STATE_END)
2565 /* This means that on the previous iteration, we hit the EOF on
2566 the other end. We loop once more so that mule_encode() can
2567 output any final stuff it may be holding, or any "go back
2568 to a sane state" escape sequences. (This latter makes sense
2569 during encoding.) */
2572 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2573 left of storage in it, so it's OK to read directly into it.
2574 (We'll be overwriting above, after we've encoded it into the
2576 read_size = Lstream_read (str->other_end, data, size);
2583 /* There might be some more end data produced in the translation.
2584 See the comment above. */
2585 str->flags |= CODING_STATE_END;
2586 mule_encode (stream, data, str->runoff, read_size);
2589 if (data == orig_data)
2590 return error_occurred ? -1 : 0;
2592 return data - orig_data;
2596 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2598 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2601 /* Encode all our data into the runoff, and then attempt to write
2602 it all out to the other end. Remove whatever chunk we succeeded
2604 mule_encode (stream, data, str->runoff, size);
2605 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2606 Dynarr_length (str->runoff));
2608 Dynarr_delete_many (str->runoff, 0, retval);
2609 /* Do NOT return retval. The return value indicates how much
2610 of the incoming data was written, not how many bytes were
2616 reset_encoding_stream (struct encoding_stream *str)
2619 switch (CODING_SYSTEM_TYPE (str->codesys))
2621 case CODESYS_ISO2022:
2625 str->encode_char = &char_encode_iso2022;
2626 str->finish = &char_finish_iso2022;
2627 for (i = 0; i < 4; i++)
2629 str->iso2022.charset[i] =
2630 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2631 str->iso2022.force_charset_on_output[i] =
2632 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2634 str->iso2022.register_left = 0;
2635 str->iso2022.register_right = 1;
2636 str->iso2022.current_charset = Qnil;
2637 str->iso2022.current_half = 0;
2641 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2644 str->encode_char = &char_encode_utf8;
2645 str->finish = &char_finish_utf8;
2648 str->encode_char = &char_encode_ucs4;
2649 str->finish = &char_finish_ucs4;
2651 case CODESYS_SHIFT_JIS:
2652 str->encode_char = &char_encode_shift_jis;
2653 str->finish = &char_finish_shift_jis;
2659 str->iso2022.current_char_boundary = 0;
2660 str->flags = str->ch = 0;
2664 encoding_rewinder (Lstream *stream)
2666 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2667 reset_encoding_stream (str);
2668 Dynarr_reset (str->runoff);
2669 return Lstream_rewind (str->other_end);
2673 encoding_seekable_p (Lstream *stream)
2675 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2676 return Lstream_seekable_p (str->other_end);
2680 encoding_flusher (Lstream *stream)
2682 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2683 return Lstream_flush (str->other_end);
2687 encoding_closer (Lstream *stream)
2689 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2690 if (stream->flags & LSTREAM_FL_WRITE)
2692 str->flags |= CODING_STATE_END;
2693 encoding_writer (stream, 0, 0);
2695 Dynarr_free (str->runoff);
2696 return Lstream_close (str->other_end);
2700 encoding_stream_coding_system (Lstream *stream)
2702 Lisp_Object coding_system;
2703 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2705 XSETCODING_SYSTEM (coding_system, str->codesys);
2706 return coding_system;
2710 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2712 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2713 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2715 reset_encoding_stream (str);
2719 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2722 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2723 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2727 str->runoff = Dynarr_new (unsigned_char);
2728 str->other_end = stream;
2729 set_encoding_stream_coding_system (lstr, codesys);
2730 XSETLSTREAM (obj, lstr);
2735 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2737 return make_encoding_stream_1 (stream, codesys, "r");
2741 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2743 return make_encoding_stream_1 (stream, codesys, "w");
2746 /* Convert N bytes of internally-formatted data stored in SRC to an
2747 external format, according to the encoding stream ENCODING.
2748 Store the encoded data into DST. */
2751 mule_encode (Lstream *encoding, CONST unsigned char *src,
2752 unsigned_char_dynarr *dst, unsigned int n)
2754 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2756 switch (CODING_SYSTEM_TYPE (str->codesys))
2759 case CODESYS_INTERNAL:
2760 Dynarr_add_many (dst, src, n);
2763 case CODESYS_AUTODETECT:
2764 /* If we got this far and still haven't decided on the coding
2765 system, then do no conversion. */
2766 case CODESYS_NO_CONVERSION:
2767 encode_coding_no_conversion (encoding, src, dst, n);
2771 encode_coding_big5 (encoding, src, dst, n);
2774 str->ccl.last_block = str->flags & CODING_STATE_END;
2775 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2779 text_encode_generic (encoding, src, dst, n);
2783 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2784 Encode the text between START and END using CODING-SYSTEM.
2785 This will, for example, convert Japanese characters into stuff such as
2786 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2787 text. BUFFER defaults to the current buffer if unspecified.
2789 (start, end, coding_system, buffer))
2792 struct buffer *buf = decode_buffer (buffer, 0);
2793 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2794 Lstream *istr, *ostr;
2795 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2797 get_buffer_range_char (buf, start, end, &b, &e, 0);
2799 barf_if_buffer_read_only (buf, b, e);
2801 coding_system = Fget_coding_system (coding_system);
2802 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2803 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2804 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2805 Fget_coding_system (Qbinary));
2806 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2808 istr = XLSTREAM (instream);
2809 ostr = XLSTREAM (outstream);
2810 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2811 /* The chain of streams looks like this:
2813 [BUFFER] <----- send through
2814 ------> [ENCODE AS SPECIFIED]
2815 ------> [DECODE AS BINARY]
2820 char tempbuf[1024]; /* some random amount */
2821 Bufpos newpos, even_newer_pos;
2822 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2823 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2827 newpos = lisp_buffer_stream_startpos (istr);
2828 Lstream_write (ostr, tempbuf, size_in_bytes);
2829 even_newer_pos = lisp_buffer_stream_startpos (istr);
2830 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2836 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2837 Lstream_close (istr);
2838 Lstream_close (ostr);
2840 Lstream_delete (istr);
2841 Lstream_delete (ostr);
2842 Lstream_delete (XLSTREAM (de_outstream));
2843 Lstream_delete (XLSTREAM (lb_outstream));
2844 return make_int (retlen);
2851 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2852 unsigned_char_dynarr *dst, unsigned int n)
2855 unsigned char char_boundary;
2856 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2857 unsigned int flags = str->flags;
2858 Emchar ch = str->ch;
2860 char_boundary = str->iso2022.current_char_boundary;
2866 if (char_boundary == 0)
2894 (*str->encode_char) (str, c, dst, &flags);
2896 else if (char_boundary == 1)
2898 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2904 ch = (ch << 6) | (c & 0x3f);
2909 if ((char_boundary == 0) && (flags & CODING_STATE_END))
2911 (*str->finish) (str, dst, &flags);
2916 str->iso2022.current_char_boundary = char_boundary;
2920 /************************************************************************/
2921 /* Shift-JIS methods */
2922 /************************************************************************/
2924 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2925 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2926 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2927 encoded by "position-code + 0x80". A character of JISX0208
2928 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2929 position-codes are divided and shifted so that it fit in the range
2932 --- CODE RANGE of Shift-JIS ---
2933 (character set) (range)
2935 JISX0201-Kana 0xA0 .. 0xDF
2936 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2937 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2938 -------------------------------
2942 /* Is this the first byte of a Shift-JIS two-byte char? */
2944 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2945 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2947 /* Is this the second byte of a Shift-JIS two-byte char? */
2949 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2950 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2952 #define BYTE_SJIS_KATAKANA_P(c) \
2953 ((c) >= 0xA1 && (c) <= 0xDF)
2956 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2964 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2966 if (st->shift_jis.in_second_byte)
2968 st->shift_jis.in_second_byte = 0;
2972 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2973 st->shift_jis.in_second_byte = 1;
2975 return CODING_CATEGORY_SHIFT_JIS_MASK;
2978 /* Convert Shift-JIS data to internal format. */
2981 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2982 unsigned_char_dynarr *dst, unsigned int n)
2985 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2986 unsigned int flags = str->flags;
2987 unsigned int ch = str->ch;
2988 eol_type_t eol_type = str->eol_type;
2996 /* Previous character was first byte of Shift-JIS Kanji char. */
2997 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2999 unsigned char e1, e2;
3001 DECODE_SJIS (ch, c, e1, e2);
3003 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3007 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3008 Dynarr_add (dst, e1);
3009 Dynarr_add (dst, e2);
3014 DECODE_ADD_BINARY_CHAR (ch, dst);
3015 DECODE_ADD_BINARY_CHAR (c, dst);
3021 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3022 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3024 else if (BYTE_SJIS_KATAKANA_P (c))
3027 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3030 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3031 Dynarr_add (dst, c);
3036 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3040 DECODE_ADD_BINARY_CHAR (c, dst);
3042 label_continue_loop:;
3045 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3051 /* Convert internal character representation to Shift_JIS. */
3054 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3055 unsigned_char_dynarr *dst, unsigned int *flags)
3057 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3061 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3062 Dynarr_add (dst, '\r');
3063 if (eol_type != EOL_CR)
3064 Dynarr_add (dst, ch);
3068 Lisp_Object charset;
3069 unsigned int c1, c2, s1, s2;
3071 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3073 if (code_point >= 0)
3075 charset = Vcharset_latin_jisx0201;
3081 BREAKUP_CHAR (ch, charset, c1, c2);
3083 if (EQ(charset, Vcharset_katakana_jisx0201))
3085 Dynarr_add (dst, c1 | 0x80);
3089 Dynarr_add (dst, c1);
3091 else if (EQ(charset, Vcharset_japanese_jisx0208))
3093 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3094 Dynarr_add (dst, s1);
3095 Dynarr_add (dst, s2);
3098 Dynarr_add (dst, '?');
3103 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3104 unsigned int *flags)
3108 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3109 Decode a JISX0208 character of Shift-JIS coding-system.
3110 CODE is the character code in Shift-JIS as a cons of type bytes.
3111 Return the corresponding character.
3115 unsigned char c1, c2, s1, s2;
3118 CHECK_INT (XCAR (code));
3119 CHECK_INT (XCDR (code));
3120 s1 = XINT (XCAR (code));
3121 s2 = XINT (XCDR (code));
3122 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3123 BYTE_SJIS_TWO_BYTE_2_P (s2))
3125 DECODE_SJIS (s1, s2, c1, c2);
3126 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3127 c1 & 0x7F, c2 & 0x7F));
3133 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3134 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3135 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3139 Lisp_Object charset;
3142 CHECK_CHAR_COERCE_INT (ch);
3143 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3144 if (EQ (charset, Vcharset_japanese_jisx0208))
3146 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3147 return Fcons (make_int (s1), make_int (s2));
3154 /************************************************************************/
3156 /************************************************************************/
3158 /* BIG5 is a coding system encoding two character sets: ASCII and
3159 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3160 character set and is encoded in two-byte.
3162 --- CODE RANGE of BIG5 ---
3163 (character set) (range)
3165 Big5 (1st byte) 0xA1 .. 0xFE
3166 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3167 --------------------------
3169 Since the number of characters in Big5 is larger than maximum
3170 characters in Emacs' charset (96x96), it can't be handled as one
3171 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3172 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3173 contains frequently used characters and the latter contains less
3174 frequently used characters. */
3176 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3177 ((c) >= 0xA1 && (c) <= 0xFE)
3179 /* Is this the second byte of a Shift-JIS two-byte char? */
3181 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3182 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3184 /* Number of Big5 characters which have the same code in 1st byte. */
3186 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3188 /* Code conversion macros. These are macros because they are used in
3189 inner loops during code conversion.
3191 Note that temporary variables in macros introduce the classic
3192 dynamic-scoping problems with variable names. We use capital-
3193 lettered variables in the assumption that XEmacs does not use
3194 capital letters in variables except in a very formalized way
3197 /* Convert Big5 code (b1, b2) into its internal string representation
3200 /* There is a much simpler way to split the Big5 charset into two.
3201 For the moment I'm going to leave the algorithm as-is because it
3202 claims to separate out the most-used characters into a single
3203 charset, which perhaps will lead to optimizations in various
3206 The way the algorithm works is something like this:
3208 Big5 can be viewed as a 94x157 charset, where the row is
3209 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3210 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3211 the split between low and high column numbers is apparently
3212 meaningless; ascending rows produce less and less frequent chars.
3213 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3214 the first charset, and the upper half (0xC9 .. 0xFE) to the
3215 second. To do the conversion, we convert the character into
3216 a single number where 0 .. 156 is the first row, 157 .. 313
3217 is the second, etc. That way, the characters are ordered by
3218 decreasing frequency. Then we just chop the space in two
3219 and coerce the result into a 94x94 space.
3222 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3224 int B1 = b1, B2 = b2; \
3226 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3230 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3234 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3235 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3237 c1 = I / (0xFF - 0xA1) + 0xA1; \
3238 c2 = I % (0xFF - 0xA1) + 0xA1; \
3241 /* Convert the internal string representation of a Big5 character
3242 (lb, c1, c2) into Big5 code (b1, b2). */
3244 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3246 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3248 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3250 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3252 b1 = I / BIG5_SAME_ROW + 0xA1; \
3253 b2 = I % BIG5_SAME_ROW; \
3254 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3258 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3266 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3267 (c >= 0x80 && c <= 0xA0))
3269 if (st->big5.in_second_byte)
3271 st->big5.in_second_byte = 0;
3272 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3276 st->big5.in_second_byte = 1;
3278 return CODING_CATEGORY_BIG5_MASK;
3281 /* Convert Big5 data to internal format. */
3284 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3285 unsigned_char_dynarr *dst, unsigned int n)
3288 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3289 unsigned int flags = str->flags;
3290 unsigned int ch = str->ch;
3291 eol_type_t eol_type = str->eol_type;
3298 /* Previous character was first byte of Big5 char. */
3299 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3301 unsigned char b1, b2, b3;
3302 DECODE_BIG5 (ch, c, b1, b2, b3);
3303 Dynarr_add (dst, b1);
3304 Dynarr_add (dst, b2);
3305 Dynarr_add (dst, b3);
3309 DECODE_ADD_BINARY_CHAR (ch, dst);
3310 DECODE_ADD_BINARY_CHAR (c, dst);
3316 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3317 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3320 DECODE_ADD_BINARY_CHAR (c, dst);
3322 label_continue_loop:;
3325 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3331 /* Convert internally-formatted data to Big5. */
3334 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3335 unsigned_char_dynarr *dst, unsigned int n)
3339 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3340 unsigned int flags = str->flags;
3341 unsigned int ch = str->ch;
3342 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3349 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3350 Dynarr_add (dst, '\r');
3351 if (eol_type != EOL_CR)
3352 Dynarr_add (dst, '\n');
3354 else if (BYTE_ASCII_P (c))
3357 Dynarr_add (dst, c);
3359 else if (BUFBYTE_LEADING_BYTE_P (c))
3361 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3362 c == LEADING_BYTE_CHINESE_BIG5_2)
3364 /* A recognized leading byte. */
3366 continue; /* not done with this character. */
3368 /* otherwise just ignore this character. */
3370 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3371 ch == LEADING_BYTE_CHINESE_BIG5_2)
3373 /* Previous char was a recognized leading byte. */
3375 continue; /* not done with this character. */
3379 /* Encountering second byte of a Big5 character. */
3380 unsigned char b1, b2;
3382 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3383 Dynarr_add (dst, b1);
3384 Dynarr_add (dst, b2);
3396 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3397 Decode a Big5 character CODE of BIG5 coding-system.
3398 CODE is the character code in BIG5, a cons of two integers.
3399 Return the corresponding character.
3403 unsigned char c1, c2, b1, b2;
3406 CHECK_INT (XCAR (code));
3407 CHECK_INT (XCDR (code));
3408 b1 = XINT (XCAR (code));
3409 b2 = XINT (XCDR (code));
3410 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3411 BYTE_BIG5_TWO_BYTE_2_P (b2))
3413 Charset_ID leading_byte;
3414 Lisp_Object charset;
3415 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3416 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3417 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3423 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3424 Encode the Big5 character CH to BIG5 coding-system.
3425 Return the corresponding character code in Big5.
3429 Lisp_Object charset;
3432 CHECK_CHAR_COERCE_INT (ch);
3433 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3434 if (EQ (charset, Vcharset_chinese_big5_1) ||
3435 EQ (charset, Vcharset_chinese_big5_2))
3437 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3439 return Fcons (make_int (b1), make_int (b2));
3446 /************************************************************************/
3448 /************************************************************************/
3451 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3457 switch (st->ucs4.in_byte)
3466 st->ucs4.in_byte = 0;
3472 return CODING_CATEGORY_UCS4_MASK;
3476 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3477 unsigned_char_dynarr *dst, unsigned int n)
3479 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3480 unsigned int flags = str->flags;
3481 unsigned int ch = str->ch;
3482 unsigned char counter = str->counter;
3486 unsigned char c = *src++;
3494 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3499 ch = ( ch << 8 ) | c;
3503 if (counter & CODING_STATE_END)
3504 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3508 str->counter = counter;
3512 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3513 unsigned_char_dynarr *dst, unsigned int *flags)
3515 Dynarr_add (dst, ch >> 24);
3516 Dynarr_add (dst, ch >> 16);
3517 Dynarr_add (dst, ch >> 8);
3518 Dynarr_add (dst, ch );
3522 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3523 unsigned int *flags)
3528 /************************************************************************/
3530 /************************************************************************/
3533 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3538 unsigned char c = *src++;
3539 switch (st->utf8.in_byte)
3542 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3545 st->utf8.in_byte = 5;
3547 st->utf8.in_byte = 4;
3549 st->utf8.in_byte = 3;
3551 st->utf8.in_byte = 2;
3553 st->utf8.in_byte = 1;
3558 if ((c & 0xc0) != 0x80)
3564 return CODING_CATEGORY_UTF8_MASK;
3568 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3569 unsigned_char_dynarr *dst, unsigned int n)
3571 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3572 unsigned int flags = str->flags;
3573 unsigned int ch = str->ch;
3574 eol_type_t eol_type = str->eol_type;
3575 unsigned char counter = str->counter;
3579 unsigned char c = *src++;
3588 else if ( c >= 0xf8 )
3593 else if ( c >= 0xf0 )
3598 else if ( c >= 0xe0 )
3603 else if ( c >= 0xc0 )
3610 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3611 DECODE_ADD_UCS_CHAR (c, dst);
3615 ch = ( ch << 6 ) | ( c & 0x3f );
3616 DECODE_ADD_UCS_CHAR (ch, dst);
3621 ch = ( ch << 6 ) | ( c & 0x3f );
3624 label_continue_loop:;
3627 if (flags & CODING_STATE_END)
3628 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3632 str->counter = counter;
3636 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3637 unsigned_char_dynarr *dst, unsigned int *flags)
3639 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3643 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3644 Dynarr_add (dst, '\r');
3645 if (eol_type != EOL_CR)
3646 Dynarr_add (dst, ch);
3648 else if (ch <= 0x7f)
3650 Dynarr_add (dst, ch);
3652 else if (ch <= 0x7ff)
3654 Dynarr_add (dst, (ch >> 6) | 0xc0);
3655 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3657 else if (ch <= 0xffff)
3659 Dynarr_add (dst, (ch >> 12) | 0xe0);
3660 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3661 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3663 else if (ch <= 0x1fffff)
3665 Dynarr_add (dst, (ch >> 18) | 0xf0);
3666 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3667 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3668 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3670 else if (ch <= 0x3ffffff)
3672 Dynarr_add (dst, (ch >> 24) | 0xf8);
3673 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3674 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3675 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3676 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3680 Dynarr_add (dst, (ch >> 30) | 0xfc);
3681 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3682 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3683 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3684 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3685 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3690 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3691 unsigned int *flags)
3696 /************************************************************************/
3697 /* ISO2022 methods */
3698 /************************************************************************/
3700 /* The following note describes the coding system ISO2022 briefly.
3701 Since the intention of this note is to help understand the
3702 functions in this file, some parts are NOT ACCURATE or OVERLY
3703 SIMPLIFIED. For thorough understanding, please refer to the
3704 original document of ISO2022.
3706 ISO2022 provides many mechanisms to encode several character sets
3707 in 7-bit and 8-bit environments. For 7-bit environments, all text
3708 is encoded using bytes less than 128. This may make the encoded
3709 text a little bit longer, but the text passes more easily through
3710 several gateways, some of which strip off MSB (Most Signigant Bit).
3712 There are two kinds of character sets: control character set and
3713 graphic character set. The former contains control characters such
3714 as `newline' and `escape' to provide control functions (control
3715 functions are also provided by escape sequences). The latter
3716 contains graphic characters such as 'A' and '-'. Emacs recognizes
3717 two control character sets and many graphic character sets.
3719 Graphic character sets are classified into one of the following
3720 four classes, according to the number of bytes (DIMENSION) and
3721 number of characters in one dimension (CHARS) of the set:
3722 - DIMENSION1_CHARS94
3723 - DIMENSION1_CHARS96
3724 - DIMENSION2_CHARS94
3725 - DIMENSION2_CHARS96
3727 In addition, each character set is assigned an identification tag,
3728 unique for each set, called "final character" (denoted as <F>
3729 hereafter). The <F> of each character set is decided by ECMA(*)
3730 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3731 (0x30..0x3F are for private use only).
3733 Note (*): ECMA = European Computer Manufacturers Association
3735 Here are examples of graphic character set [NAME(<F>)]:
3736 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3737 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3738 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3739 o DIMENSION2_CHARS96 -- none for the moment
3741 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3742 C0 [0x00..0x1F] -- control character plane 0
3743 GL [0x20..0x7F] -- graphic character plane 0
3744 C1 [0x80..0x9F] -- control character plane 1
3745 GR [0xA0..0xFF] -- graphic character plane 1
3747 A control character set is directly designated and invoked to C0 or
3748 C1 by an escape sequence. The most common case is that:
3749 - ISO646's control character set is designated/invoked to C0, and
3750 - ISO6429's control character set is designated/invoked to C1,
3751 and usually these designations/invocations are omitted in encoded
3752 text. In a 7-bit environment, only C0 can be used, and a control
3753 character for C1 is encoded by an appropriate escape sequence to
3754 fit into the environment. All control characters for C1 are
3755 defined to have corresponding escape sequences.
3757 A graphic character set is at first designated to one of four
3758 graphic registers (G0 through G3), then these graphic registers are
3759 invoked to GL or GR. These designations and invocations can be
3760 done independently. The most common case is that G0 is invoked to
3761 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3762 these invocations and designations are omitted in encoded text.
3763 In a 7-bit environment, only GL can be used.
3765 When a graphic character set of CHARS94 is invoked to GL, codes
3766 0x20 and 0x7F of the GL area work as control characters SPACE and
3767 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3770 There are two ways of invocation: locking-shift and single-shift.
3771 With locking-shift, the invocation lasts until the next different
3772 invocation, whereas with single-shift, the invocation affects the
3773 following character only and doesn't affect the locking-shift
3774 state. Invocations are done by the following control characters or
3777 ----------------------------------------------------------------------
3778 abbrev function cntrl escape seq description
3779 ----------------------------------------------------------------------
3780 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3781 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3782 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3783 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3784 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3785 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3786 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3787 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3788 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3789 ----------------------------------------------------------------------
3790 (*) These are not used by any known coding system.
3792 Control characters for these functions are defined by macros
3793 ISO_CODE_XXX in `coding.h'.
3795 Designations are done by the following escape sequences:
3796 ----------------------------------------------------------------------
3797 escape sequence description
3798 ----------------------------------------------------------------------
3799 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3800 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3801 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3802 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3803 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3804 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3805 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3806 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3807 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3808 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3809 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3810 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3811 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3812 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3813 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3814 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3815 ----------------------------------------------------------------------
3817 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3818 of dimension 1, chars 94, and final character <F>, etc...
3820 Note (*): Although these designations are not allowed in ISO2022,
3821 Emacs accepts them on decoding, and produces them on encoding
3822 CHARS96 character sets in a coding system which is characterized as
3823 7-bit environment, non-locking-shift, and non-single-shift.
3825 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3826 '(' can be omitted. We refer to this as "short-form" hereafter.
3828 Now you may notice that there are a lot of ways for encoding the
3829 same multilingual text in ISO2022. Actually, there exist many
3830 coding systems such as Compound Text (used in X11's inter client
3831 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3832 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3833 localized platforms), and all of these are variants of ISO2022.
3835 In addition to the above, Emacs handles two more kinds of escape
3836 sequences: ISO6429's direction specification and Emacs' private
3837 sequence for specifying character composition.
3839 ISO6429's direction specification takes the following form:
3840 o CSI ']' -- end of the current direction
3841 o CSI '0' ']' -- end of the current direction
3842 o CSI '1' ']' -- start of left-to-right text
3843 o CSI '2' ']' -- start of right-to-left text
3844 The control character CSI (0x9B: control sequence introducer) is
3845 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3847 Character composition specification takes the following form:
3848 o ESC '0' -- start character composition
3849 o ESC '1' -- end character composition
3850 Since these are not standard escape sequences of any ISO standard,
3851 their use with these meanings is restricted to Emacs only. */
3854 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3858 for (i = 0; i < 4; i++)
3860 if (!NILP (coding_system))
3862 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3864 iso->charset[i] = Qt;
3865 iso->invalid_designated[i] = 0;
3867 iso->esc = ISO_ESC_NOTHING;
3868 iso->esc_bytes_index = 0;
3869 iso->register_left = 0;
3870 iso->register_right = 1;
3871 iso->switched_dir_and_no_valid_charset_yet = 0;
3872 iso->invalid_switch_dir = 0;
3873 iso->output_direction_sequence = 0;
3874 iso->output_literally = 0;
3875 #ifdef ENABLE_COMPOSITE_CHARS
3876 if (iso->composite_chars)
3877 Dynarr_reset (iso->composite_chars);
3882 fit_to_be_escape_quoted (unsigned char c)
3899 /* Parse one byte of an ISO2022 escape sequence.
3900 If the result is an invalid escape sequence, return 0 and
3901 do not change anything in STR. Otherwise, if the result is
3902 an incomplete escape sequence, update ISO2022.ESC and
3903 ISO2022.ESC_BYTES and return -1. Otherwise, update
3904 all the state variables (but not ISO2022.ESC_BYTES) and
3907 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3908 or invocation of an invalid character set and treat that as
3909 an unrecognized escape sequence. */
3912 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3913 unsigned char c, unsigned int *flags,
3914 int check_invalid_charsets)
3916 /* (1) If we're at the end of a designation sequence, CS is the
3917 charset being designated and REG is the register to designate
3920 (2) If we're at the end of a locking-shift sequence, REG is
3921 the register to invoke and HALF (0 == left, 1 == right) is
3922 the half to invoke it into.
3924 (3) If we're at the end of a single-shift sequence, REG is
3925 the register to invoke. */
3926 Lisp_Object cs = Qnil;
3929 /* NOTE: This code does goto's all over the fucking place.
3930 The reason for this is that we're basically implementing
3931 a state machine here, and hierarchical languages like C
3932 don't really provide a clean way of doing this. */
3934 if (! (*flags & CODING_STATE_ESCAPE))
3935 /* At beginning of escape sequence; we need to reset our
3936 escape-state variables. */
3937 iso->esc = ISO_ESC_NOTHING;
3939 iso->output_literally = 0;
3940 iso->output_direction_sequence = 0;
3944 case ISO_ESC_NOTHING:
3945 iso->esc_bytes_index = 0;
3948 case ISO_CODE_ESC: /* Start escape sequence */
3949 *flags |= CODING_STATE_ESCAPE;
3953 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3954 *flags |= CODING_STATE_ESCAPE;
3955 iso->esc = ISO_ESC_5_11;
3958 case ISO_CODE_SO: /* locking shift 1 */
3961 case ISO_CODE_SI: /* locking shift 0 */
3965 case ISO_CODE_SS2: /* single shift */
3968 case ISO_CODE_SS3: /* single shift */
3972 default: /* Other control characters */
3979 /**** single shift ****/
3981 case 'N': /* single shift 2 */
3984 case 'O': /* single shift 3 */
3988 /**** locking shift ****/
3990 case '~': /* locking shift 1 right */
3993 case 'n': /* locking shift 2 */
3996 case '}': /* locking shift 2 right */
3999 case 'o': /* locking shift 3 */
4002 case '|': /* locking shift 3 right */
4006 #ifdef ENABLE_COMPOSITE_CHARS
4007 /**** composite ****/
4010 iso->esc = ISO_ESC_START_COMPOSITE;
4011 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4012 CODING_STATE_COMPOSITE;
4016 iso->esc = ISO_ESC_END_COMPOSITE;
4017 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4018 ~CODING_STATE_COMPOSITE;
4020 #endif /* ENABLE_COMPOSITE_CHARS */
4022 /**** directionality ****/
4025 iso->esc = ISO_ESC_5_11;
4028 /**** designation ****/
4030 case '$': /* multibyte charset prefix */
4031 iso->esc = ISO_ESC_2_4;
4035 if (0x28 <= c && c <= 0x2F)
4037 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4041 /* This function is called with CODESYS equal to nil when
4042 doing coding-system detection. */
4044 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4045 && fit_to_be_escape_quoted (c))
4047 iso->esc = ISO_ESC_LITERAL;
4048 *flags &= CODING_STATE_ISO2022_LOCK;
4058 /**** directionality ****/
4060 case ISO_ESC_5_11: /* ISO6429 direction control */
4063 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4064 goto directionality;
4066 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4067 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4068 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4072 case ISO_ESC_5_11_0:
4075 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4076 goto directionality;
4080 case ISO_ESC_5_11_1:
4083 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4084 goto directionality;
4088 case ISO_ESC_5_11_2:
4091 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4092 goto directionality;
4097 iso->esc = ISO_ESC_DIRECTIONALITY;
4098 /* Various junk here to attempt to preserve the direction sequences
4099 literally in the text if they would otherwise be swallowed due
4100 to invalid designations that don't show up as actual charset
4101 changes in the text. */
4102 if (iso->invalid_switch_dir)
4104 /* We already inserted a direction switch literally into the
4105 text. We assume (#### this may not be right) that the
4106 next direction switch is the one going the other way,
4107 and we need to output that literally as well. */
4108 iso->output_literally = 1;
4109 iso->invalid_switch_dir = 0;
4115 /* If we are in the thrall of an invalid designation,
4116 then stick the directionality sequence literally into the
4117 output stream so it ends up in the original text again. */
4118 for (jj = 0; jj < 4; jj++)
4119 if (iso->invalid_designated[jj])
4123 iso->output_literally = 1;
4124 iso->invalid_switch_dir = 1;
4127 /* Indicate that we haven't yet seen a valid designation,
4128 so that if a switch-dir is directly followed by an
4129 invalid designation, both get inserted literally. */
4130 iso->switched_dir_and_no_valid_charset_yet = 1;
4135 /**** designation ****/
4138 if (0x28 <= c && c <= 0x2F)
4140 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4143 if (0x40 <= c && c <= 0x42)
4145 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4146 *flags & CODING_STATE_R2L ?
4147 CHARSET_RIGHT_TO_LEFT :
4148 CHARSET_LEFT_TO_RIGHT);
4158 if (c < '0' || c > '~')
4159 return 0; /* bad final byte */
4161 if (iso->esc >= ISO_ESC_2_8 &&
4162 iso->esc <= ISO_ESC_2_15)
4164 type = ((iso->esc >= ISO_ESC_2_12) ?
4165 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4166 reg = (iso->esc - ISO_ESC_2_8) & 3;
4168 else if (iso->esc >= ISO_ESC_2_4_8 &&
4169 iso->esc <= ISO_ESC_2_4_15)
4171 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4172 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4173 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4177 /* Can this ever be reached? -slb */
4181 cs = CHARSET_BY_ATTRIBUTES (type, c,
4182 *flags & CODING_STATE_R2L ?
4183 CHARSET_RIGHT_TO_LEFT :
4184 CHARSET_LEFT_TO_RIGHT);
4190 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4194 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4195 /* can't invoke something that ain't there. */
4197 iso->esc = ISO_ESC_SINGLE_SHIFT;
4198 *flags &= CODING_STATE_ISO2022_LOCK;
4200 *flags |= CODING_STATE_SS2;
4202 *flags |= CODING_STATE_SS3;
4206 if (check_invalid_charsets &&
4207 !CHARSETP (iso->charset[reg]))
4208 /* can't invoke something that ain't there. */
4211 iso->register_right = reg;
4213 iso->register_left = reg;
4214 *flags &= CODING_STATE_ISO2022_LOCK;
4215 iso->esc = ISO_ESC_LOCKING_SHIFT;
4219 if (NILP (cs) && check_invalid_charsets)
4221 iso->invalid_designated[reg] = 1;
4222 iso->charset[reg] = Vcharset_ascii;
4223 iso->esc = ISO_ESC_DESIGNATE;
4224 *flags &= CODING_STATE_ISO2022_LOCK;
4225 iso->output_literally = 1;
4226 if (iso->switched_dir_and_no_valid_charset_yet)
4228 /* We encountered a switch-direction followed by an
4229 invalid designation. Ensure that the switch-direction
4230 gets outputted; otherwise it will probably get eaten
4231 when the text is written out again. */
4232 iso->switched_dir_and_no_valid_charset_yet = 0;
4233 iso->output_direction_sequence = 1;
4234 /* And make sure that the switch-dir going the other
4235 way gets outputted, as well. */
4236 iso->invalid_switch_dir = 1;
4240 /* This function is called with CODESYS equal to nil when
4241 doing coding-system detection. */
4242 if (!NILP (codesys))
4244 charset_conversion_spec_dynarr *dyn =
4245 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4251 for (i = 0; i < Dynarr_length (dyn); i++)
4253 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4254 if (EQ (cs, spec->from_charset))
4255 cs = spec->to_charset;
4260 iso->charset[reg] = cs;
4261 iso->esc = ISO_ESC_DESIGNATE;
4262 *flags &= CODING_STATE_ISO2022_LOCK;
4263 if (iso->invalid_designated[reg])
4265 iso->invalid_designated[reg] = 0;
4266 iso->output_literally = 1;
4268 if (iso->switched_dir_and_no_valid_charset_yet)
4269 iso->switched_dir_and_no_valid_charset_yet = 0;
4274 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4279 /* #### There are serious deficiencies in the recognition mechanism
4280 here. This needs to be much smarter if it's going to cut it.
4281 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4282 it should be detected as Latin-1.
4283 All the ISO2022 stuff in this file should be synced up with the
4284 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4285 Perhaps we should wait till R2L works in FSF Emacs? */
4287 if (!st->iso2022.initted)
4289 reset_iso2022 (Qnil, &st->iso2022.iso);
4290 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4291 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4292 CODING_CATEGORY_ISO_8_1_MASK |
4293 CODING_CATEGORY_ISO_8_2_MASK |
4294 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4295 st->iso2022.flags = 0;
4296 st->iso2022.high_byte_count = 0;
4297 st->iso2022.saw_single_shift = 0;
4298 st->iso2022.initted = 1;
4301 mask = st->iso2022.mask;
4308 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4309 st->iso2022.high_byte_count++;
4313 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4315 if (st->iso2022.high_byte_count & 1)
4316 /* odd number of high bytes; assume not iso-8-2 */
4317 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4319 st->iso2022.high_byte_count = 0;
4320 st->iso2022.saw_single_shift = 0;
4322 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4324 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4325 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4326 { /* control chars */
4329 /* Allow and ignore control characters that you might
4330 reasonably see in a text file */
4335 case 8: /* backspace */
4336 case 11: /* vertical tab */
4337 case 12: /* form feed */
4338 case 26: /* MS-DOS C-z junk */
4339 case 31: /* '^_' -- for info */
4340 goto label_continue_loop;
4347 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4350 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4351 &st->iso2022.flags, 0))
4353 switch (st->iso2022.iso.esc)
4355 case ISO_ESC_DESIGNATE:
4356 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4357 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4359 case ISO_ESC_LOCKING_SHIFT:
4360 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4361 goto ran_out_of_chars;
4362 case ISO_ESC_SINGLE_SHIFT:
4363 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4364 st->iso2022.saw_single_shift = 1;
4373 goto ran_out_of_chars;
4376 label_continue_loop:;
4385 postprocess_iso2022_mask (int mask)
4387 /* #### kind of cheesy */
4388 /* If seven-bit ISO is allowed, then assume that the encoding is
4389 entirely seven-bit and turn off the eight-bit ones. */
4390 if (mask & CODING_CATEGORY_ISO_7_MASK)
4391 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4392 CODING_CATEGORY_ISO_8_1_MASK |
4393 CODING_CATEGORY_ISO_8_2_MASK);
4397 /* If FLAGS is a null pointer or specifies right-to-left motion,
4398 output a switch-dir-to-left-to-right sequence to DST.
4399 Also update FLAGS if it is not a null pointer.
4400 If INTERNAL_P is set, we are outputting in internal format and
4401 need to handle the CSI differently. */
4404 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4405 unsigned_char_dynarr *dst,
4406 unsigned int *flags,
4409 if (!flags || (*flags & CODING_STATE_R2L))
4411 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4413 Dynarr_add (dst, ISO_CODE_ESC);
4414 Dynarr_add (dst, '[');
4416 else if (internal_p)
4417 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4419 Dynarr_add (dst, ISO_CODE_CSI);
4420 Dynarr_add (dst, '0');
4421 Dynarr_add (dst, ']');
4423 *flags &= ~CODING_STATE_R2L;
4427 /* If FLAGS is a null pointer or specifies a direction different from
4428 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4429 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4430 sequence to DST. Also update FLAGS if it is not a null pointer.
4431 If INTERNAL_P is set, we are outputting in internal format and
4432 need to handle the CSI differently. */
4435 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4436 unsigned_char_dynarr *dst, unsigned int *flags,
4439 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4440 direction == CHARSET_LEFT_TO_RIGHT)
4441 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4442 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4443 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4444 direction == CHARSET_RIGHT_TO_LEFT)
4446 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4448 Dynarr_add (dst, ISO_CODE_ESC);
4449 Dynarr_add (dst, '[');
4451 else if (internal_p)
4452 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4454 Dynarr_add (dst, ISO_CODE_CSI);
4455 Dynarr_add (dst, '2');
4456 Dynarr_add (dst, ']');
4458 *flags |= CODING_STATE_R2L;
4462 /* Convert ISO2022-format data to internal format. */
4465 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4466 unsigned_char_dynarr *dst, unsigned int n)
4468 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4469 unsigned int flags = str->flags;
4470 unsigned int ch = str->ch;
4471 eol_type_t eol_type = str->eol_type;
4472 #ifdef ENABLE_COMPOSITE_CHARS
4473 unsigned_char_dynarr *real_dst = dst;
4475 Lisp_Object coding_system;
4477 XSETCODING_SYSTEM (coding_system, str->codesys);
4479 #ifdef ENABLE_COMPOSITE_CHARS
4480 if (flags & CODING_STATE_COMPOSITE)
4481 dst = str->iso2022.composite_chars;
4482 #endif /* ENABLE_COMPOSITE_CHARS */
4486 unsigned char c = *src++;
4487 if (flags & CODING_STATE_ESCAPE)
4488 { /* Within ESC sequence */
4489 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4494 switch (str->iso2022.esc)
4496 #ifdef ENABLE_COMPOSITE_CHARS
4497 case ISO_ESC_START_COMPOSITE:
4498 if (str->iso2022.composite_chars)
4499 Dynarr_reset (str->iso2022.composite_chars);
4501 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4502 dst = str->iso2022.composite_chars;
4504 case ISO_ESC_END_COMPOSITE:
4506 Bufbyte comstr[MAX_EMCHAR_LEN];
4508 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4509 Dynarr_length (dst));
4511 len = set_charptr_emchar (comstr, emch);
4512 Dynarr_add_many (dst, comstr, len);
4515 #endif /* ENABLE_COMPOSITE_CHARS */
4517 case ISO_ESC_LITERAL:
4518 DECODE_ADD_BINARY_CHAR (c, dst);
4522 /* Everything else handled already */
4527 /* Attempted error recovery. */
4528 if (str->iso2022.output_direction_sequence)
4529 ensure_correct_direction (flags & CODING_STATE_R2L ?
4530 CHARSET_RIGHT_TO_LEFT :
4531 CHARSET_LEFT_TO_RIGHT,
4532 str->codesys, dst, 0, 1);
4533 /* More error recovery. */
4534 if (!retval || str->iso2022.output_literally)
4536 /* Output the (possibly invalid) sequence */
4538 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4539 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4540 flags &= CODING_STATE_ISO2022_LOCK;
4542 n++, src--;/* Repeat the loop with the same character. */
4545 /* No sense in reprocessing the final byte of the
4546 escape sequence; it could mess things up anyway.
4548 DECODE_ADD_BINARY_CHAR (c, dst);
4553 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4554 { /* Control characters */
4556 /***** Error-handling *****/
4558 /* If we were in the middle of a character, dump out the
4559 partial character. */
4560 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4562 /* If we just saw a single-shift character, dump it out.
4563 This may dump out the wrong sort of single-shift character,
4564 but least it will give an indication that something went
4566 if (flags & CODING_STATE_SS2)
4568 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4569 flags &= ~CODING_STATE_SS2;
4571 if (flags & CODING_STATE_SS3)
4573 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4574 flags &= ~CODING_STATE_SS3;
4577 /***** Now handle the control characters. *****/
4580 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4582 flags &= CODING_STATE_ISO2022_LOCK;
4584 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4585 DECODE_ADD_BINARY_CHAR (c, dst);
4588 { /* Graphic characters */
4589 Lisp_Object charset;
4595 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4597 /* Now determine the charset. */
4598 reg = ((flags & CODING_STATE_SS2) ? 2
4599 : (flags & CODING_STATE_SS3) ? 3
4600 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4601 : str->iso2022.register_left);
4602 charset = str->iso2022.charset[reg];
4604 /* Error checking: */
4605 if (! CHARSETP (charset)
4606 || str->iso2022.invalid_designated[reg]
4607 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4608 && XCHARSET_CHARS (charset) == 94))
4609 /* Mrmph. We are trying to invoke a register that has no
4610 or an invalid charset in it, or trying to add a character
4611 outside the range of the charset. Insert that char literally
4612 to preserve it for the output. */
4614 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4615 DECODE_ADD_BINARY_CHAR (c, dst);
4620 /* Things are probably hunky-dorey. */
4622 /* Fetch reverse charset, maybe. */
4623 if (((flags & CODING_STATE_R2L) &&
4624 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4626 (!(flags & CODING_STATE_R2L) &&
4627 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4629 Lisp_Object new_charset =
4630 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4631 if (!NILP (new_charset))
4632 charset = new_charset;
4636 if (XCHARSET_DIMENSION (charset) == 1)
4638 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4640 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4645 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4651 lb = XCHARSET_LEADING_BYTE (charset);
4652 switch (XCHARSET_REP_BYTES (charset))
4655 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4656 Dynarr_add (dst, c & 0x7F);
4659 case 2: /* one-byte official */
4660 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4661 Dynarr_add (dst, lb);
4662 Dynarr_add (dst, c | 0x80);
4665 case 3: /* one-byte private or two-byte official */
4666 if (XCHARSET_PRIVATE_P (charset))
4668 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4669 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4670 Dynarr_add (dst, lb);
4671 Dynarr_add (dst, c | 0x80);
4677 Dynarr_add (dst, lb);
4678 Dynarr_add (dst, ch | 0x80);
4679 Dynarr_add (dst, c | 0x80);
4687 default: /* two-byte private */
4690 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4691 Dynarr_add (dst, lb);
4692 Dynarr_add (dst, ch | 0x80);
4693 Dynarr_add (dst, c | 0x80);
4703 flags &= CODING_STATE_ISO2022_LOCK;
4706 label_continue_loop:;
4709 if (flags & CODING_STATE_END)
4710 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4717 /***** ISO2022 encoder *****/
4719 /* Designate CHARSET into register REG. */
4722 iso2022_designate (Lisp_Object charset, unsigned char reg,
4723 struct encoding_stream *str, unsigned_char_dynarr *dst)
4725 static CONST char inter94[] = "()*+";
4726 static CONST char inter96[] = ",-./";
4728 unsigned char final;
4729 Lisp_Object old_charset = str->iso2022.charset[reg];
4731 str->iso2022.charset[reg] = charset;
4732 if (!CHARSETP (charset))
4733 /* charset might be an initial nil or t. */
4735 type = XCHARSET_TYPE (charset);
4736 final = XCHARSET_FINAL (charset);
4737 if (!str->iso2022.force_charset_on_output[reg] &&
4738 CHARSETP (old_charset) &&
4739 XCHARSET_TYPE (old_charset) == type &&
4740 XCHARSET_FINAL (old_charset) == final)
4743 str->iso2022.force_charset_on_output[reg] = 0;
4746 charset_conversion_spec_dynarr *dyn =
4747 str->codesys->iso2022.output_conv;
4753 for (i = 0; i < Dynarr_length (dyn); i++)
4755 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4756 if (EQ (charset, spec->from_charset))
4757 charset = spec->to_charset;
4762 Dynarr_add (dst, ISO_CODE_ESC);
4765 case CHARSET_TYPE_94:
4766 Dynarr_add (dst, inter94[reg]);
4768 case CHARSET_TYPE_96:
4769 Dynarr_add (dst, inter96[reg]);
4771 case CHARSET_TYPE_94X94:
4772 Dynarr_add (dst, '$');
4774 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4777 Dynarr_add (dst, inter94[reg]);
4779 case CHARSET_TYPE_96X96:
4780 Dynarr_add (dst, '$');
4781 Dynarr_add (dst, inter96[reg]);
4784 Dynarr_add (dst, final);
4788 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4790 if (str->iso2022.register_left != 0)
4792 Dynarr_add (dst, ISO_CODE_SI);
4793 str->iso2022.register_left = 0;
4798 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4800 if (str->iso2022.register_left != 1)
4802 Dynarr_add (dst, ISO_CODE_SO);
4803 str->iso2022.register_left = 1;
4808 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4809 unsigned_char_dynarr *dst, unsigned int *flags)
4811 unsigned char charmask;
4812 Lisp_Coding_System* codesys = str->codesys;
4813 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4815 Lisp_Object charset = str->iso2022.current_charset;
4816 int half = str->iso2022.current_half;
4817 unsigned int byte1, byte2;
4821 restore_left_to_right_direction (codesys, dst, flags, 0);
4823 /* Make sure G0 contains ASCII */
4824 if ((ch > ' ' && ch < ISO_CODE_DEL)
4825 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4827 ensure_normal_shift (str, dst);
4828 iso2022_designate (Vcharset_ascii, 0, str, dst);
4831 /* If necessary, restore everything to the default state
4833 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4835 restore_left_to_right_direction (codesys, dst, flags, 0);
4837 ensure_normal_shift (str, dst);
4839 for (i = 0; i < 4; i++)
4841 Lisp_Object initial_charset =
4842 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4843 iso2022_designate (initial_charset, i, str, dst);
4848 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4849 Dynarr_add (dst, '\r');
4850 if (eol_type != EOL_CR)
4851 Dynarr_add (dst, ch);
4855 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4856 && fit_to_be_escape_quoted (ch))
4857 Dynarr_add (dst, ISO_CODE_ESC);
4858 Dynarr_add (dst, ch);
4861 else if ( (0x80 <= ch) && (ch <= 0x9f) )
4863 charmask = (half == 0 ? 0x00 : 0x80);
4865 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4866 && fit_to_be_escape_quoted (ch))
4867 Dynarr_add (dst, ISO_CODE_ESC);
4868 /* you asked for it ... */
4869 Dynarr_add (dst, ch);
4875 /* Now determine which register to use. */
4877 for (i = 0; i < 4; i++)
4881 if ((CHARSETP (charset = str->iso2022.charset[i])
4882 && ((code_point = charset_code_point (charset, ch)) >= 0))
4886 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
4887 && ((code_point = charset_code_point (charset, ch)) >= 0)))
4889 if (XCHARSET_DIMENSION (charset) == 1)
4894 else /* if (XCHARSET_DIMENSION (charset) == 2) */
4896 byte1 = code_point >> 8;
4897 byte2 = code_point & 255;
4905 Lisp_Object original_default_coded_charset_priority_list
4906 = Vdefault_coded_charset_priority_list;
4908 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
4910 BREAKUP_CHAR (ch, charset, byte1, byte2);
4911 if (XCHARSET_FINAL (charset))
4913 Vdefault_coded_charset_priority_list
4914 = Fcdr (Fmemq (XCHARSET_NAME (charset),
4915 Vdefault_coded_charset_priority_list));
4917 BREAKUP_CHAR (ch, charset, byte1, byte2);
4918 if (!XCHARSET_FINAL (charset))
4920 charset = Vcharset_ascii;
4924 Vdefault_coded_charset_priority_list
4925 = original_default_coded_charset_priority_list;
4927 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4928 codesys, dst, flags, 0);
4932 if (XCHARSET_GRAPHIC (charset) != 0)
4934 if (!NILP (str->iso2022.charset[1]) &&
4935 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4936 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4938 else if (!NILP (str->iso2022.charset[2]))
4940 else if (!NILP (str->iso2022.charset[3]))
4949 iso2022_designate (charset, reg, str, dst);
4951 /* Now invoke that register. */
4955 ensure_normal_shift (str, dst);
4959 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4961 ensure_shift_out (str, dst);
4968 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4970 Dynarr_add (dst, ISO_CODE_ESC);
4971 Dynarr_add (dst, 'N');
4976 Dynarr_add (dst, ISO_CODE_SS2);
4981 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4983 Dynarr_add (dst, ISO_CODE_ESC);
4984 Dynarr_add (dst, 'O');
4989 Dynarr_add (dst, ISO_CODE_SS3);
4997 charmask = (half == 0 ? 0x00 : 0x80);
4999 switch (XCHARSET_DIMENSION (charset))
5002 Dynarr_add (dst, byte1 | charmask);
5005 Dynarr_add (dst, byte1 | charmask);
5006 Dynarr_add (dst, byte2 | charmask);
5012 str->iso2022.current_charset = charset;
5013 str->iso2022.current_half = half;
5017 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5018 unsigned int *flags)
5020 Lisp_Coding_System* codesys = str->codesys;
5023 restore_left_to_right_direction (codesys, dst, flags, 0);
5024 ensure_normal_shift (str, dst);
5025 for (i = 0; i < 4; i++)
5027 Lisp_Object initial_charset
5028 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5029 iso2022_designate (initial_charset, i, str, dst);
5034 /************************************************************************/
5035 /* No-conversion methods */
5036 /************************************************************************/
5038 /* This is used when reading in "binary" files -- i.e. files that may
5039 contain all 256 possible byte values and that are not to be
5040 interpreted as being in any particular decoding. */
5042 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5043 unsigned_char_dynarr *dst, unsigned int n)
5046 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5047 unsigned int flags = str->flags;
5048 unsigned int ch = str->ch;
5049 eol_type_t eol_type = str->eol_type;
5055 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5056 DECODE_ADD_BINARY_CHAR (c, dst);
5057 label_continue_loop:;
5060 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5067 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5068 unsigned_char_dynarr *dst, unsigned int n)
5071 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5072 unsigned int flags = str->flags;
5073 unsigned int ch = str->ch;
5074 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5076 unsigned char char_boundary = str->iso2022.current_char_boundary;
5083 if (char_boundary == 0)
5089 else if ( c >= 0xf8 )
5094 else if ( c >= 0xf0 )
5099 else if ( c >= 0xe0 )
5104 else if ( c >= 0xc0 )
5114 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5115 Dynarr_add (dst, '\r');
5116 if (eol_type != EOL_CR)
5117 Dynarr_add (dst, c);
5120 Dynarr_add (dst, c);
5123 else if (char_boundary == 1)
5125 ch = ( ch << 6 ) | ( c & 0x3f );
5126 Dynarr_add (dst, ch & 0xff);
5131 ch = ( ch << 6 ) | ( c & 0x3f );
5134 #else /* not UTF2000 */
5137 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5138 Dynarr_add (dst, '\r');
5139 if (eol_type != EOL_CR)
5140 Dynarr_add (dst, '\n');
5143 else if (BYTE_ASCII_P (c))
5146 Dynarr_add (dst, c);
5148 else if (BUFBYTE_LEADING_BYTE_P (c))
5151 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5152 c == LEADING_BYTE_CONTROL_1)
5155 Dynarr_add (dst, '~'); /* untranslatable character */
5159 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5160 Dynarr_add (dst, c);
5161 else if (ch == LEADING_BYTE_CONTROL_1)
5164 Dynarr_add (dst, c - 0x20);
5166 /* else it should be the second or third byte of an
5167 untranslatable character, so ignore it */
5170 #endif /* not UTF2000 */
5176 str->iso2022.current_char_boundary = char_boundary;
5181 /************************************************************************/
5182 /* Simple internal/external functions */
5183 /************************************************************************/
5185 static Extbyte_dynarr *conversion_out_dynarr;
5186 static Bufbyte_dynarr *conversion_in_dynarr;
5188 /* Determine coding system from coding format */
5190 /* #### not correct for all values of `fmt'! */
5192 external_data_format_to_coding_system (enum external_data_format fmt)
5196 case FORMAT_FILENAME:
5197 case FORMAT_TERMINAL:
5198 if (EQ (Vfile_name_coding_system, Qnil) ||
5199 EQ (Vfile_name_coding_system, Qbinary))
5202 return Fget_coding_system (Vfile_name_coding_system);
5205 return Fget_coding_system (Qctext);
5213 convert_to_external_format (CONST Bufbyte *ptr,
5216 enum external_data_format fmt)
5218 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5220 if (!conversion_out_dynarr)
5221 conversion_out_dynarr = Dynarr_new (Extbyte);
5223 Dynarr_reset (conversion_out_dynarr);
5225 if (NILP (coding_system))
5227 CONST Bufbyte *end = ptr + len;
5233 (*ptr < 0xc0) ? *ptr :
5234 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5237 (BYTE_ASCII_P (*ptr)) ? *ptr :
5238 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5239 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5242 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5246 #ifdef ERROR_CHECK_BUFPOS
5247 assert (ptr == end);
5252 Lisp_Object instream, outstream, da_outstream;
5253 Lstream *istr, *ostr;
5254 struct gcpro gcpro1, gcpro2, gcpro3;
5255 char tempbuf[1024]; /* some random amount */
5257 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5258 da_outstream = make_dynarr_output_stream
5259 ((unsigned_char_dynarr *) conversion_out_dynarr);
5261 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5262 istr = XLSTREAM (instream);
5263 ostr = XLSTREAM (outstream);
5264 GCPRO3 (instream, outstream, da_outstream);
5267 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5270 Lstream_write (ostr, tempbuf, size_in_bytes);
5272 Lstream_close (istr);
5273 Lstream_close (ostr);
5275 Lstream_delete (istr);
5276 Lstream_delete (ostr);
5277 Lstream_delete (XLSTREAM (da_outstream));
5280 *len_out = Dynarr_length (conversion_out_dynarr);
5281 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5282 return Dynarr_atp (conversion_out_dynarr, 0);
5286 convert_from_external_format (CONST Extbyte *ptr,
5289 enum external_data_format fmt)
5291 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5293 if (!conversion_in_dynarr)
5294 conversion_in_dynarr = Dynarr_new (Bufbyte);
5296 Dynarr_reset (conversion_in_dynarr);
5298 if (NILP (coding_system))
5300 CONST Extbyte *end = ptr + len;
5301 for (; ptr < end; ptr++)
5304 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5309 Lisp_Object instream, outstream, da_outstream;
5310 Lstream *istr, *ostr;
5311 struct gcpro gcpro1, gcpro2, gcpro3;
5312 char tempbuf[1024]; /* some random amount */
5314 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5315 da_outstream = make_dynarr_output_stream
5316 ((unsigned_char_dynarr *) conversion_in_dynarr);
5318 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5319 istr = XLSTREAM (instream);
5320 ostr = XLSTREAM (outstream);
5321 GCPRO3 (instream, outstream, da_outstream);
5324 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5327 Lstream_write (ostr, tempbuf, size_in_bytes);
5329 Lstream_close (istr);
5330 Lstream_close (ostr);
5332 Lstream_delete (istr);
5333 Lstream_delete (ostr);
5334 Lstream_delete (XLSTREAM (da_outstream));
5337 *len_out = Dynarr_length (conversion_in_dynarr);
5338 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5339 return Dynarr_atp (conversion_in_dynarr, 0);
5343 /************************************************************************/
5344 /* Initialization */
5345 /************************************************************************/
5348 syms_of_file_coding (void)
5350 deferror (&Qcoding_system_error, "coding-system-error",
5351 "Coding-system error", Qio_error);
5353 DEFSUBR (Fcoding_system_p);
5354 DEFSUBR (Ffind_coding_system);
5355 DEFSUBR (Fget_coding_system);
5356 DEFSUBR (Fcoding_system_list);
5357 DEFSUBR (Fcoding_system_name);
5358 DEFSUBR (Fmake_coding_system);
5359 DEFSUBR (Fcopy_coding_system);
5360 DEFSUBR (Fdefine_coding_system_alias);
5361 DEFSUBR (Fsubsidiary_coding_system);
5363 DEFSUBR (Fcoding_system_type);
5364 DEFSUBR (Fcoding_system_doc_string);
5366 DEFSUBR (Fcoding_system_charset);
5368 DEFSUBR (Fcoding_system_property);
5370 DEFSUBR (Fcoding_category_list);
5371 DEFSUBR (Fset_coding_priority_list);
5372 DEFSUBR (Fcoding_priority_list);
5373 DEFSUBR (Fset_coding_category_system);
5374 DEFSUBR (Fcoding_category_system);
5376 DEFSUBR (Fdetect_coding_region);
5377 DEFSUBR (Fdecode_coding_region);
5378 DEFSUBR (Fencode_coding_region);
5380 DEFSUBR (Fdecode_shift_jis_char);
5381 DEFSUBR (Fencode_shift_jis_char);
5382 DEFSUBR (Fdecode_big5_char);
5383 DEFSUBR (Fencode_big5_char);
5385 defsymbol (&Qcoding_systemp, "coding-system-p");
5386 defsymbol (&Qno_conversion, "no-conversion");
5387 defsymbol (&Qraw_text, "raw-text");
5389 defsymbol (&Qbig5, "big5");
5390 defsymbol (&Qshift_jis, "shift-jis");
5391 defsymbol (&Qucs4, "ucs-4");
5392 defsymbol (&Qutf8, "utf-8");
5393 defsymbol (&Qccl, "ccl");
5394 defsymbol (&Qiso2022, "iso2022");
5396 defsymbol (&Qmnemonic, "mnemonic");
5397 defsymbol (&Qeol_type, "eol-type");
5398 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5399 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5401 defsymbol (&Qcr, "cr");
5402 defsymbol (&Qlf, "lf");
5403 defsymbol (&Qcrlf, "crlf");
5404 defsymbol (&Qeol_cr, "eol-cr");
5405 defsymbol (&Qeol_lf, "eol-lf");
5406 defsymbol (&Qeol_crlf, "eol-crlf");
5408 defsymbol (&Qcharset_g0, "charset-g0");
5409 defsymbol (&Qcharset_g1, "charset-g1");
5410 defsymbol (&Qcharset_g2, "charset-g2");
5411 defsymbol (&Qcharset_g3, "charset-g3");
5412 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5413 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5414 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5415 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5416 defsymbol (&Qno_iso6429, "no-iso6429");
5417 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5418 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5420 defsymbol (&Qshort, "short");
5421 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5422 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5423 defsymbol (&Qseven, "seven");
5424 defsymbol (&Qlock_shift, "lock-shift");
5425 defsymbol (&Qescape_quoted, "escape-quoted");
5427 defsymbol (&Qencode, "encode");
5428 defsymbol (&Qdecode, "decode");
5431 defsymbol (&Qctext, "ctext");
5432 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5434 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5436 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5438 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5440 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5442 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5444 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5446 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5448 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5451 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5456 lstream_type_create_file_coding (void)
5458 LSTREAM_HAS_METHOD (decoding, reader);
5459 LSTREAM_HAS_METHOD (decoding, writer);
5460 LSTREAM_HAS_METHOD (decoding, rewinder);
5461 LSTREAM_HAS_METHOD (decoding, seekable_p);
5462 LSTREAM_HAS_METHOD (decoding, flusher);
5463 LSTREAM_HAS_METHOD (decoding, closer);
5464 LSTREAM_HAS_METHOD (decoding, marker);
5466 LSTREAM_HAS_METHOD (encoding, reader);
5467 LSTREAM_HAS_METHOD (encoding, writer);
5468 LSTREAM_HAS_METHOD (encoding, rewinder);
5469 LSTREAM_HAS_METHOD (encoding, seekable_p);
5470 LSTREAM_HAS_METHOD (encoding, flusher);
5471 LSTREAM_HAS_METHOD (encoding, closer);
5472 LSTREAM_HAS_METHOD (encoding, marker);
5476 vars_of_file_coding (void)
5480 fcd = xnew (struct file_coding_dump);
5481 dumpstruct (&fcd, &fcd_description);
5483 /* Initialize to something reasonable ... */
5484 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5486 fcd->coding_category_system[i] = Qnil;
5487 fcd->coding_category_by_priority[i] = i;
5490 Fprovide (intern ("file-coding"));
5492 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5493 Coding system used for TTY keyboard input.
5494 Not used under a windowing system.
5496 Vkeyboard_coding_system = Qnil;
5498 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5499 Coding system used for TTY display output.
5500 Not used under a windowing system.
5502 Vterminal_coding_system = Qnil;
5504 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5505 Overriding coding system used when writing a file or process.
5506 You should *bind* this, not set it. If this is non-nil, it specifies
5507 the coding system that will be used when a file or process is read
5508 in, and overrides `buffer-file-coding-system-for-read',
5509 `insert-file-contents-pre-hook', etc. Use those variables instead of
5510 this one for permanent changes to the environment.
5512 Vcoding_system_for_read = Qnil;
5514 DEFVAR_LISP ("coding-system-for-write",
5515 &Vcoding_system_for_write /*
5516 Overriding coding system used when writing a file or process.
5517 You should *bind* this, not set it. If this is non-nil, it specifies
5518 the coding system that will be used when a file or process is wrote
5519 in, and overrides `buffer-file-coding-system',
5520 `write-region-pre-hook', etc. Use those variables instead of this one
5521 for permanent changes to the environment.
5523 Vcoding_system_for_write = Qnil;
5525 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5526 Coding system used to convert pathnames when accessing files.
5528 Vfile_name_coding_system = Qnil;
5530 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5531 Non-nil means the buffer contents are regarded as multi-byte form
5532 of characters, not a binary code. This affects the display, file I/O,
5533 and behaviors of various editing commands.
5535 Setting this to nil does not do anything.
5537 enable_multibyte_characters = 1;
5541 complex_vars_of_file_coding (void)
5543 staticpro (&Vcoding_system_hash_table);
5544 Vcoding_system_hash_table =
5545 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5547 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5548 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5550 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5552 struct codesys_prop csp; \
5554 csp.prop_type = (Prop_Type); \
5555 Dynarr_add (the_codesys_prop_dynarr, csp); \
5558 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5559 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5560 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5561 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5562 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5563 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5564 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5566 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5567 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5568 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5569 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5570 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5571 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5572 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5573 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5574 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5575 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5576 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5577 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5578 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5579 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5580 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5581 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5582 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5584 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5585 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5587 /* Need to create this here or we're really screwed. */
5589 (Qraw_text, Qno_conversion,
5590 build_string ("Raw text, which means it converts only line-break-codes."),
5591 list2 (Qmnemonic, build_string ("Raw")));
5594 (Qbinary, Qno_conversion,
5595 build_string ("Binary, which means it does not convert anything."),
5596 list4 (Qeol_type, Qlf,
5597 Qmnemonic, build_string ("Binary")));
5602 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5603 list2 (Qmnemonic, build_string ("UTF8")));
5606 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5608 /* Need this for bootstrapping */
5609 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5610 Fget_coding_system (Qraw_text);
5613 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5614 = Fget_coding_system (Qutf8);