1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
52 struct file_coding_dump {
53 /* Coding system currently associated with each coding category. */
54 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
56 /* Table of all coding categories in decreasing order of priority.
57 This describes a permutation of the possible coding categories. */
58 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
61 Lisp_Object ucs_to_mule_table[65536];
65 static const struct lrecord_description fcd_description_1[] = {
66 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
68 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
73 static const struct struct_description fcd_description = {
74 sizeof(struct file_coding_dump),
78 Lisp_Object mule_to_ucs_table;
80 Lisp_Object Qcoding_systemp;
82 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
83 /* Qinternal in general.c */
85 Lisp_Object Qmnemonic, Qeol_type;
86 Lisp_Object Qcr, Qcrlf, Qlf;
87 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
88 Lisp_Object Qpost_read_conversion;
89 Lisp_Object Qpre_write_conversion;
92 Lisp_Object Qucs4, Qutf8;
93 Lisp_Object Qbig5, Qshift_jis;
94 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
95 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
96 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
97 Lisp_Object Qno_iso6429;
98 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
99 Lisp_Object Qctext, Qescape_quoted;
100 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
102 Lisp_Object Qencode, Qdecode;
104 Lisp_Object Vcoding_system_hash_table;
106 int enable_multibyte_characters;
109 /* Additional information used by the ISO2022 decoder and detector. */
110 struct iso2022_decoder
112 /* CHARSET holds the character sets currently assigned to the G0
113 through G3 variables. It is initialized from the array
114 INITIAL_CHARSET in CODESYS. */
115 Lisp_Object charset[4];
117 /* Which registers are currently invoked into the left (GL) and
118 right (GR) halves of the 8-bit encoding space? */
119 int register_left, register_right;
121 /* ISO_ESC holds a value indicating part of an escape sequence
122 that has already been seen. */
123 enum iso_esc_flag esc;
125 /* This records the bytes we've seen so far in an escape sequence,
126 in case the sequence is invalid (we spit out the bytes unchanged). */
127 unsigned char esc_bytes[8];
129 /* Index for next byte to store in ISO escape sequence. */
132 #ifdef ENABLE_COMPOSITE_CHARS
133 /* Stuff seen so far when composing a string. */
134 unsigned_char_dynarr *composite_chars;
137 /* If we saw an invalid designation sequence for a particular
138 register, we flag it here and switch to ASCII. The next time we
139 see a valid designation for this register, we turn off the flag
140 and do the designation normally, but pretend the sequence was
141 invalid. The effect of all this is that (most of the time) the
142 escape sequences for both the switch to the unknown charset, and
143 the switch back to the known charset, get inserted literally into
144 the buffer and saved out as such. The hope is that we can
145 preserve the escape sequences so that the resulting written out
146 file makes sense. If we don't do any of this, the designation
147 to the invalid charset will be preserved but that switch back
148 to the known charset will probably get eaten because it was
149 the same charset that was already present in the register. */
150 unsigned char invalid_designated[4];
152 /* We try to do similar things as above for direction-switching
153 sequences. If we encountered a direction switch while an
154 invalid designation was present, or an invalid designation
155 just after a direction switch (i.e. no valid designation
156 encountered yet), we insert the direction-switch escape
157 sequence literally into the output stream, and later on
158 insert the corresponding direction-restoring escape sequence
160 unsigned int switched_dir_and_no_valid_charset_yet :1;
161 unsigned int invalid_switch_dir :1;
163 /* Tells the decoder to output the escape sequence literally
164 even though it was valid. Used in the games we play to
165 avoid lossage when we encounter invalid designations. */
166 unsigned int output_literally :1;
167 /* We encountered a direction switch followed by an invalid
168 designation. We didn't output the direction switch
169 literally because we didn't know about the invalid designation;
170 but we have to do so now. */
171 unsigned int output_direction_sequence :1;
174 EXFUN (Fcopy_coding_system, 2);
176 struct detection_state;
179 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
180 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_sjis (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_sjis (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst,
189 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
190 unsigned_char_dynarr *dst, unsigned int *flags);
191 void char_finish_shift_jis (struct encoding_stream *str,
192 unsigned_char_dynarr *dst, unsigned int *flags);
194 static int detect_coding_big5 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_big5 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_big5 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
203 static int detect_coding_ucs4 (struct detection_state *st,
204 CONST unsigned char *src,
206 static void decode_coding_ucs4 (Lstream *decoding,
207 CONST unsigned char *src,
208 unsigned_char_dynarr *dst, unsigned int n);
209 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
210 unsigned_char_dynarr *dst, unsigned int *flags);
211 void char_finish_ucs4 (struct encoding_stream *str,
212 unsigned_char_dynarr *dst, unsigned int *flags);
214 static int detect_coding_utf8 (struct detection_state *st,
215 CONST unsigned char *src,
217 static void decode_coding_utf8 (Lstream *decoding,
218 CONST unsigned char *src,
219 unsigned_char_dynarr *dst, unsigned int n);
220 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
221 unsigned_char_dynarr *dst, unsigned int *flags);
222 void char_finish_utf8 (struct encoding_stream *str,
223 unsigned_char_dynarr *dst, unsigned int *flags);
225 static int postprocess_iso2022_mask (int mask);
226 static void reset_iso2022 (Lisp_Object coding_system,
227 struct iso2022_decoder *iso);
228 static int detect_coding_iso2022 (struct detection_state *st,
229 CONST unsigned char *src,
231 static void decode_coding_iso2022 (Lstream *decoding,
232 CONST unsigned char *src,
233 unsigned_char_dynarr *dst, unsigned int n);
234 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
235 unsigned_char_dynarr *dst, unsigned int *flags);
236 void char_finish_iso2022 (struct encoding_stream *str,
237 unsigned_char_dynarr *dst, unsigned int *flags);
239 static void decode_coding_no_conversion (Lstream *decoding,
240 CONST unsigned char *src,
241 unsigned_char_dynarr *dst,
243 static void encode_coding_no_conversion (Lstream *encoding,
244 CONST unsigned char *src,
245 unsigned_char_dynarr *dst,
247 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
248 unsigned_char_dynarr *dst, unsigned int n);
249 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
250 unsigned_char_dynarr *dst, unsigned int n);
252 typedef struct codesys_prop codesys_prop;
261 Dynarr_declare (codesys_prop);
262 } codesys_prop_dynarr;
264 static const struct lrecord_description codesys_prop_description_1[] = {
265 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
269 static const struct struct_description codesys_prop_description = {
270 sizeof(codesys_prop),
271 codesys_prop_description_1
274 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
275 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
279 static const struct struct_description codesys_prop_dynarr_description = {
280 sizeof(codesys_prop_dynarr),
281 codesys_prop_dynarr_description_1
284 codesys_prop_dynarr *the_codesys_prop_dynarr;
286 enum codesys_prop_enum
289 CODESYS_PROP_ISO2022,
294 /************************************************************************/
295 /* Coding system functions */
296 /************************************************************************/
298 static Lisp_Object mark_coding_system (Lisp_Object);
299 static void print_coding_system (Lisp_Object, Lisp_Object, int);
300 static void finalize_coding_system (void *header, int for_disksave);
303 static const struct lrecord_description ccs_description_1[] = {
304 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
308 static const struct struct_description ccs_description = {
309 sizeof(charset_conversion_spec),
313 static const struct lrecord_description ccsd_description_1[] = {
314 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
318 static const struct struct_description ccsd_description = {
319 sizeof(charset_conversion_spec_dynarr),
324 static const struct lrecord_description coding_system_description[] = {
325 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
326 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
327 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
329 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
330 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
331 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
332 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
337 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
338 mark_coding_system, print_coding_system,
339 finalize_coding_system,
340 0, 0, coding_system_description,
341 struct Lisp_Coding_System);
344 mark_coding_system (Lisp_Object obj)
346 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
348 mark_object (CODING_SYSTEM_NAME (codesys));
349 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
350 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
351 mark_object (CODING_SYSTEM_EOL_LF (codesys));
352 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
353 mark_object (CODING_SYSTEM_EOL_CR (codesys));
355 switch (CODING_SYSTEM_TYPE (codesys))
359 case CODESYS_ISO2022:
360 for (i = 0; i < 4; i++)
361 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
362 if (codesys->iso2022.input_conv)
364 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
366 struct charset_conversion_spec *ccs =
367 Dynarr_atp (codesys->iso2022.input_conv, i);
368 mark_object (ccs->from_charset);
369 mark_object (ccs->to_charset);
372 if (codesys->iso2022.output_conv)
374 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
376 struct charset_conversion_spec *ccs =
377 Dynarr_atp (codesys->iso2022.output_conv, i);
378 mark_object (ccs->from_charset);
379 mark_object (ccs->to_charset);
385 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
386 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
393 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
394 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
398 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
401 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
403 error ("printing unreadable object #<coding_system 0x%x>",
406 write_c_string ("#<coding_system ", printcharfun);
407 print_internal (c->name, printcharfun, 1);
408 write_c_string (">", printcharfun);
412 finalize_coding_system (void *header, int for_disksave)
414 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
415 /* Since coding systems never go away, this function is not
416 necessary. But it would be necessary if we changed things
417 so that coding systems could go away. */
418 if (!for_disksave) /* see comment in lstream.c */
420 switch (CODING_SYSTEM_TYPE (c))
423 case CODESYS_ISO2022:
424 if (c->iso2022.input_conv)
426 Dynarr_free (c->iso2022.input_conv);
427 c->iso2022.input_conv = 0;
429 if (c->iso2022.output_conv)
431 Dynarr_free (c->iso2022.output_conv);
432 c->iso2022.output_conv = 0;
443 symbol_to_eol_type (Lisp_Object symbol)
445 CHECK_SYMBOL (symbol);
446 if (NILP (symbol)) return EOL_AUTODETECT;
447 if (EQ (symbol, Qlf)) return EOL_LF;
448 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
449 if (EQ (symbol, Qcr)) return EOL_CR;
451 signal_simple_error ("Unrecognized eol type", symbol);
452 return EOL_AUTODETECT; /* not reached */
456 eol_type_to_symbol (enum eol_type type)
461 case EOL_LF: return Qlf;
462 case EOL_CRLF: return Qcrlf;
463 case EOL_CR: return Qcr;
464 case EOL_AUTODETECT: return Qnil;
469 setup_eol_coding_systems (Lisp_Coding_System *codesys)
471 Lisp_Object codesys_obj;
472 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
473 char *codesys_name = (char *) alloca (len + 7);
475 char *codesys_mnemonic=0;
477 Lisp_Object codesys_name_sym, sub_codesys_obj;
481 XSETCODING_SYSTEM (codesys_obj, codesys);
483 memcpy (codesys_name,
484 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
486 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
488 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
489 codesys_mnemonic = (char *) alloca (mlen + 7);
490 memcpy (codesys_mnemonic,
491 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
494 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
495 strcpy (codesys_name + len, "-" op_sys); \
497 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
498 codesys_name_sym = intern (codesys_name); \
499 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
500 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
502 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
503 build_string (codesys_mnemonic); \
504 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
507 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
508 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
509 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
512 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
513 Return t if OBJECT is a coding system.
514 A coding system is an object that defines how text containing multiple
515 character sets is encoded into a stream of (typically 8-bit) bytes.
516 The coding system is used to decode the stream into a series of
517 characters (which may be from multiple charsets) when the text is read
518 from a file or process, and is used to encode the text back into the
519 same format when it is written out to a file or process.
521 For example, many ISO2022-compliant coding systems (such as Compound
522 Text, which is used for inter-client data under the X Window System)
523 use escape sequences to switch between different charsets -- Japanese
524 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
525 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
526 `make-coding-system' for more information.
528 Coding systems are normally identified using a symbol, and the
529 symbol is accepted in place of the actual coding system object whenever
530 a coding system is called for. (This is similar to how faces work.)
534 return CODING_SYSTEMP (object) ? Qt : Qnil;
537 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
538 Retrieve the coding system of the given name.
540 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
541 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
542 If there is no such coding system, nil is returned. Otherwise the
543 associated coding system object is returned.
545 (coding_system_or_name))
547 if (CODING_SYSTEMP (coding_system_or_name))
548 return coding_system_or_name;
550 if (NILP (coding_system_or_name))
551 coding_system_or_name = Qbinary;
553 CHECK_SYMBOL (coding_system_or_name);
555 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
558 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
559 Retrieve the coding system of the given name.
560 Same as `find-coding-system' except that if there is no such
561 coding system, an error is signaled instead of returning nil.
565 Lisp_Object coding_system = Ffind_coding_system (name);
567 if (NILP (coding_system))
568 signal_simple_error ("No such coding system", name);
569 return coding_system;
572 /* We store the coding systems in hash tables with the names as the key and the
573 actual coding system object as the value. Occasionally we need to use them
574 in a list format. These routines provide us with that. */
575 struct coding_system_list_closure
577 Lisp_Object *coding_system_list;
581 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
582 void *coding_system_list_closure)
584 /* This function can GC */
585 struct coding_system_list_closure *cscl =
586 (struct coding_system_list_closure *) coding_system_list_closure;
587 Lisp_Object *coding_system_list = cscl->coding_system_list;
589 *coding_system_list = Fcons (key, *coding_system_list);
593 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
594 Return a list of the names of all defined coding systems.
598 Lisp_Object coding_system_list = Qnil;
600 struct coding_system_list_closure coding_system_list_closure;
602 GCPRO1 (coding_system_list);
603 coding_system_list_closure.coding_system_list = &coding_system_list;
604 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
605 &coding_system_list_closure);
608 return coding_system_list;
611 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
612 Return the name of the given coding system.
616 coding_system = Fget_coding_system (coding_system);
617 return XCODING_SYSTEM_NAME (coding_system);
620 static Lisp_Coding_System *
621 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
623 Lisp_Coding_System *codesys =
624 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
626 zero_lcrecord (codesys);
627 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
628 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
629 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
630 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
631 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
632 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
633 CODING_SYSTEM_TYPE (codesys) = type;
634 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
636 if (type == CODESYS_ISO2022)
639 for (i = 0; i < 4; i++)
640 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
642 else if (type == CODESYS_CCL)
644 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
645 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
648 CODING_SYSTEM_NAME (codesys) = name;
654 /* Given a list of charset conversion specs as specified in a Lisp
655 program, parse it into STORE_HERE. */
658 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
659 Lisp_Object spec_list)
663 EXTERNAL_LIST_LOOP (rest, spec_list)
665 Lisp_Object car = XCAR (rest);
666 Lisp_Object from, to;
667 struct charset_conversion_spec spec;
669 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
670 signal_simple_error ("Invalid charset conversion spec", car);
671 from = Fget_charset (XCAR (car));
672 to = Fget_charset (XCAR (XCDR (car)));
673 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
674 signal_simple_error_2
675 ("Attempted conversion between different charset types",
677 spec.from_charset = from;
678 spec.to_charset = to;
680 Dynarr_add (store_here, spec);
684 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
685 specs, return the equivalent as the Lisp programmer would see it.
687 If LOAD_HERE is 0, return Qnil. */
690 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
697 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
699 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
700 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
703 return Fnreverse (result);
708 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
709 Register symbol NAME as a coding system.
711 TYPE describes the conversion method used and should be one of
714 Automatic conversion. XEmacs attempts to detect the coding system
717 No conversion. Use this for binary files and such. On output,
718 graphic characters that are not in ASCII or Latin-1 will be
719 replaced by a ?. (For a no-conversion-encoded buffer, these
720 characters will only be present if you explicitly insert them.)
722 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
724 ISO 10646 UCS-4 encoding.
726 ISO 10646 UTF-8 encoding.
728 Any ISO2022-compliant encoding. Among other things, this includes
729 JIS (the Japanese encoding commonly used for e-mail), EUC (the
730 standard Unix encoding for Japanese and other languages), and
731 Compound Text (the encoding used in X11). You can specify more
732 specific information about the conversion with the FLAGS argument.
734 Big5 (the encoding commonly used for Taiwanese).
736 The conversion is performed using a user-written pseudo-code
737 program. CCL (Code Conversion Language) is the name of this
740 Write out or read in the raw contents of the memory representing
741 the buffer's text. This is primarily useful for debugging
742 purposes, and is only enabled when XEmacs has been compiled with
743 DEBUG_XEMACS defined (via the --debug configure option).
744 WARNING: Reading in a file using 'internal conversion can result
745 in an internal inconsistency in the memory representing a
746 buffer's text, which will produce unpredictable results and may
747 cause XEmacs to crash. Under normal circumstances you should
748 never use 'internal conversion.
750 DOC-STRING is a string describing the coding system.
752 PROPS is a property list, describing the specific nature of the
753 character set. Recognized properties are:
756 String to be displayed in the modeline when this coding system is
760 End-of-line conversion to be used. It should be one of
763 Automatically detect the end-of-line type (LF, CRLF,
764 or CR). Also generate subsidiary coding systems named
765 `NAME-unix', `NAME-dos', and `NAME-mac', that are
766 identical to this coding system but have an EOL-TYPE
767 value of 'lf, 'crlf, and 'cr, respectively.
769 The end of a line is marked externally using ASCII LF.
770 Since this is also the way that XEmacs represents an
771 end-of-line internally, specifying this option results
772 in no end-of-line conversion. This is the standard
773 format for Unix text files.
775 The end of a line is marked externally using ASCII
776 CRLF. This is the standard format for MS-DOS text
779 The end of a line is marked externally using ASCII CR.
780 This is the standard format for Macintosh text files.
782 Automatically detect the end-of-line type but do not
783 generate subsidiary coding systems. (This value is
784 converted to nil when stored internally, and
785 `coding-system-property' will return nil.)
787 'post-read-conversion
788 Function called after a file has been read in, to perform the
789 decoding. Called with two arguments, BEG and END, denoting
790 a region of the current buffer to be decoded.
792 'pre-write-conversion
793 Function called before a file is written out, to perform the
794 encoding. Called with two arguments, BEG and END, denoting
795 a region of the current buffer to be encoded.
798 The following additional properties are recognized if TYPE is 'iso2022:
804 The character set initially designated to the G0 - G3 registers.
805 The value should be one of
807 -- A charset object (designate that character set)
808 -- nil (do not ever use this register)
809 -- t (no character set is initially designated to
810 the register, but may be later on; this automatically
811 sets the corresponding `force-g*-on-output' property)
817 If non-nil, send an explicit designation sequence on output before
818 using the specified register.
821 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
822 "ESC $ B" on output in place of the full designation sequences
823 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
826 If non-nil, don't designate ASCII to G0 at each end of line on output.
827 Setting this to non-nil also suppresses other state-resetting that
828 normally happens at the end of a line.
831 If non-nil, don't designate ASCII to G0 before control chars on output.
834 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
838 If non-nil, use locking-shift (SO/SI) instead of single-shift
839 or designation by escape sequence.
842 If non-nil, don't use ISO6429's direction specification.
845 If non-nil, literal control characters that are the same as
846 the beginning of a recognized ISO2022 or ISO6429 escape sequence
847 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
848 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
849 so that they can be properly distinguished from an escape sequence.
850 (Note that doing this results in a non-portable encoding.) This
851 encoding flag is used for byte-compiled files. Note that ESC
852 is a good choice for a quoting character because there are no
853 escape sequences whose second byte is a character from the Control-0
854 or Control-1 character sets; this is explicitly disallowed by the
857 'input-charset-conversion
858 A list of conversion specifications, specifying conversion of
859 characters in one charset to another when decoding is performed.
860 Each specification is a list of two elements: the source charset,
861 and the destination charset.
863 'output-charset-conversion
864 A list of conversion specifications, specifying conversion of
865 characters in one charset to another when encoding is performed.
866 The form of each specification is the same as for
867 'input-charset-conversion.
870 The following additional properties are recognized (and required)
874 CCL program used for decoding (converting to internal format).
877 CCL program used for encoding (converting to external format).
879 (name, type, doc_string, props))
881 Lisp_Coding_System *codesys;
882 Lisp_Object rest, key, value;
883 enum coding_system_type ty;
884 int need_to_setup_eol_systems = 1;
886 /* Convert type to constant */
887 if (NILP (type) || EQ (type, Qundecided))
888 { ty = CODESYS_AUTODETECT; }
890 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
891 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
892 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
893 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
894 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
895 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
897 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
899 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
902 signal_simple_error ("Invalid coding system type", type);
906 codesys = allocate_coding_system (ty, name);
908 if (NILP (doc_string))
909 doc_string = build_string ("");
911 CHECK_STRING (doc_string);
912 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
914 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
916 if (EQ (key, Qmnemonic))
919 CHECK_STRING (value);
920 CODING_SYSTEM_MNEMONIC (codesys) = value;
923 else if (EQ (key, Qeol_type))
925 need_to_setup_eol_systems = NILP (value);
928 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
931 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
932 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
934 else if (ty == CODESYS_ISO2022)
936 #define FROB_INITIAL_CHARSET(charset_num) \
937 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
938 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
940 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
941 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
942 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
943 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
945 #define FROB_FORCE_CHARSET(charset_num) \
946 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
948 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
949 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
950 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
951 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
953 #define FROB_BOOLEAN_PROPERTY(prop) \
954 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
956 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
957 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
958 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
959 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
960 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
961 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
962 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
964 else if (EQ (key, Qinput_charset_conversion))
966 codesys->iso2022.input_conv =
967 Dynarr_new (charset_conversion_spec);
968 parse_charset_conversion_specs (codesys->iso2022.input_conv,
971 else if (EQ (key, Qoutput_charset_conversion))
973 codesys->iso2022.output_conv =
974 Dynarr_new (charset_conversion_spec);
975 parse_charset_conversion_specs (codesys->iso2022.output_conv,
979 signal_simple_error ("Unrecognized property", key);
981 else if (EQ (type, Qccl))
983 if (EQ (key, Qdecode))
985 CHECK_VECTOR (value);
986 CODING_SYSTEM_CCL_DECODE (codesys) = value;
988 else if (EQ (key, Qencode))
990 CHECK_VECTOR (value);
991 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
994 signal_simple_error ("Unrecognized property", key);
998 signal_simple_error ("Unrecognized property", key);
1001 if (need_to_setup_eol_systems)
1002 setup_eol_coding_systems (codesys);
1005 Lisp_Object codesys_obj;
1006 XSETCODING_SYSTEM (codesys_obj, codesys);
1007 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1012 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1013 Copy OLD-CODING-SYSTEM to NEW-NAME.
1014 If NEW-NAME does not name an existing coding system, a new one will
1017 (old_coding_system, new_name))
1019 Lisp_Object new_coding_system;
1020 old_coding_system = Fget_coding_system (old_coding_system);
1021 new_coding_system = Ffind_coding_system (new_name);
1022 if (NILP (new_coding_system))
1024 XSETCODING_SYSTEM (new_coding_system,
1025 allocate_coding_system
1026 (XCODING_SYSTEM_TYPE (old_coding_system),
1028 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1032 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1033 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1034 memcpy (((char *) to ) + sizeof (to->header),
1035 ((char *) from) + sizeof (from->header),
1036 sizeof (*from) - sizeof (from->header));
1037 to->name = new_name;
1039 return new_coding_system;
1042 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1043 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1045 (alias, coding_system))
1047 CHECK_SYMBOL (alias);
1048 if (!NILP (Ffind_coding_system (alias)))
1049 signal_simple_error ("Symbol already names a coding system", alias);
1050 coding_system = Fget_coding_system (coding_system);
1051 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1053 /* Set up aliases for subsidiaries. */
1054 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1057 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1058 #define FROB(type, name) \
1060 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1061 if (!NILP (subsidiary)) \
1062 Fdefine_coding_system_alias \
1063 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1066 FROB (CRLF, "-dos");
1070 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1071 but it doesn't look intentional, so I'd rather return something
1072 meaningful or nothing at all. */
1077 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1079 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1080 Lisp_Object new_coding_system;
1082 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1083 return coding_system;
1087 case EOL_AUTODETECT: return coding_system;
1088 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1089 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1090 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1094 return NILP (new_coding_system) ? coding_system : new_coding_system;
1097 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1098 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1100 (coding_system, eol_type))
1102 coding_system = Fget_coding_system (coding_system);
1104 return subsidiary_coding_system (coding_system,
1105 symbol_to_eol_type (eol_type));
1109 /************************************************************************/
1110 /* Coding system accessors */
1111 /************************************************************************/
1113 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1114 Return the doc string for CODING-SYSTEM.
1118 coding_system = Fget_coding_system (coding_system);
1119 return XCODING_SYSTEM_DOC_STRING (coding_system);
1122 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1123 Return the type of CODING-SYSTEM.
1127 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1130 case CODESYS_AUTODETECT: return Qundecided;
1132 case CODESYS_SHIFT_JIS: return Qshift_jis;
1133 case CODESYS_ISO2022: return Qiso2022;
1134 case CODESYS_BIG5: return Qbig5;
1135 case CODESYS_UCS4: return Qucs4;
1136 case CODESYS_UTF8: return Qutf8;
1137 case CODESYS_CCL: return Qccl;
1139 case CODESYS_NO_CONVERSION: return Qno_conversion;
1141 case CODESYS_INTERNAL: return Qinternal;
1148 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1151 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1153 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1156 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1157 Return initial charset of CODING-SYSTEM designated to GNUM.
1160 (coding_system, gnum))
1162 coding_system = Fget_coding_system (coding_system);
1165 return coding_system_charset (coding_system, XINT (gnum));
1169 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1170 Return the PROP property of CODING-SYSTEM.
1172 (coding_system, prop))
1175 enum coding_system_type type;
1177 coding_system = Fget_coding_system (coding_system);
1178 CHECK_SYMBOL (prop);
1179 type = XCODING_SYSTEM_TYPE (coding_system);
1181 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1182 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1185 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1187 case CODESYS_PROP_ALL_OK:
1190 case CODESYS_PROP_ISO2022:
1191 if (type != CODESYS_ISO2022)
1193 ("Property only valid in ISO2022 coding systems",
1197 case CODESYS_PROP_CCL:
1198 if (type != CODESYS_CCL)
1200 ("Property only valid in CCL coding systems",
1210 signal_simple_error ("Unrecognized property", prop);
1212 if (EQ (prop, Qname))
1213 return XCODING_SYSTEM_NAME (coding_system);
1214 else if (EQ (prop, Qtype))
1215 return Fcoding_system_type (coding_system);
1216 else if (EQ (prop, Qdoc_string))
1217 return XCODING_SYSTEM_DOC_STRING (coding_system);
1218 else if (EQ (prop, Qmnemonic))
1219 return XCODING_SYSTEM_MNEMONIC (coding_system);
1220 else if (EQ (prop, Qeol_type))
1221 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1222 else if (EQ (prop, Qeol_lf))
1223 return XCODING_SYSTEM_EOL_LF (coding_system);
1224 else if (EQ (prop, Qeol_crlf))
1225 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1226 else if (EQ (prop, Qeol_cr))
1227 return XCODING_SYSTEM_EOL_CR (coding_system);
1228 else if (EQ (prop, Qpost_read_conversion))
1229 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1230 else if (EQ (prop, Qpre_write_conversion))
1231 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1233 else if (type == CODESYS_ISO2022)
1235 if (EQ (prop, Qcharset_g0))
1236 return coding_system_charset (coding_system, 0);
1237 else if (EQ (prop, Qcharset_g1))
1238 return coding_system_charset (coding_system, 1);
1239 else if (EQ (prop, Qcharset_g2))
1240 return coding_system_charset (coding_system, 2);
1241 else if (EQ (prop, Qcharset_g3))
1242 return coding_system_charset (coding_system, 3);
1244 #define FORCE_CHARSET(charset_num) \
1245 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1246 (coding_system, charset_num) ? Qt : Qnil)
1248 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1249 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1250 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1251 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1253 #define LISP_BOOLEAN(prop) \
1254 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1256 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1257 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1258 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1259 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1260 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1261 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1262 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1264 else if (EQ (prop, Qinput_charset_conversion))
1266 unparse_charset_conversion_specs
1267 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1268 else if (EQ (prop, Qoutput_charset_conversion))
1270 unparse_charset_conversion_specs
1271 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1275 else if (type == CODESYS_CCL)
1277 if (EQ (prop, Qdecode))
1278 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1279 else if (EQ (prop, Qencode))
1280 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1288 return Qnil; /* not reached */
1292 /************************************************************************/
1293 /* Coding category functions */
1294 /************************************************************************/
1297 decode_coding_category (Lisp_Object symbol)
1301 CHECK_SYMBOL (symbol);
1302 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1303 if (EQ (coding_category_symbol[i], symbol))
1306 signal_simple_error ("Unrecognized coding category", symbol);
1307 return 0; /* not reached */
1310 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1311 Return a list of all recognized coding categories.
1316 Lisp_Object list = Qnil;
1318 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1319 list = Fcons (coding_category_symbol[i], list);
1323 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1324 Change the priority order of the coding categories.
1325 LIST should be list of coding categories, in descending order of
1326 priority. Unspecified coding categories will be lower in priority
1327 than all specified ones, in the same relative order they were in
1332 int category_to_priority[CODING_CATEGORY_LAST + 1];
1336 /* First generate a list that maps coding categories to priorities. */
1338 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1339 category_to_priority[i] = -1;
1341 /* Highest priority comes from the specified list. */
1343 EXTERNAL_LIST_LOOP (rest, list)
1345 int cat = decode_coding_category (XCAR (rest));
1347 if (category_to_priority[cat] >= 0)
1348 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1349 category_to_priority[cat] = i++;
1352 /* Now go through the existing categories by priority to retrieve
1353 the categories not yet specified and preserve their priority
1355 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1357 int cat = fcd->coding_category_by_priority[j];
1358 if (category_to_priority[cat] < 0)
1359 category_to_priority[cat] = i++;
1362 /* Now we need to construct the inverse of the mapping we just
1365 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1366 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1368 /* Phew! That was confusing. */
1372 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1373 Return a list of coding categories in descending order of priority.
1378 Lisp_Object list = Qnil;
1380 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1381 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1386 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1387 Change the coding system associated with a coding category.
1389 (coding_category, coding_system))
1391 int cat = decode_coding_category (coding_category);
1393 coding_system = Fget_coding_system (coding_system);
1394 fcd->coding_category_system[cat] = coding_system;
1398 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1399 Return the coding system associated with a coding category.
1403 int cat = decode_coding_category (coding_category);
1404 Lisp_Object sys = fcd->coding_category_system[cat];
1407 return XCODING_SYSTEM_NAME (sys);
1412 /************************************************************************/
1413 /* Detecting the encoding of data */
1414 /************************************************************************/
1416 struct detection_state
1418 enum eol_type eol_type;
1454 struct iso2022_decoder iso;
1456 int high_byte_count;
1457 unsigned int saw_single_shift:1;
1470 acceptable_control_char_p (int c)
1474 /* Allow and ignore control characters that you might
1475 reasonably see in a text file */
1480 case 8: /* backspace */
1481 case 11: /* vertical tab */
1482 case 12: /* form feed */
1483 case 26: /* MS-DOS C-z junk */
1484 case 31: /* '^_' -- for info */
1492 mask_has_at_most_one_bit_p (int mask)
1494 /* Perhaps the only thing useful you learn from intensive Microsoft
1495 technical interviews */
1496 return (mask & (mask - 1)) == 0;
1499 static enum eol_type
1500 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1510 if (st->eol.just_saw_cr)
1512 else if (st->eol.seen_anything)
1515 else if (st->eol.just_saw_cr)
1518 st->eol.just_saw_cr = 1;
1520 st->eol.just_saw_cr = 0;
1521 st->eol.seen_anything = 1;
1524 return EOL_AUTODETECT;
1527 /* Attempt to determine the encoding and EOL type of the given text.
1528 Before calling this function for the first type, you must initialize
1529 st->eol_type as appropriate and initialize st->mask to ~0.
1531 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1534 st->mask holds the determined coding category mask, or ~0 if only
1535 ASCII has been seen so far.
1539 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1540 is present in st->mask
1541 1 == definitive answers are here for both st->eol_type and st->mask
1545 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1546 unsigned int n, int just_do_eol)
1550 if (st->eol_type == EOL_AUTODETECT)
1551 st->eol_type = detect_eol_type (st, src, n);
1554 return st->eol_type != EOL_AUTODETECT;
1556 if (!st->seen_non_ascii)
1558 for (; n; n--, src++)
1561 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1563 st->seen_non_ascii = 1;
1565 st->shift_jis.mask = ~0;
1569 st->iso2022.mask = ~0;
1579 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1580 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1581 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1582 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1583 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1584 st->big5.mask = detect_coding_big5 (st, src, n);
1585 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1586 st->utf8.mask = detect_coding_utf8 (st, src, n);
1587 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1588 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1591 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1592 | st->utf8.mask | st->ucs4.mask;
1595 int retval = mask_has_at_most_one_bit_p (st->mask);
1596 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1597 return retval && st->eol_type != EOL_AUTODETECT;
1602 coding_system_from_mask (int mask)
1606 /* If the file was entirely or basically ASCII, use the
1607 default value of `buffer-file-coding-system'. */
1608 Lisp_Object retval =
1609 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1612 retval = Ffind_coding_system (retval);
1616 (Qbad_variable, Qwarning,
1617 "Invalid `default-buffer-file-coding-system', set to nil");
1618 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1622 retval = Fget_coding_system (Qraw_text);
1630 mask = postprocess_iso2022_mask (mask);
1632 /* Look through the coding categories by priority and find
1633 the first one that is allowed. */
1634 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1636 cat = fcd->coding_category_by_priority[i];
1637 if ((mask & (1 << cat)) &&
1638 !NILP (fcd->coding_category_system[cat]))
1642 return fcd->coding_category_system[cat];
1644 return Fget_coding_system (Qraw_text);
1648 /* Given a seekable read stream and potential coding system and EOL type
1649 as specified, do any autodetection that is called for. If the
1650 coding system and/or EOL type are not `autodetect', they will be left
1651 alone; but this function will never return an autodetect coding system
1654 This function does not automatically fetch subsidiary coding systems;
1655 that should be unnecessary with the explicit eol-type argument. */
1657 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1660 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1661 enum eol_type *eol_type_in_out)
1663 struct detection_state decst;
1665 if (*eol_type_in_out == EOL_AUTODETECT)
1666 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1669 decst.eol_type = *eol_type_in_out;
1672 /* If autodetection is called for, do it now. */
1673 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1674 || *eol_type_in_out == EOL_AUTODETECT)
1677 Lisp_Object coding_system = Qnil;
1679 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1682 /* Look for initial "-*-"; mode line prefix */
1684 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1689 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1691 Extbyte *local_vars_beg = p + 3;
1692 /* Look for final "-*-"; mode line suffix */
1693 for (p = local_vars_beg,
1694 scan_end = buf + nread - LENGTH ("-*-");
1699 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1701 Extbyte *suffix = p;
1702 /* Look for "coding:" */
1703 for (p = local_vars_beg,
1704 scan_end = suffix - LENGTH ("coding:?");
1707 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1708 && (p == local_vars_beg
1709 || (*(p-1) == ' ' ||
1715 p += LENGTH ("coding:");
1716 while (*p == ' ' || *p == '\t') p++;
1718 /* Get coding system name */
1719 save = *suffix; *suffix = '\0';
1720 /* Characters valid in a MIME charset name (rfc 1521),
1721 and in a Lisp symbol name. */
1722 n = strspn ( (char *) p,
1723 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1724 "abcdefghijklmnopqrstuvwxyz"
1730 save = p[n]; p[n] = '\0';
1732 Ffind_coding_system (intern ((char *) p));
1742 if (NILP (coding_system))
1745 if (detect_coding_type (&decst, buf, nread,
1746 XCODING_SYSTEM_TYPE (*codesys_in_out)
1747 != CODESYS_AUTODETECT))
1749 nread = Lstream_read (stream, buf, sizeof (buf));
1755 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1756 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1759 if (detect_coding_type (&decst, buf, nread, 1))
1761 nread = Lstream_read (stream, buf, sizeof (buf));
1767 *eol_type_in_out = decst.eol_type;
1768 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1770 if (NILP (coding_system))
1771 *codesys_in_out = coding_system_from_mask (decst.mask);
1773 *codesys_in_out = coding_system;
1777 /* If we absolutely can't determine the EOL type, just assume LF. */
1778 if (*eol_type_in_out == EOL_AUTODETECT)
1779 *eol_type_in_out = EOL_LF;
1781 Lstream_rewind (stream);
1784 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1785 Detect coding system of the text in the region between START and END.
1786 Returned a list of possible coding systems ordered by priority.
1787 If only ASCII characters are found, it returns 'undecided or one of
1788 its subsidiary coding systems according to a detected end-of-line
1789 type. Optional arg BUFFER defaults to the current buffer.
1791 (start, end, buffer))
1793 Lisp_Object val = Qnil;
1794 struct buffer *buf = decode_buffer (buffer, 0);
1796 Lisp_Object instream, lb_instream;
1797 Lstream *istr, *lb_istr;
1798 struct detection_state decst;
1799 struct gcpro gcpro1, gcpro2;
1801 get_buffer_range_char (buf, start, end, &b, &e, 0);
1802 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1803 lb_istr = XLSTREAM (lb_instream);
1804 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1805 istr = XLSTREAM (instream);
1806 GCPRO2 (instream, lb_instream);
1808 decst.eol_type = EOL_AUTODETECT;
1812 unsigned char random_buffer[4096];
1813 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1817 if (detect_coding_type (&decst, random_buffer, nread, 0))
1821 if (decst.mask == ~0)
1822 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1830 decst.mask = postprocess_iso2022_mask (decst.mask);
1832 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1834 int sys = fcd->coding_category_by_priority[i];
1835 if (decst.mask & (1 << sys))
1837 Lisp_Object codesys = fcd->coding_category_system[sys];
1838 if (!NILP (codesys))
1839 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1840 val = Fcons (codesys, val);
1844 Lstream_close (istr);
1846 Lstream_delete (istr);
1847 Lstream_delete (lb_istr);
1852 /************************************************************************/
1853 /* Converting to internal Mule format ("decoding") */
1854 /************************************************************************/
1856 /* A decoding stream is a stream used for decoding text (i.e.
1857 converting from some external format to internal format).
1858 The decoding-stream object keeps track of the actual coding
1859 stream, the stream that is at the other end, and data that
1860 needs to be persistent across the lifetime of the stream. */
1862 /* Handle the EOL stuff related to just-read-in character C.
1863 EOL_TYPE is the EOL type of the coding stream.
1864 FLAGS is the current value of FLAGS in the coding stream, and may
1865 be modified by this macro. (The macro only looks at the
1866 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1867 bytes are to be written. You need to also define a local goto
1868 label "label_continue_loop" that is at the end of the main
1869 character-reading loop.
1871 If C is a CR character, then this macro handles it entirely and
1872 jumps to label_continue_loop. Otherwise, this macro does not add
1873 anything to DST, and continues normally. You should continue
1874 processing C normally after this macro. */
1876 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1880 if (eol_type == EOL_CR) \
1881 Dynarr_add (dst, '\n'); \
1882 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1883 Dynarr_add (dst, c); \
1885 flags |= CODING_STATE_CR; \
1886 goto label_continue_loop; \
1888 else if (flags & CODING_STATE_CR) \
1889 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1891 Dynarr_add (dst, '\r'); \
1892 flags &= ~CODING_STATE_CR; \
1896 /* C should be a binary character in the range 0 - 255; convert
1897 to internal format and add to Dynarr DST. */
1900 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1902 if (BYTE_ASCII_P (c)) \
1903 Dynarr_add (dst, c); \
1906 Dynarr_add (dst, (c >> 6) | 0xc0); \
1907 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1912 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1916 Dynarr_add (dst, c);
1918 else if ( c <= 0x7ff )
1920 Dynarr_add (dst, (c >> 6) | 0xc0);
1921 Dynarr_add (dst, (c & 0x3f) | 0x80);
1923 else if ( c <= 0xffff )
1925 Dynarr_add (dst, (c >> 12) | 0xe0);
1926 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1927 Dynarr_add (dst, (c & 0x3f) | 0x80);
1929 else if ( c <= 0x1fffff )
1931 Dynarr_add (dst, (c >> 18) | 0xf0);
1932 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1933 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1934 Dynarr_add (dst, (c & 0x3f) | 0x80);
1936 else if ( c <= 0x3ffffff )
1938 Dynarr_add (dst, (c >> 24) | 0xf8);
1939 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1940 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1941 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1942 Dynarr_add (dst, (c & 0x3f) | 0x80);
1946 Dynarr_add (dst, (c >> 30) | 0xfc);
1947 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1948 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1949 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1950 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1951 Dynarr_add (dst, (c & 0x3f) | 0x80);
1955 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1957 if (BYTE_ASCII_P (c)) \
1958 Dynarr_add (dst, c); \
1959 else if (BYTE_C1_P (c)) \
1961 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1962 Dynarr_add (dst, c + 0x20); \
1966 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1967 Dynarr_add (dst, c); \
1972 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1976 DECODE_ADD_BINARY_CHAR (ch, dst); \
1981 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1983 if (flags & CODING_STATE_END) \
1985 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1986 if (flags & CODING_STATE_CR) \
1987 Dynarr_add (dst, '\r'); \
1991 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1993 struct decoding_stream
1995 /* Coding system that governs the conversion. */
1996 Lisp_Coding_System *codesys;
1998 /* Stream that we read the encoded data from or
1999 write the decoded data to. */
2002 /* If we are reading, then we can return only a fixed amount of
2003 data, so if the conversion resulted in too much data, we store it
2004 here for retrieval the next time around. */
2005 unsigned_char_dynarr *runoff;
2007 /* FLAGS holds flags indicating the current state of the decoding.
2008 Some of these flags are dependent on the coding system. */
2011 /* CH holds a partially built-up character. Since we only deal
2012 with one- and two-byte characters at the moment, we only use
2013 this to store the first byte of a two-byte character. */
2016 /* EOL_TYPE specifies the type of end-of-line conversion that
2017 currently applies. We need to keep this separate from the
2018 EOL type stored in CODESYS because the latter might indicate
2019 automatic EOL-type detection while the former will always
2020 indicate a particular EOL type. */
2021 enum eol_type eol_type;
2023 /* Additional ISO2022 information. We define the structure above
2024 because it's also needed by the detection routines. */
2025 struct iso2022_decoder iso2022;
2027 /* Additional information (the state of the running CCL program)
2028 used by the CCL decoder. */
2029 struct ccl_program ccl;
2031 /* counter for UTF-8 or UCS-4 */
2032 unsigned char counter;
2034 struct detection_state decst;
2037 static ssize_t decoding_reader (Lstream *stream,
2038 unsigned char *data, size_t size);
2039 static ssize_t decoding_writer (Lstream *stream,
2040 CONST unsigned char *data, size_t size);
2041 static int decoding_rewinder (Lstream *stream);
2042 static int decoding_seekable_p (Lstream *stream);
2043 static int decoding_flusher (Lstream *stream);
2044 static int decoding_closer (Lstream *stream);
2046 static Lisp_Object decoding_marker (Lisp_Object stream);
2048 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2049 sizeof (struct decoding_stream));
2052 decoding_marker (Lisp_Object stream)
2054 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2055 Lisp_Object str_obj;
2057 /* We do not need to mark the coding systems or charsets stored
2058 within the stream because they are stored in a global list
2059 and automatically marked. */
2061 XSETLSTREAM (str_obj, str);
2062 mark_object (str_obj);
2063 if (str->imp->marker)
2064 return (str->imp->marker) (str_obj);
2069 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2070 so we read data from the other end, decode it, and store it into DATA. */
2073 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2075 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2076 unsigned char *orig_data = data;
2078 int error_occurred = 0;
2080 /* We need to interface to mule_decode(), which expects to take some
2081 amount of data and store the result into a Dynarr. We have
2082 mule_decode() store into str->runoff, and take data from there
2085 /* We loop until we have enough data, reading chunks from the other
2086 end and decoding it. */
2089 /* Take data from the runoff if we can. Make sure to take at
2090 most SIZE bytes, and delete the data from the runoff. */
2091 if (Dynarr_length (str->runoff) > 0)
2093 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2094 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2095 Dynarr_delete_many (str->runoff, 0, chunk);
2101 break; /* No more room for data */
2103 if (str->flags & CODING_STATE_END)
2104 /* This means that on the previous iteration, we hit the EOF on
2105 the other end. We loop once more so that mule_decode() can
2106 output any final stuff it may be holding, or any "go back
2107 to a sane state" escape sequences. (This latter makes sense
2108 during encoding.) */
2111 /* Exhausted the runoff, so get some more. DATA has at least
2112 SIZE bytes left of storage in it, so it's OK to read directly
2113 into it. (We'll be overwriting above, after we've decoded it
2114 into the runoff.) */
2115 read_size = Lstream_read (str->other_end, data, size);
2122 /* There might be some more end data produced in the translation.
2123 See the comment above. */
2124 str->flags |= CODING_STATE_END;
2125 mule_decode (stream, data, str->runoff, read_size);
2128 if (data - orig_data == 0)
2129 return error_occurred ? -1 : 0;
2131 return data - orig_data;
2135 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2137 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2140 /* Decode all our data into the runoff, and then attempt to write
2141 it all out to the other end. Remove whatever chunk we succeeded
2143 mule_decode (stream, data, str->runoff, size);
2144 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2145 Dynarr_length (str->runoff));
2147 Dynarr_delete_many (str->runoff, 0, retval);
2148 /* Do NOT return retval. The return value indicates how much
2149 of the incoming data was written, not how many bytes were
2155 reset_decoding_stream (struct decoding_stream *str)
2158 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2160 Lisp_Object coding_system;
2161 XSETCODING_SYSTEM (coding_system, str->codesys);
2162 reset_iso2022 (coding_system, &str->iso2022);
2164 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2166 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2170 str->flags = str->ch = 0;
2174 decoding_rewinder (Lstream *stream)
2176 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2177 reset_decoding_stream (str);
2178 Dynarr_reset (str->runoff);
2179 return Lstream_rewind (str->other_end);
2183 decoding_seekable_p (Lstream *stream)
2185 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2186 return Lstream_seekable_p (str->other_end);
2190 decoding_flusher (Lstream *stream)
2192 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2193 return Lstream_flush (str->other_end);
2197 decoding_closer (Lstream *stream)
2199 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2200 if (stream->flags & LSTREAM_FL_WRITE)
2202 str->flags |= CODING_STATE_END;
2203 decoding_writer (stream, 0, 0);
2205 Dynarr_free (str->runoff);
2207 #ifdef ENABLE_COMPOSITE_CHARS
2208 if (str->iso2022.composite_chars)
2209 Dynarr_free (str->iso2022.composite_chars);
2212 return Lstream_close (str->other_end);
2216 decoding_stream_coding_system (Lstream *stream)
2218 Lisp_Object coding_system;
2219 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2221 XSETCODING_SYSTEM (coding_system, str->codesys);
2222 return subsidiary_coding_system (coding_system, str->eol_type);
2226 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2228 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2229 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2231 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2232 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2233 reset_decoding_stream (str);
2236 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2237 stream for writing, no automatic code detection will be performed.
2238 The reason for this is that automatic code detection requires a
2239 seekable input. Things will also fail if you open a decoding
2240 stream for reading using a non-fully-specified coding system and
2241 a non-seekable input stream. */
2244 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2247 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2248 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2252 str->other_end = stream;
2253 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2254 str->eol_type = EOL_AUTODETECT;
2255 if (!strcmp (mode, "r")
2256 && Lstream_seekable_p (stream))
2257 /* We can determine the coding system now. */
2258 determine_real_coding_system (stream, &codesys, &str->eol_type);
2259 set_decoding_stream_coding_system (lstr, codesys);
2260 str->decst.eol_type = str->eol_type;
2261 str->decst.mask = ~0;
2262 XSETLSTREAM (obj, lstr);
2267 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2269 return make_decoding_stream_1 (stream, codesys, "r");
2273 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2275 return make_decoding_stream_1 (stream, codesys, "w");
2278 /* Note: the decode_coding_* functions all take the same
2279 arguments as mule_decode(), which is to say some SRC data of
2280 size N, which is to be stored into dynamic array DST.
2281 DECODING is the stream within which the decoding is
2282 taking place, but no data is actually read from or
2283 written to that stream; that is handled in decoding_reader()
2284 or decoding_writer(). This allows the same functions to
2285 be used for both reading and writing. */
2288 mule_decode (Lstream *decoding, CONST unsigned char *src,
2289 unsigned_char_dynarr *dst, unsigned int n)
2291 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2293 /* If necessary, do encoding-detection now. We do this when
2294 we're a writing stream or a non-seekable reading stream,
2295 meaning that we can't just process the whole input,
2296 rewind, and start over. */
2298 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2299 str->eol_type == EOL_AUTODETECT)
2301 Lisp_Object codesys;
2303 XSETCODING_SYSTEM (codesys, str->codesys);
2304 detect_coding_type (&str->decst, src, n,
2305 CODING_SYSTEM_TYPE (str->codesys) !=
2306 CODESYS_AUTODETECT);
2307 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2308 str->decst.mask != ~0)
2309 /* #### This is cheesy. What we really ought to do is
2310 buffer up a certain amount of data so as to get a
2311 less random result. */
2312 codesys = coding_system_from_mask (str->decst.mask);
2313 str->eol_type = str->decst.eol_type;
2314 if (XCODING_SYSTEM (codesys) != str->codesys)
2316 /* Preserve the CODING_STATE_END flag in case it was set.
2317 If we erase it, bad things might happen. */
2318 int was_end = str->flags & CODING_STATE_END;
2319 set_decoding_stream_coding_system (decoding, codesys);
2321 str->flags |= CODING_STATE_END;
2325 switch (CODING_SYSTEM_TYPE (str->codesys))
2328 case CODESYS_INTERNAL:
2329 Dynarr_add_many (dst, src, n);
2332 case CODESYS_AUTODETECT:
2333 /* If we got this far and still haven't decided on the coding
2334 system, then do no conversion. */
2335 case CODESYS_NO_CONVERSION:
2336 decode_coding_no_conversion (decoding, src, dst, n);
2339 case CODESYS_SHIFT_JIS:
2340 decode_coding_sjis (decoding, src, dst, n);
2343 decode_coding_big5 (decoding, src, dst, n);
2346 decode_coding_ucs4 (decoding, src, dst, n);
2349 decode_coding_utf8 (decoding, src, dst, n);
2352 str->ccl.last_block = str->flags & CODING_STATE_END;
2353 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2355 case CODESYS_ISO2022:
2356 decode_coding_iso2022 (decoding, src, dst, n);
2364 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2365 Decode the text between START and END which is encoded in CODING-SYSTEM.
2366 This is useful if you've read in encoded text from a file without decoding
2367 it (e.g. you read in a JIS-formatted file but used the `binary' or
2368 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2369 Return length of decoded text.
2370 BUFFER defaults to the current buffer if unspecified.
2372 (start, end, coding_system, buffer))
2375 struct buffer *buf = decode_buffer (buffer, 0);
2376 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2377 Lstream *istr, *ostr;
2378 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2380 get_buffer_range_char (buf, start, end, &b, &e, 0);
2382 barf_if_buffer_read_only (buf, b, e);
2384 coding_system = Fget_coding_system (coding_system);
2385 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2386 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2387 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2389 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2390 Fget_coding_system (Qbinary));
2391 istr = XLSTREAM (instream);
2392 ostr = XLSTREAM (outstream);
2393 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2395 /* The chain of streams looks like this:
2397 [BUFFER] <----- send through
2398 ------> [ENCODE AS BINARY]
2399 ------> [DECODE AS SPECIFIED]
2405 char tempbuf[1024]; /* some random amount */
2406 Bufpos newpos, even_newer_pos;
2407 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2408 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2412 newpos = lisp_buffer_stream_startpos (istr);
2413 Lstream_write (ostr, tempbuf, size_in_bytes);
2414 even_newer_pos = lisp_buffer_stream_startpos (istr);
2415 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2418 Lstream_close (istr);
2419 Lstream_close (ostr);
2421 Lstream_delete (istr);
2422 Lstream_delete (ostr);
2423 Lstream_delete (XLSTREAM (de_outstream));
2424 Lstream_delete (XLSTREAM (lb_outstream));
2429 /************************************************************************/
2430 /* Converting to an external encoding ("encoding") */
2431 /************************************************************************/
2433 /* An encoding stream is an output stream. When you create the
2434 stream, you specify the coding system that governs the encoding
2435 and another stream that the resulting encoded data is to be
2436 sent to, and then start sending data to it. */
2438 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2440 struct encoding_stream
2442 /* Coding system that governs the conversion. */
2443 Lisp_Coding_System *codesys;
2445 /* Stream that we read the encoded data from or
2446 write the decoded data to. */
2449 /* If we are reading, then we can return only a fixed amount of
2450 data, so if the conversion resulted in too much data, we store it
2451 here for retrieval the next time around. */
2452 unsigned_char_dynarr *runoff;
2454 /* FLAGS holds flags indicating the current state of the encoding.
2455 Some of these flags are dependent on the coding system. */
2458 /* CH holds a partially built-up character. Since we only deal
2459 with one- and two-byte characters at the moment, we only use
2460 this to store the first byte of a two-byte character. */
2463 /* Additional information used by the ISO2022 encoder. */
2466 /* CHARSET holds the character sets currently assigned to the G0
2467 through G3 registers. It is initialized from the array
2468 INITIAL_CHARSET in CODESYS. */
2469 Lisp_Object charset[4];
2471 /* Which registers are currently invoked into the left (GL) and
2472 right (GR) halves of the 8-bit encoding space? */
2473 int register_left, register_right;
2475 /* Whether we need to explicitly designate the charset in the
2476 G? register before using it. It is initialized from the
2477 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2478 unsigned char force_charset_on_output[4];
2480 /* Other state variables that need to be preserved across
2482 Lisp_Object current_charset;
2484 int current_char_boundary;
2487 void (*encode_char) (struct encoding_stream *str, Emchar c,
2488 unsigned_char_dynarr *dst, unsigned int *flags);
2489 void (*finish) (struct encoding_stream *str,
2490 unsigned_char_dynarr *dst, unsigned int *flags);
2492 /* Additional information (the state of the running CCL program)
2493 used by the CCL encoder. */
2494 struct ccl_program ccl;
2498 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2499 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2501 static int encoding_rewinder (Lstream *stream);
2502 static int encoding_seekable_p (Lstream *stream);
2503 static int encoding_flusher (Lstream *stream);
2504 static int encoding_closer (Lstream *stream);
2506 static Lisp_Object encoding_marker (Lisp_Object stream);
2508 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2509 sizeof (struct encoding_stream));
2512 encoding_marker (Lisp_Object stream)
2514 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2515 Lisp_Object str_obj;
2517 /* We do not need to mark the coding systems or charsets stored
2518 within the stream because they are stored in a global list
2519 and automatically marked. */
2521 XSETLSTREAM (str_obj, str);
2522 mark_object (str_obj);
2523 if (str->imp->marker)
2524 return (str->imp->marker) (str_obj);
2529 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2530 so we read data from the other end, encode it, and store it into DATA. */
2533 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2535 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2536 unsigned char *orig_data = data;
2538 int error_occurred = 0;
2540 /* We need to interface to mule_encode(), which expects to take some
2541 amount of data and store the result into a Dynarr. We have
2542 mule_encode() store into str->runoff, and take data from there
2545 /* We loop until we have enough data, reading chunks from the other
2546 end and encoding it. */
2549 /* Take data from the runoff if we can. Make sure to take at
2550 most SIZE bytes, and delete the data from the runoff. */
2551 if (Dynarr_length (str->runoff) > 0)
2553 int chunk = min ((int) size, Dynarr_length (str->runoff));
2554 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2555 Dynarr_delete_many (str->runoff, 0, chunk);
2561 break; /* No more room for data */
2563 if (str->flags & CODING_STATE_END)
2564 /* This means that on the previous iteration, we hit the EOF on
2565 the other end. We loop once more so that mule_encode() can
2566 output any final stuff it may be holding, or any "go back
2567 to a sane state" escape sequences. (This latter makes sense
2568 during encoding.) */
2571 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2572 left of storage in it, so it's OK to read directly into it.
2573 (We'll be overwriting above, after we've encoded it into the
2575 read_size = Lstream_read (str->other_end, data, size);
2582 /* There might be some more end data produced in the translation.
2583 See the comment above. */
2584 str->flags |= CODING_STATE_END;
2585 mule_encode (stream, data, str->runoff, read_size);
2588 if (data == orig_data)
2589 return error_occurred ? -1 : 0;
2591 return data - orig_data;
2595 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2597 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2600 /* Encode all our data into the runoff, and then attempt to write
2601 it all out to the other end. Remove whatever chunk we succeeded
2603 mule_encode (stream, data, str->runoff, size);
2604 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2605 Dynarr_length (str->runoff));
2607 Dynarr_delete_many (str->runoff, 0, retval);
2608 /* Do NOT return retval. The return value indicates how much
2609 of the incoming data was written, not how many bytes were
2615 reset_encoding_stream (struct encoding_stream *str)
2618 switch (CODING_SYSTEM_TYPE (str->codesys))
2620 case CODESYS_ISO2022:
2624 str->encode_char = &char_encode_iso2022;
2625 str->finish = &char_finish_iso2022;
2626 for (i = 0; i < 4; i++)
2628 str->iso2022.charset[i] =
2629 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2630 str->iso2022.force_charset_on_output[i] =
2631 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2633 str->iso2022.register_left = 0;
2634 str->iso2022.register_right = 1;
2635 str->iso2022.current_charset = Qnil;
2636 str->iso2022.current_half = 0;
2640 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2643 str->encode_char = &char_encode_utf8;
2644 str->finish = &char_finish_utf8;
2647 str->encode_char = &char_encode_ucs4;
2648 str->finish = &char_finish_ucs4;
2650 case CODESYS_SHIFT_JIS:
2651 str->encode_char = &char_encode_shift_jis;
2652 str->finish = &char_finish_shift_jis;
2658 str->iso2022.current_char_boundary = 0;
2659 str->flags = str->ch = 0;
2663 encoding_rewinder (Lstream *stream)
2665 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2666 reset_encoding_stream (str);
2667 Dynarr_reset (str->runoff);
2668 return Lstream_rewind (str->other_end);
2672 encoding_seekable_p (Lstream *stream)
2674 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2675 return Lstream_seekable_p (str->other_end);
2679 encoding_flusher (Lstream *stream)
2681 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2682 return Lstream_flush (str->other_end);
2686 encoding_closer (Lstream *stream)
2688 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2689 if (stream->flags & LSTREAM_FL_WRITE)
2691 str->flags |= CODING_STATE_END;
2692 encoding_writer (stream, 0, 0);
2694 Dynarr_free (str->runoff);
2695 return Lstream_close (str->other_end);
2699 encoding_stream_coding_system (Lstream *stream)
2701 Lisp_Object coding_system;
2702 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2704 XSETCODING_SYSTEM (coding_system, str->codesys);
2705 return coding_system;
2709 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2711 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2712 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2714 reset_encoding_stream (str);
2718 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2721 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2722 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2726 str->runoff = Dynarr_new (unsigned_char);
2727 str->other_end = stream;
2728 set_encoding_stream_coding_system (lstr, codesys);
2729 XSETLSTREAM (obj, lstr);
2734 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2736 return make_encoding_stream_1 (stream, codesys, "r");
2740 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2742 return make_encoding_stream_1 (stream, codesys, "w");
2745 /* Convert N bytes of internally-formatted data stored in SRC to an
2746 external format, according to the encoding stream ENCODING.
2747 Store the encoded data into DST. */
2750 mule_encode (Lstream *encoding, CONST unsigned char *src,
2751 unsigned_char_dynarr *dst, unsigned int n)
2753 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2755 switch (CODING_SYSTEM_TYPE (str->codesys))
2758 case CODESYS_INTERNAL:
2759 Dynarr_add_many (dst, src, n);
2762 case CODESYS_AUTODETECT:
2763 /* If we got this far and still haven't decided on the coding
2764 system, then do no conversion. */
2765 case CODESYS_NO_CONVERSION:
2766 encode_coding_no_conversion (encoding, src, dst, n);
2770 encode_coding_big5 (encoding, src, dst, n);
2773 str->ccl.last_block = str->flags & CODING_STATE_END;
2774 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2778 text_encode_generic (encoding, src, dst, n);
2782 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2783 Encode the text between START and END using CODING-SYSTEM.
2784 This will, for example, convert Japanese characters into stuff such as
2785 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2786 text. BUFFER defaults to the current buffer if unspecified.
2788 (start, end, coding_system, buffer))
2791 struct buffer *buf = decode_buffer (buffer, 0);
2792 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2793 Lstream *istr, *ostr;
2794 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2796 get_buffer_range_char (buf, start, end, &b, &e, 0);
2798 barf_if_buffer_read_only (buf, b, e);
2800 coding_system = Fget_coding_system (coding_system);
2801 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2802 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2803 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2804 Fget_coding_system (Qbinary));
2805 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2807 istr = XLSTREAM (instream);
2808 ostr = XLSTREAM (outstream);
2809 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2810 /* The chain of streams looks like this:
2812 [BUFFER] <----- send through
2813 ------> [ENCODE AS SPECIFIED]
2814 ------> [DECODE AS BINARY]
2819 char tempbuf[1024]; /* some random amount */
2820 Bufpos newpos, even_newer_pos;
2821 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2822 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2826 newpos = lisp_buffer_stream_startpos (istr);
2827 Lstream_write (ostr, tempbuf, size_in_bytes);
2828 even_newer_pos = lisp_buffer_stream_startpos (istr);
2829 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2835 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2836 Lstream_close (istr);
2837 Lstream_close (ostr);
2839 Lstream_delete (istr);
2840 Lstream_delete (ostr);
2841 Lstream_delete (XLSTREAM (de_outstream));
2842 Lstream_delete (XLSTREAM (lb_outstream));
2843 return make_int (retlen);
2850 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2851 unsigned_char_dynarr *dst, unsigned int n)
2854 unsigned char char_boundary;
2855 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2856 unsigned int flags = str->flags;
2857 Emchar ch = str->ch;
2859 char_boundary = str->iso2022.current_char_boundary;
2865 if (char_boundary == 0)
2893 (*str->encode_char) (str, c, dst, &flags);
2895 else if (char_boundary == 1)
2897 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2903 ch = (ch << 6) | (c & 0x3f);
2908 if ((char_boundary == 0) && (flags & CODING_STATE_END))
2910 (*str->finish) (str, dst, &flags);
2915 str->iso2022.current_char_boundary = char_boundary;
2919 /************************************************************************/
2920 /* Shift-JIS methods */
2921 /************************************************************************/
2923 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2924 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2925 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2926 encoded by "position-code + 0x80". A character of JISX0208
2927 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2928 position-codes are divided and shifted so that it fit in the range
2931 --- CODE RANGE of Shift-JIS ---
2932 (character set) (range)
2934 JISX0201-Kana 0xA0 .. 0xDF
2935 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2936 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2937 -------------------------------
2941 /* Is this the first byte of a Shift-JIS two-byte char? */
2943 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2944 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2946 /* Is this the second byte of a Shift-JIS two-byte char? */
2948 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2949 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2951 #define BYTE_SJIS_KATAKANA_P(c) \
2952 ((c) >= 0xA1 && (c) <= 0xDF)
2955 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2963 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2965 if (st->shift_jis.in_second_byte)
2967 st->shift_jis.in_second_byte = 0;
2971 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2972 st->shift_jis.in_second_byte = 1;
2974 return CODING_CATEGORY_SHIFT_JIS_MASK;
2977 /* Convert Shift-JIS data to internal format. */
2980 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2981 unsigned_char_dynarr *dst, unsigned int n)
2984 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2985 unsigned int flags = str->flags;
2986 unsigned int ch = str->ch;
2987 eol_type_t eol_type = str->eol_type;
2995 /* Previous character was first byte of Shift-JIS Kanji char. */
2996 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2998 unsigned char e1, e2;
3000 DECODE_SJIS (ch, c, e1, e2);
3002 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3006 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3007 Dynarr_add (dst, e1);
3008 Dynarr_add (dst, e2);
3013 DECODE_ADD_BINARY_CHAR (ch, dst);
3014 DECODE_ADD_BINARY_CHAR (c, dst);
3020 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3021 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3023 else if (BYTE_SJIS_KATAKANA_P (c))
3026 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3029 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3030 Dynarr_add (dst, c);
3035 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3039 DECODE_ADD_BINARY_CHAR (c, dst);
3041 label_continue_loop:;
3044 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3050 /* Convert internal character representation to Shift_JIS. */
3053 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3054 unsigned_char_dynarr *dst, unsigned int *flags)
3056 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3060 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3061 Dynarr_add (dst, '\r');
3062 if (eol_type != EOL_CR)
3063 Dynarr_add (dst, ch);
3067 Lisp_Object charset;
3068 unsigned int c1, c2, s1, s2;
3070 Lisp_Object value = charset_code_point (Vcharset_latin_jisx0201, ch);
3071 Lisp_Object ret = Fcar (value);
3075 charset = Vcharset_latin_jisx0201;
3081 BREAKUP_CHAR (ch, charset, c1, c2);
3083 if (EQ(charset, Vcharset_katakana_jisx0201))
3085 Dynarr_add (dst, c1 | 0x80);
3089 Dynarr_add (dst, c1);
3091 else if (EQ(charset, Vcharset_japanese_jisx0208))
3093 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3094 Dynarr_add (dst, s1);
3095 Dynarr_add (dst, s2);
3098 Dynarr_add (dst, '?');
3103 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3104 unsigned int *flags)
3108 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3109 Decode a JISX0208 character of Shift-JIS coding-system.
3110 CODE is the character code in Shift-JIS as a cons of type bytes.
3111 Return the corresponding character.
3115 unsigned char c1, c2, s1, s2;
3118 CHECK_INT (XCAR (code));
3119 CHECK_INT (XCDR (code));
3120 s1 = XINT (XCAR (code));
3121 s2 = XINT (XCDR (code));
3122 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3123 BYTE_SJIS_TWO_BYTE_2_P (s2))
3125 DECODE_SJIS (s1, s2, c1, c2);
3126 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3127 c1 & 0x7F, c2 & 0x7F));
3133 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3134 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3135 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3139 Lisp_Object charset;
3142 CHECK_CHAR_COERCE_INT (ch);
3143 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3144 if (EQ (charset, Vcharset_japanese_jisx0208))
3146 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3147 return Fcons (make_int (s1), make_int (s2));
3154 /************************************************************************/
3156 /************************************************************************/
3158 /* BIG5 is a coding system encoding two character sets: ASCII and
3159 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3160 character set and is encoded in two-byte.
3162 --- CODE RANGE of BIG5 ---
3163 (character set) (range)
3165 Big5 (1st byte) 0xA1 .. 0xFE
3166 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3167 --------------------------
3169 Since the number of characters in Big5 is larger than maximum
3170 characters in Emacs' charset (96x96), it can't be handled as one
3171 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3172 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3173 contains frequently used characters and the latter contains less
3174 frequently used characters. */
3176 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3177 ((c) >= 0xA1 && (c) <= 0xFE)
3179 /* Is this the second byte of a Shift-JIS two-byte char? */
3181 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3182 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3184 /* Number of Big5 characters which have the same code in 1st byte. */
3186 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3188 /* Code conversion macros. These are macros because they are used in
3189 inner loops during code conversion.
3191 Note that temporary variables in macros introduce the classic
3192 dynamic-scoping problems with variable names. We use capital-
3193 lettered variables in the assumption that XEmacs does not use
3194 capital letters in variables except in a very formalized way
3197 /* Convert Big5 code (b1, b2) into its internal string representation
3200 /* There is a much simpler way to split the Big5 charset into two.
3201 For the moment I'm going to leave the algorithm as-is because it
3202 claims to separate out the most-used characters into a single
3203 charset, which perhaps will lead to optimizations in various
3206 The way the algorithm works is something like this:
3208 Big5 can be viewed as a 94x157 charset, where the row is
3209 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3210 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3211 the split between low and high column numbers is apparently
3212 meaningless; ascending rows produce less and less frequent chars.
3213 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3214 the first charset, and the upper half (0xC9 .. 0xFE) to the
3215 second. To do the conversion, we convert the character into
3216 a single number where 0 .. 156 is the first row, 157 .. 313
3217 is the second, etc. That way, the characters are ordered by
3218 decreasing frequency. Then we just chop the space in two
3219 and coerce the result into a 94x94 space.
3222 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3224 int B1 = b1, B2 = b2; \
3226 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3230 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3234 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3235 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3237 c1 = I / (0xFF - 0xA1) + 0xA1; \
3238 c2 = I % (0xFF - 0xA1) + 0xA1; \
3241 /* Convert the internal string representation of a Big5 character
3242 (lb, c1, c2) into Big5 code (b1, b2). */
3244 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3246 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3248 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3250 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3252 b1 = I / BIG5_SAME_ROW + 0xA1; \
3253 b2 = I % BIG5_SAME_ROW; \
3254 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3258 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3266 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3267 (c >= 0x80 && c <= 0xA0))
3269 if (st->big5.in_second_byte)
3271 st->big5.in_second_byte = 0;
3272 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3276 st->big5.in_second_byte = 1;
3278 return CODING_CATEGORY_BIG5_MASK;
3281 /* Convert Big5 data to internal format. */
3284 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3285 unsigned_char_dynarr *dst, unsigned int n)
3288 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3289 unsigned int flags = str->flags;
3290 unsigned int ch = str->ch;
3291 eol_type_t eol_type = str->eol_type;
3298 /* Previous character was first byte of Big5 char. */
3299 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3301 unsigned char b1, b2, b3;
3302 DECODE_BIG5 (ch, c, b1, b2, b3);
3303 Dynarr_add (dst, b1);
3304 Dynarr_add (dst, b2);
3305 Dynarr_add (dst, b3);
3309 DECODE_ADD_BINARY_CHAR (ch, dst);
3310 DECODE_ADD_BINARY_CHAR (c, dst);
3316 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3317 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3320 DECODE_ADD_BINARY_CHAR (c, dst);
3322 label_continue_loop:;
3325 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3331 /* Convert internally-formatted data to Big5. */
3334 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3335 unsigned_char_dynarr *dst, unsigned int n)
3339 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3340 unsigned int flags = str->flags;
3341 unsigned int ch = str->ch;
3342 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3349 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3350 Dynarr_add (dst, '\r');
3351 if (eol_type != EOL_CR)
3352 Dynarr_add (dst, '\n');
3354 else if (BYTE_ASCII_P (c))
3357 Dynarr_add (dst, c);
3359 else if (BUFBYTE_LEADING_BYTE_P (c))
3361 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3362 c == LEADING_BYTE_CHINESE_BIG5_2)
3364 /* A recognized leading byte. */
3366 continue; /* not done with this character. */
3368 /* otherwise just ignore this character. */
3370 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3371 ch == LEADING_BYTE_CHINESE_BIG5_2)
3373 /* Previous char was a recognized leading byte. */
3375 continue; /* not done with this character. */
3379 /* Encountering second byte of a Big5 character. */
3380 unsigned char b1, b2;
3382 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3383 Dynarr_add (dst, b1);
3384 Dynarr_add (dst, b2);
3396 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3397 Decode a Big5 character CODE of BIG5 coding-system.
3398 CODE is the character code in BIG5, a cons of two integers.
3399 Return the corresponding character.
3403 unsigned char c1, c2, b1, b2;
3406 CHECK_INT (XCAR (code));
3407 CHECK_INT (XCDR (code));
3408 b1 = XINT (XCAR (code));
3409 b2 = XINT (XCDR (code));
3410 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3411 BYTE_BIG5_TWO_BYTE_2_P (b2))
3413 Charset_ID leading_byte;
3414 Lisp_Object charset;
3415 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3416 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3417 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3423 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3424 Encode the Big5 character CH to BIG5 coding-system.
3425 Return the corresponding character code in Big5.
3429 Lisp_Object charset;
3432 CHECK_CHAR_COERCE_INT (ch);
3433 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3434 if (EQ (charset, Vcharset_chinese_big5_1) ||
3435 EQ (charset, Vcharset_chinese_big5_2))
3437 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3439 return Fcons (make_int (b1), make_int (b2));
3446 /************************************************************************/
3448 /************************************************************************/
3451 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3457 switch (st->ucs4.in_byte)
3466 st->ucs4.in_byte = 0;
3472 return CODING_CATEGORY_UCS4_MASK;
3476 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3477 unsigned_char_dynarr *dst, unsigned int n)
3479 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3480 unsigned int flags = str->flags;
3481 unsigned int ch = str->ch;
3482 unsigned char counter = str->counter;
3486 unsigned char c = *src++;
3494 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3499 ch = ( ch << 8 ) | c;
3503 if (counter & CODING_STATE_END)
3504 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3508 str->counter = counter;
3512 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3513 unsigned_char_dynarr *dst, unsigned int *flags)
3515 Dynarr_add (dst, ch >> 24);
3516 Dynarr_add (dst, ch >> 16);
3517 Dynarr_add (dst, ch >> 8);
3518 Dynarr_add (dst, ch );
3522 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3523 unsigned int *flags)
3528 /************************************************************************/
3530 /************************************************************************/
3533 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3538 unsigned char c = *src++;
3539 switch (st->utf8.in_byte)
3542 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3545 st->utf8.in_byte = 5;
3547 st->utf8.in_byte = 4;
3549 st->utf8.in_byte = 3;
3551 st->utf8.in_byte = 2;
3553 st->utf8.in_byte = 1;
3558 if ((c & 0xc0) != 0x80)
3564 return CODING_CATEGORY_UTF8_MASK;
3568 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3569 unsigned_char_dynarr *dst, unsigned int n)
3571 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3572 unsigned int flags = str->flags;
3573 unsigned int ch = str->ch;
3574 eol_type_t eol_type = str->eol_type;
3575 unsigned char counter = str->counter;
3579 unsigned char c = *src++;
3588 else if ( c >= 0xf8 )
3593 else if ( c >= 0xf0 )
3598 else if ( c >= 0xe0 )
3603 else if ( c >= 0xc0 )
3610 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3611 DECODE_ADD_UCS_CHAR (c, dst);
3615 ch = ( ch << 6 ) | ( c & 0x3f );
3616 DECODE_ADD_UCS_CHAR (ch, dst);
3621 ch = ( ch << 6 ) | ( c & 0x3f );
3624 label_continue_loop:;
3627 if (flags & CODING_STATE_END)
3628 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3632 str->counter = counter;
3636 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3637 unsigned_char_dynarr *dst, unsigned int *flags)
3639 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3643 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3644 Dynarr_add (dst, '\r');
3645 if (eol_type != EOL_CR)
3646 Dynarr_add (dst, ch);
3648 else if (ch <= 0x7f)
3650 Dynarr_add (dst, ch);
3652 else if (ch <= 0x7ff)
3654 Dynarr_add (dst, (ch >> 6) | 0xc0);
3655 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3657 else if (ch <= 0xffff)
3659 Dynarr_add (dst, (ch >> 12) | 0xe0);
3660 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3661 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3663 else if (ch <= 0x1fffff)
3665 Dynarr_add (dst, (ch >> 18) | 0xf0);
3666 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3667 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3668 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3670 else if (ch <= 0x3ffffff)
3672 Dynarr_add (dst, (ch >> 24) | 0xf8);
3673 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3674 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3675 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3676 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3680 Dynarr_add (dst, (ch >> 30) | 0xfc);
3681 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3682 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3683 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3684 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3685 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3690 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3691 unsigned int *flags)
3696 /************************************************************************/
3697 /* ISO2022 methods */
3698 /************************************************************************/
3700 /* The following note describes the coding system ISO2022 briefly.
3701 Since the intention of this note is to help understand the
3702 functions in this file, some parts are NOT ACCURATE or OVERLY
3703 SIMPLIFIED. For thorough understanding, please refer to the
3704 original document of ISO2022.
3706 ISO2022 provides many mechanisms to encode several character sets
3707 in 7-bit and 8-bit environments. For 7-bit environments, all text
3708 is encoded using bytes less than 128. This may make the encoded
3709 text a little bit longer, but the text passes more easily through
3710 several gateways, some of which strip off MSB (Most Signigant Bit).
3712 There are two kinds of character sets: control character set and
3713 graphic character set. The former contains control characters such
3714 as `newline' and `escape' to provide control functions (control
3715 functions are also provided by escape sequences). The latter
3716 contains graphic characters such as 'A' and '-'. Emacs recognizes
3717 two control character sets and many graphic character sets.
3719 Graphic character sets are classified into one of the following
3720 four classes, according to the number of bytes (DIMENSION) and
3721 number of characters in one dimension (CHARS) of the set:
3722 - DIMENSION1_CHARS94
3723 - DIMENSION1_CHARS96
3724 - DIMENSION2_CHARS94
3725 - DIMENSION2_CHARS96
3727 In addition, each character set is assigned an identification tag,
3728 unique for each set, called "final character" (denoted as <F>
3729 hereafter). The <F> of each character set is decided by ECMA(*)
3730 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3731 (0x30..0x3F are for private use only).
3733 Note (*): ECMA = European Computer Manufacturers Association
3735 Here are examples of graphic character set [NAME(<F>)]:
3736 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3737 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3738 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3739 o DIMENSION2_CHARS96 -- none for the moment
3741 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3742 C0 [0x00..0x1F] -- control character plane 0
3743 GL [0x20..0x7F] -- graphic character plane 0
3744 C1 [0x80..0x9F] -- control character plane 1
3745 GR [0xA0..0xFF] -- graphic character plane 1
3747 A control character set is directly designated and invoked to C0 or
3748 C1 by an escape sequence. The most common case is that:
3749 - ISO646's control character set is designated/invoked to C0, and
3750 - ISO6429's control character set is designated/invoked to C1,
3751 and usually these designations/invocations are omitted in encoded
3752 text. In a 7-bit environment, only C0 can be used, and a control
3753 character for C1 is encoded by an appropriate escape sequence to
3754 fit into the environment. All control characters for C1 are
3755 defined to have corresponding escape sequences.
3757 A graphic character set is at first designated to one of four
3758 graphic registers (G0 through G3), then these graphic registers are
3759 invoked to GL or GR. These designations and invocations can be
3760 done independently. The most common case is that G0 is invoked to
3761 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3762 these invocations and designations are omitted in encoded text.
3763 In a 7-bit environment, only GL can be used.
3765 When a graphic character set of CHARS94 is invoked to GL, codes
3766 0x20 and 0x7F of the GL area work as control characters SPACE and
3767 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3770 There are two ways of invocation: locking-shift and single-shift.
3771 With locking-shift, the invocation lasts until the next different
3772 invocation, whereas with single-shift, the invocation affects the
3773 following character only and doesn't affect the locking-shift
3774 state. Invocations are done by the following control characters or
3777 ----------------------------------------------------------------------
3778 abbrev function cntrl escape seq description
3779 ----------------------------------------------------------------------
3780 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3781 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3782 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3783 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3784 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3785 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3786 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3787 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3788 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3789 ----------------------------------------------------------------------
3790 (*) These are not used by any known coding system.
3792 Control characters for these functions are defined by macros
3793 ISO_CODE_XXX in `coding.h'.
3795 Designations are done by the following escape sequences:
3796 ----------------------------------------------------------------------
3797 escape sequence description
3798 ----------------------------------------------------------------------
3799 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3800 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3801 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3802 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3803 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3804 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3805 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3806 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3807 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3808 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3809 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3810 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3811 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3812 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3813 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3814 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3815 ----------------------------------------------------------------------
3817 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3818 of dimension 1, chars 94, and final character <F>, etc...
3820 Note (*): Although these designations are not allowed in ISO2022,
3821 Emacs accepts them on decoding, and produces them on encoding
3822 CHARS96 character sets in a coding system which is characterized as
3823 7-bit environment, non-locking-shift, and non-single-shift.
3825 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3826 '(' can be omitted. We refer to this as "short-form" hereafter.
3828 Now you may notice that there are a lot of ways for encoding the
3829 same multilingual text in ISO2022. Actually, there exist many
3830 coding systems such as Compound Text (used in X11's inter client
3831 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3832 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3833 localized platforms), and all of these are variants of ISO2022.
3835 In addition to the above, Emacs handles two more kinds of escape
3836 sequences: ISO6429's direction specification and Emacs' private
3837 sequence for specifying character composition.
3839 ISO6429's direction specification takes the following form:
3840 o CSI ']' -- end of the current direction
3841 o CSI '0' ']' -- end of the current direction
3842 o CSI '1' ']' -- start of left-to-right text
3843 o CSI '2' ']' -- start of right-to-left text
3844 The control character CSI (0x9B: control sequence introducer) is
3845 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3847 Character composition specification takes the following form:
3848 o ESC '0' -- start character composition
3849 o ESC '1' -- end character composition
3850 Since these are not standard escape sequences of any ISO standard,
3851 their use with these meanings is restricted to Emacs only. */
3854 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3858 for (i = 0; i < 4; i++)
3860 if (!NILP (coding_system))
3862 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3864 iso->charset[i] = Qt;
3865 iso->invalid_designated[i] = 0;
3867 iso->esc = ISO_ESC_NOTHING;
3868 iso->esc_bytes_index = 0;
3869 iso->register_left = 0;
3870 iso->register_right = 1;
3871 iso->switched_dir_and_no_valid_charset_yet = 0;
3872 iso->invalid_switch_dir = 0;
3873 iso->output_direction_sequence = 0;
3874 iso->output_literally = 0;
3875 #ifdef ENABLE_COMPOSITE_CHARS
3876 if (iso->composite_chars)
3877 Dynarr_reset (iso->composite_chars);
3882 fit_to_be_escape_quoted (unsigned char c)
3899 /* Parse one byte of an ISO2022 escape sequence.
3900 If the result is an invalid escape sequence, return 0 and
3901 do not change anything in STR. Otherwise, if the result is
3902 an incomplete escape sequence, update ISO2022.ESC and
3903 ISO2022.ESC_BYTES and return -1. Otherwise, update
3904 all the state variables (but not ISO2022.ESC_BYTES) and
3907 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3908 or invocation of an invalid character set and treat that as
3909 an unrecognized escape sequence. */
3912 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3913 unsigned char c, unsigned int *flags,
3914 int check_invalid_charsets)
3916 /* (1) If we're at the end of a designation sequence, CS is the
3917 charset being designated and REG is the register to designate
3920 (2) If we're at the end of a locking-shift sequence, REG is
3921 the register to invoke and HALF (0 == left, 1 == right) is
3922 the half to invoke it into.
3924 (3) If we're at the end of a single-shift sequence, REG is
3925 the register to invoke. */
3926 Lisp_Object cs = Qnil;
3929 /* NOTE: This code does goto's all over the fucking place.
3930 The reason for this is that we're basically implementing
3931 a state machine here, and hierarchical languages like C
3932 don't really provide a clean way of doing this. */
3934 if (! (*flags & CODING_STATE_ESCAPE))
3935 /* At beginning of escape sequence; we need to reset our
3936 escape-state variables. */
3937 iso->esc = ISO_ESC_NOTHING;
3939 iso->output_literally = 0;
3940 iso->output_direction_sequence = 0;
3944 case ISO_ESC_NOTHING:
3945 iso->esc_bytes_index = 0;
3948 case ISO_CODE_ESC: /* Start escape sequence */
3949 *flags |= CODING_STATE_ESCAPE;
3953 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3954 *flags |= CODING_STATE_ESCAPE;
3955 iso->esc = ISO_ESC_5_11;
3958 case ISO_CODE_SO: /* locking shift 1 */
3961 case ISO_CODE_SI: /* locking shift 0 */
3965 case ISO_CODE_SS2: /* single shift */
3968 case ISO_CODE_SS3: /* single shift */
3972 default: /* Other control characters */
3979 /**** single shift ****/
3981 case 'N': /* single shift 2 */
3984 case 'O': /* single shift 3 */
3988 /**** locking shift ****/
3990 case '~': /* locking shift 1 right */
3993 case 'n': /* locking shift 2 */
3996 case '}': /* locking shift 2 right */
3999 case 'o': /* locking shift 3 */
4002 case '|': /* locking shift 3 right */
4006 #ifdef ENABLE_COMPOSITE_CHARS
4007 /**** composite ****/
4010 iso->esc = ISO_ESC_START_COMPOSITE;
4011 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4012 CODING_STATE_COMPOSITE;
4016 iso->esc = ISO_ESC_END_COMPOSITE;
4017 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4018 ~CODING_STATE_COMPOSITE;
4020 #endif /* ENABLE_COMPOSITE_CHARS */
4022 /**** directionality ****/
4025 iso->esc = ISO_ESC_5_11;
4028 /**** designation ****/
4030 case '$': /* multibyte charset prefix */
4031 iso->esc = ISO_ESC_2_4;
4035 if (0x28 <= c && c <= 0x2F)
4037 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4041 /* This function is called with CODESYS equal to nil when
4042 doing coding-system detection. */
4044 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4045 && fit_to_be_escape_quoted (c))
4047 iso->esc = ISO_ESC_LITERAL;
4048 *flags &= CODING_STATE_ISO2022_LOCK;
4058 /**** directionality ****/
4060 case ISO_ESC_5_11: /* ISO6429 direction control */
4063 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4064 goto directionality;
4066 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4067 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4068 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4072 case ISO_ESC_5_11_0:
4075 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4076 goto directionality;
4080 case ISO_ESC_5_11_1:
4083 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4084 goto directionality;
4088 case ISO_ESC_5_11_2:
4091 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4092 goto directionality;
4097 iso->esc = ISO_ESC_DIRECTIONALITY;
4098 /* Various junk here to attempt to preserve the direction sequences
4099 literally in the text if they would otherwise be swallowed due
4100 to invalid designations that don't show up as actual charset
4101 changes in the text. */
4102 if (iso->invalid_switch_dir)
4104 /* We already inserted a direction switch literally into the
4105 text. We assume (#### this may not be right) that the
4106 next direction switch is the one going the other way,
4107 and we need to output that literally as well. */
4108 iso->output_literally = 1;
4109 iso->invalid_switch_dir = 0;
4115 /* If we are in the thrall of an invalid designation,
4116 then stick the directionality sequence literally into the
4117 output stream so it ends up in the original text again. */
4118 for (jj = 0; jj < 4; jj++)
4119 if (iso->invalid_designated[jj])
4123 iso->output_literally = 1;
4124 iso->invalid_switch_dir = 1;
4127 /* Indicate that we haven't yet seen a valid designation,
4128 so that if a switch-dir is directly followed by an
4129 invalid designation, both get inserted literally. */
4130 iso->switched_dir_and_no_valid_charset_yet = 1;
4135 /**** designation ****/
4138 if (0x28 <= c && c <= 0x2F)
4140 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4143 if (0x40 <= c && c <= 0x42)
4145 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4146 *flags & CODING_STATE_R2L ?
4147 CHARSET_RIGHT_TO_LEFT :
4148 CHARSET_LEFT_TO_RIGHT);
4158 if (c < '0' || c > '~')
4159 return 0; /* bad final byte */
4161 if (iso->esc >= ISO_ESC_2_8 &&
4162 iso->esc <= ISO_ESC_2_15)
4164 type = ((iso->esc >= ISO_ESC_2_12) ?
4165 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4166 reg = (iso->esc - ISO_ESC_2_8) & 3;
4168 else if (iso->esc >= ISO_ESC_2_4_8 &&
4169 iso->esc <= ISO_ESC_2_4_15)
4171 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4172 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4173 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4177 /* Can this ever be reached? -slb */
4181 cs = CHARSET_BY_ATTRIBUTES (type, c,
4182 *flags & CODING_STATE_R2L ?
4183 CHARSET_RIGHT_TO_LEFT :
4184 CHARSET_LEFT_TO_RIGHT);
4190 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4194 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4195 /* can't invoke something that ain't there. */
4197 iso->esc = ISO_ESC_SINGLE_SHIFT;
4198 *flags &= CODING_STATE_ISO2022_LOCK;
4200 *flags |= CODING_STATE_SS2;
4202 *flags |= CODING_STATE_SS3;
4206 if (check_invalid_charsets &&
4207 !CHARSETP (iso->charset[reg]))
4208 /* can't invoke something that ain't there. */
4211 iso->register_right = reg;
4213 iso->register_left = reg;
4214 *flags &= CODING_STATE_ISO2022_LOCK;
4215 iso->esc = ISO_ESC_LOCKING_SHIFT;
4219 if (NILP (cs) && check_invalid_charsets)
4221 iso->invalid_designated[reg] = 1;
4222 iso->charset[reg] = Vcharset_ascii;
4223 iso->esc = ISO_ESC_DESIGNATE;
4224 *flags &= CODING_STATE_ISO2022_LOCK;
4225 iso->output_literally = 1;
4226 if (iso->switched_dir_and_no_valid_charset_yet)
4228 /* We encountered a switch-direction followed by an
4229 invalid designation. Ensure that the switch-direction
4230 gets outputted; otherwise it will probably get eaten
4231 when the text is written out again. */
4232 iso->switched_dir_and_no_valid_charset_yet = 0;
4233 iso->output_direction_sequence = 1;
4234 /* And make sure that the switch-dir going the other
4235 way gets outputted, as well. */
4236 iso->invalid_switch_dir = 1;
4240 /* This function is called with CODESYS equal to nil when
4241 doing coding-system detection. */
4242 if (!NILP (codesys))
4244 charset_conversion_spec_dynarr *dyn =
4245 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4251 for (i = 0; i < Dynarr_length (dyn); i++)
4253 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4254 if (EQ (cs, spec->from_charset))
4255 cs = spec->to_charset;
4260 iso->charset[reg] = cs;
4261 iso->esc = ISO_ESC_DESIGNATE;
4262 *flags &= CODING_STATE_ISO2022_LOCK;
4263 if (iso->invalid_designated[reg])
4265 iso->invalid_designated[reg] = 0;
4266 iso->output_literally = 1;
4268 if (iso->switched_dir_and_no_valid_charset_yet)
4269 iso->switched_dir_and_no_valid_charset_yet = 0;
4274 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4279 /* #### There are serious deficiencies in the recognition mechanism
4280 here. This needs to be much smarter if it's going to cut it.
4281 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4282 it should be detected as Latin-1.
4283 All the ISO2022 stuff in this file should be synced up with the
4284 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4285 Perhaps we should wait till R2L works in FSF Emacs? */
4287 if (!st->iso2022.initted)
4289 reset_iso2022 (Qnil, &st->iso2022.iso);
4290 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4291 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4292 CODING_CATEGORY_ISO_8_1_MASK |
4293 CODING_CATEGORY_ISO_8_2_MASK |
4294 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4295 st->iso2022.flags = 0;
4296 st->iso2022.high_byte_count = 0;
4297 st->iso2022.saw_single_shift = 0;
4298 st->iso2022.initted = 1;
4301 mask = st->iso2022.mask;
4308 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4309 st->iso2022.high_byte_count++;
4313 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4315 if (st->iso2022.high_byte_count & 1)
4316 /* odd number of high bytes; assume not iso-8-2 */
4317 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4319 st->iso2022.high_byte_count = 0;
4320 st->iso2022.saw_single_shift = 0;
4322 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4324 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4325 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4326 { /* control chars */
4329 /* Allow and ignore control characters that you might
4330 reasonably see in a text file */
4335 case 8: /* backspace */
4336 case 11: /* vertical tab */
4337 case 12: /* form feed */
4338 case 26: /* MS-DOS C-z junk */
4339 case 31: /* '^_' -- for info */
4340 goto label_continue_loop;
4347 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4350 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4351 &st->iso2022.flags, 0))
4353 switch (st->iso2022.iso.esc)
4355 case ISO_ESC_DESIGNATE:
4356 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4357 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4359 case ISO_ESC_LOCKING_SHIFT:
4360 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4361 goto ran_out_of_chars;
4362 case ISO_ESC_SINGLE_SHIFT:
4363 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4364 st->iso2022.saw_single_shift = 1;
4373 goto ran_out_of_chars;
4376 label_continue_loop:;
4385 postprocess_iso2022_mask (int mask)
4387 /* #### kind of cheesy */
4388 /* If seven-bit ISO is allowed, then assume that the encoding is
4389 entirely seven-bit and turn off the eight-bit ones. */
4390 if (mask & CODING_CATEGORY_ISO_7_MASK)
4391 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4392 CODING_CATEGORY_ISO_8_1_MASK |
4393 CODING_CATEGORY_ISO_8_2_MASK);
4397 /* If FLAGS is a null pointer or specifies right-to-left motion,
4398 output a switch-dir-to-left-to-right sequence to DST.
4399 Also update FLAGS if it is not a null pointer.
4400 If INTERNAL_P is set, we are outputting in internal format and
4401 need to handle the CSI differently. */
4404 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4405 unsigned_char_dynarr *dst,
4406 unsigned int *flags,
4409 if (!flags || (*flags & CODING_STATE_R2L))
4411 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4413 Dynarr_add (dst, ISO_CODE_ESC);
4414 Dynarr_add (dst, '[');
4416 else if (internal_p)
4417 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4419 Dynarr_add (dst, ISO_CODE_CSI);
4420 Dynarr_add (dst, '0');
4421 Dynarr_add (dst, ']');
4423 *flags &= ~CODING_STATE_R2L;
4427 /* If FLAGS is a null pointer or specifies a direction different from
4428 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4429 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4430 sequence to DST. Also update FLAGS if it is not a null pointer.
4431 If INTERNAL_P is set, we are outputting in internal format and
4432 need to handle the CSI differently. */
4435 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4436 unsigned_char_dynarr *dst, unsigned int *flags,
4439 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4440 direction == CHARSET_LEFT_TO_RIGHT)
4441 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4442 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4443 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4444 direction == CHARSET_RIGHT_TO_LEFT)
4446 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4448 Dynarr_add (dst, ISO_CODE_ESC);
4449 Dynarr_add (dst, '[');
4451 else if (internal_p)
4452 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4454 Dynarr_add (dst, ISO_CODE_CSI);
4455 Dynarr_add (dst, '2');
4456 Dynarr_add (dst, ']');
4458 *flags |= CODING_STATE_R2L;
4462 /* Convert ISO2022-format data to internal format. */
4465 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4466 unsigned_char_dynarr *dst, unsigned int n)
4468 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4469 unsigned int flags = str->flags;
4470 unsigned int ch = str->ch;
4471 eol_type_t eol_type = str->eol_type;
4472 #ifdef ENABLE_COMPOSITE_CHARS
4473 unsigned_char_dynarr *real_dst = dst;
4475 Lisp_Object coding_system;
4477 XSETCODING_SYSTEM (coding_system, str->codesys);
4479 #ifdef ENABLE_COMPOSITE_CHARS
4480 if (flags & CODING_STATE_COMPOSITE)
4481 dst = str->iso2022.composite_chars;
4482 #endif /* ENABLE_COMPOSITE_CHARS */
4486 unsigned char c = *src++;
4487 if (flags & CODING_STATE_ESCAPE)
4488 { /* Within ESC sequence */
4489 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4494 switch (str->iso2022.esc)
4496 #ifdef ENABLE_COMPOSITE_CHARS
4497 case ISO_ESC_START_COMPOSITE:
4498 if (str->iso2022.composite_chars)
4499 Dynarr_reset (str->iso2022.composite_chars);
4501 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4502 dst = str->iso2022.composite_chars;
4504 case ISO_ESC_END_COMPOSITE:
4506 Bufbyte comstr[MAX_EMCHAR_LEN];
4508 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4509 Dynarr_length (dst));
4511 len = set_charptr_emchar (comstr, emch);
4512 Dynarr_add_many (dst, comstr, len);
4515 #endif /* ENABLE_COMPOSITE_CHARS */
4517 case ISO_ESC_LITERAL:
4518 DECODE_ADD_BINARY_CHAR (c, dst);
4522 /* Everything else handled already */
4527 /* Attempted error recovery. */
4528 if (str->iso2022.output_direction_sequence)
4529 ensure_correct_direction (flags & CODING_STATE_R2L ?
4530 CHARSET_RIGHT_TO_LEFT :
4531 CHARSET_LEFT_TO_RIGHT,
4532 str->codesys, dst, 0, 1);
4533 /* More error recovery. */
4534 if (!retval || str->iso2022.output_literally)
4536 /* Output the (possibly invalid) sequence */
4538 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4539 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4540 flags &= CODING_STATE_ISO2022_LOCK;
4542 n++, src--;/* Repeat the loop with the same character. */
4545 /* No sense in reprocessing the final byte of the
4546 escape sequence; it could mess things up anyway.
4548 DECODE_ADD_BINARY_CHAR (c, dst);
4553 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4554 { /* Control characters */
4556 /***** Error-handling *****/
4558 /* If we were in the middle of a character, dump out the
4559 partial character. */
4560 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4562 /* If we just saw a single-shift character, dump it out.
4563 This may dump out the wrong sort of single-shift character,
4564 but least it will give an indication that something went
4566 if (flags & CODING_STATE_SS2)
4568 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4569 flags &= ~CODING_STATE_SS2;
4571 if (flags & CODING_STATE_SS3)
4573 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4574 flags &= ~CODING_STATE_SS3;
4577 /***** Now handle the control characters. *****/
4580 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4582 flags &= CODING_STATE_ISO2022_LOCK;
4584 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4585 DECODE_ADD_BINARY_CHAR (c, dst);
4588 { /* Graphic characters */
4589 Lisp_Object charset;
4595 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4597 /* Now determine the charset. */
4598 reg = ((flags & CODING_STATE_SS2) ? 2
4599 : (flags & CODING_STATE_SS3) ? 3
4600 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4601 : str->iso2022.register_left);
4602 charset = str->iso2022.charset[reg];
4604 /* Error checking: */
4605 if (! CHARSETP (charset)
4606 || str->iso2022.invalid_designated[reg]
4607 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4608 && XCHARSET_CHARS (charset) == 94))
4609 /* Mrmph. We are trying to invoke a register that has no
4610 or an invalid charset in it, or trying to add a character
4611 outside the range of the charset. Insert that char literally
4612 to preserve it for the output. */
4614 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4615 DECODE_ADD_BINARY_CHAR (c, dst);
4620 /* Things are probably hunky-dorey. */
4622 /* Fetch reverse charset, maybe. */
4623 if (((flags & CODING_STATE_R2L) &&
4624 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4626 (!(flags & CODING_STATE_R2L) &&
4627 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4629 Lisp_Object new_charset =
4630 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4631 if (!NILP (new_charset))
4632 charset = new_charset;
4636 if (XCHARSET_DIMENSION (charset) == 1)
4638 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4640 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4645 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4651 lb = XCHARSET_LEADING_BYTE (charset);
4652 switch (XCHARSET_REP_BYTES (charset))
4655 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4656 Dynarr_add (dst, c & 0x7F);
4659 case 2: /* one-byte official */
4660 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4661 Dynarr_add (dst, lb);
4662 Dynarr_add (dst, c | 0x80);
4665 case 3: /* one-byte private or two-byte official */
4666 if (XCHARSET_PRIVATE_P (charset))
4668 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4669 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4670 Dynarr_add (dst, lb);
4671 Dynarr_add (dst, c | 0x80);
4677 Dynarr_add (dst, lb);
4678 Dynarr_add (dst, ch | 0x80);
4679 Dynarr_add (dst, c | 0x80);
4687 default: /* two-byte private */
4690 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4691 Dynarr_add (dst, lb);
4692 Dynarr_add (dst, ch | 0x80);
4693 Dynarr_add (dst, c | 0x80);
4703 flags &= CODING_STATE_ISO2022_LOCK;
4706 label_continue_loop:;
4709 if (flags & CODING_STATE_END)
4710 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4717 /***** ISO2022 encoder *****/
4719 /* Designate CHARSET into register REG. */
4722 iso2022_designate (Lisp_Object charset, unsigned char reg,
4723 struct encoding_stream *str, unsigned_char_dynarr *dst)
4725 static CONST char inter94[] = "()*+";
4726 static CONST char inter96[] = ",-./";
4728 unsigned char final;
4729 Lisp_Object old_charset = str->iso2022.charset[reg];
4731 str->iso2022.charset[reg] = charset;
4732 if (!CHARSETP (charset))
4733 /* charset might be an initial nil or t. */
4735 type = XCHARSET_TYPE (charset);
4736 final = XCHARSET_FINAL (charset);
4737 if (!str->iso2022.force_charset_on_output[reg] &&
4738 CHARSETP (old_charset) &&
4739 XCHARSET_TYPE (old_charset) == type &&
4740 XCHARSET_FINAL (old_charset) == final)
4743 str->iso2022.force_charset_on_output[reg] = 0;
4746 charset_conversion_spec_dynarr *dyn =
4747 str->codesys->iso2022.output_conv;
4753 for (i = 0; i < Dynarr_length (dyn); i++)
4755 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4756 if (EQ (charset, spec->from_charset))
4757 charset = spec->to_charset;
4762 Dynarr_add (dst, ISO_CODE_ESC);
4765 case CHARSET_TYPE_94:
4766 Dynarr_add (dst, inter94[reg]);
4768 case CHARSET_TYPE_96:
4769 Dynarr_add (dst, inter96[reg]);
4771 case CHARSET_TYPE_94X94:
4772 Dynarr_add (dst, '$');
4774 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4777 Dynarr_add (dst, inter94[reg]);
4779 case CHARSET_TYPE_96X96:
4780 Dynarr_add (dst, '$');
4781 Dynarr_add (dst, inter96[reg]);
4784 Dynarr_add (dst, final);
4788 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4790 if (str->iso2022.register_left != 0)
4792 Dynarr_add (dst, ISO_CODE_SI);
4793 str->iso2022.register_left = 0;
4798 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4800 if (str->iso2022.register_left != 1)
4802 Dynarr_add (dst, ISO_CODE_SO);
4803 str->iso2022.register_left = 1;
4808 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4809 unsigned_char_dynarr *dst, unsigned int *flags)
4811 unsigned char charmask;
4812 Lisp_Coding_System* codesys = str->codesys;
4813 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4815 Lisp_Object charset = str->iso2022.current_charset;
4816 int half = str->iso2022.current_half;
4817 unsigned int byte1, byte2;
4821 restore_left_to_right_direction (codesys, dst, flags, 0);
4823 /* Make sure G0 contains ASCII */
4824 if ((ch > ' ' && ch < ISO_CODE_DEL)
4825 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4827 ensure_normal_shift (str, dst);
4828 iso2022_designate (Vcharset_ascii, 0, str, dst);
4831 /* If necessary, restore everything to the default state
4833 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4835 restore_left_to_right_direction (codesys, dst, flags, 0);
4837 ensure_normal_shift (str, dst);
4839 for (i = 0; i < 4; i++)
4841 Lisp_Object initial_charset =
4842 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4843 iso2022_designate (initial_charset, i, str, dst);
4848 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4849 Dynarr_add (dst, '\r');
4850 if (eol_type != EOL_CR)
4851 Dynarr_add (dst, ch);
4855 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4856 && fit_to_be_escape_quoted (ch))
4857 Dynarr_add (dst, ISO_CODE_ESC);
4858 Dynarr_add (dst, ch);
4861 else if ( (0x80 <= ch) && (ch <= 0x9f) )
4863 charmask = (half == 0 ? 0x00 : 0x80);
4865 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4866 && fit_to_be_escape_quoted (ch))
4867 Dynarr_add (dst, ISO_CODE_ESC);
4868 /* you asked for it ... */
4869 Dynarr_add (dst, ch);
4875 /* Now determine which register to use. */
4877 for (i = 0; i < 4; i++)
4879 Lisp_Object code_point;
4881 if ((CHARSETP (charset = str->iso2022.charset[i])
4882 && !EQ (code_point = charset_code_point (charset, ch), Qnil))
4886 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
4887 && !EQ (code_point = charset_code_point (charset, ch), Qnil)))
4889 Lisp_Object ret = Fcar (code_point);
4894 ret = Fcar (Fcdr (code_point));
4911 Lisp_Object original_default_coded_charset_priority_list
4912 = Vdefault_coded_charset_priority_list;
4914 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
4916 BREAKUP_CHAR (ch, charset, byte1, byte2);
4917 if (XCHARSET_FINAL (charset))
4919 Vdefault_coded_charset_priority_list
4920 = Fcdr (Fmemq (XCHARSET_NAME (charset),
4921 Vdefault_coded_charset_priority_list));
4923 BREAKUP_CHAR (ch, charset, byte1, byte2);
4924 if (!XCHARSET_FINAL (charset))
4926 charset = Vcharset_ascii;
4930 Vdefault_coded_charset_priority_list
4931 = original_default_coded_charset_priority_list;
4933 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4934 codesys, dst, flags, 0);
4938 if (XCHARSET_GRAPHIC (charset) != 0)
4940 if (!NILP (str->iso2022.charset[1]) &&
4941 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4942 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4944 else if (!NILP (str->iso2022.charset[2]))
4946 else if (!NILP (str->iso2022.charset[3]))
4955 iso2022_designate (charset, reg, str, dst);
4957 /* Now invoke that register. */
4961 ensure_normal_shift (str, dst);
4965 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4967 ensure_shift_out (str, dst);
4974 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4976 Dynarr_add (dst, ISO_CODE_ESC);
4977 Dynarr_add (dst, 'N');
4982 Dynarr_add (dst, ISO_CODE_SS2);
4987 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4989 Dynarr_add (dst, ISO_CODE_ESC);
4990 Dynarr_add (dst, 'O');
4995 Dynarr_add (dst, ISO_CODE_SS3);
5003 charmask = (half == 0 ? 0x00 : 0x80);
5005 switch (XCHARSET_DIMENSION (charset))
5008 Dynarr_add (dst, byte1 | charmask);
5011 Dynarr_add (dst, byte1 | charmask);
5012 Dynarr_add (dst, byte2 | charmask);
5018 str->iso2022.current_charset = charset;
5019 str->iso2022.current_half = half;
5023 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5024 unsigned int *flags)
5026 Lisp_Coding_System* codesys = str->codesys;
5029 restore_left_to_right_direction (codesys, dst, flags, 0);
5030 ensure_normal_shift (str, dst);
5031 for (i = 0; i < 4; i++)
5033 Lisp_Object initial_charset
5034 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5035 iso2022_designate (initial_charset, i, str, dst);
5040 /************************************************************************/
5041 /* No-conversion methods */
5042 /************************************************************************/
5044 /* This is used when reading in "binary" files -- i.e. files that may
5045 contain all 256 possible byte values and that are not to be
5046 interpreted as being in any particular decoding. */
5048 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5049 unsigned_char_dynarr *dst, unsigned int n)
5052 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5053 unsigned int flags = str->flags;
5054 unsigned int ch = str->ch;
5055 eol_type_t eol_type = str->eol_type;
5061 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5062 DECODE_ADD_BINARY_CHAR (c, dst);
5063 label_continue_loop:;
5066 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5073 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5074 unsigned_char_dynarr *dst, unsigned int n)
5077 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5078 unsigned int flags = str->flags;
5079 unsigned int ch = str->ch;
5080 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5082 unsigned char char_boundary = str->iso2022.current_char_boundary;
5089 if (char_boundary == 0)
5095 else if ( c >= 0xf8 )
5100 else if ( c >= 0xf0 )
5105 else if ( c >= 0xe0 )
5110 else if ( c >= 0xc0 )
5120 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5121 Dynarr_add (dst, '\r');
5122 if (eol_type != EOL_CR)
5123 Dynarr_add (dst, c);
5126 Dynarr_add (dst, c);
5129 else if (char_boundary == 1)
5131 ch = ( ch << 6 ) | ( c & 0x3f );
5132 Dynarr_add (dst, ch & 0xff);
5137 ch = ( ch << 6 ) | ( c & 0x3f );
5140 #else /* not UTF2000 */
5143 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5144 Dynarr_add (dst, '\r');
5145 if (eol_type != EOL_CR)
5146 Dynarr_add (dst, '\n');
5149 else if (BYTE_ASCII_P (c))
5152 Dynarr_add (dst, c);
5154 else if (BUFBYTE_LEADING_BYTE_P (c))
5157 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5158 c == LEADING_BYTE_CONTROL_1)
5161 Dynarr_add (dst, '~'); /* untranslatable character */
5165 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5166 Dynarr_add (dst, c);
5167 else if (ch == LEADING_BYTE_CONTROL_1)
5170 Dynarr_add (dst, c - 0x20);
5172 /* else it should be the second or third byte of an
5173 untranslatable character, so ignore it */
5176 #endif /* not UTF2000 */
5182 str->iso2022.current_char_boundary = char_boundary;
5187 /************************************************************************/
5188 /* Simple internal/external functions */
5189 /************************************************************************/
5191 static Extbyte_dynarr *conversion_out_dynarr;
5192 static Bufbyte_dynarr *conversion_in_dynarr;
5194 /* Determine coding system from coding format */
5196 /* #### not correct for all values of `fmt'! */
5198 external_data_format_to_coding_system (enum external_data_format fmt)
5202 case FORMAT_FILENAME:
5203 case FORMAT_TERMINAL:
5204 if (EQ (Vfile_name_coding_system, Qnil) ||
5205 EQ (Vfile_name_coding_system, Qbinary))
5208 return Fget_coding_system (Vfile_name_coding_system);
5211 return Fget_coding_system (Qctext);
5219 convert_to_external_format (CONST Bufbyte *ptr,
5222 enum external_data_format fmt)
5224 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5226 if (!conversion_out_dynarr)
5227 conversion_out_dynarr = Dynarr_new (Extbyte);
5229 Dynarr_reset (conversion_out_dynarr);
5231 if (NILP (coding_system))
5233 CONST Bufbyte *end = ptr + len;
5239 (*ptr < 0xc0) ? *ptr :
5240 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5243 (BYTE_ASCII_P (*ptr)) ? *ptr :
5244 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5245 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5248 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5252 #ifdef ERROR_CHECK_BUFPOS
5253 assert (ptr == end);
5258 Lisp_Object instream, outstream, da_outstream;
5259 Lstream *istr, *ostr;
5260 struct gcpro gcpro1, gcpro2, gcpro3;
5261 char tempbuf[1024]; /* some random amount */
5263 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5264 da_outstream = make_dynarr_output_stream
5265 ((unsigned_char_dynarr *) conversion_out_dynarr);
5267 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5268 istr = XLSTREAM (instream);
5269 ostr = XLSTREAM (outstream);
5270 GCPRO3 (instream, outstream, da_outstream);
5273 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5276 Lstream_write (ostr, tempbuf, size_in_bytes);
5278 Lstream_close (istr);
5279 Lstream_close (ostr);
5281 Lstream_delete (istr);
5282 Lstream_delete (ostr);
5283 Lstream_delete (XLSTREAM (da_outstream));
5286 *len_out = Dynarr_length (conversion_out_dynarr);
5287 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5288 return Dynarr_atp (conversion_out_dynarr, 0);
5292 convert_from_external_format (CONST Extbyte *ptr,
5295 enum external_data_format fmt)
5297 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5299 if (!conversion_in_dynarr)
5300 conversion_in_dynarr = Dynarr_new (Bufbyte);
5302 Dynarr_reset (conversion_in_dynarr);
5304 if (NILP (coding_system))
5306 CONST Extbyte *end = ptr + len;
5307 for (; ptr < end; ptr++)
5310 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5315 Lisp_Object instream, outstream, da_outstream;
5316 Lstream *istr, *ostr;
5317 struct gcpro gcpro1, gcpro2, gcpro3;
5318 char tempbuf[1024]; /* some random amount */
5320 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5321 da_outstream = make_dynarr_output_stream
5322 ((unsigned_char_dynarr *) conversion_in_dynarr);
5324 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5325 istr = XLSTREAM (instream);
5326 ostr = XLSTREAM (outstream);
5327 GCPRO3 (instream, outstream, da_outstream);
5330 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5333 Lstream_write (ostr, tempbuf, size_in_bytes);
5335 Lstream_close (istr);
5336 Lstream_close (ostr);
5338 Lstream_delete (istr);
5339 Lstream_delete (ostr);
5340 Lstream_delete (XLSTREAM (da_outstream));
5343 *len_out = Dynarr_length (conversion_in_dynarr);
5344 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5345 return Dynarr_atp (conversion_in_dynarr, 0);
5349 /************************************************************************/
5350 /* Initialization */
5351 /************************************************************************/
5354 syms_of_file_coding (void)
5356 deferror (&Qcoding_system_error, "coding-system-error",
5357 "Coding-system error", Qio_error);
5359 DEFSUBR (Fcoding_system_p);
5360 DEFSUBR (Ffind_coding_system);
5361 DEFSUBR (Fget_coding_system);
5362 DEFSUBR (Fcoding_system_list);
5363 DEFSUBR (Fcoding_system_name);
5364 DEFSUBR (Fmake_coding_system);
5365 DEFSUBR (Fcopy_coding_system);
5366 DEFSUBR (Fdefine_coding_system_alias);
5367 DEFSUBR (Fsubsidiary_coding_system);
5369 DEFSUBR (Fcoding_system_type);
5370 DEFSUBR (Fcoding_system_doc_string);
5372 DEFSUBR (Fcoding_system_charset);
5374 DEFSUBR (Fcoding_system_property);
5376 DEFSUBR (Fcoding_category_list);
5377 DEFSUBR (Fset_coding_priority_list);
5378 DEFSUBR (Fcoding_priority_list);
5379 DEFSUBR (Fset_coding_category_system);
5380 DEFSUBR (Fcoding_category_system);
5382 DEFSUBR (Fdetect_coding_region);
5383 DEFSUBR (Fdecode_coding_region);
5384 DEFSUBR (Fencode_coding_region);
5386 DEFSUBR (Fdecode_shift_jis_char);
5387 DEFSUBR (Fencode_shift_jis_char);
5388 DEFSUBR (Fdecode_big5_char);
5389 DEFSUBR (Fencode_big5_char);
5391 defsymbol (&Qcoding_systemp, "coding-system-p");
5392 defsymbol (&Qno_conversion, "no-conversion");
5393 defsymbol (&Qraw_text, "raw-text");
5395 defsymbol (&Qbig5, "big5");
5396 defsymbol (&Qshift_jis, "shift-jis");
5397 defsymbol (&Qucs4, "ucs-4");
5398 defsymbol (&Qutf8, "utf-8");
5399 defsymbol (&Qccl, "ccl");
5400 defsymbol (&Qiso2022, "iso2022");
5402 defsymbol (&Qmnemonic, "mnemonic");
5403 defsymbol (&Qeol_type, "eol-type");
5404 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5405 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5407 defsymbol (&Qcr, "cr");
5408 defsymbol (&Qlf, "lf");
5409 defsymbol (&Qcrlf, "crlf");
5410 defsymbol (&Qeol_cr, "eol-cr");
5411 defsymbol (&Qeol_lf, "eol-lf");
5412 defsymbol (&Qeol_crlf, "eol-crlf");
5414 defsymbol (&Qcharset_g0, "charset-g0");
5415 defsymbol (&Qcharset_g1, "charset-g1");
5416 defsymbol (&Qcharset_g2, "charset-g2");
5417 defsymbol (&Qcharset_g3, "charset-g3");
5418 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5419 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5420 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5421 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5422 defsymbol (&Qno_iso6429, "no-iso6429");
5423 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5424 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5426 defsymbol (&Qshort, "short");
5427 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5428 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5429 defsymbol (&Qseven, "seven");
5430 defsymbol (&Qlock_shift, "lock-shift");
5431 defsymbol (&Qescape_quoted, "escape-quoted");
5433 defsymbol (&Qencode, "encode");
5434 defsymbol (&Qdecode, "decode");
5437 defsymbol (&Qctext, "ctext");
5438 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5440 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5442 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5444 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5446 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5448 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5450 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5452 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5454 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5457 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5462 lstream_type_create_file_coding (void)
5464 LSTREAM_HAS_METHOD (decoding, reader);
5465 LSTREAM_HAS_METHOD (decoding, writer);
5466 LSTREAM_HAS_METHOD (decoding, rewinder);
5467 LSTREAM_HAS_METHOD (decoding, seekable_p);
5468 LSTREAM_HAS_METHOD (decoding, flusher);
5469 LSTREAM_HAS_METHOD (decoding, closer);
5470 LSTREAM_HAS_METHOD (decoding, marker);
5472 LSTREAM_HAS_METHOD (encoding, reader);
5473 LSTREAM_HAS_METHOD (encoding, writer);
5474 LSTREAM_HAS_METHOD (encoding, rewinder);
5475 LSTREAM_HAS_METHOD (encoding, seekable_p);
5476 LSTREAM_HAS_METHOD (encoding, flusher);
5477 LSTREAM_HAS_METHOD (encoding, closer);
5478 LSTREAM_HAS_METHOD (encoding, marker);
5482 vars_of_file_coding (void)
5486 fcd = xnew (struct file_coding_dump);
5487 dumpstruct (&fcd, &fcd_description);
5489 /* Initialize to something reasonable ... */
5490 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5492 fcd->coding_category_system[i] = Qnil;
5493 fcd->coding_category_by_priority[i] = i;
5496 Fprovide (intern ("file-coding"));
5498 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5499 Coding system used for TTY keyboard input.
5500 Not used under a windowing system.
5502 Vkeyboard_coding_system = Qnil;
5504 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5505 Coding system used for TTY display output.
5506 Not used under a windowing system.
5508 Vterminal_coding_system = Qnil;
5510 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5511 Overriding coding system used when writing a file or process.
5512 You should *bind* this, not set it. If this is non-nil, it specifies
5513 the coding system that will be used when a file or process is read
5514 in, and overrides `buffer-file-coding-system-for-read',
5515 `insert-file-contents-pre-hook', etc. Use those variables instead of
5516 this one for permanent changes to the environment.
5518 Vcoding_system_for_read = Qnil;
5520 DEFVAR_LISP ("coding-system-for-write",
5521 &Vcoding_system_for_write /*
5522 Overriding coding system used when writing a file or process.
5523 You should *bind* this, not set it. If this is non-nil, it specifies
5524 the coding system that will be used when a file or process is wrote
5525 in, and overrides `buffer-file-coding-system',
5526 `write-region-pre-hook', etc. Use those variables instead of this one
5527 for permanent changes to the environment.
5529 Vcoding_system_for_write = Qnil;
5531 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5532 Coding system used to convert pathnames when accessing files.
5534 Vfile_name_coding_system = Qnil;
5536 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5537 Non-nil means the buffer contents are regarded as multi-byte form
5538 of characters, not a binary code. This affects the display, file I/O,
5539 and behaviors of various editing commands.
5541 Setting this to nil does not do anything.
5543 enable_multibyte_characters = 1;
5547 complex_vars_of_file_coding (void)
5549 staticpro (&Vcoding_system_hash_table);
5550 Vcoding_system_hash_table =
5551 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5553 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5554 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5556 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5558 struct codesys_prop csp; \
5560 csp.prop_type = (Prop_Type); \
5561 Dynarr_add (the_codesys_prop_dynarr, csp); \
5564 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5565 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5566 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5567 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5568 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5569 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5570 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5572 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5573 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5574 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5575 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5576 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5577 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5578 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5579 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5580 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5581 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5582 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5583 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5584 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5585 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5586 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5587 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5588 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5590 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5591 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5593 /* Need to create this here or we're really screwed. */
5595 (Qraw_text, Qno_conversion,
5596 build_string ("Raw text, which means it converts only line-break-codes."),
5597 list2 (Qmnemonic, build_string ("Raw")));
5600 (Qbinary, Qno_conversion,
5601 build_string ("Binary, which means it does not convert anything."),
5602 list4 (Qeol_type, Qlf,
5603 Qmnemonic, build_string ("Binary")));
5608 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5609 list2 (Qmnemonic, build_string ("UTF8")));
5612 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5614 /* Need this for bootstrapping */
5615 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5616 Fget_coding_system (Qraw_text);
5619 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5620 = Fget_coding_system (Qutf8);