1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const unsigned char *src,
181 static void decode_coding_sjis (Lstream *decoding,
182 const unsigned char *src,
183 unsigned_char_dynarr *dst,
185 static void encode_coding_sjis (Lstream *encoding,
186 const unsigned char *src,
187 unsigned_char_dynarr *dst,
189 static int detect_coding_big5 (struct detection_state *st,
190 const unsigned char *src,
192 static void decode_coding_big5 (Lstream *decoding,
193 const unsigned char *src,
194 unsigned_char_dynarr *dst, unsigned int n);
195 static void encode_coding_big5 (Lstream *encoding,
196 const unsigned char *src,
197 unsigned_char_dynarr *dst, unsigned int n);
198 static int detect_coding_ucs4 (struct detection_state *st,
199 const unsigned char *src,
201 static void decode_coding_ucs4 (Lstream *decoding,
202 const unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_ucs4 (Lstream *encoding,
205 const unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_utf8 (struct detection_state *st,
208 const unsigned char *src,
210 static void decode_coding_utf8 (Lstream *decoding,
211 const unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 static void encode_coding_utf8 (Lstream *encoding,
214 const unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
216 static int postprocess_iso2022_mask (int mask);
217 static void reset_iso2022 (Lisp_Object coding_system,
218 struct iso2022_decoder *iso);
219 static int detect_coding_iso2022 (struct detection_state *st,
220 const unsigned char *src,
222 static void decode_coding_iso2022 (Lstream *decoding,
223 const unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 static void encode_coding_iso2022 (Lstream *encoding,
226 const unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 static void decode_coding_no_conversion (Lstream *decoding,
230 const unsigned char *src,
231 unsigned_char_dynarr *dst,
233 static void encode_coding_no_conversion (Lstream *encoding,
234 const unsigned char *src,
235 unsigned_char_dynarr *dst,
237 static void mule_decode (Lstream *decoding, const unsigned char *src,
238 unsigned_char_dynarr *dst, unsigned int n);
239 static void mule_encode (Lstream *encoding, const unsigned char *src,
240 unsigned_char_dynarr *dst, unsigned int n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
677 signal_simple_error_2
678 ("Attempted conversion between different charset types",
680 spec.from_charset = from;
681 spec.to_charset = to;
683 Dynarr_add (store_here, spec);
687 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
688 specs, return the equivalent as the Lisp programmer would see it.
690 If LOAD_HERE is 0, return Qnil. */
693 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
700 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
702 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
703 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
706 return Fnreverse (result);
711 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
712 Register symbol NAME as a coding system.
714 TYPE describes the conversion method used and should be one of
717 Automatic conversion. XEmacs attempts to detect the coding system
720 No conversion. Use this for binary files and such. On output,
721 graphic characters that are not in ASCII or Latin-1 will be
722 replaced by a ?. (For a no-conversion-encoded buffer, these
723 characters will only be present if you explicitly insert them.)
725 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
727 ISO 10646 UCS-4 encoding.
729 ISO 10646 UTF-8 encoding.
731 Any ISO2022-compliant encoding. Among other things, this includes
732 JIS (the Japanese encoding commonly used for e-mail), EUC (the
733 standard Unix encoding for Japanese and other languages), and
734 Compound Text (the encoding used in X11). You can specify more
735 specific information about the conversion with the FLAGS argument.
737 Big5 (the encoding commonly used for Taiwanese).
739 The conversion is performed using a user-written pseudo-code
740 program. CCL (Code Conversion Language) is the name of this
743 Write out or read in the raw contents of the memory representing
744 the buffer's text. This is primarily useful for debugging
745 purposes, and is only enabled when XEmacs has been compiled with
746 DEBUG_XEMACS defined (via the --debug configure option).
747 WARNING: Reading in a file using 'internal conversion can result
748 in an internal inconsistency in the memory representing a
749 buffer's text, which will produce unpredictable results and may
750 cause XEmacs to crash. Under normal circumstances you should
751 never use 'internal conversion.
753 DOC-STRING is a string describing the coding system.
755 PROPS is a property list, describing the specific nature of the
756 character set. Recognized properties are:
759 String to be displayed in the modeline when this coding system is
763 End-of-line conversion to be used. It should be one of
766 Automatically detect the end-of-line type (LF, CRLF,
767 or CR). Also generate subsidiary coding systems named
768 `NAME-unix', `NAME-dos', and `NAME-mac', that are
769 identical to this coding system but have an EOL-TYPE
770 value of 'lf, 'crlf, and 'cr, respectively.
772 The end of a line is marked externally using ASCII LF.
773 Since this is also the way that XEmacs represents an
774 end-of-line internally, specifying this option results
775 in no end-of-line conversion. This is the standard
776 format for Unix text files.
778 The end of a line is marked externally using ASCII
779 CRLF. This is the standard format for MS-DOS text
782 The end of a line is marked externally using ASCII CR.
783 This is the standard format for Macintosh text files.
785 Automatically detect the end-of-line type but do not
786 generate subsidiary coding systems. (This value is
787 converted to nil when stored internally, and
788 `coding-system-property' will return nil.)
790 'post-read-conversion
791 Function called after a file has been read in, to perform the
792 decoding. Called with two arguments, BEG and END, denoting
793 a region of the current buffer to be decoded.
795 'pre-write-conversion
796 Function called before a file is written out, to perform the
797 encoding. Called with two arguments, BEG and END, denoting
798 a region of the current buffer to be encoded.
801 The following additional properties are recognized if TYPE is 'iso2022:
807 The character set initially designated to the G0 - G3 registers.
808 The value should be one of
810 -- A charset object (designate that character set)
811 -- nil (do not ever use this register)
812 -- t (no character set is initially designated to
813 the register, but may be later on; this automatically
814 sets the corresponding `force-g*-on-output' property)
820 If non-nil, send an explicit designation sequence on output before
821 using the specified register.
824 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
825 "ESC $ B" on output in place of the full designation sequences
826 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
829 If non-nil, don't designate ASCII to G0 at each end of line on output.
830 Setting this to non-nil also suppresses other state-resetting that
831 normally happens at the end of a line.
834 If non-nil, don't designate ASCII to G0 before control chars on output.
837 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
841 If non-nil, use locking-shift (SO/SI) instead of single-shift
842 or designation by escape sequence.
845 If non-nil, don't use ISO6429's direction specification.
848 If non-nil, literal control characters that are the same as
849 the beginning of a recognized ISO2022 or ISO6429 escape sequence
850 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
851 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
852 so that they can be properly distinguished from an escape sequence.
853 (Note that doing this results in a non-portable encoding.) This
854 encoding flag is used for byte-compiled files. Note that ESC
855 is a good choice for a quoting character because there are no
856 escape sequences whose second byte is a character from the Control-0
857 or Control-1 character sets; this is explicitly disallowed by the
860 'input-charset-conversion
861 A list of conversion specifications, specifying conversion of
862 characters in one charset to another when decoding is performed.
863 Each specification is a list of two elements: the source charset,
864 and the destination charset.
866 'output-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when encoding is performed.
869 The form of each specification is the same as for
870 'input-charset-conversion.
873 The following additional properties are recognized (and required)
877 CCL program used for decoding (converting to internal format).
880 CCL program used for encoding (converting to external format).
882 (name, type, doc_string, props))
884 Lisp_Coding_System *codesys;
885 Lisp_Object rest, key, value;
886 enum coding_system_type ty;
887 int need_to_setup_eol_systems = 1;
889 /* Convert type to constant */
890 if (NILP (type) || EQ (type, Qundecided))
891 { ty = CODESYS_AUTODETECT; }
893 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
894 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
895 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
896 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
897 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
898 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
900 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
902 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
905 signal_simple_error ("Invalid coding system type", type);
909 codesys = allocate_coding_system (ty, name);
911 if (NILP (doc_string))
912 doc_string = build_string ("");
914 CHECK_STRING (doc_string);
915 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
917 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
919 if (EQ (key, Qmnemonic))
922 CHECK_STRING (value);
923 CODING_SYSTEM_MNEMONIC (codesys) = value;
926 else if (EQ (key, Qeol_type))
928 need_to_setup_eol_systems = NILP (value);
931 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
934 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
935 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
937 else if (ty == CODESYS_ISO2022)
939 #define FROB_INITIAL_CHARSET(charset_num) \
940 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
941 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
943 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
944 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
945 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
946 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
948 #define FROB_FORCE_CHARSET(charset_num) \
949 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
951 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
952 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
953 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
954 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
956 #define FROB_BOOLEAN_PROPERTY(prop) \
957 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
959 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
960 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
961 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
962 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
963 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
964 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
965 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
967 else if (EQ (key, Qinput_charset_conversion))
969 codesys->iso2022.input_conv =
970 Dynarr_new (charset_conversion_spec);
971 parse_charset_conversion_specs (codesys->iso2022.input_conv,
974 else if (EQ (key, Qoutput_charset_conversion))
976 codesys->iso2022.output_conv =
977 Dynarr_new (charset_conversion_spec);
978 parse_charset_conversion_specs (codesys->iso2022.output_conv,
982 signal_simple_error ("Unrecognized property", key);
984 else if (EQ (type, Qccl))
986 if (EQ (key, Qdecode))
988 CHECK_VECTOR (value);
989 CODING_SYSTEM_CCL_DECODE (codesys) = value;
991 else if (EQ (key, Qencode))
993 CHECK_VECTOR (value);
994 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
997 signal_simple_error ("Unrecognized property", key);
1001 signal_simple_error ("Unrecognized property", key);
1004 if (need_to_setup_eol_systems)
1005 setup_eol_coding_systems (codesys);
1008 Lisp_Object codesys_obj;
1009 XSETCODING_SYSTEM (codesys_obj, codesys);
1010 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1016 Copy OLD-CODING-SYSTEM to NEW-NAME.
1017 If NEW-NAME does not name an existing coding system, a new one will
1020 (old_coding_system, new_name))
1022 Lisp_Object new_coding_system;
1023 old_coding_system = Fget_coding_system (old_coding_system);
1024 new_coding_system = Ffind_coding_system (new_name);
1025 if (NILP (new_coding_system))
1027 XSETCODING_SYSTEM (new_coding_system,
1028 allocate_coding_system
1029 (XCODING_SYSTEM_TYPE (old_coding_system),
1031 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1035 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1036 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1037 memcpy (((char *) to ) + sizeof (to->header),
1038 ((char *) from) + sizeof (from->header),
1039 sizeof (*from) - sizeof (from->header));
1040 to->name = new_name;
1042 return new_coding_system;
1045 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1046 Return t if OBJECT names a coding system, and is not a coding system alias.
1050 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1054 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1055 Return t if OBJECT is a coding system alias.
1056 All coding system aliases are created by `define-coding-system-alias'.
1060 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1064 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1065 Return the coding-system symbol for which symbol ALIAS is an alias.
1069 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1070 if (SYMBOLP (aliasee))
1073 signal_simple_error ("Symbol is not a coding system alias", alias);
1077 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1079 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1083 /* A maphash function, for removing dangling coding system aliases. */
1085 dangling_coding_system_alias_p (Lisp_Object alias,
1086 Lisp_Object aliasee,
1087 void *dangling_aliases)
1089 if (SYMBOLP (aliasee)
1090 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1092 (*(int *) dangling_aliases)++;
1099 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1100 Define symbol ALIAS as an alias for coding system ALIASEE.
1102 You can use this function to redefine an alias that has already been defined,
1103 but you cannot redefine a name which is the canonical name for a coding system.
1104 \(a canonical name of a coding system is what is returned when you call
1105 `coding-system-name' on a coding system).
1107 ALIASEE itself can be an alias, which allows you to define nested aliases.
1109 You are forbidden, however, from creating alias loops or `dangling' aliases.
1110 These will be detected, and an error will be signaled if you attempt to do so.
1112 If ALIASEE is nil, then ALIAS will simply be undefined.
1114 See also `coding-system-alias-p', `coding-system-aliasee',
1115 and `coding-system-canonical-name-p'.
1119 Lisp_Object real_coding_system, probe;
1121 CHECK_SYMBOL (alias);
1123 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1125 ("Symbol is the canonical name of a coding system and cannot be redefined",
1130 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1131 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1132 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1134 Fremhash (alias, Vcoding_system_hash_table);
1136 /* Undefine subsidiary aliases,
1137 presumably created by a previous call to this function */
1138 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1139 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1140 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1142 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1143 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1144 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1147 /* Undefine dangling coding system aliases. */
1149 int dangling_aliases;
1152 dangling_aliases = 0;
1153 elisp_map_remhash (dangling_coding_system_alias_p,
1154 Vcoding_system_hash_table,
1156 } while (dangling_aliases > 0);
1162 if (CODING_SYSTEMP (aliasee))
1163 aliasee = XCODING_SYSTEM_NAME (aliasee);
1165 /* Checks that aliasee names a coding-system */
1166 real_coding_system = Fget_coding_system (aliasee);
1168 /* Check for coding system alias loops */
1169 if (EQ (alias, aliasee))
1170 alias_loop: signal_simple_error_2
1171 ("Attempt to create a coding system alias loop", alias, aliasee);
1173 for (probe = aliasee;
1175 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1177 if (EQ (probe, alias))
1181 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1183 /* Set up aliases for subsidiaries.
1184 #### There must be a better way to handle subsidiary coding systems. */
1186 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1188 for (i = 0; i < countof (suffixes); i++)
1190 Lisp_Object alias_subsidiary =
1191 append_suffix_to_symbol (alias, suffixes[i]);
1192 Lisp_Object aliasee_subsidiary =
1193 append_suffix_to_symbol (aliasee, suffixes[i]);
1195 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1196 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1199 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1200 but it doesn't look intentional, so I'd rather return something
1201 meaningful or nothing at all. */
1206 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1208 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1209 Lisp_Object new_coding_system;
1211 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1212 return coding_system;
1216 case EOL_AUTODETECT: return coding_system;
1217 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1218 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1219 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1223 return NILP (new_coding_system) ? coding_system : new_coding_system;
1226 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1227 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1229 (coding_system, eol_type))
1231 coding_system = Fget_coding_system (coding_system);
1233 return subsidiary_coding_system (coding_system,
1234 symbol_to_eol_type (eol_type));
1238 /************************************************************************/
1239 /* Coding system accessors */
1240 /************************************************************************/
1242 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1243 Return the doc string for CODING-SYSTEM.
1247 coding_system = Fget_coding_system (coding_system);
1248 return XCODING_SYSTEM_DOC_STRING (coding_system);
1251 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1252 Return the type of CODING-SYSTEM.
1256 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1259 case CODESYS_AUTODETECT: return Qundecided;
1261 case CODESYS_SHIFT_JIS: return Qshift_jis;
1262 case CODESYS_ISO2022: return Qiso2022;
1263 case CODESYS_BIG5: return Qbig5;
1264 case CODESYS_UCS4: return Qucs4;
1265 case CODESYS_UTF8: return Qutf8;
1266 case CODESYS_CCL: return Qccl;
1268 case CODESYS_NO_CONVERSION: return Qno_conversion;
1270 case CODESYS_INTERNAL: return Qinternal;
1277 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1280 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1282 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1285 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1286 Return initial charset of CODING-SYSTEM designated to GNUM.
1289 (coding_system, gnum))
1291 coding_system = Fget_coding_system (coding_system);
1294 return coding_system_charset (coding_system, XINT (gnum));
1298 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1299 Return the PROP property of CODING-SYSTEM.
1301 (coding_system, prop))
1304 enum coding_system_type type;
1306 coding_system = Fget_coding_system (coding_system);
1307 CHECK_SYMBOL (prop);
1308 type = XCODING_SYSTEM_TYPE (coding_system);
1310 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1311 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1314 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1316 case CODESYS_PROP_ALL_OK:
1319 case CODESYS_PROP_ISO2022:
1320 if (type != CODESYS_ISO2022)
1322 ("Property only valid in ISO2022 coding systems",
1326 case CODESYS_PROP_CCL:
1327 if (type != CODESYS_CCL)
1329 ("Property only valid in CCL coding systems",
1339 signal_simple_error ("Unrecognized property", prop);
1341 if (EQ (prop, Qname))
1342 return XCODING_SYSTEM_NAME (coding_system);
1343 else if (EQ (prop, Qtype))
1344 return Fcoding_system_type (coding_system);
1345 else if (EQ (prop, Qdoc_string))
1346 return XCODING_SYSTEM_DOC_STRING (coding_system);
1347 else if (EQ (prop, Qmnemonic))
1348 return XCODING_SYSTEM_MNEMONIC (coding_system);
1349 else if (EQ (prop, Qeol_type))
1350 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1351 else if (EQ (prop, Qeol_lf))
1352 return XCODING_SYSTEM_EOL_LF (coding_system);
1353 else if (EQ (prop, Qeol_crlf))
1354 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1355 else if (EQ (prop, Qeol_cr))
1356 return XCODING_SYSTEM_EOL_CR (coding_system);
1357 else if (EQ (prop, Qpost_read_conversion))
1358 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1359 else if (EQ (prop, Qpre_write_conversion))
1360 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1362 else if (type == CODESYS_ISO2022)
1364 if (EQ (prop, Qcharset_g0))
1365 return coding_system_charset (coding_system, 0);
1366 else if (EQ (prop, Qcharset_g1))
1367 return coding_system_charset (coding_system, 1);
1368 else if (EQ (prop, Qcharset_g2))
1369 return coding_system_charset (coding_system, 2);
1370 else if (EQ (prop, Qcharset_g3))
1371 return coding_system_charset (coding_system, 3);
1373 #define FORCE_CHARSET(charset_num) \
1374 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1375 (coding_system, charset_num) ? Qt : Qnil)
1377 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1378 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1379 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1380 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1382 #define LISP_BOOLEAN(prop) \
1383 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1385 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1386 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1387 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1388 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1389 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1390 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1391 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1393 else if (EQ (prop, Qinput_charset_conversion))
1395 unparse_charset_conversion_specs
1396 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1397 else if (EQ (prop, Qoutput_charset_conversion))
1399 unparse_charset_conversion_specs
1400 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1404 else if (type == CODESYS_CCL)
1406 if (EQ (prop, Qdecode))
1407 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1408 else if (EQ (prop, Qencode))
1409 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1417 return Qnil; /* not reached */
1421 /************************************************************************/
1422 /* Coding category functions */
1423 /************************************************************************/
1426 decode_coding_category (Lisp_Object symbol)
1430 CHECK_SYMBOL (symbol);
1431 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1432 if (EQ (coding_category_symbol[i], symbol))
1435 signal_simple_error ("Unrecognized coding category", symbol);
1436 return 0; /* not reached */
1439 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1440 Return a list of all recognized coding categories.
1445 Lisp_Object list = Qnil;
1447 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1448 list = Fcons (coding_category_symbol[i], list);
1452 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1453 Change the priority order of the coding categories.
1454 LIST should be list of coding categories, in descending order of
1455 priority. Unspecified coding categories will be lower in priority
1456 than all specified ones, in the same relative order they were in
1461 int category_to_priority[CODING_CATEGORY_LAST + 1];
1465 /* First generate a list that maps coding categories to priorities. */
1467 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1468 category_to_priority[i] = -1;
1470 /* Highest priority comes from the specified list. */
1472 EXTERNAL_LIST_LOOP (rest, list)
1474 int cat = decode_coding_category (XCAR (rest));
1476 if (category_to_priority[cat] >= 0)
1477 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1478 category_to_priority[cat] = i++;
1481 /* Now go through the existing categories by priority to retrieve
1482 the categories not yet specified and preserve their priority
1484 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1486 int cat = fcd->coding_category_by_priority[j];
1487 if (category_to_priority[cat] < 0)
1488 category_to_priority[cat] = i++;
1491 /* Now we need to construct the inverse of the mapping we just
1494 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1495 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1497 /* Phew! That was confusing. */
1501 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1502 Return a list of coding categories in descending order of priority.
1507 Lisp_Object list = Qnil;
1509 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1510 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1515 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1516 Change the coding system associated with a coding category.
1518 (coding_category, coding_system))
1520 int cat = decode_coding_category (coding_category);
1522 coding_system = Fget_coding_system (coding_system);
1523 fcd->coding_category_system[cat] = coding_system;
1527 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1528 Return the coding system associated with a coding category.
1532 int cat = decode_coding_category (coding_category);
1533 Lisp_Object sys = fcd->coding_category_system[cat];
1536 return XCODING_SYSTEM_NAME (sys);
1541 /************************************************************************/
1542 /* Detecting the encoding of data */
1543 /************************************************************************/
1545 struct detection_state
1547 eol_type_t eol_type;
1583 struct iso2022_decoder iso;
1585 int high_byte_count;
1586 unsigned int saw_single_shift:1;
1599 acceptable_control_char_p (int c)
1603 /* Allow and ignore control characters that you might
1604 reasonably see in a text file */
1609 case 8: /* backspace */
1610 case 11: /* vertical tab */
1611 case 12: /* form feed */
1612 case 26: /* MS-DOS C-z junk */
1613 case 31: /* '^_' -- for info */
1621 mask_has_at_most_one_bit_p (int mask)
1623 /* Perhaps the only thing useful you learn from intensive Microsoft
1624 technical interviews */
1625 return (mask & (mask - 1)) == 0;
1629 detect_eol_type (struct detection_state *st, const unsigned char *src,
1639 if (st->eol.just_saw_cr)
1641 else if (st->eol.seen_anything)
1644 else if (st->eol.just_saw_cr)
1647 st->eol.just_saw_cr = 1;
1649 st->eol.just_saw_cr = 0;
1650 st->eol.seen_anything = 1;
1653 return EOL_AUTODETECT;
1656 /* Attempt to determine the encoding and EOL type of the given text.
1657 Before calling this function for the first type, you must initialize
1658 st->eol_type as appropriate and initialize st->mask to ~0.
1660 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1663 st->mask holds the determined coding category mask, or ~0 if only
1664 ASCII has been seen so far.
1668 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1669 is present in st->mask
1670 1 == definitive answers are here for both st->eol_type and st->mask
1674 detect_coding_type (struct detection_state *st, const Extbyte *src,
1675 unsigned int n, int just_do_eol)
1679 if (st->eol_type == EOL_AUTODETECT)
1680 st->eol_type = detect_eol_type (st, src, n);
1683 return st->eol_type != EOL_AUTODETECT;
1685 if (!st->seen_non_ascii)
1687 for (; n; n--, src++)
1690 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1692 st->seen_non_ascii = 1;
1694 st->shift_jis.mask = ~0;
1698 st->iso2022.mask = ~0;
1708 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1709 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1710 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1711 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1712 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1713 st->big5.mask = detect_coding_big5 (st, src, n);
1714 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1715 st->utf8.mask = detect_coding_utf8 (st, src, n);
1716 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1717 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1720 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1721 | st->utf8.mask | st->ucs4.mask;
1724 int retval = mask_has_at_most_one_bit_p (st->mask);
1725 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1726 return retval && st->eol_type != EOL_AUTODETECT;
1731 coding_system_from_mask (int mask)
1735 /* If the file was entirely or basically ASCII, use the
1736 default value of `buffer-file-coding-system'. */
1737 Lisp_Object retval =
1738 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1741 retval = Ffind_coding_system (retval);
1745 (Qbad_variable, Qwarning,
1746 "Invalid `default-buffer-file-coding-system', set to nil");
1747 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1751 retval = Fget_coding_system (Qraw_text);
1759 mask = postprocess_iso2022_mask (mask);
1761 /* Look through the coding categories by priority and find
1762 the first one that is allowed. */
1763 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1765 cat = fcd->coding_category_by_priority[i];
1766 if ((mask & (1 << cat)) &&
1767 !NILP (fcd->coding_category_system[cat]))
1771 return fcd->coding_category_system[cat];
1773 return Fget_coding_system (Qraw_text);
1777 /* Given a seekable read stream and potential coding system and EOL type
1778 as specified, do any autodetection that is called for. If the
1779 coding system and/or EOL type are not `autodetect', they will be left
1780 alone; but this function will never return an autodetect coding system
1783 This function does not automatically fetch subsidiary coding systems;
1784 that should be unnecessary with the explicit eol-type argument. */
1786 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1789 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1790 eol_type_t *eol_type_in_out)
1792 struct detection_state decst;
1794 if (*eol_type_in_out == EOL_AUTODETECT)
1795 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1798 decst.eol_type = *eol_type_in_out;
1801 /* If autodetection is called for, do it now. */
1802 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1803 || *eol_type_in_out == EOL_AUTODETECT)
1806 Lisp_Object coding_system = Qnil;
1808 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1811 /* Look for initial "-*-"; mode line prefix */
1813 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1818 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1820 Extbyte *local_vars_beg = p + 3;
1821 /* Look for final "-*-"; mode line suffix */
1822 for (p = local_vars_beg,
1823 scan_end = buf + nread - LENGTH ("-*-");
1828 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1830 Extbyte *suffix = p;
1831 /* Look for "coding:" */
1832 for (p = local_vars_beg,
1833 scan_end = suffix - LENGTH ("coding:?");
1836 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1837 && (p == local_vars_beg
1838 || (*(p-1) == ' ' ||
1844 p += LENGTH ("coding:");
1845 while (*p == ' ' || *p == '\t') p++;
1847 /* Get coding system name */
1848 save = *suffix; *suffix = '\0';
1849 /* Characters valid in a MIME charset name (rfc 1521),
1850 and in a Lisp symbol name. */
1851 n = strspn ( (char *) p,
1852 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1853 "abcdefghijklmnopqrstuvwxyz"
1859 save = p[n]; p[n] = '\0';
1861 Ffind_coding_system (intern ((char *) p));
1871 if (NILP (coding_system))
1874 if (detect_coding_type (&decst, buf, nread,
1875 XCODING_SYSTEM_TYPE (*codesys_in_out)
1876 != CODESYS_AUTODETECT))
1878 nread = Lstream_read (stream, buf, sizeof (buf));
1884 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1885 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1888 if (detect_coding_type (&decst, buf, nread, 1))
1890 nread = Lstream_read (stream, buf, sizeof (buf));
1896 *eol_type_in_out = decst.eol_type;
1897 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1899 if (NILP (coding_system))
1900 *codesys_in_out = coding_system_from_mask (decst.mask);
1902 *codesys_in_out = coding_system;
1906 /* If we absolutely can't determine the EOL type, just assume LF. */
1907 if (*eol_type_in_out == EOL_AUTODETECT)
1908 *eol_type_in_out = EOL_LF;
1910 Lstream_rewind (stream);
1913 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1914 Detect coding system of the text in the region between START and END.
1915 Returned a list of possible coding systems ordered by priority.
1916 If only ASCII characters are found, it returns 'undecided or one of
1917 its subsidiary coding systems according to a detected end-of-line
1918 type. Optional arg BUFFER defaults to the current buffer.
1920 (start, end, buffer))
1922 Lisp_Object val = Qnil;
1923 struct buffer *buf = decode_buffer (buffer, 0);
1925 Lisp_Object instream, lb_instream;
1926 Lstream *istr, *lb_istr;
1927 struct detection_state decst;
1928 struct gcpro gcpro1, gcpro2;
1930 get_buffer_range_char (buf, start, end, &b, &e, 0);
1931 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1932 lb_istr = XLSTREAM (lb_instream);
1933 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1934 istr = XLSTREAM (instream);
1935 GCPRO2 (instream, lb_instream);
1937 decst.eol_type = EOL_AUTODETECT;
1941 unsigned char random_buffer[4096];
1942 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1946 if (detect_coding_type (&decst, random_buffer, nread, 0))
1950 if (decst.mask == ~0)
1951 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1959 decst.mask = postprocess_iso2022_mask (decst.mask);
1961 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1963 int sys = fcd->coding_category_by_priority[i];
1964 if (decst.mask & (1 << sys))
1966 Lisp_Object codesys = fcd->coding_category_system[sys];
1967 if (!NILP (codesys))
1968 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1969 val = Fcons (codesys, val);
1973 Lstream_close (istr);
1975 Lstream_delete (istr);
1976 Lstream_delete (lb_istr);
1981 /************************************************************************/
1982 /* Converting to internal Mule format ("decoding") */
1983 /************************************************************************/
1985 /* A decoding stream is a stream used for decoding text (i.e.
1986 converting from some external format to internal format).
1987 The decoding-stream object keeps track of the actual coding
1988 stream, the stream that is at the other end, and data that
1989 needs to be persistent across the lifetime of the stream. */
1991 /* Handle the EOL stuff related to just-read-in character C.
1992 EOL_TYPE is the EOL type of the coding stream.
1993 FLAGS is the current value of FLAGS in the coding stream, and may
1994 be modified by this macro. (The macro only looks at the
1995 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1996 bytes are to be written. You need to also define a local goto
1997 label "label_continue_loop" that is at the end of the main
1998 character-reading loop.
2000 If C is a CR character, then this macro handles it entirely and
2001 jumps to label_continue_loop. Otherwise, this macro does not add
2002 anything to DST, and continues normally. You should continue
2003 processing C normally after this macro. */
2005 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2009 if (eol_type == EOL_CR) \
2010 Dynarr_add (dst, '\n'); \
2011 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2012 Dynarr_add (dst, c); \
2014 flags |= CODING_STATE_CR; \
2015 goto label_continue_loop; \
2017 else if (flags & CODING_STATE_CR) \
2018 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2020 Dynarr_add (dst, '\r'); \
2021 flags &= ~CODING_STATE_CR; \
2025 /* C should be a binary character in the range 0 - 255; convert
2026 to internal format and add to Dynarr DST. */
2028 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2030 if (BYTE_ASCII_P (c)) \
2031 Dynarr_add (dst, c); \
2032 else if (BYTE_C1_P (c)) \
2034 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2035 Dynarr_add (dst, c + 0x20); \
2039 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2040 Dynarr_add (dst, c); \
2044 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2048 DECODE_ADD_BINARY_CHAR (ch, dst); \
2053 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2055 if (flags & CODING_STATE_END) \
2057 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2058 if (flags & CODING_STATE_CR) \
2059 Dynarr_add (dst, '\r'); \
2063 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2065 struct decoding_stream
2067 /* Coding system that governs the conversion. */
2068 Lisp_Coding_System *codesys;
2070 /* Stream that we read the encoded data from or
2071 write the decoded data to. */
2074 /* If we are reading, then we can return only a fixed amount of
2075 data, so if the conversion resulted in too much data, we store it
2076 here for retrieval the next time around. */
2077 unsigned_char_dynarr *runoff;
2079 /* FLAGS holds flags indicating the current state of the decoding.
2080 Some of these flags are dependent on the coding system. */
2083 /* CH holds a partially built-up character. Since we only deal
2084 with one- and two-byte characters at the moment, we only use
2085 this to store the first byte of a two-byte character. */
2088 /* EOL_TYPE specifies the type of end-of-line conversion that
2089 currently applies. We need to keep this separate from the
2090 EOL type stored in CODESYS because the latter might indicate
2091 automatic EOL-type detection while the former will always
2092 indicate a particular EOL type. */
2093 eol_type_t eol_type;
2095 /* Additional ISO2022 information. We define the structure above
2096 because it's also needed by the detection routines. */
2097 struct iso2022_decoder iso2022;
2099 /* Additional information (the state of the running CCL program)
2100 used by the CCL decoder. */
2101 struct ccl_program ccl;
2103 /* counter for UTF-8 or UCS-4 */
2104 unsigned char counter;
2106 struct detection_state decst;
2109 static ssize_t decoding_reader (Lstream *stream,
2110 unsigned char *data, size_t size);
2111 static ssize_t decoding_writer (Lstream *stream,
2112 const unsigned char *data, size_t size);
2113 static int decoding_rewinder (Lstream *stream);
2114 static int decoding_seekable_p (Lstream *stream);
2115 static int decoding_flusher (Lstream *stream);
2116 static int decoding_closer (Lstream *stream);
2118 static Lisp_Object decoding_marker (Lisp_Object stream);
2120 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2121 sizeof (struct decoding_stream));
2124 decoding_marker (Lisp_Object stream)
2126 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2127 Lisp_Object str_obj;
2129 /* We do not need to mark the coding systems or charsets stored
2130 within the stream because they are stored in a global list
2131 and automatically marked. */
2133 XSETLSTREAM (str_obj, str);
2134 mark_object (str_obj);
2135 if (str->imp->marker)
2136 return (str->imp->marker) (str_obj);
2141 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2142 so we read data from the other end, decode it, and store it into DATA. */
2145 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2147 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2148 unsigned char *orig_data = data;
2150 int error_occurred = 0;
2152 /* We need to interface to mule_decode(), which expects to take some
2153 amount of data and store the result into a Dynarr. We have
2154 mule_decode() store into str->runoff, and take data from there
2157 /* We loop until we have enough data, reading chunks from the other
2158 end and decoding it. */
2161 /* Take data from the runoff if we can. Make sure to take at
2162 most SIZE bytes, and delete the data from the runoff. */
2163 if (Dynarr_length (str->runoff) > 0)
2165 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2166 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2167 Dynarr_delete_many (str->runoff, 0, chunk);
2173 break; /* No more room for data */
2175 if (str->flags & CODING_STATE_END)
2176 /* This means that on the previous iteration, we hit the EOF on
2177 the other end. We loop once more so that mule_decode() can
2178 output any final stuff it may be holding, or any "go back
2179 to a sane state" escape sequences. (This latter makes sense
2180 during encoding.) */
2183 /* Exhausted the runoff, so get some more. DATA has at least
2184 SIZE bytes left of storage in it, so it's OK to read directly
2185 into it. (We'll be overwriting above, after we've decoded it
2186 into the runoff.) */
2187 read_size = Lstream_read (str->other_end, data, size);
2194 /* There might be some more end data produced in the translation.
2195 See the comment above. */
2196 str->flags |= CODING_STATE_END;
2197 mule_decode (stream, data, str->runoff, read_size);
2200 if (data - orig_data == 0)
2201 return error_occurred ? -1 : 0;
2203 return data - orig_data;
2207 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2209 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2212 /* Decode all our data into the runoff, and then attempt to write
2213 it all out to the other end. Remove whatever chunk we succeeded
2215 mule_decode (stream, data, str->runoff, size);
2216 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2217 Dynarr_length (str->runoff));
2219 Dynarr_delete_many (str->runoff, 0, retval);
2220 /* Do NOT return retval. The return value indicates how much
2221 of the incoming data was written, not how many bytes were
2227 reset_decoding_stream (struct decoding_stream *str)
2230 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2232 Lisp_Object coding_system;
2233 XSETCODING_SYSTEM (coding_system, str->codesys);
2234 reset_iso2022 (coding_system, &str->iso2022);
2236 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2238 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2242 str->flags = str->ch = 0;
2246 decoding_rewinder (Lstream *stream)
2248 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2249 reset_decoding_stream (str);
2250 Dynarr_reset (str->runoff);
2251 return Lstream_rewind (str->other_end);
2255 decoding_seekable_p (Lstream *stream)
2257 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2258 return Lstream_seekable_p (str->other_end);
2262 decoding_flusher (Lstream *stream)
2264 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2265 return Lstream_flush (str->other_end);
2269 decoding_closer (Lstream *stream)
2271 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2272 if (stream->flags & LSTREAM_FL_WRITE)
2274 str->flags |= CODING_STATE_END;
2275 decoding_writer (stream, 0, 0);
2277 Dynarr_free (str->runoff);
2279 #ifdef ENABLE_COMPOSITE_CHARS
2280 if (str->iso2022.composite_chars)
2281 Dynarr_free (str->iso2022.composite_chars);
2284 return Lstream_close (str->other_end);
2288 decoding_stream_coding_system (Lstream *stream)
2290 Lisp_Object coding_system;
2291 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2293 XSETCODING_SYSTEM (coding_system, str->codesys);
2294 return subsidiary_coding_system (coding_system, str->eol_type);
2298 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2300 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2301 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2303 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2304 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2305 reset_decoding_stream (str);
2308 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2309 stream for writing, no automatic code detection will be performed.
2310 The reason for this is that automatic code detection requires a
2311 seekable input. Things will also fail if you open a decoding
2312 stream for reading using a non-fully-specified coding system and
2313 a non-seekable input stream. */
2316 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2319 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2320 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2324 str->other_end = stream;
2325 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2326 str->eol_type = EOL_AUTODETECT;
2327 if (!strcmp (mode, "r")
2328 && Lstream_seekable_p (stream))
2329 /* We can determine the coding system now. */
2330 determine_real_coding_system (stream, &codesys, &str->eol_type);
2331 set_decoding_stream_coding_system (lstr, codesys);
2332 str->decst.eol_type = str->eol_type;
2333 str->decst.mask = ~0;
2334 XSETLSTREAM (obj, lstr);
2339 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2341 return make_decoding_stream_1 (stream, codesys, "r");
2345 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2347 return make_decoding_stream_1 (stream, codesys, "w");
2350 /* Note: the decode_coding_* functions all take the same
2351 arguments as mule_decode(), which is to say some SRC data of
2352 size N, which is to be stored into dynamic array DST.
2353 DECODING is the stream within which the decoding is
2354 taking place, but no data is actually read from or
2355 written to that stream; that is handled in decoding_reader()
2356 or decoding_writer(). This allows the same functions to
2357 be used for both reading and writing. */
2360 mule_decode (Lstream *decoding, const unsigned char *src,
2361 unsigned_char_dynarr *dst, unsigned int n)
2363 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2365 /* If necessary, do encoding-detection now. We do this when
2366 we're a writing stream or a non-seekable reading stream,
2367 meaning that we can't just process the whole input,
2368 rewind, and start over. */
2370 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2371 str->eol_type == EOL_AUTODETECT)
2373 Lisp_Object codesys;
2375 XSETCODING_SYSTEM (codesys, str->codesys);
2376 detect_coding_type (&str->decst, src, n,
2377 CODING_SYSTEM_TYPE (str->codesys) !=
2378 CODESYS_AUTODETECT);
2379 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2380 str->decst.mask != ~0)
2381 /* #### This is cheesy. What we really ought to do is
2382 buffer up a certain amount of data so as to get a
2383 less random result. */
2384 codesys = coding_system_from_mask (str->decst.mask);
2385 str->eol_type = str->decst.eol_type;
2386 if (XCODING_SYSTEM (codesys) != str->codesys)
2388 /* Preserve the CODING_STATE_END flag in case it was set.
2389 If we erase it, bad things might happen. */
2390 int was_end = str->flags & CODING_STATE_END;
2391 set_decoding_stream_coding_system (decoding, codesys);
2393 str->flags |= CODING_STATE_END;
2397 switch (CODING_SYSTEM_TYPE (str->codesys))
2400 case CODESYS_INTERNAL:
2401 Dynarr_add_many (dst, src, n);
2404 case CODESYS_AUTODETECT:
2405 /* If we got this far and still haven't decided on the coding
2406 system, then do no conversion. */
2407 case CODESYS_NO_CONVERSION:
2408 decode_coding_no_conversion (decoding, src, dst, n);
2411 case CODESYS_SHIFT_JIS:
2412 decode_coding_sjis (decoding, src, dst, n);
2415 decode_coding_big5 (decoding, src, dst, n);
2418 decode_coding_ucs4 (decoding, src, dst, n);
2421 decode_coding_utf8 (decoding, src, dst, n);
2424 str->ccl.last_block = str->flags & CODING_STATE_END;
2425 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2427 case CODESYS_ISO2022:
2428 decode_coding_iso2022 (decoding, src, dst, n);
2436 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2437 Decode the text between START and END which is encoded in CODING-SYSTEM.
2438 This is useful if you've read in encoded text from a file without decoding
2439 it (e.g. you read in a JIS-formatted file but used the `binary' or
2440 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2441 Return length of decoded text.
2442 BUFFER defaults to the current buffer if unspecified.
2444 (start, end, coding_system, buffer))
2447 struct buffer *buf = decode_buffer (buffer, 0);
2448 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2449 Lstream *istr, *ostr;
2450 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2452 get_buffer_range_char (buf, start, end, &b, &e, 0);
2454 barf_if_buffer_read_only (buf, b, e);
2456 coding_system = Fget_coding_system (coding_system);
2457 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2458 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2459 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2461 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2462 Fget_coding_system (Qbinary));
2463 istr = XLSTREAM (instream);
2464 ostr = XLSTREAM (outstream);
2465 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2467 /* The chain of streams looks like this:
2469 [BUFFER] <----- send through
2470 ------> [ENCODE AS BINARY]
2471 ------> [DECODE AS SPECIFIED]
2477 char tempbuf[1024]; /* some random amount */
2478 Bufpos newpos, even_newer_pos;
2479 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2480 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2484 newpos = lisp_buffer_stream_startpos (istr);
2485 Lstream_write (ostr, tempbuf, size_in_bytes);
2486 even_newer_pos = lisp_buffer_stream_startpos (istr);
2487 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2490 Lstream_close (istr);
2491 Lstream_close (ostr);
2493 Lstream_delete (istr);
2494 Lstream_delete (ostr);
2495 Lstream_delete (XLSTREAM (de_outstream));
2496 Lstream_delete (XLSTREAM (lb_outstream));
2501 /************************************************************************/
2502 /* Converting to an external encoding ("encoding") */
2503 /************************************************************************/
2505 /* An encoding stream is an output stream. When you create the
2506 stream, you specify the coding system that governs the encoding
2507 and another stream that the resulting encoded data is to be
2508 sent to, and then start sending data to it. */
2510 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2512 struct encoding_stream
2514 /* Coding system that governs the conversion. */
2515 Lisp_Coding_System *codesys;
2517 /* Stream that we read the encoded data from or
2518 write the decoded data to. */
2521 /* If we are reading, then we can return only a fixed amount of
2522 data, so if the conversion resulted in too much data, we store it
2523 here for retrieval the next time around. */
2524 unsigned_char_dynarr *runoff;
2526 /* FLAGS holds flags indicating the current state of the encoding.
2527 Some of these flags are dependent on the coding system. */
2530 /* CH holds a partially built-up character. Since we only deal
2531 with one- and two-byte characters at the moment, we only use
2532 this to store the first byte of a two-byte character. */
2535 /* Additional information used by the ISO2022 encoder. */
2538 /* CHARSET holds the character sets currently assigned to the G0
2539 through G3 registers. It is initialized from the array
2540 INITIAL_CHARSET in CODESYS. */
2541 Lisp_Object charset[4];
2543 /* Which registers are currently invoked into the left (GL) and
2544 right (GR) halves of the 8-bit encoding space? */
2545 int register_left, register_right;
2547 /* Whether we need to explicitly designate the charset in the
2548 G? register before using it. It is initialized from the
2549 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2550 unsigned char force_charset_on_output[4];
2552 /* Other state variables that need to be preserved across
2554 Lisp_Object current_charset;
2556 int current_char_boundary;
2559 /* Additional information (the state of the running CCL program)
2560 used by the CCL encoder. */
2561 struct ccl_program ccl;
2565 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2566 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2568 static int encoding_rewinder (Lstream *stream);
2569 static int encoding_seekable_p (Lstream *stream);
2570 static int encoding_flusher (Lstream *stream);
2571 static int encoding_closer (Lstream *stream);
2573 static Lisp_Object encoding_marker (Lisp_Object stream);
2575 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2576 sizeof (struct encoding_stream));
2579 encoding_marker (Lisp_Object stream)
2581 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2582 Lisp_Object str_obj;
2584 /* We do not need to mark the coding systems or charsets stored
2585 within the stream because they are stored in a global list
2586 and automatically marked. */
2588 XSETLSTREAM (str_obj, str);
2589 mark_object (str_obj);
2590 if (str->imp->marker)
2591 return (str->imp->marker) (str_obj);
2596 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2597 so we read data from the other end, encode it, and store it into DATA. */
2600 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2602 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2603 unsigned char *orig_data = data;
2605 int error_occurred = 0;
2607 /* We need to interface to mule_encode(), which expects to take some
2608 amount of data and store the result into a Dynarr. We have
2609 mule_encode() store into str->runoff, and take data from there
2612 /* We loop until we have enough data, reading chunks from the other
2613 end and encoding it. */
2616 /* Take data from the runoff if we can. Make sure to take at
2617 most SIZE bytes, and delete the data from the runoff. */
2618 if (Dynarr_length (str->runoff) > 0)
2620 int chunk = min ((int) size, Dynarr_length (str->runoff));
2621 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2622 Dynarr_delete_many (str->runoff, 0, chunk);
2628 break; /* No more room for data */
2630 if (str->flags & CODING_STATE_END)
2631 /* This means that on the previous iteration, we hit the EOF on
2632 the other end. We loop once more so that mule_encode() can
2633 output any final stuff it may be holding, or any "go back
2634 to a sane state" escape sequences. (This latter makes sense
2635 during encoding.) */
2638 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2639 left of storage in it, so it's OK to read directly into it.
2640 (We'll be overwriting above, after we've encoded it into the
2642 read_size = Lstream_read (str->other_end, data, size);
2649 /* There might be some more end data produced in the translation.
2650 See the comment above. */
2651 str->flags |= CODING_STATE_END;
2652 mule_encode (stream, data, str->runoff, read_size);
2655 if (data == orig_data)
2656 return error_occurred ? -1 : 0;
2658 return data - orig_data;
2662 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2664 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2667 /* Encode all our data into the runoff, and then attempt to write
2668 it all out to the other end. Remove whatever chunk we succeeded
2670 mule_encode (stream, data, str->runoff, size);
2671 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2672 Dynarr_length (str->runoff));
2674 Dynarr_delete_many (str->runoff, 0, retval);
2675 /* Do NOT return retval. The return value indicates how much
2676 of the incoming data was written, not how many bytes were
2682 reset_encoding_stream (struct encoding_stream *str)
2685 switch (CODING_SYSTEM_TYPE (str->codesys))
2687 case CODESYS_ISO2022:
2691 for (i = 0; i < 4; i++)
2693 str->iso2022.charset[i] =
2694 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2695 str->iso2022.force_charset_on_output[i] =
2696 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2698 str->iso2022.register_left = 0;
2699 str->iso2022.register_right = 1;
2700 str->iso2022.current_charset = Qnil;
2701 str->iso2022.current_half = 0;
2702 str->iso2022.current_char_boundary = 1;
2706 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2713 str->flags = str->ch = 0;
2717 encoding_rewinder (Lstream *stream)
2719 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2720 reset_encoding_stream (str);
2721 Dynarr_reset (str->runoff);
2722 return Lstream_rewind (str->other_end);
2726 encoding_seekable_p (Lstream *stream)
2728 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2729 return Lstream_seekable_p (str->other_end);
2733 encoding_flusher (Lstream *stream)
2735 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2736 return Lstream_flush (str->other_end);
2740 encoding_closer (Lstream *stream)
2742 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2743 if (stream->flags & LSTREAM_FL_WRITE)
2745 str->flags |= CODING_STATE_END;
2746 encoding_writer (stream, 0, 0);
2748 Dynarr_free (str->runoff);
2749 return Lstream_close (str->other_end);
2753 encoding_stream_coding_system (Lstream *stream)
2755 Lisp_Object coding_system;
2756 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2758 XSETCODING_SYSTEM (coding_system, str->codesys);
2759 return coding_system;
2763 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2765 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2766 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2768 reset_encoding_stream (str);
2772 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2775 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2776 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2780 str->runoff = Dynarr_new (unsigned_char);
2781 str->other_end = stream;
2782 set_encoding_stream_coding_system (lstr, codesys);
2783 XSETLSTREAM (obj, lstr);
2788 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2790 return make_encoding_stream_1 (stream, codesys, "r");
2794 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2796 return make_encoding_stream_1 (stream, codesys, "w");
2799 /* Convert N bytes of internally-formatted data stored in SRC to an
2800 external format, according to the encoding stream ENCODING.
2801 Store the encoded data into DST. */
2804 mule_encode (Lstream *encoding, const unsigned char *src,
2805 unsigned_char_dynarr *dst, unsigned int n)
2807 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2809 switch (CODING_SYSTEM_TYPE (str->codesys))
2812 case CODESYS_INTERNAL:
2813 Dynarr_add_many (dst, src, n);
2816 case CODESYS_AUTODETECT:
2817 /* If we got this far and still haven't decided on the coding
2818 system, then do no conversion. */
2819 case CODESYS_NO_CONVERSION:
2820 encode_coding_no_conversion (encoding, src, dst, n);
2823 case CODESYS_SHIFT_JIS:
2824 encode_coding_sjis (encoding, src, dst, n);
2827 encode_coding_big5 (encoding, src, dst, n);
2830 encode_coding_ucs4 (encoding, src, dst, n);
2833 encode_coding_utf8 (encoding, src, dst, n);
2836 str->ccl.last_block = str->flags & CODING_STATE_END;
2837 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2839 case CODESYS_ISO2022:
2840 encode_coding_iso2022 (encoding, src, dst, n);
2848 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2849 Encode the text between START and END using CODING-SYSTEM.
2850 This will, for example, convert Japanese characters into stuff such as
2851 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2852 text. BUFFER defaults to the current buffer if unspecified.
2854 (start, end, coding_system, buffer))
2857 struct buffer *buf = decode_buffer (buffer, 0);
2858 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2859 Lstream *istr, *ostr;
2860 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2862 get_buffer_range_char (buf, start, end, &b, &e, 0);
2864 barf_if_buffer_read_only (buf, b, e);
2866 coding_system = Fget_coding_system (coding_system);
2867 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2868 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2869 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2870 Fget_coding_system (Qbinary));
2871 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2873 istr = XLSTREAM (instream);
2874 ostr = XLSTREAM (outstream);
2875 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2876 /* The chain of streams looks like this:
2878 [BUFFER] <----- send through
2879 ------> [ENCODE AS SPECIFIED]
2880 ------> [DECODE AS BINARY]
2885 char tempbuf[1024]; /* some random amount */
2886 Bufpos newpos, even_newer_pos;
2887 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2888 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2892 newpos = lisp_buffer_stream_startpos (istr);
2893 Lstream_write (ostr, tempbuf, size_in_bytes);
2894 even_newer_pos = lisp_buffer_stream_startpos (istr);
2895 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2901 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2902 Lstream_close (istr);
2903 Lstream_close (ostr);
2905 Lstream_delete (istr);
2906 Lstream_delete (ostr);
2907 Lstream_delete (XLSTREAM (de_outstream));
2908 Lstream_delete (XLSTREAM (lb_outstream));
2909 return make_int (retlen);
2915 /************************************************************************/
2916 /* Shift-JIS methods */
2917 /************************************************************************/
2919 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2920 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2921 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2922 encoded by "position-code + 0x80". A character of JISX0208
2923 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2924 position-codes are divided and shifted so that it fit in the range
2927 --- CODE RANGE of Shift-JIS ---
2928 (character set) (range)
2930 JISX0201-Kana 0xA0 .. 0xDF
2931 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2932 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2933 -------------------------------
2937 /* Is this the first byte of a Shift-JIS two-byte char? */
2939 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2940 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2942 /* Is this the second byte of a Shift-JIS two-byte char? */
2944 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2945 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2947 #define BYTE_SJIS_KATAKANA_P(c) \
2948 ((c) >= 0xA1 && (c) <= 0xDF)
2951 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
2959 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2961 if (st->shift_jis.in_second_byte)
2963 st->shift_jis.in_second_byte = 0;
2967 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2968 st->shift_jis.in_second_byte = 1;
2970 return CODING_CATEGORY_SHIFT_JIS_MASK;
2973 /* Convert Shift-JIS data to internal format. */
2976 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
2977 unsigned_char_dynarr *dst, unsigned int n)
2980 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2981 unsigned int flags = str->flags;
2982 unsigned int ch = str->ch;
2983 eol_type_t eol_type = str->eol_type;
2991 /* Previous character was first byte of Shift-JIS Kanji char. */
2992 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2994 unsigned char e1, e2;
2996 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2997 DECODE_SJIS (ch, c, e1, e2);
2998 Dynarr_add (dst, e1);
2999 Dynarr_add (dst, e2);
3003 DECODE_ADD_BINARY_CHAR (ch, dst);
3004 DECODE_ADD_BINARY_CHAR (c, dst);
3010 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3011 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3013 else if (BYTE_SJIS_KATAKANA_P (c))
3015 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3016 Dynarr_add (dst, c);
3019 DECODE_ADD_BINARY_CHAR (c, dst);
3021 label_continue_loop:;
3024 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3030 /* Convert internally-formatted data to Shift-JIS. */
3033 encode_coding_sjis (Lstream *encoding, const unsigned char *src,
3034 unsigned_char_dynarr *dst, unsigned int n)
3037 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3038 unsigned int flags = str->flags;
3039 unsigned int ch = str->ch;
3040 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3047 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3048 Dynarr_add (dst, '\r');
3049 if (eol_type != EOL_CR)
3050 Dynarr_add (dst, '\n');
3053 else if (BYTE_ASCII_P (c))
3055 Dynarr_add (dst, c);
3058 else if (BUFBYTE_LEADING_BYTE_P (c))
3059 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3060 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3061 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3064 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3066 Dynarr_add (dst, c);
3069 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3070 ch == LEADING_BYTE_JAPANESE_JISX0208)
3074 unsigned char j1, j2;
3075 ENCODE_SJIS (ch, c, j1, j2);
3076 Dynarr_add (dst, j1);
3077 Dynarr_add (dst, j2);
3087 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3088 Decode a JISX0208 character of Shift-JIS coding-system.
3089 CODE is the character code in Shift-JIS as a cons of type bytes.
3090 Return the corresponding character.
3094 unsigned char c1, c2, s1, s2;
3097 CHECK_INT (XCAR (code));
3098 CHECK_INT (XCDR (code));
3099 s1 = XINT (XCAR (code));
3100 s2 = XINT (XCDR (code));
3101 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3102 BYTE_SJIS_TWO_BYTE_2_P (s2))
3104 DECODE_SJIS (s1, s2, c1, c2);
3105 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3106 c1 & 0x7F, c2 & 0x7F));
3112 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3113 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3114 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3118 Lisp_Object charset;
3121 CHECK_CHAR_COERCE_INT (ch);
3122 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3123 if (EQ (charset, Vcharset_japanese_jisx0208))
3125 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3126 return Fcons (make_int (s1), make_int (s2));
3133 /************************************************************************/
3135 /************************************************************************/
3137 /* BIG5 is a coding system encoding two character sets: ASCII and
3138 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3139 character set and is encoded in two-byte.
3141 --- CODE RANGE of BIG5 ---
3142 (character set) (range)
3144 Big5 (1st byte) 0xA1 .. 0xFE
3145 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3146 --------------------------
3148 Since the number of characters in Big5 is larger than maximum
3149 characters in Emacs' charset (96x96), it can't be handled as one
3150 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3151 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3152 contains frequently used characters and the latter contains less
3153 frequently used characters. */
3155 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3156 ((c) >= 0xA1 && (c) <= 0xFE)
3158 /* Is this the second byte of a Shift-JIS two-byte char? */
3160 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3161 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3163 /* Number of Big5 characters which have the same code in 1st byte. */
3165 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3167 /* Code conversion macros. These are macros because they are used in
3168 inner loops during code conversion.
3170 Note that temporary variables in macros introduce the classic
3171 dynamic-scoping problems with variable names. We use capital-
3172 lettered variables in the assumption that XEmacs does not use
3173 capital letters in variables except in a very formalized way
3176 /* Convert Big5 code (b1, b2) into its internal string representation
3179 /* There is a much simpler way to split the Big5 charset into two.
3180 For the moment I'm going to leave the algorithm as-is because it
3181 claims to separate out the most-used characters into a single
3182 charset, which perhaps will lead to optimizations in various
3185 The way the algorithm works is something like this:
3187 Big5 can be viewed as a 94x157 charset, where the row is
3188 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3189 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3190 the split between low and high column numbers is apparently
3191 meaningless; ascending rows produce less and less frequent chars.
3192 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3193 the first charset, and the upper half (0xC9 .. 0xFE) to the
3194 second. To do the conversion, we convert the character into
3195 a single number where 0 .. 156 is the first row, 157 .. 313
3196 is the second, etc. That way, the characters are ordered by
3197 decreasing frequency. Then we just chop the space in two
3198 and coerce the result into a 94x94 space.
3201 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3203 int B1 = b1, B2 = b2; \
3205 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3209 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3213 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3214 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3216 c1 = I / (0xFF - 0xA1) + 0xA1; \
3217 c2 = I % (0xFF - 0xA1) + 0xA1; \
3220 /* Convert the internal string representation of a Big5 character
3221 (lb, c1, c2) into Big5 code (b1, b2). */
3223 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3225 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3227 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3229 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3231 b1 = I / BIG5_SAME_ROW + 0xA1; \
3232 b2 = I % BIG5_SAME_ROW; \
3233 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3237 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3245 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3246 (c >= 0x80 && c <= 0xA0))
3248 if (st->big5.in_second_byte)
3250 st->big5.in_second_byte = 0;
3251 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3255 st->big5.in_second_byte = 1;
3257 return CODING_CATEGORY_BIG5_MASK;
3260 /* Convert Big5 data to internal format. */
3263 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3264 unsigned_char_dynarr *dst, unsigned int n)
3267 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3268 unsigned int flags = str->flags;
3269 unsigned int ch = str->ch;
3270 eol_type_t eol_type = str->eol_type;
3277 /* Previous character was first byte of Big5 char. */
3278 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3280 unsigned char b1, b2, b3;
3281 DECODE_BIG5 (ch, c, b1, b2, b3);
3282 Dynarr_add (dst, b1);
3283 Dynarr_add (dst, b2);
3284 Dynarr_add (dst, b3);
3288 DECODE_ADD_BINARY_CHAR (ch, dst);
3289 DECODE_ADD_BINARY_CHAR (c, dst);
3295 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3296 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3299 DECODE_ADD_BINARY_CHAR (c, dst);
3301 label_continue_loop:;
3304 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3310 /* Convert internally-formatted data to Big5. */
3313 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3314 unsigned_char_dynarr *dst, unsigned int n)
3317 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3318 unsigned int flags = str->flags;
3319 unsigned int ch = str->ch;
3320 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3327 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3328 Dynarr_add (dst, '\r');
3329 if (eol_type != EOL_CR)
3330 Dynarr_add (dst, '\n');
3332 else if (BYTE_ASCII_P (c))
3335 Dynarr_add (dst, c);
3337 else if (BUFBYTE_LEADING_BYTE_P (c))
3339 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3340 c == LEADING_BYTE_CHINESE_BIG5_2)
3342 /* A recognized leading byte. */
3344 continue; /* not done with this character. */
3346 /* otherwise just ignore this character. */
3348 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3349 ch == LEADING_BYTE_CHINESE_BIG5_2)
3351 /* Previous char was a recognized leading byte. */
3353 continue; /* not done with this character. */
3357 /* Encountering second byte of a Big5 character. */
3358 unsigned char b1, b2;
3360 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3361 Dynarr_add (dst, b1);
3362 Dynarr_add (dst, b2);
3373 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3374 Decode a Big5 character CODE of BIG5 coding-system.
3375 CODE is the character code in BIG5, a cons of two integers.
3376 Return the corresponding character.
3380 unsigned char c1, c2, b1, b2;
3383 CHECK_INT (XCAR (code));
3384 CHECK_INT (XCDR (code));
3385 b1 = XINT (XCAR (code));
3386 b2 = XINT (XCDR (code));
3387 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3388 BYTE_BIG5_TWO_BYTE_2_P (b2))
3391 Lisp_Object charset;
3392 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3393 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3394 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3400 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3401 Encode the Big5 character CH to BIG5 coding-system.
3402 Return the corresponding character code in Big5.
3406 Lisp_Object charset;
3409 CHECK_CHAR_COERCE_INT (ch);
3410 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3411 if (EQ (charset, Vcharset_chinese_big5_1) ||
3412 EQ (charset, Vcharset_chinese_big5_2))
3414 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3416 return Fcons (make_int (b1), make_int (b2));
3423 /************************************************************************/
3426 /* UCS-4 character codes are implemented as nonnegative integers. */
3428 /************************************************************************/
3431 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3432 Map UCS-4 code CODE to Mule character CHARACTER.
3434 Return T on success, NIL on failure.
3440 CHECK_CHAR (character);
3444 if (c < sizeof (fcd->ucs_to_mule_table))
3446 fcd->ucs_to_mule_table[c] = character;
3454 ucs_to_char (unsigned long code)
3456 if (code < sizeof (fcd->ucs_to_mule_table))
3458 return fcd->ucs_to_mule_table[code];
3460 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3465 c = code % (94 * 94);
3467 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3468 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3469 CHARSET_LEFT_TO_RIGHT),
3470 c / 94 + 33, c % 94 + 33));
3476 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3477 Return Mule character corresponding to UCS code CODE (a positive integer).
3481 CHECK_NATNUM (code);
3482 return ucs_to_char (XINT (code));
3485 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3486 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3490 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3491 Fset_char_ucs is more restrictive on index arg, but should
3492 check code arg in a char_table method. */
3493 CHECK_CHAR (character);
3494 CHECK_NATNUM (code);
3495 return Fput_char_table (character, code, mule_to_ucs_table);
3498 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3499 Return the UCS code (a positive integer) corresponding to CHARACTER.
3503 return Fget_char_table (character, mule_to_ucs_table);
3506 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3507 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3508 is not found, instead.
3509 #### do something more appropriate (use blob?)
3510 Danger, Will Robinson! Data loss. Should we signal user? */
3512 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3514 Lisp_Object chr = ucs_to_char (ch);
3518 Bufbyte work[MAX_EMCHAR_LEN];
3523 simple_set_charptr_emchar (work, ch) :
3524 non_ascii_set_charptr_emchar (work, ch);
3525 Dynarr_add_many (dst, work, len);
3529 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3530 Dynarr_add (dst, 34 + 128);
3531 Dynarr_add (dst, 46 + 128);
3535 static unsigned long
3536 mule_char_to_ucs4 (Lisp_Object charset,
3537 unsigned char h, unsigned char l)
3540 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3547 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3548 (XCHARSET_CHARS (charset) == 94) )
3550 unsigned char final = XCHARSET_FINAL (charset);
3552 if ( ('@' <= final) && (final < 0x7f) )
3554 return 0xe00000 + (final - '@') * 94 * 94
3555 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3569 encode_ucs4 (Lisp_Object charset,
3570 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3572 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3573 Dynarr_add (dst, code >> 24);
3574 Dynarr_add (dst, (code >> 16) & 255);
3575 Dynarr_add (dst, (code >> 8) & 255);
3576 Dynarr_add (dst, code & 255);
3580 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3586 switch (st->ucs4.in_byte)
3595 st->ucs4.in_byte = 0;
3601 return CODING_CATEGORY_UCS4_MASK;
3605 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3606 unsigned_char_dynarr *dst, unsigned int n)
3608 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3609 unsigned int flags = str->flags;
3610 unsigned int ch = str->ch;
3611 unsigned char counter = str->counter;
3615 unsigned char c = *src++;
3623 decode_ucs4 ( ( ch << 8 ) | c, dst);
3628 ch = ( ch << 8 ) | c;
3632 if (counter & CODING_STATE_END)
3633 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3637 str->counter = counter;
3641 encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
3642 unsigned_char_dynarr *dst, unsigned int n)
3644 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3645 unsigned int flags = str->flags;
3646 unsigned int ch = str->ch;
3647 unsigned char char_boundary = str->iso2022.current_char_boundary;
3648 Lisp_Object charset = str->iso2022.current_charset;
3650 #ifdef ENABLE_COMPOSITE_CHARS
3651 /* flags for handling composite chars. We do a little switcharoo
3652 on the source while we're outputting the composite char. */
3653 unsigned int saved_n = 0;
3654 const unsigned char *saved_src = NULL;
3655 int in_composite = 0;
3662 unsigned char c = *src++;
3664 if (BYTE_ASCII_P (c))
3665 { /* Processing ASCII character */
3667 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3670 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3671 { /* Processing Leading Byte */
3673 charset = CHARSET_BY_LEADING_BYTE (c);
3674 if (LEADING_BYTE_PREFIX_P(c))
3679 { /* Processing Non-ASCII character */
3681 if (EQ (charset, Vcharset_control_1))
3683 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3687 switch (XCHARSET_REP_BYTES (charset))
3690 encode_ucs4 (charset, c, 0, dst);
3693 if (XCHARSET_PRIVATE_P (charset))
3695 encode_ucs4 (charset, c, 0, dst);
3700 #ifdef ENABLE_COMPOSITE_CHARS
3701 if (EQ (charset, Vcharset_composite))
3705 /* #### Bother! We don't know how to
3707 Dynarr_add (dst, 0);
3708 Dynarr_add (dst, 0);
3709 Dynarr_add (dst, 0);
3710 Dynarr_add (dst, '~');
3714 Emchar emch = MAKE_CHAR (Vcharset_composite,
3715 ch & 0x7F, c & 0x7F);
3716 Lisp_Object lstr = composite_char_string (emch);
3720 src = XSTRING_DATA (lstr);
3721 n = XSTRING_LENGTH (lstr);
3725 #endif /* ENABLE_COMPOSITE_CHARS */
3727 encode_ucs4(charset, ch, c, dst);
3740 encode_ucs4 (charset, ch, c, dst);
3756 #ifdef ENABLE_COMPOSITE_CHARS
3762 goto back_to_square_n; /* Wheeeeeeeee ..... */
3764 #endif /* ENABLE_COMPOSITE_CHARS */
3768 str->iso2022.current_char_boundary = char_boundary;
3769 str->iso2022.current_charset = charset;
3771 /* Verbum caro factum est! */
3775 /************************************************************************/
3777 /************************************************************************/
3780 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3785 unsigned char c = *src++;
3786 switch (st->utf8.in_byte)
3789 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3792 st->utf8.in_byte = 5;
3794 st->utf8.in_byte = 4;
3796 st->utf8.in_byte = 3;
3798 st->utf8.in_byte = 2;
3800 st->utf8.in_byte = 1;
3805 if ((c & 0xc0) != 0x80)
3811 return CODING_CATEGORY_UTF8_MASK;
3815 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3816 unsigned_char_dynarr *dst, unsigned int n)
3818 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3819 unsigned int flags = str->flags;
3820 unsigned int ch = str->ch;
3821 eol_type_t eol_type = str->eol_type;
3822 unsigned char counter = str->counter;
3826 unsigned char c = *src++;
3835 else if ( c >= 0xf8 )
3840 else if ( c >= 0xf0 )
3845 else if ( c >= 0xe0 )
3850 else if ( c >= 0xc0 )
3857 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3858 decode_ucs4 (c, dst);
3862 ch = ( ch << 6 ) | ( c & 0x3f );
3863 decode_ucs4 (ch, dst);
3868 ch = ( ch << 6 ) | ( c & 0x3f );
3871 label_continue_loop:;
3874 if (flags & CODING_STATE_END)
3875 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3879 str->counter = counter;
3883 encode_utf8 (Lisp_Object charset,
3884 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3886 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3889 Dynarr_add (dst, code);
3891 else if ( code <= 0x7ff )
3893 Dynarr_add (dst, (code >> 6) | 0xc0);
3894 Dynarr_add (dst, (code & 0x3f) | 0x80);
3896 else if ( code <= 0xffff )
3898 Dynarr_add (dst, (code >> 12) | 0xe0);
3899 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3900 Dynarr_add (dst, (code & 0x3f) | 0x80);
3902 else if ( code <= 0x1fffff )
3904 Dynarr_add (dst, (code >> 18) | 0xf0);
3905 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3906 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3907 Dynarr_add (dst, (code & 0x3f) | 0x80);
3909 else if ( code <= 0x3ffffff )
3911 Dynarr_add (dst, (code >> 24) | 0xf8);
3912 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3913 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3914 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3915 Dynarr_add (dst, (code & 0x3f) | 0x80);
3919 Dynarr_add (dst, (code >> 30) | 0xfc);
3920 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3921 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3922 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3923 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3924 Dynarr_add (dst, (code & 0x3f) | 0x80);
3929 encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
3930 unsigned_char_dynarr *dst, unsigned int n)
3932 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3933 unsigned int flags = str->flags;
3934 unsigned int ch = str->ch;
3935 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3936 unsigned char char_boundary = str->iso2022.current_char_boundary;
3937 Lisp_Object charset = str->iso2022.current_charset;
3939 #ifdef ENABLE_COMPOSITE_CHARS
3940 /* flags for handling composite chars. We do a little switcharoo
3941 on the source while we're outputting the composite char. */
3942 unsigned int saved_n = 0;
3943 const unsigned char *saved_src = NULL;
3944 int in_composite = 0;
3947 #endif /* ENABLE_COMPOSITE_CHARS */
3951 unsigned char c = *src++;
3953 if (BYTE_ASCII_P (c))
3954 { /* Processing ASCII character */
3958 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3959 Dynarr_add (dst, '\r');
3960 if (eol_type != EOL_CR)
3961 Dynarr_add (dst, c);
3964 encode_utf8 (Vcharset_ascii, c, 0, dst);
3967 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3968 { /* Processing Leading Byte */
3970 charset = CHARSET_BY_LEADING_BYTE (c);
3971 if (LEADING_BYTE_PREFIX_P(c))
3976 { /* Processing Non-ASCII character */
3978 if (EQ (charset, Vcharset_control_1))
3980 encode_utf8 (Vcharset_control_1, c, 0, dst);
3984 switch (XCHARSET_REP_BYTES (charset))
3987 encode_utf8 (charset, c, 0, dst);
3990 if (XCHARSET_PRIVATE_P (charset))
3992 encode_utf8 (charset, c, 0, dst);
3997 #ifdef ENABLE_COMPOSITE_CHARS
3998 if (EQ (charset, Vcharset_composite))
4002 /* #### Bother! We don't know how to
4004 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4008 Emchar emch = MAKE_CHAR (Vcharset_composite,
4009 ch & 0x7F, c & 0x7F);
4010 Lisp_Object lstr = composite_char_string (emch);
4014 src = XSTRING_DATA (lstr);
4015 n = XSTRING_LENGTH (lstr);
4019 #endif /* ENABLE_COMPOSITE_CHARS */
4021 encode_utf8 (charset, ch, c, dst);
4034 encode_utf8 (charset, ch, c, dst);
4050 #ifdef ENABLE_COMPOSITE_CHARS
4056 goto back_to_square_n; /* Wheeeeeeeee ..... */
4062 str->iso2022.current_char_boundary = char_boundary;
4063 str->iso2022.current_charset = charset;
4065 /* Verbum caro factum est! */
4069 /************************************************************************/
4070 /* ISO2022 methods */
4071 /************************************************************************/
4073 /* The following note describes the coding system ISO2022 briefly.
4074 Since the intention of this note is to help understand the
4075 functions in this file, some parts are NOT ACCURATE or OVERLY
4076 SIMPLIFIED. For thorough understanding, please refer to the
4077 original document of ISO2022.
4079 ISO2022 provides many mechanisms to encode several character sets
4080 in 7-bit and 8-bit environments. For 7-bit environments, all text
4081 is encoded using bytes less than 128. This may make the encoded
4082 text a little bit longer, but the text passes more easily through
4083 several gateways, some of which strip off MSB (Most Signigant Bit).
4085 There are two kinds of character sets: control character set and
4086 graphic character set. The former contains control characters such
4087 as `newline' and `escape' to provide control functions (control
4088 functions are also provided by escape sequences). The latter
4089 contains graphic characters such as 'A' and '-'. Emacs recognizes
4090 two control character sets and many graphic character sets.
4092 Graphic character sets are classified into one of the following
4093 four classes, according to the number of bytes (DIMENSION) and
4094 number of characters in one dimension (CHARS) of the set:
4095 - DIMENSION1_CHARS94
4096 - DIMENSION1_CHARS96
4097 - DIMENSION2_CHARS94
4098 - DIMENSION2_CHARS96
4100 In addition, each character set is assigned an identification tag,
4101 unique for each set, called "final character" (denoted as <F>
4102 hereafter). The <F> of each character set is decided by ECMA(*)
4103 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4104 (0x30..0x3F are for private use only).
4106 Note (*): ECMA = European Computer Manufacturers Association
4108 Here are examples of graphic character set [NAME(<F>)]:
4109 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4110 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4111 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4112 o DIMENSION2_CHARS96 -- none for the moment
4114 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4115 C0 [0x00..0x1F] -- control character plane 0
4116 GL [0x20..0x7F] -- graphic character plane 0
4117 C1 [0x80..0x9F] -- control character plane 1
4118 GR [0xA0..0xFF] -- graphic character plane 1
4120 A control character set is directly designated and invoked to C0 or
4121 C1 by an escape sequence. The most common case is that:
4122 - ISO646's control character set is designated/invoked to C0, and
4123 - ISO6429's control character set is designated/invoked to C1,
4124 and usually these designations/invocations are omitted in encoded
4125 text. In a 7-bit environment, only C0 can be used, and a control
4126 character for C1 is encoded by an appropriate escape sequence to
4127 fit into the environment. All control characters for C1 are
4128 defined to have corresponding escape sequences.
4130 A graphic character set is at first designated to one of four
4131 graphic registers (G0 through G3), then these graphic registers are
4132 invoked to GL or GR. These designations and invocations can be
4133 done independently. The most common case is that G0 is invoked to
4134 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4135 these invocations and designations are omitted in encoded text.
4136 In a 7-bit environment, only GL can be used.
4138 When a graphic character set of CHARS94 is invoked to GL, codes
4139 0x20 and 0x7F of the GL area work as control characters SPACE and
4140 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4143 There are two ways of invocation: locking-shift and single-shift.
4144 With locking-shift, the invocation lasts until the next different
4145 invocation, whereas with single-shift, the invocation affects the
4146 following character only and doesn't affect the locking-shift
4147 state. Invocations are done by the following control characters or
4150 ----------------------------------------------------------------------
4151 abbrev function cntrl escape seq description
4152 ----------------------------------------------------------------------
4153 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4154 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4155 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4156 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4157 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4158 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4159 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4160 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4161 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4162 ----------------------------------------------------------------------
4163 (*) These are not used by any known coding system.
4165 Control characters for these functions are defined by macros
4166 ISO_CODE_XXX in `coding.h'.
4168 Designations are done by the following escape sequences:
4169 ----------------------------------------------------------------------
4170 escape sequence description
4171 ----------------------------------------------------------------------
4172 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4173 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4174 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4175 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4176 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4177 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4178 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4179 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4180 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4181 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4182 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4183 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4184 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4185 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4186 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4187 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4188 ----------------------------------------------------------------------
4190 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4191 of dimension 1, chars 94, and final character <F>, etc...
4193 Note (*): Although these designations are not allowed in ISO2022,
4194 Emacs accepts them on decoding, and produces them on encoding
4195 CHARS96 character sets in a coding system which is characterized as
4196 7-bit environment, non-locking-shift, and non-single-shift.
4198 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4199 '(' can be omitted. We refer to this as "short-form" hereafter.
4201 Now you may notice that there are a lot of ways for encoding the
4202 same multilingual text in ISO2022. Actually, there exist many
4203 coding systems such as Compound Text (used in X11's inter client
4204 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4205 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4206 localized platforms), and all of these are variants of ISO2022.
4208 In addition to the above, Emacs handles two more kinds of escape
4209 sequences: ISO6429's direction specification and Emacs' private
4210 sequence for specifying character composition.
4212 ISO6429's direction specification takes the following form:
4213 o CSI ']' -- end of the current direction
4214 o CSI '0' ']' -- end of the current direction
4215 o CSI '1' ']' -- start of left-to-right text
4216 o CSI '2' ']' -- start of right-to-left text
4217 The control character CSI (0x9B: control sequence introducer) is
4218 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4220 Character composition specification takes the following form:
4221 o ESC '0' -- start character composition
4222 o ESC '1' -- end character composition
4223 Since these are not standard escape sequences of any ISO standard,
4224 their use with these meanings is restricted to Emacs only. */
4227 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4231 for (i = 0; i < 4; i++)
4233 if (!NILP (coding_system))
4235 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4237 iso->charset[i] = Qt;
4238 iso->invalid_designated[i] = 0;
4240 iso->esc = ISO_ESC_NOTHING;
4241 iso->esc_bytes_index = 0;
4242 iso->register_left = 0;
4243 iso->register_right = 1;
4244 iso->switched_dir_and_no_valid_charset_yet = 0;
4245 iso->invalid_switch_dir = 0;
4246 iso->output_direction_sequence = 0;
4247 iso->output_literally = 0;
4248 #ifdef ENABLE_COMPOSITE_CHARS
4249 if (iso->composite_chars)
4250 Dynarr_reset (iso->composite_chars);
4255 fit_to_be_escape_quoted (unsigned char c)
4272 /* Parse one byte of an ISO2022 escape sequence.
4273 If the result is an invalid escape sequence, return 0 and
4274 do not change anything in STR. Otherwise, if the result is
4275 an incomplete escape sequence, update ISO2022.ESC and
4276 ISO2022.ESC_BYTES and return -1. Otherwise, update
4277 all the state variables (but not ISO2022.ESC_BYTES) and
4280 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4281 or invocation of an invalid character set and treat that as
4282 an unrecognized escape sequence. */
4285 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4286 unsigned char c, unsigned int *flags,
4287 int check_invalid_charsets)
4289 /* (1) If we're at the end of a designation sequence, CS is the
4290 charset being designated and REG is the register to designate
4293 (2) If we're at the end of a locking-shift sequence, REG is
4294 the register to invoke and HALF (0 == left, 1 == right) is
4295 the half to invoke it into.
4297 (3) If we're at the end of a single-shift sequence, REG is
4298 the register to invoke. */
4299 Lisp_Object cs = Qnil;
4302 /* NOTE: This code does goto's all over the fucking place.
4303 The reason for this is that we're basically implementing
4304 a state machine here, and hierarchical languages like C
4305 don't really provide a clean way of doing this. */
4307 if (! (*flags & CODING_STATE_ESCAPE))
4308 /* At beginning of escape sequence; we need to reset our
4309 escape-state variables. */
4310 iso->esc = ISO_ESC_NOTHING;
4312 iso->output_literally = 0;
4313 iso->output_direction_sequence = 0;
4317 case ISO_ESC_NOTHING:
4318 iso->esc_bytes_index = 0;
4321 case ISO_CODE_ESC: /* Start escape sequence */
4322 *flags |= CODING_STATE_ESCAPE;
4326 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4327 *flags |= CODING_STATE_ESCAPE;
4328 iso->esc = ISO_ESC_5_11;
4331 case ISO_CODE_SO: /* locking shift 1 */
4334 case ISO_CODE_SI: /* locking shift 0 */
4338 case ISO_CODE_SS2: /* single shift */
4341 case ISO_CODE_SS3: /* single shift */
4345 default: /* Other control characters */
4352 /**** single shift ****/
4354 case 'N': /* single shift 2 */
4357 case 'O': /* single shift 3 */
4361 /**** locking shift ****/
4363 case '~': /* locking shift 1 right */
4366 case 'n': /* locking shift 2 */
4369 case '}': /* locking shift 2 right */
4372 case 'o': /* locking shift 3 */
4375 case '|': /* locking shift 3 right */
4379 #ifdef ENABLE_COMPOSITE_CHARS
4380 /**** composite ****/
4383 iso->esc = ISO_ESC_START_COMPOSITE;
4384 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4385 CODING_STATE_COMPOSITE;
4389 iso->esc = ISO_ESC_END_COMPOSITE;
4390 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4391 ~CODING_STATE_COMPOSITE;
4393 #endif /* ENABLE_COMPOSITE_CHARS */
4395 /**** directionality ****/
4398 iso->esc = ISO_ESC_5_11;
4401 /**** designation ****/
4403 case '$': /* multibyte charset prefix */
4404 iso->esc = ISO_ESC_2_4;
4408 if (0x28 <= c && c <= 0x2F)
4410 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4414 /* This function is called with CODESYS equal to nil when
4415 doing coding-system detection. */
4417 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4418 && fit_to_be_escape_quoted (c))
4420 iso->esc = ISO_ESC_LITERAL;
4421 *flags &= CODING_STATE_ISO2022_LOCK;
4431 /**** directionality ****/
4433 case ISO_ESC_5_11: /* ISO6429 direction control */
4436 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4437 goto directionality;
4439 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4440 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4441 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4445 case ISO_ESC_5_11_0:
4448 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4449 goto directionality;
4453 case ISO_ESC_5_11_1:
4456 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4457 goto directionality;
4461 case ISO_ESC_5_11_2:
4464 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4465 goto directionality;
4470 iso->esc = ISO_ESC_DIRECTIONALITY;
4471 /* Various junk here to attempt to preserve the direction sequences
4472 literally in the text if they would otherwise be swallowed due
4473 to invalid designations that don't show up as actual charset
4474 changes in the text. */
4475 if (iso->invalid_switch_dir)
4477 /* We already inserted a direction switch literally into the
4478 text. We assume (#### this may not be right) that the
4479 next direction switch is the one going the other way,
4480 and we need to output that literally as well. */
4481 iso->output_literally = 1;
4482 iso->invalid_switch_dir = 0;
4488 /* If we are in the thrall of an invalid designation,
4489 then stick the directionality sequence literally into the
4490 output stream so it ends up in the original text again. */
4491 for (jj = 0; jj < 4; jj++)
4492 if (iso->invalid_designated[jj])
4496 iso->output_literally = 1;
4497 iso->invalid_switch_dir = 1;
4500 /* Indicate that we haven't yet seen a valid designation,
4501 so that if a switch-dir is directly followed by an
4502 invalid designation, both get inserted literally. */
4503 iso->switched_dir_and_no_valid_charset_yet = 1;
4508 /**** designation ****/
4511 if (0x28 <= c && c <= 0x2F)
4513 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4516 if (0x40 <= c && c <= 0x42)
4518 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4519 *flags & CODING_STATE_R2L ?
4520 CHARSET_RIGHT_TO_LEFT :
4521 CHARSET_LEFT_TO_RIGHT);
4531 if (c < '0' || c > '~')
4532 return 0; /* bad final byte */
4534 if (iso->esc >= ISO_ESC_2_8 &&
4535 iso->esc <= ISO_ESC_2_15)
4537 type = ((iso->esc >= ISO_ESC_2_12) ?
4538 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4539 reg = (iso->esc - ISO_ESC_2_8) & 3;
4541 else if (iso->esc >= ISO_ESC_2_4_8 &&
4542 iso->esc <= ISO_ESC_2_4_15)
4544 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4545 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4546 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4550 /* Can this ever be reached? -slb */
4554 cs = CHARSET_BY_ATTRIBUTES (type, c,
4555 *flags & CODING_STATE_R2L ?
4556 CHARSET_RIGHT_TO_LEFT :
4557 CHARSET_LEFT_TO_RIGHT);
4563 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4567 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4568 /* can't invoke something that ain't there. */
4570 iso->esc = ISO_ESC_SINGLE_SHIFT;
4571 *flags &= CODING_STATE_ISO2022_LOCK;
4573 *flags |= CODING_STATE_SS2;
4575 *flags |= CODING_STATE_SS3;
4579 if (check_invalid_charsets &&
4580 !CHARSETP (iso->charset[reg]))
4581 /* can't invoke something that ain't there. */
4584 iso->register_right = reg;
4586 iso->register_left = reg;
4587 *flags &= CODING_STATE_ISO2022_LOCK;
4588 iso->esc = ISO_ESC_LOCKING_SHIFT;
4592 if (NILP (cs) && check_invalid_charsets)
4594 iso->invalid_designated[reg] = 1;
4595 iso->charset[reg] = Vcharset_ascii;
4596 iso->esc = ISO_ESC_DESIGNATE;
4597 *flags &= CODING_STATE_ISO2022_LOCK;
4598 iso->output_literally = 1;
4599 if (iso->switched_dir_and_no_valid_charset_yet)
4601 /* We encountered a switch-direction followed by an
4602 invalid designation. Ensure that the switch-direction
4603 gets outputted; otherwise it will probably get eaten
4604 when the text is written out again. */
4605 iso->switched_dir_and_no_valid_charset_yet = 0;
4606 iso->output_direction_sequence = 1;
4607 /* And make sure that the switch-dir going the other
4608 way gets outputted, as well. */
4609 iso->invalid_switch_dir = 1;
4613 /* This function is called with CODESYS equal to nil when
4614 doing coding-system detection. */
4615 if (!NILP (codesys))
4617 charset_conversion_spec_dynarr *dyn =
4618 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4624 for (i = 0; i < Dynarr_length (dyn); i++)
4626 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4627 if (EQ (cs, spec->from_charset))
4628 cs = spec->to_charset;
4633 iso->charset[reg] = cs;
4634 iso->esc = ISO_ESC_DESIGNATE;
4635 *flags &= CODING_STATE_ISO2022_LOCK;
4636 if (iso->invalid_designated[reg])
4638 iso->invalid_designated[reg] = 0;
4639 iso->output_literally = 1;
4641 if (iso->switched_dir_and_no_valid_charset_yet)
4642 iso->switched_dir_and_no_valid_charset_yet = 0;
4647 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4652 /* #### There are serious deficiencies in the recognition mechanism
4653 here. This needs to be much smarter if it's going to cut it.
4654 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4655 it should be detected as Latin-1.
4656 All the ISO2022 stuff in this file should be synced up with the
4657 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4658 Perhaps we should wait till R2L works in FSF Emacs? */
4660 if (!st->iso2022.initted)
4662 reset_iso2022 (Qnil, &st->iso2022.iso);
4663 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4664 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4665 CODING_CATEGORY_ISO_8_1_MASK |
4666 CODING_CATEGORY_ISO_8_2_MASK |
4667 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4668 st->iso2022.flags = 0;
4669 st->iso2022.high_byte_count = 0;
4670 st->iso2022.saw_single_shift = 0;
4671 st->iso2022.initted = 1;
4674 mask = st->iso2022.mask;
4681 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4682 st->iso2022.high_byte_count++;
4686 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4688 if (st->iso2022.high_byte_count & 1)
4689 /* odd number of high bytes; assume not iso-8-2 */
4690 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4692 st->iso2022.high_byte_count = 0;
4693 st->iso2022.saw_single_shift = 0;
4695 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4697 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4698 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4699 { /* control chars */
4702 /* Allow and ignore control characters that you might
4703 reasonably see in a text file */
4708 case 8: /* backspace */
4709 case 11: /* vertical tab */
4710 case 12: /* form feed */
4711 case 26: /* MS-DOS C-z junk */
4712 case 31: /* '^_' -- for info */
4713 goto label_continue_loop;
4720 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4723 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4724 &st->iso2022.flags, 0))
4726 switch (st->iso2022.iso.esc)
4728 case ISO_ESC_DESIGNATE:
4729 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4730 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4732 case ISO_ESC_LOCKING_SHIFT:
4733 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4734 goto ran_out_of_chars;
4735 case ISO_ESC_SINGLE_SHIFT:
4736 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4737 st->iso2022.saw_single_shift = 1;
4746 goto ran_out_of_chars;
4749 label_continue_loop:;
4758 postprocess_iso2022_mask (int mask)
4760 /* #### kind of cheesy */
4761 /* If seven-bit ISO is allowed, then assume that the encoding is
4762 entirely seven-bit and turn off the eight-bit ones. */
4763 if (mask & CODING_CATEGORY_ISO_7_MASK)
4764 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4765 CODING_CATEGORY_ISO_8_1_MASK |
4766 CODING_CATEGORY_ISO_8_2_MASK);
4770 /* If FLAGS is a null pointer or specifies right-to-left motion,
4771 output a switch-dir-to-left-to-right sequence to DST.
4772 Also update FLAGS if it is not a null pointer.
4773 If INTERNAL_P is set, we are outputting in internal format and
4774 need to handle the CSI differently. */
4777 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4778 unsigned_char_dynarr *dst,
4779 unsigned int *flags,
4782 if (!flags || (*flags & CODING_STATE_R2L))
4784 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4786 Dynarr_add (dst, ISO_CODE_ESC);
4787 Dynarr_add (dst, '[');
4789 else if (internal_p)
4790 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4792 Dynarr_add (dst, ISO_CODE_CSI);
4793 Dynarr_add (dst, '0');
4794 Dynarr_add (dst, ']');
4796 *flags &= ~CODING_STATE_R2L;
4800 /* If FLAGS is a null pointer or specifies a direction different from
4801 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4802 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4803 sequence to DST. Also update FLAGS if it is not a null pointer.
4804 If INTERNAL_P is set, we are outputting in internal format and
4805 need to handle the CSI differently. */
4808 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4809 unsigned_char_dynarr *dst, unsigned int *flags,
4812 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4813 direction == CHARSET_LEFT_TO_RIGHT)
4814 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4815 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4816 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4817 direction == CHARSET_RIGHT_TO_LEFT)
4819 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4821 Dynarr_add (dst, ISO_CODE_ESC);
4822 Dynarr_add (dst, '[');
4824 else if (internal_p)
4825 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4827 Dynarr_add (dst, ISO_CODE_CSI);
4828 Dynarr_add (dst, '2');
4829 Dynarr_add (dst, ']');
4831 *flags |= CODING_STATE_R2L;
4835 /* Convert ISO2022-format data to internal format. */
4838 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4839 unsigned_char_dynarr *dst, unsigned int n)
4841 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4842 unsigned int flags = str->flags;
4843 unsigned int ch = str->ch;
4844 eol_type_t eol_type = str->eol_type;
4845 #ifdef ENABLE_COMPOSITE_CHARS
4846 unsigned_char_dynarr *real_dst = dst;
4848 Lisp_Object coding_system;
4850 XSETCODING_SYSTEM (coding_system, str->codesys);
4852 #ifdef ENABLE_COMPOSITE_CHARS
4853 if (flags & CODING_STATE_COMPOSITE)
4854 dst = str->iso2022.composite_chars;
4855 #endif /* ENABLE_COMPOSITE_CHARS */
4859 unsigned char c = *src++;
4860 if (flags & CODING_STATE_ESCAPE)
4861 { /* Within ESC sequence */
4862 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4867 switch (str->iso2022.esc)
4869 #ifdef ENABLE_COMPOSITE_CHARS
4870 case ISO_ESC_START_COMPOSITE:
4871 if (str->iso2022.composite_chars)
4872 Dynarr_reset (str->iso2022.composite_chars);
4874 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4875 dst = str->iso2022.composite_chars;
4877 case ISO_ESC_END_COMPOSITE:
4879 Bufbyte comstr[MAX_EMCHAR_LEN];
4881 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4882 Dynarr_length (dst));
4884 len = set_charptr_emchar (comstr, emch);
4885 Dynarr_add_many (dst, comstr, len);
4888 #endif /* ENABLE_COMPOSITE_CHARS */
4890 case ISO_ESC_LITERAL:
4891 DECODE_ADD_BINARY_CHAR (c, dst);
4895 /* Everything else handled already */
4900 /* Attempted error recovery. */
4901 if (str->iso2022.output_direction_sequence)
4902 ensure_correct_direction (flags & CODING_STATE_R2L ?
4903 CHARSET_RIGHT_TO_LEFT :
4904 CHARSET_LEFT_TO_RIGHT,
4905 str->codesys, dst, 0, 1);
4906 /* More error recovery. */
4907 if (!retval || str->iso2022.output_literally)
4909 /* Output the (possibly invalid) sequence */
4911 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4912 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4913 flags &= CODING_STATE_ISO2022_LOCK;
4915 n++, src--;/* Repeat the loop with the same character. */
4918 /* No sense in reprocessing the final byte of the
4919 escape sequence; it could mess things up anyway.
4921 DECODE_ADD_BINARY_CHAR (c, dst);
4926 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4927 { /* Control characters */
4929 /***** Error-handling *****/
4931 /* If we were in the middle of a character, dump out the
4932 partial character. */
4933 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4935 /* If we just saw a single-shift character, dump it out.
4936 This may dump out the wrong sort of single-shift character,
4937 but least it will give an indication that something went
4939 if (flags & CODING_STATE_SS2)
4941 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4942 flags &= ~CODING_STATE_SS2;
4944 if (flags & CODING_STATE_SS3)
4946 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4947 flags &= ~CODING_STATE_SS3;
4950 /***** Now handle the control characters. *****/
4953 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4955 flags &= CODING_STATE_ISO2022_LOCK;
4957 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4958 DECODE_ADD_BINARY_CHAR (c, dst);
4961 { /* Graphic characters */
4962 Lisp_Object charset;
4966 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4968 /* Now determine the charset. */
4969 reg = ((flags & CODING_STATE_SS2) ? 2
4970 : (flags & CODING_STATE_SS3) ? 3
4971 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4972 : str->iso2022.register_left);
4973 charset = str->iso2022.charset[reg];
4975 /* Error checking: */
4976 if (! CHARSETP (charset)
4977 || str->iso2022.invalid_designated[reg]
4978 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4979 && XCHARSET_CHARS (charset) == 94))
4980 /* Mrmph. We are trying to invoke a register that has no
4981 or an invalid charset in it, or trying to add a character
4982 outside the range of the charset. Insert that char literally
4983 to preserve it for the output. */
4985 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4986 DECODE_ADD_BINARY_CHAR (c, dst);
4991 /* Things are probably hunky-dorey. */
4993 /* Fetch reverse charset, maybe. */
4994 if (((flags & CODING_STATE_R2L) &&
4995 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4997 (!(flags & CODING_STATE_R2L) &&
4998 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5000 Lisp_Object new_charset =
5001 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5002 if (!NILP (new_charset))
5003 charset = new_charset;
5006 lb = XCHARSET_LEADING_BYTE (charset);
5007 switch (XCHARSET_REP_BYTES (charset))
5010 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5011 Dynarr_add (dst, c & 0x7F);
5014 case 2: /* one-byte official */
5015 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5016 Dynarr_add (dst, lb);
5017 Dynarr_add (dst, c | 0x80);
5020 case 3: /* one-byte private or two-byte official */
5021 if (XCHARSET_PRIVATE_P (charset))
5023 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5024 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5025 Dynarr_add (dst, lb);
5026 Dynarr_add (dst, c | 0x80);
5032 Dynarr_add (dst, lb);
5033 Dynarr_add (dst, ch | 0x80);
5034 Dynarr_add (dst, c | 0x80);
5042 default: /* two-byte private */
5045 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5046 Dynarr_add (dst, lb);
5047 Dynarr_add (dst, ch | 0x80);
5048 Dynarr_add (dst, c | 0x80);
5057 flags &= CODING_STATE_ISO2022_LOCK;
5060 label_continue_loop:;
5063 if (flags & CODING_STATE_END)
5064 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5071 /***** ISO2022 encoder *****/
5073 /* Designate CHARSET into register REG. */
5076 iso2022_designate (Lisp_Object charset, unsigned char reg,
5077 struct encoding_stream *str, unsigned_char_dynarr *dst)
5079 static const char inter94[] = "()*+";
5080 static const char inter96[] = ",-./";
5082 unsigned char final;
5083 Lisp_Object old_charset = str->iso2022.charset[reg];
5085 str->iso2022.charset[reg] = charset;
5086 if (!CHARSETP (charset))
5087 /* charset might be an initial nil or t. */
5089 type = XCHARSET_TYPE (charset);
5090 final = XCHARSET_FINAL (charset);
5091 if (!str->iso2022.force_charset_on_output[reg] &&
5092 CHARSETP (old_charset) &&
5093 XCHARSET_TYPE (old_charset) == type &&
5094 XCHARSET_FINAL (old_charset) == final)
5097 str->iso2022.force_charset_on_output[reg] = 0;
5100 charset_conversion_spec_dynarr *dyn =
5101 str->codesys->iso2022.output_conv;
5107 for (i = 0; i < Dynarr_length (dyn); i++)
5109 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5110 if (EQ (charset, spec->from_charset))
5111 charset = spec->to_charset;
5116 Dynarr_add (dst, ISO_CODE_ESC);
5119 case CHARSET_TYPE_94:
5120 Dynarr_add (dst, inter94[reg]);
5122 case CHARSET_TYPE_96:
5123 Dynarr_add (dst, inter96[reg]);
5125 case CHARSET_TYPE_94X94:
5126 Dynarr_add (dst, '$');
5128 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5131 Dynarr_add (dst, inter94[reg]);
5133 case CHARSET_TYPE_96X96:
5134 Dynarr_add (dst, '$');
5135 Dynarr_add (dst, inter96[reg]);
5138 Dynarr_add (dst, final);
5142 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5144 if (str->iso2022.register_left != 0)
5146 Dynarr_add (dst, ISO_CODE_SI);
5147 str->iso2022.register_left = 0;
5152 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5154 if (str->iso2022.register_left != 1)
5156 Dynarr_add (dst, ISO_CODE_SO);
5157 str->iso2022.register_left = 1;
5161 /* Convert internally-formatted data to ISO2022 format. */
5164 encode_coding_iso2022 (Lstream *encoding, const unsigned char *src,
5165 unsigned_char_dynarr *dst, unsigned int n)
5167 unsigned char charmask, c;
5168 unsigned char char_boundary;
5169 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5170 unsigned int flags = str->flags;
5171 unsigned int ch = str->ch;
5172 Lisp_Coding_System *codesys = str->codesys;
5173 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5175 Lisp_Object charset;
5178 #ifdef ENABLE_COMPOSITE_CHARS
5179 /* flags for handling composite chars. We do a little switcharoo
5180 on the source while we're outputting the composite char. */
5181 unsigned int saved_n = 0;
5182 const unsigned char *saved_src = NULL;
5183 int in_composite = 0;
5184 #endif /* ENABLE_COMPOSITE_CHARS */
5186 char_boundary = str->iso2022.current_char_boundary;
5187 charset = str->iso2022.current_charset;
5188 half = str->iso2022.current_half;
5190 #ifdef ENABLE_COMPOSITE_CHARS
5197 if (BYTE_ASCII_P (c))
5198 { /* Processing ASCII character */
5201 restore_left_to_right_direction (codesys, dst, &flags, 0);
5203 /* Make sure G0 contains ASCII */
5204 if ((c > ' ' && c < ISO_CODE_DEL) ||
5205 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5207 ensure_normal_shift (str, dst);
5208 iso2022_designate (Vcharset_ascii, 0, str, dst);
5211 /* If necessary, restore everything to the default state
5214 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5216 restore_left_to_right_direction (codesys, dst, &flags, 0);
5218 ensure_normal_shift (str, dst);
5220 for (i = 0; i < 4; i++)
5222 Lisp_Object initial_charset =
5223 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5224 iso2022_designate (initial_charset, i, str, dst);
5229 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5230 Dynarr_add (dst, '\r');
5231 if (eol_type != EOL_CR)
5232 Dynarr_add (dst, c);
5236 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5237 && fit_to_be_escape_quoted (c))
5238 Dynarr_add (dst, ISO_CODE_ESC);
5239 Dynarr_add (dst, c);
5244 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5245 { /* Processing Leading Byte */
5247 charset = CHARSET_BY_LEADING_BYTE (c);
5248 if (LEADING_BYTE_PREFIX_P(c))
5250 else if (!EQ (charset, Vcharset_control_1)
5251 #ifdef ENABLE_COMPOSITE_CHARS
5252 && !EQ (charset, Vcharset_composite)
5258 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5259 codesys, dst, &flags, 0);
5261 /* Now determine which register to use. */
5263 for (i = 0; i < 4; i++)
5265 if (EQ (charset, str->iso2022.charset[i]) ||
5267 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5276 if (XCHARSET_GRAPHIC (charset) != 0)
5278 if (!NILP (str->iso2022.charset[1]) &&
5279 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5280 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5282 else if (!NILP (str->iso2022.charset[2]))
5284 else if (!NILP (str->iso2022.charset[3]))
5293 iso2022_designate (charset, reg, str, dst);
5295 /* Now invoke that register. */
5299 ensure_normal_shift (str, dst);
5304 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5306 ensure_shift_out (str, dst);
5314 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5316 Dynarr_add (dst, ISO_CODE_ESC);
5317 Dynarr_add (dst, 'N');
5322 Dynarr_add (dst, ISO_CODE_SS2);
5328 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5330 Dynarr_add (dst, ISO_CODE_ESC);
5331 Dynarr_add (dst, 'O');
5336 Dynarr_add (dst, ISO_CODE_SS3);
5348 { /* Processing Non-ASCII character */
5349 charmask = (half == 0 ? 0x7F : 0xFF);
5351 if (EQ (charset, Vcharset_control_1))
5353 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5354 && fit_to_be_escape_quoted (c))
5355 Dynarr_add (dst, ISO_CODE_ESC);
5356 /* you asked for it ... */
5357 Dynarr_add (dst, c - 0x20);
5361 switch (XCHARSET_REP_BYTES (charset))
5364 Dynarr_add (dst, c & charmask);
5367 if (XCHARSET_PRIVATE_P (charset))
5369 Dynarr_add (dst, c & charmask);
5374 #ifdef ENABLE_COMPOSITE_CHARS
5375 if (EQ (charset, Vcharset_composite))
5379 /* #### Bother! We don't know how to
5381 Dynarr_add (dst, '~');
5385 Emchar emch = MAKE_CHAR (Vcharset_composite,
5386 ch & 0x7F, c & 0x7F);
5387 Lisp_Object lstr = composite_char_string (emch);
5391 src = XSTRING_DATA (lstr);
5392 n = XSTRING_LENGTH (lstr);
5393 Dynarr_add (dst, ISO_CODE_ESC);
5394 Dynarr_add (dst, '0'); /* start composing */
5398 #endif /* ENABLE_COMPOSITE_CHARS */
5400 Dynarr_add (dst, ch & charmask);
5401 Dynarr_add (dst, c & charmask);
5414 Dynarr_add (dst, ch & charmask);
5415 Dynarr_add (dst, c & charmask);
5431 #ifdef ENABLE_COMPOSITE_CHARS
5437 Dynarr_add (dst, ISO_CODE_ESC);
5438 Dynarr_add (dst, '1'); /* end composing */
5439 goto back_to_square_n; /* Wheeeeeeeee ..... */
5441 #endif /* ENABLE_COMPOSITE_CHARS */
5443 if (char_boundary && flags & CODING_STATE_END)
5445 restore_left_to_right_direction (codesys, dst, &flags, 0);
5446 ensure_normal_shift (str, dst);
5447 for (i = 0; i < 4; i++)
5449 Lisp_Object initial_charset =
5450 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5451 iso2022_designate (initial_charset, i, str, dst);
5457 str->iso2022.current_char_boundary = char_boundary;
5458 str->iso2022.current_charset = charset;
5459 str->iso2022.current_half = half;
5461 /* Verbum caro factum est! */
5465 /************************************************************************/
5466 /* No-conversion methods */
5467 /************************************************************************/
5469 /* This is used when reading in "binary" files -- i.e. files that may
5470 contain all 256 possible byte values and that are not to be
5471 interpreted as being in any particular decoding. */
5473 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5474 unsigned_char_dynarr *dst, unsigned int n)
5477 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5478 unsigned int flags = str->flags;
5479 unsigned int ch = str->ch;
5480 eol_type_t eol_type = str->eol_type;
5486 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5487 DECODE_ADD_BINARY_CHAR (c, dst);
5488 label_continue_loop:;
5491 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5498 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5499 unsigned_char_dynarr *dst, unsigned int n)
5502 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5503 unsigned int flags = str->flags;
5504 unsigned int ch = str->ch;
5505 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5512 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5513 Dynarr_add (dst, '\r');
5514 if (eol_type != EOL_CR)
5515 Dynarr_add (dst, '\n');
5518 else if (BYTE_ASCII_P (c))
5521 Dynarr_add (dst, c);
5523 else if (BUFBYTE_LEADING_BYTE_P (c))
5526 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5527 c == LEADING_BYTE_CONTROL_1)
5530 Dynarr_add (dst, '~'); /* untranslatable character */
5534 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5535 Dynarr_add (dst, c);
5536 else if (ch == LEADING_BYTE_CONTROL_1)
5539 Dynarr_add (dst, c - 0x20);
5541 /* else it should be the second or third byte of an
5542 untranslatable character, so ignore it */
5553 /************************************************************************/
5554 /* Initialization */
5555 /************************************************************************/
5558 syms_of_file_coding (void)
5560 INIT_LRECORD_IMPLEMENTATION (coding_system);
5562 deferror (&Qcoding_system_error, "coding-system-error",
5563 "Coding-system error", Qio_error);
5565 DEFSUBR (Fcoding_system_p);
5566 DEFSUBR (Ffind_coding_system);
5567 DEFSUBR (Fget_coding_system);
5568 DEFSUBR (Fcoding_system_list);
5569 DEFSUBR (Fcoding_system_name);
5570 DEFSUBR (Fmake_coding_system);
5571 DEFSUBR (Fcopy_coding_system);
5572 DEFSUBR (Fcoding_system_canonical_name_p);
5573 DEFSUBR (Fcoding_system_alias_p);
5574 DEFSUBR (Fcoding_system_aliasee);
5575 DEFSUBR (Fdefine_coding_system_alias);
5576 DEFSUBR (Fsubsidiary_coding_system);
5578 DEFSUBR (Fcoding_system_type);
5579 DEFSUBR (Fcoding_system_doc_string);
5581 DEFSUBR (Fcoding_system_charset);
5583 DEFSUBR (Fcoding_system_property);
5585 DEFSUBR (Fcoding_category_list);
5586 DEFSUBR (Fset_coding_priority_list);
5587 DEFSUBR (Fcoding_priority_list);
5588 DEFSUBR (Fset_coding_category_system);
5589 DEFSUBR (Fcoding_category_system);
5591 DEFSUBR (Fdetect_coding_region);
5592 DEFSUBR (Fdecode_coding_region);
5593 DEFSUBR (Fencode_coding_region);
5595 DEFSUBR (Fdecode_shift_jis_char);
5596 DEFSUBR (Fencode_shift_jis_char);
5597 DEFSUBR (Fdecode_big5_char);
5598 DEFSUBR (Fencode_big5_char);
5599 DEFSUBR (Fset_ucs_char);
5600 DEFSUBR (Fucs_char);
5601 DEFSUBR (Fset_char_ucs);
5602 DEFSUBR (Fchar_ucs);
5604 defsymbol (&Qcoding_systemp, "coding-system-p");
5605 defsymbol (&Qno_conversion, "no-conversion");
5606 defsymbol (&Qraw_text, "raw-text");
5608 defsymbol (&Qbig5, "big5");
5609 defsymbol (&Qshift_jis, "shift-jis");
5610 defsymbol (&Qucs4, "ucs-4");
5611 defsymbol (&Qutf8, "utf-8");
5612 defsymbol (&Qccl, "ccl");
5613 defsymbol (&Qiso2022, "iso2022");
5615 defsymbol (&Qmnemonic, "mnemonic");
5616 defsymbol (&Qeol_type, "eol-type");
5617 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5618 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5620 defsymbol (&Qcr, "cr");
5621 defsymbol (&Qlf, "lf");
5622 defsymbol (&Qcrlf, "crlf");
5623 defsymbol (&Qeol_cr, "eol-cr");
5624 defsymbol (&Qeol_lf, "eol-lf");
5625 defsymbol (&Qeol_crlf, "eol-crlf");
5627 defsymbol (&Qcharset_g0, "charset-g0");
5628 defsymbol (&Qcharset_g1, "charset-g1");
5629 defsymbol (&Qcharset_g2, "charset-g2");
5630 defsymbol (&Qcharset_g3, "charset-g3");
5631 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5632 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5633 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5634 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5635 defsymbol (&Qno_iso6429, "no-iso6429");
5636 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5637 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5639 defsymbol (&Qshort, "short");
5640 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5641 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5642 defsymbol (&Qseven, "seven");
5643 defsymbol (&Qlock_shift, "lock-shift");
5644 defsymbol (&Qescape_quoted, "escape-quoted");
5646 defsymbol (&Qencode, "encode");
5647 defsymbol (&Qdecode, "decode");
5650 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5652 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5654 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5656 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5658 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5660 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5662 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5664 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5666 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5669 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5674 lstream_type_create_file_coding (void)
5676 LSTREAM_HAS_METHOD (decoding, reader);
5677 LSTREAM_HAS_METHOD (decoding, writer);
5678 LSTREAM_HAS_METHOD (decoding, rewinder);
5679 LSTREAM_HAS_METHOD (decoding, seekable_p);
5680 LSTREAM_HAS_METHOD (decoding, flusher);
5681 LSTREAM_HAS_METHOD (decoding, closer);
5682 LSTREAM_HAS_METHOD (decoding, marker);
5684 LSTREAM_HAS_METHOD (encoding, reader);
5685 LSTREAM_HAS_METHOD (encoding, writer);
5686 LSTREAM_HAS_METHOD (encoding, rewinder);
5687 LSTREAM_HAS_METHOD (encoding, seekable_p);
5688 LSTREAM_HAS_METHOD (encoding, flusher);
5689 LSTREAM_HAS_METHOD (encoding, closer);
5690 LSTREAM_HAS_METHOD (encoding, marker);
5694 vars_of_file_coding (void)
5698 fcd = xnew (struct file_coding_dump);
5699 dumpstruct (&fcd, &fcd_description);
5701 /* Initialize to something reasonable ... */
5702 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5704 fcd->coding_category_system[i] = Qnil;
5705 fcd->coding_category_by_priority[i] = i;
5708 Fprovide (intern ("file-coding"));
5710 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5711 Coding system used for TTY keyboard input.
5712 Not used under a windowing system.
5714 Vkeyboard_coding_system = Qnil;
5716 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5717 Coding system used for TTY display output.
5718 Not used under a windowing system.
5720 Vterminal_coding_system = Qnil;
5722 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5723 Overriding coding system used when reading from a file or process.
5724 You should bind this variable with `let', but do not set it globally.
5725 If this is non-nil, it specifies the coding system that will be used
5726 to decode input on read operations, such as from a file or process.
5727 It overrides `buffer-file-coding-system-for-read',
5728 `insert-file-contents-pre-hook', etc. Use those variables instead of
5729 this one for permanent changes to the environment. */ );
5730 Vcoding_system_for_read = Qnil;
5732 DEFVAR_LISP ("coding-system-for-write",
5733 &Vcoding_system_for_write /*
5734 Overriding coding system used when writing to a file or process.
5735 You should bind this variable with `let', but do not set it globally.
5736 If this is non-nil, it specifies the coding system that will be used
5737 to encode output for write operations, such as to a file or process.
5738 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5739 Use those variables instead of this one for permanent changes to the
5741 Vcoding_system_for_write = Qnil;
5743 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5744 Coding system used to convert pathnames when accessing files.
5746 Vfile_name_coding_system = Qnil;
5748 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5749 Non-nil means the buffer contents are regarded as multi-byte form
5750 of characters, not a binary code. This affects the display, file I/O,
5751 and behaviors of various editing commands.
5753 Setting this to nil does not do anything.
5755 enable_multibyte_characters = 1;
5759 complex_vars_of_file_coding (void)
5761 staticpro (&Vcoding_system_hash_table);
5762 Vcoding_system_hash_table =
5763 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5765 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5766 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5768 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5770 struct codesys_prop csp; \
5772 csp.prop_type = (Prop_Type); \
5773 Dynarr_add (the_codesys_prop_dynarr, csp); \
5776 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5777 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5778 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5779 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5780 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5781 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5782 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5784 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5785 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5786 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5787 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5788 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5789 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5790 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5791 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5792 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5793 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5794 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5795 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5796 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5797 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5798 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5799 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5800 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5802 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5803 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5805 /* Need to create this here or we're really screwed. */
5807 (Qraw_text, Qno_conversion,
5808 build_string ("Raw text, which means it converts only line-break-codes."),
5809 list2 (Qmnemonic, build_string ("Raw")));
5812 (Qbinary, Qno_conversion,
5813 build_string ("Binary, which means it does not convert anything."),
5814 list4 (Qeol_type, Qlf,
5815 Qmnemonic, build_string ("Binary")));
5817 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5819 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5821 Fdefine_coding_system_alias (Qterminal, Qbinary);
5822 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5824 /* Need this for bootstrapping */
5825 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5826 Fget_coding_system (Qraw_text);
5832 for (i = 0; i < 65536; i++)
5833 fcd->ucs_to_mule_table[i] = Qnil;
5835 staticpro (&mule_to_ucs_table);
5836 mule_to_ucs_table = Fmake_char_table(Qgeneric);