1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
61 #if defined(MULE) && !defined(UTF2000)
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
68 #if defined(MULE) && !defined(UTF2000)
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const unsigned char *src,
181 static void decode_coding_sjis (Lstream *decoding,
182 const unsigned char *src,
183 unsigned_char_dynarr *dst,
185 static void encode_coding_sjis (Lstream *encoding,
186 const unsigned char *src,
187 unsigned_char_dynarr *dst,
189 static int detect_coding_big5 (struct detection_state *st,
190 const unsigned char *src,
192 static void decode_coding_big5 (Lstream *decoding,
193 const unsigned char *src,
194 unsigned_char_dynarr *dst, unsigned int n);
195 static void encode_coding_big5 (Lstream *encoding,
196 const unsigned char *src,
197 unsigned_char_dynarr *dst, unsigned int n);
198 static int detect_coding_ucs4 (struct detection_state *st,
199 const unsigned char *src,
201 static void decode_coding_ucs4 (Lstream *decoding,
202 const unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_ucs4 (Lstream *encoding,
205 const unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_utf8 (struct detection_state *st,
208 const unsigned char *src,
210 static void decode_coding_utf8 (Lstream *decoding,
211 const unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 static void encode_coding_utf8 (Lstream *encoding,
214 const unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
216 static int postprocess_iso2022_mask (int mask);
217 static void reset_iso2022 (Lisp_Object coding_system,
218 struct iso2022_decoder *iso);
219 static int detect_coding_iso2022 (struct detection_state *st,
220 const unsigned char *src,
222 static void decode_coding_iso2022 (Lstream *decoding,
223 const unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 static void encode_coding_iso2022 (Lstream *encoding,
226 const unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 static void decode_coding_no_conversion (Lstream *decoding,
230 const unsigned char *src,
231 unsigned_char_dynarr *dst,
233 static void encode_coding_no_conversion (Lstream *encoding,
234 const unsigned char *src,
235 unsigned_char_dynarr *dst,
237 static void mule_decode (Lstream *decoding, const unsigned char *src,
238 unsigned_char_dynarr *dst, unsigned int n);
239 static void mule_encode (Lstream *encoding, const unsigned char *src,
240 unsigned_char_dynarr *dst, unsigned int n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
677 signal_simple_error_2
678 ("Attempted conversion between different charset types",
680 spec.from_charset = from;
681 spec.to_charset = to;
683 Dynarr_add (store_here, spec);
687 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
688 specs, return the equivalent as the Lisp programmer would see it.
690 If LOAD_HERE is 0, return Qnil. */
693 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
700 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
702 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
703 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
706 return Fnreverse (result);
711 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
712 Register symbol NAME as a coding system.
714 TYPE describes the conversion method used and should be one of
717 Automatic conversion. XEmacs attempts to detect the coding system
720 No conversion. Use this for binary files and such. On output,
721 graphic characters that are not in ASCII or Latin-1 will be
722 replaced by a ?. (For a no-conversion-encoded buffer, these
723 characters will only be present if you explicitly insert them.)
725 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
727 ISO 10646 UCS-4 encoding.
729 ISO 10646 UTF-8 encoding.
731 Any ISO2022-compliant encoding. Among other things, this includes
732 JIS (the Japanese encoding commonly used for e-mail), EUC (the
733 standard Unix encoding for Japanese and other languages), and
734 Compound Text (the encoding used in X11). You can specify more
735 specific information about the conversion with the FLAGS argument.
737 Big5 (the encoding commonly used for Taiwanese).
739 The conversion is performed using a user-written pseudo-code
740 program. CCL (Code Conversion Language) is the name of this
743 Write out or read in the raw contents of the memory representing
744 the buffer's text. This is primarily useful for debugging
745 purposes, and is only enabled when XEmacs has been compiled with
746 DEBUG_XEMACS defined (via the --debug configure option).
747 WARNING: Reading in a file using 'internal conversion can result
748 in an internal inconsistency in the memory representing a
749 buffer's text, which will produce unpredictable results and may
750 cause XEmacs to crash. Under normal circumstances you should
751 never use 'internal conversion.
753 DOC-STRING is a string describing the coding system.
755 PROPS is a property list, describing the specific nature of the
756 character set. Recognized properties are:
759 String to be displayed in the modeline when this coding system is
763 End-of-line conversion to be used. It should be one of
766 Automatically detect the end-of-line type (LF, CRLF,
767 or CR). Also generate subsidiary coding systems named
768 `NAME-unix', `NAME-dos', and `NAME-mac', that are
769 identical to this coding system but have an EOL-TYPE
770 value of 'lf, 'crlf, and 'cr, respectively.
772 The end of a line is marked externally using ASCII LF.
773 Since this is also the way that XEmacs represents an
774 end-of-line internally, specifying this option results
775 in no end-of-line conversion. This is the standard
776 format for Unix text files.
778 The end of a line is marked externally using ASCII
779 CRLF. This is the standard format for MS-DOS text
782 The end of a line is marked externally using ASCII CR.
783 This is the standard format for Macintosh text files.
785 Automatically detect the end-of-line type but do not
786 generate subsidiary coding systems. (This value is
787 converted to nil when stored internally, and
788 `coding-system-property' will return nil.)
790 'post-read-conversion
791 Function called after a file has been read in, to perform the
792 decoding. Called with two arguments, BEG and END, denoting
793 a region of the current buffer to be decoded.
795 'pre-write-conversion
796 Function called before a file is written out, to perform the
797 encoding. Called with two arguments, BEG and END, denoting
798 a region of the current buffer to be encoded.
801 The following additional properties are recognized if TYPE is 'iso2022:
807 The character set initially designated to the G0 - G3 registers.
808 The value should be one of
810 -- A charset object (designate that character set)
811 -- nil (do not ever use this register)
812 -- t (no character set is initially designated to
813 the register, but may be later on; this automatically
814 sets the corresponding `force-g*-on-output' property)
820 If non-nil, send an explicit designation sequence on output before
821 using the specified register.
824 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
825 "ESC $ B" on output in place of the full designation sequences
826 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
829 If non-nil, don't designate ASCII to G0 at each end of line on output.
830 Setting this to non-nil also suppresses other state-resetting that
831 normally happens at the end of a line.
834 If non-nil, don't designate ASCII to G0 before control chars on output.
837 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
841 If non-nil, use locking-shift (SO/SI) instead of single-shift
842 or designation by escape sequence.
845 If non-nil, don't use ISO6429's direction specification.
848 If non-nil, literal control characters that are the same as
849 the beginning of a recognized ISO2022 or ISO6429 escape sequence
850 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
851 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
852 so that they can be properly distinguished from an escape sequence.
853 (Note that doing this results in a non-portable encoding.) This
854 encoding flag is used for byte-compiled files. Note that ESC
855 is a good choice for a quoting character because there are no
856 escape sequences whose second byte is a character from the Control-0
857 or Control-1 character sets; this is explicitly disallowed by the
860 'input-charset-conversion
861 A list of conversion specifications, specifying conversion of
862 characters in one charset to another when decoding is performed.
863 Each specification is a list of two elements: the source charset,
864 and the destination charset.
866 'output-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when encoding is performed.
869 The form of each specification is the same as for
870 'input-charset-conversion.
873 The following additional properties are recognized (and required)
877 CCL program used for decoding (converting to internal format).
880 CCL program used for encoding (converting to external format).
882 (name, type, doc_string, props))
884 Lisp_Coding_System *codesys;
885 Lisp_Object rest, key, value;
886 enum coding_system_type ty;
887 int need_to_setup_eol_systems = 1;
889 /* Convert type to constant */
890 if (NILP (type) || EQ (type, Qundecided))
891 { ty = CODESYS_AUTODETECT; }
893 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
894 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
895 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
896 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
897 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
898 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
900 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
902 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
905 signal_simple_error ("Invalid coding system type", type);
909 codesys = allocate_coding_system (ty, name);
911 if (NILP (doc_string))
912 doc_string = build_string ("");
914 CHECK_STRING (doc_string);
915 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
917 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
919 if (EQ (key, Qmnemonic))
922 CHECK_STRING (value);
923 CODING_SYSTEM_MNEMONIC (codesys) = value;
926 else if (EQ (key, Qeol_type))
928 need_to_setup_eol_systems = NILP (value);
931 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
934 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
935 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
937 else if (ty == CODESYS_ISO2022)
939 #define FROB_INITIAL_CHARSET(charset_num) \
940 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
941 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
943 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
944 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
945 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
946 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
948 #define FROB_FORCE_CHARSET(charset_num) \
949 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
951 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
952 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
953 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
954 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
956 #define FROB_BOOLEAN_PROPERTY(prop) \
957 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
959 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
960 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
961 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
962 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
963 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
964 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
965 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
967 else if (EQ (key, Qinput_charset_conversion))
969 codesys->iso2022.input_conv =
970 Dynarr_new (charset_conversion_spec);
971 parse_charset_conversion_specs (codesys->iso2022.input_conv,
974 else if (EQ (key, Qoutput_charset_conversion))
976 codesys->iso2022.output_conv =
977 Dynarr_new (charset_conversion_spec);
978 parse_charset_conversion_specs (codesys->iso2022.output_conv,
982 signal_simple_error ("Unrecognized property", key);
984 else if (EQ (type, Qccl))
986 if (EQ (key, Qdecode))
988 CHECK_VECTOR (value);
989 CODING_SYSTEM_CCL_DECODE (codesys) = value;
991 else if (EQ (key, Qencode))
993 CHECK_VECTOR (value);
994 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
997 signal_simple_error ("Unrecognized property", key);
1001 signal_simple_error ("Unrecognized property", key);
1004 if (need_to_setup_eol_systems)
1005 setup_eol_coding_systems (codesys);
1008 Lisp_Object codesys_obj;
1009 XSETCODING_SYSTEM (codesys_obj, codesys);
1010 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1016 Copy OLD-CODING-SYSTEM to NEW-NAME.
1017 If NEW-NAME does not name an existing coding system, a new one will
1020 (old_coding_system, new_name))
1022 Lisp_Object new_coding_system;
1023 old_coding_system = Fget_coding_system (old_coding_system);
1024 new_coding_system = Ffind_coding_system (new_name);
1025 if (NILP (new_coding_system))
1027 XSETCODING_SYSTEM (new_coding_system,
1028 allocate_coding_system
1029 (XCODING_SYSTEM_TYPE (old_coding_system),
1031 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1035 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1036 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1037 memcpy (((char *) to ) + sizeof (to->header),
1038 ((char *) from) + sizeof (from->header),
1039 sizeof (*from) - sizeof (from->header));
1040 to->name = new_name;
1042 return new_coding_system;
1045 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1046 Return t if OBJECT names a coding system, and is not a coding system alias.
1050 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1054 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1055 Return t if OBJECT is a coding system alias.
1056 All coding system aliases are created by `define-coding-system-alias'.
1060 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1064 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1065 Return the coding-system symbol for which symbol ALIAS is an alias.
1069 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1070 if (SYMBOLP (aliasee))
1073 signal_simple_error ("Symbol is not a coding system alias", alias);
1074 return Qnil; /* To keep the compiler happy */
1078 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1080 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1084 /* A maphash function, for removing dangling coding system aliases. */
1086 dangling_coding_system_alias_p (Lisp_Object alias,
1087 Lisp_Object aliasee,
1088 void *dangling_aliases)
1090 if (SYMBOLP (aliasee)
1091 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1093 (*(int *) dangling_aliases)++;
1100 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1101 Define symbol ALIAS as an alias for coding system ALIASEE.
1103 You can use this function to redefine an alias that has already been defined,
1104 but you cannot redefine a name which is the canonical name for a coding system.
1105 \(a canonical name of a coding system is what is returned when you call
1106 `coding-system-name' on a coding system).
1108 ALIASEE itself can be an alias, which allows you to define nested aliases.
1110 You are forbidden, however, from creating alias loops or `dangling' aliases.
1111 These will be detected, and an error will be signaled if you attempt to do so.
1113 If ALIASEE is nil, then ALIAS will simply be undefined.
1115 See also `coding-system-alias-p', `coding-system-aliasee',
1116 and `coding-system-canonical-name-p'.
1120 Lisp_Object real_coding_system, probe;
1122 CHECK_SYMBOL (alias);
1124 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1126 ("Symbol is the canonical name of a coding system and cannot be redefined",
1131 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1132 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1133 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1135 Fremhash (alias, Vcoding_system_hash_table);
1137 /* Undefine subsidiary aliases,
1138 presumably created by a previous call to this function */
1139 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1140 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1141 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1143 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1144 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1145 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1148 /* Undefine dangling coding system aliases. */
1150 int dangling_aliases;
1153 dangling_aliases = 0;
1154 elisp_map_remhash (dangling_coding_system_alias_p,
1155 Vcoding_system_hash_table,
1157 } while (dangling_aliases > 0);
1163 if (CODING_SYSTEMP (aliasee))
1164 aliasee = XCODING_SYSTEM_NAME (aliasee);
1166 /* Checks that aliasee names a coding-system */
1167 real_coding_system = Fget_coding_system (aliasee);
1169 /* Check for coding system alias loops */
1170 if (EQ (alias, aliasee))
1171 alias_loop: signal_simple_error_2
1172 ("Attempt to create a coding system alias loop", alias, aliasee);
1174 for (probe = aliasee;
1176 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1178 if (EQ (probe, alias))
1182 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1184 /* Set up aliases for subsidiaries.
1185 #### There must be a better way to handle subsidiary coding systems. */
1187 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1189 for (i = 0; i < countof (suffixes); i++)
1191 Lisp_Object alias_subsidiary =
1192 append_suffix_to_symbol (alias, suffixes[i]);
1193 Lisp_Object aliasee_subsidiary =
1194 append_suffix_to_symbol (aliasee, suffixes[i]);
1196 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1197 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1200 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1201 but it doesn't look intentional, so I'd rather return something
1202 meaningful or nothing at all. */
1207 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1209 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1210 Lisp_Object new_coding_system;
1212 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1213 return coding_system;
1217 case EOL_AUTODETECT: return coding_system;
1218 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1219 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1220 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1224 return NILP (new_coding_system) ? coding_system : new_coding_system;
1227 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1228 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1230 (coding_system, eol_type))
1232 coding_system = Fget_coding_system (coding_system);
1234 return subsidiary_coding_system (coding_system,
1235 symbol_to_eol_type (eol_type));
1239 /************************************************************************/
1240 /* Coding system accessors */
1241 /************************************************************************/
1243 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1244 Return the doc string for CODING-SYSTEM.
1248 coding_system = Fget_coding_system (coding_system);
1249 return XCODING_SYSTEM_DOC_STRING (coding_system);
1252 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1253 Return the type of CODING-SYSTEM.
1257 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1260 case CODESYS_AUTODETECT: return Qundecided;
1262 case CODESYS_SHIFT_JIS: return Qshift_jis;
1263 case CODESYS_ISO2022: return Qiso2022;
1264 case CODESYS_BIG5: return Qbig5;
1265 case CODESYS_UCS4: return Qucs4;
1266 case CODESYS_UTF8: return Qutf8;
1267 case CODESYS_CCL: return Qccl;
1269 case CODESYS_NO_CONVERSION: return Qno_conversion;
1271 case CODESYS_INTERNAL: return Qinternal;
1278 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1281 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1283 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1286 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1287 Return initial charset of CODING-SYSTEM designated to GNUM.
1290 (coding_system, gnum))
1292 coding_system = Fget_coding_system (coding_system);
1295 return coding_system_charset (coding_system, XINT (gnum));
1299 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1300 Return the PROP property of CODING-SYSTEM.
1302 (coding_system, prop))
1305 enum coding_system_type type;
1307 coding_system = Fget_coding_system (coding_system);
1308 CHECK_SYMBOL (prop);
1309 type = XCODING_SYSTEM_TYPE (coding_system);
1311 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1312 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1315 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1317 case CODESYS_PROP_ALL_OK:
1320 case CODESYS_PROP_ISO2022:
1321 if (type != CODESYS_ISO2022)
1323 ("Property only valid in ISO2022 coding systems",
1327 case CODESYS_PROP_CCL:
1328 if (type != CODESYS_CCL)
1330 ("Property only valid in CCL coding systems",
1340 signal_simple_error ("Unrecognized property", prop);
1342 if (EQ (prop, Qname))
1343 return XCODING_SYSTEM_NAME (coding_system);
1344 else if (EQ (prop, Qtype))
1345 return Fcoding_system_type (coding_system);
1346 else if (EQ (prop, Qdoc_string))
1347 return XCODING_SYSTEM_DOC_STRING (coding_system);
1348 else if (EQ (prop, Qmnemonic))
1349 return XCODING_SYSTEM_MNEMONIC (coding_system);
1350 else if (EQ (prop, Qeol_type))
1351 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1352 else if (EQ (prop, Qeol_lf))
1353 return XCODING_SYSTEM_EOL_LF (coding_system);
1354 else if (EQ (prop, Qeol_crlf))
1355 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1356 else if (EQ (prop, Qeol_cr))
1357 return XCODING_SYSTEM_EOL_CR (coding_system);
1358 else if (EQ (prop, Qpost_read_conversion))
1359 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1360 else if (EQ (prop, Qpre_write_conversion))
1361 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1363 else if (type == CODESYS_ISO2022)
1365 if (EQ (prop, Qcharset_g0))
1366 return coding_system_charset (coding_system, 0);
1367 else if (EQ (prop, Qcharset_g1))
1368 return coding_system_charset (coding_system, 1);
1369 else if (EQ (prop, Qcharset_g2))
1370 return coding_system_charset (coding_system, 2);
1371 else if (EQ (prop, Qcharset_g3))
1372 return coding_system_charset (coding_system, 3);
1374 #define FORCE_CHARSET(charset_num) \
1375 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1376 (coding_system, charset_num) ? Qt : Qnil)
1378 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1379 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1380 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1381 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1383 #define LISP_BOOLEAN(prop) \
1384 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1386 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1387 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1388 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1389 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1390 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1391 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1392 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1394 else if (EQ (prop, Qinput_charset_conversion))
1396 unparse_charset_conversion_specs
1397 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1398 else if (EQ (prop, Qoutput_charset_conversion))
1400 unparse_charset_conversion_specs
1401 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1405 else if (type == CODESYS_CCL)
1407 if (EQ (prop, Qdecode))
1408 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1409 else if (EQ (prop, Qencode))
1410 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1418 return Qnil; /* not reached */
1422 /************************************************************************/
1423 /* Coding category functions */
1424 /************************************************************************/
1427 decode_coding_category (Lisp_Object symbol)
1431 CHECK_SYMBOL (symbol);
1432 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1433 if (EQ (coding_category_symbol[i], symbol))
1436 signal_simple_error ("Unrecognized coding category", symbol);
1437 return 0; /* not reached */
1440 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1441 Return a list of all recognized coding categories.
1446 Lisp_Object list = Qnil;
1448 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1449 list = Fcons (coding_category_symbol[i], list);
1453 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1454 Change the priority order of the coding categories.
1455 LIST should be list of coding categories, in descending order of
1456 priority. Unspecified coding categories will be lower in priority
1457 than all specified ones, in the same relative order they were in
1462 int category_to_priority[CODING_CATEGORY_LAST + 1];
1466 /* First generate a list that maps coding categories to priorities. */
1468 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1469 category_to_priority[i] = -1;
1471 /* Highest priority comes from the specified list. */
1473 EXTERNAL_LIST_LOOP (rest, list)
1475 int cat = decode_coding_category (XCAR (rest));
1477 if (category_to_priority[cat] >= 0)
1478 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1479 category_to_priority[cat] = i++;
1482 /* Now go through the existing categories by priority to retrieve
1483 the categories not yet specified and preserve their priority
1485 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1487 int cat = fcd->coding_category_by_priority[j];
1488 if (category_to_priority[cat] < 0)
1489 category_to_priority[cat] = i++;
1492 /* Now we need to construct the inverse of the mapping we just
1495 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1496 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1498 /* Phew! That was confusing. */
1502 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1503 Return a list of coding categories in descending order of priority.
1508 Lisp_Object list = Qnil;
1510 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1511 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1516 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1517 Change the coding system associated with a coding category.
1519 (coding_category, coding_system))
1521 int cat = decode_coding_category (coding_category);
1523 coding_system = Fget_coding_system (coding_system);
1524 fcd->coding_category_system[cat] = coding_system;
1528 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1529 Return the coding system associated with a coding category.
1533 int cat = decode_coding_category (coding_category);
1534 Lisp_Object sys = fcd->coding_category_system[cat];
1537 return XCODING_SYSTEM_NAME (sys);
1542 /************************************************************************/
1543 /* Detecting the encoding of data */
1544 /************************************************************************/
1546 struct detection_state
1548 eol_type_t eol_type;
1584 struct iso2022_decoder iso;
1586 int high_byte_count;
1587 unsigned int saw_single_shift:1;
1600 acceptable_control_char_p (int c)
1604 /* Allow and ignore control characters that you might
1605 reasonably see in a text file */
1610 case 8: /* backspace */
1611 case 11: /* vertical tab */
1612 case 12: /* form feed */
1613 case 26: /* MS-DOS C-z junk */
1614 case 31: /* '^_' -- for info */
1622 mask_has_at_most_one_bit_p (int mask)
1624 /* Perhaps the only thing useful you learn from intensive Microsoft
1625 technical interviews */
1626 return (mask & (mask - 1)) == 0;
1630 detect_eol_type (struct detection_state *st, const unsigned char *src,
1640 if (st->eol.just_saw_cr)
1642 else if (st->eol.seen_anything)
1645 else if (st->eol.just_saw_cr)
1648 st->eol.just_saw_cr = 1;
1650 st->eol.just_saw_cr = 0;
1651 st->eol.seen_anything = 1;
1654 return EOL_AUTODETECT;
1657 /* Attempt to determine the encoding and EOL type of the given text.
1658 Before calling this function for the first type, you must initialize
1659 st->eol_type as appropriate and initialize st->mask to ~0.
1661 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1664 st->mask holds the determined coding category mask, or ~0 if only
1665 ASCII has been seen so far.
1669 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1670 is present in st->mask
1671 1 == definitive answers are here for both st->eol_type and st->mask
1675 detect_coding_type (struct detection_state *st, const Extbyte *src,
1676 unsigned int n, int just_do_eol)
1680 if (st->eol_type == EOL_AUTODETECT)
1681 st->eol_type = detect_eol_type (st, src, n);
1684 return st->eol_type != EOL_AUTODETECT;
1686 if (!st->seen_non_ascii)
1688 for (; n; n--, src++)
1691 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1693 st->seen_non_ascii = 1;
1695 st->shift_jis.mask = ~0;
1699 st->iso2022.mask = ~0;
1709 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714 st->big5.mask = detect_coding_big5 (st, src, n);
1715 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716 st->utf8.mask = detect_coding_utf8 (st, src, n);
1717 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1721 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722 | st->utf8.mask | st->ucs4.mask;
1725 int retval = mask_has_at_most_one_bit_p (st->mask);
1726 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727 return retval && st->eol_type != EOL_AUTODETECT;
1732 coding_system_from_mask (int mask)
1736 /* If the file was entirely or basically ASCII, use the
1737 default value of `buffer-file-coding-system'. */
1738 Lisp_Object retval =
1739 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1742 retval = Ffind_coding_system (retval);
1746 (Qbad_variable, Qwarning,
1747 "Invalid `default-buffer-file-coding-system', set to nil");
1748 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1752 retval = Fget_coding_system (Qraw_text);
1760 mask = postprocess_iso2022_mask (mask);
1762 /* Look through the coding categories by priority and find
1763 the first one that is allowed. */
1764 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1766 cat = fcd->coding_category_by_priority[i];
1767 if ((mask & (1 << cat)) &&
1768 !NILP (fcd->coding_category_system[cat]))
1772 return fcd->coding_category_system[cat];
1774 return Fget_coding_system (Qraw_text);
1778 /* Given a seekable read stream and potential coding system and EOL type
1779 as specified, do any autodetection that is called for. If the
1780 coding system and/or EOL type are not `autodetect', they will be left
1781 alone; but this function will never return an autodetect coding system
1784 This function does not automatically fetch subsidiary coding systems;
1785 that should be unnecessary with the explicit eol-type argument. */
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1790 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1791 eol_type_t *eol_type_in_out)
1793 struct detection_state decst;
1795 if (*eol_type_in_out == EOL_AUTODETECT)
1796 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1799 decst.eol_type = *eol_type_in_out;
1802 /* If autodetection is called for, do it now. */
1803 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1804 || *eol_type_in_out == EOL_AUTODETECT)
1807 Lisp_Object coding_system = Qnil;
1809 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1812 /* Look for initial "-*-"; mode line prefix */
1814 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1819 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1821 Extbyte *local_vars_beg = p + 3;
1822 /* Look for final "-*-"; mode line suffix */
1823 for (p = local_vars_beg,
1824 scan_end = buf + nread - LENGTH ("-*-");
1829 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1831 Extbyte *suffix = p;
1832 /* Look for "coding:" */
1833 for (p = local_vars_beg,
1834 scan_end = suffix - LENGTH ("coding:?");
1837 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1838 && (p == local_vars_beg
1839 || (*(p-1) == ' ' ||
1845 p += LENGTH ("coding:");
1846 while (*p == ' ' || *p == '\t') p++;
1848 /* Get coding system name */
1849 save = *suffix; *suffix = '\0';
1850 /* Characters valid in a MIME charset name (rfc 1521),
1851 and in a Lisp symbol name. */
1852 n = strspn ( (char *) p,
1853 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1854 "abcdefghijklmnopqrstuvwxyz"
1860 save = p[n]; p[n] = '\0';
1862 Ffind_coding_system (intern ((char *) p));
1872 if (NILP (coding_system))
1875 if (detect_coding_type (&decst, buf, nread,
1876 XCODING_SYSTEM_TYPE (*codesys_in_out)
1877 != CODESYS_AUTODETECT))
1879 nread = Lstream_read (stream, buf, sizeof (buf));
1885 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1886 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1889 if (detect_coding_type (&decst, buf, nread, 1))
1891 nread = Lstream_read (stream, buf, sizeof (buf));
1897 *eol_type_in_out = decst.eol_type;
1898 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1900 if (NILP (coding_system))
1901 *codesys_in_out = coding_system_from_mask (decst.mask);
1903 *codesys_in_out = coding_system;
1907 /* If we absolutely can't determine the EOL type, just assume LF. */
1908 if (*eol_type_in_out == EOL_AUTODETECT)
1909 *eol_type_in_out = EOL_LF;
1911 Lstream_rewind (stream);
1914 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1915 Detect coding system of the text in the region between START and END.
1916 Returned a list of possible coding systems ordered by priority.
1917 If only ASCII characters are found, it returns 'undecided or one of
1918 its subsidiary coding systems according to a detected end-of-line
1919 type. Optional arg BUFFER defaults to the current buffer.
1921 (start, end, buffer))
1923 Lisp_Object val = Qnil;
1924 struct buffer *buf = decode_buffer (buffer, 0);
1926 Lisp_Object instream, lb_instream;
1927 Lstream *istr, *lb_istr;
1928 struct detection_state decst;
1929 struct gcpro gcpro1, gcpro2;
1931 get_buffer_range_char (buf, start, end, &b, &e, 0);
1932 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1933 lb_istr = XLSTREAM (lb_instream);
1934 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1935 istr = XLSTREAM (instream);
1936 GCPRO2 (instream, lb_instream);
1938 decst.eol_type = EOL_AUTODETECT;
1942 unsigned char random_buffer[4096];
1943 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1947 if (detect_coding_type (&decst, random_buffer, nread, 0))
1951 if (decst.mask == ~0)
1952 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1960 decst.mask = postprocess_iso2022_mask (decst.mask);
1962 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1964 int sys = fcd->coding_category_by_priority[i];
1965 if (decst.mask & (1 << sys))
1967 Lisp_Object codesys = fcd->coding_category_system[sys];
1968 if (!NILP (codesys))
1969 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1970 val = Fcons (codesys, val);
1974 Lstream_close (istr);
1976 Lstream_delete (istr);
1977 Lstream_delete (lb_istr);
1982 /************************************************************************/
1983 /* Converting to internal Mule format ("decoding") */
1984 /************************************************************************/
1986 /* A decoding stream is a stream used for decoding text (i.e.
1987 converting from some external format to internal format).
1988 The decoding-stream object keeps track of the actual coding
1989 stream, the stream that is at the other end, and data that
1990 needs to be persistent across the lifetime of the stream. */
1992 /* Handle the EOL stuff related to just-read-in character C.
1993 EOL_TYPE is the EOL type of the coding stream.
1994 FLAGS is the current value of FLAGS in the coding stream, and may
1995 be modified by this macro. (The macro only looks at the
1996 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1997 bytes are to be written. You need to also define a local goto
1998 label "label_continue_loop" that is at the end of the main
1999 character-reading loop.
2001 If C is a CR character, then this macro handles it entirely and
2002 jumps to label_continue_loop. Otherwise, this macro does not add
2003 anything to DST, and continues normally. You should continue
2004 processing C normally after this macro. */
2006 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2010 if (eol_type == EOL_CR) \
2011 Dynarr_add (dst, '\n'); \
2012 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2013 Dynarr_add (dst, c); \
2015 flags |= CODING_STATE_CR; \
2016 goto label_continue_loop; \
2018 else if (flags & CODING_STATE_CR) \
2019 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2021 Dynarr_add (dst, '\r'); \
2022 flags &= ~CODING_STATE_CR; \
2026 /* C should be a binary character in the range 0 - 255; convert
2027 to internal format and add to Dynarr DST. */
2030 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2032 if (BYTE_ASCII_P (c)) \
2033 Dynarr_add (dst, c); \
2036 Dynarr_add (dst, (c >> 6) | 0xc0); \
2037 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2042 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2046 Dynarr_add (dst, c);
2048 else if ( c <= 0x7ff )
2050 Dynarr_add (dst, (c >> 6) | 0xc0);
2051 Dynarr_add (dst, (c & 0x3f) | 0x80);
2053 else if ( c <= 0xffff )
2055 Dynarr_add (dst, (c >> 12) | 0xe0);
2056 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2057 Dynarr_add (dst, (c & 0x3f) | 0x80);
2059 else if ( c <= 0x1fffff )
2061 Dynarr_add (dst, (c >> 18) | 0xf0);
2062 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2063 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2064 Dynarr_add (dst, (c & 0x3f) | 0x80);
2066 else if ( c <= 0x3ffffff )
2068 Dynarr_add (dst, (c >> 24) | 0xf8);
2069 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2070 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2071 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2072 Dynarr_add (dst, (c & 0x3f) | 0x80);
2076 Dynarr_add (dst, (c >> 30) | 0xfc);
2077 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2078 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2079 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2080 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2081 Dynarr_add (dst, (c & 0x3f) | 0x80);
2085 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2087 if (BYTE_ASCII_P (c)) \
2088 Dynarr_add (dst, c); \
2089 else if (BYTE_C1_P (c)) \
2091 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2092 Dynarr_add (dst, c + 0x20); \
2096 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2097 Dynarr_add (dst, c); \
2102 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2106 DECODE_ADD_BINARY_CHAR (ch, dst); \
2111 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2113 if (flags & CODING_STATE_END) \
2115 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2116 if (flags & CODING_STATE_CR) \
2117 Dynarr_add (dst, '\r'); \
2121 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2123 struct decoding_stream
2125 /* Coding system that governs the conversion. */
2126 Lisp_Coding_System *codesys;
2128 /* Stream that we read the encoded data from or
2129 write the decoded data to. */
2132 /* If we are reading, then we can return only a fixed amount of
2133 data, so if the conversion resulted in too much data, we store it
2134 here for retrieval the next time around. */
2135 unsigned_char_dynarr *runoff;
2137 /* FLAGS holds flags indicating the current state of the decoding.
2138 Some of these flags are dependent on the coding system. */
2141 /* CH holds a partially built-up character. Since we only deal
2142 with one- and two-byte characters at the moment, we only use
2143 this to store the first byte of a two-byte character. */
2146 /* EOL_TYPE specifies the type of end-of-line conversion that
2147 currently applies. We need to keep this separate from the
2148 EOL type stored in CODESYS because the latter might indicate
2149 automatic EOL-type detection while the former will always
2150 indicate a particular EOL type. */
2151 eol_type_t eol_type;
2153 /* Additional ISO2022 information. We define the structure above
2154 because it's also needed by the detection routines. */
2155 struct iso2022_decoder iso2022;
2157 /* Additional information (the state of the running CCL program)
2158 used by the CCL decoder. */
2159 struct ccl_program ccl;
2161 /* counter for UTF-8 or UCS-4 */
2162 unsigned char counter;
2164 struct detection_state decst;
2167 static ssize_t decoding_reader (Lstream *stream,
2168 unsigned char *data, size_t size);
2169 static ssize_t decoding_writer (Lstream *stream,
2170 const unsigned char *data, size_t size);
2171 static int decoding_rewinder (Lstream *stream);
2172 static int decoding_seekable_p (Lstream *stream);
2173 static int decoding_flusher (Lstream *stream);
2174 static int decoding_closer (Lstream *stream);
2176 static Lisp_Object decoding_marker (Lisp_Object stream);
2178 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2179 sizeof (struct decoding_stream));
2182 decoding_marker (Lisp_Object stream)
2184 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2185 Lisp_Object str_obj;
2187 /* We do not need to mark the coding systems or charsets stored
2188 within the stream because they are stored in a global list
2189 and automatically marked. */
2191 XSETLSTREAM (str_obj, str);
2192 mark_object (str_obj);
2193 if (str->imp->marker)
2194 return (str->imp->marker) (str_obj);
2199 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2200 so we read data from the other end, decode it, and store it into DATA. */
2203 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2205 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2206 unsigned char *orig_data = data;
2208 int error_occurred = 0;
2210 /* We need to interface to mule_decode(), which expects to take some
2211 amount of data and store the result into a Dynarr. We have
2212 mule_decode() store into str->runoff, and take data from there
2215 /* We loop until we have enough data, reading chunks from the other
2216 end and decoding it. */
2219 /* Take data from the runoff if we can. Make sure to take at
2220 most SIZE bytes, and delete the data from the runoff. */
2221 if (Dynarr_length (str->runoff) > 0)
2223 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2224 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2225 Dynarr_delete_many (str->runoff, 0, chunk);
2231 break; /* No more room for data */
2233 if (str->flags & CODING_STATE_END)
2234 /* This means that on the previous iteration, we hit the EOF on
2235 the other end. We loop once more so that mule_decode() can
2236 output any final stuff it may be holding, or any "go back
2237 to a sane state" escape sequences. (This latter makes sense
2238 during encoding.) */
2241 /* Exhausted the runoff, so get some more. DATA has at least
2242 SIZE bytes left of storage in it, so it's OK to read directly
2243 into it. (We'll be overwriting above, after we've decoded it
2244 into the runoff.) */
2245 read_size = Lstream_read (str->other_end, data, size);
2252 /* There might be some more end data produced in the translation.
2253 See the comment above. */
2254 str->flags |= CODING_STATE_END;
2255 mule_decode (stream, data, str->runoff, read_size);
2258 if (data - orig_data == 0)
2259 return error_occurred ? -1 : 0;
2261 return data - orig_data;
2265 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2267 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2270 /* Decode all our data into the runoff, and then attempt to write
2271 it all out to the other end. Remove whatever chunk we succeeded
2273 mule_decode (stream, data, str->runoff, size);
2274 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2275 Dynarr_length (str->runoff));
2277 Dynarr_delete_many (str->runoff, 0, retval);
2278 /* Do NOT return retval. The return value indicates how much
2279 of the incoming data was written, not how many bytes were
2285 reset_decoding_stream (struct decoding_stream *str)
2288 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2290 Lisp_Object coding_system;
2291 XSETCODING_SYSTEM (coding_system, str->codesys);
2292 reset_iso2022 (coding_system, &str->iso2022);
2294 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2296 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2300 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2301 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2304 str->decst.eol_type = EOL_AUTODETECT;
2305 str->decst.mask = ~0;
2307 str->flags = str->ch = 0;
2311 decoding_rewinder (Lstream *stream)
2313 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2314 reset_decoding_stream (str);
2315 Dynarr_reset (str->runoff);
2316 return Lstream_rewind (str->other_end);
2320 decoding_seekable_p (Lstream *stream)
2322 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2323 return Lstream_seekable_p (str->other_end);
2327 decoding_flusher (Lstream *stream)
2329 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2330 return Lstream_flush (str->other_end);
2334 decoding_closer (Lstream *stream)
2336 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2337 if (stream->flags & LSTREAM_FL_WRITE)
2339 str->flags |= CODING_STATE_END;
2340 decoding_writer (stream, 0, 0);
2342 Dynarr_free (str->runoff);
2344 #ifdef ENABLE_COMPOSITE_CHARS
2345 if (str->iso2022.composite_chars)
2346 Dynarr_free (str->iso2022.composite_chars);
2349 return Lstream_close (str->other_end);
2353 decoding_stream_coding_system (Lstream *stream)
2355 Lisp_Object coding_system;
2356 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2358 XSETCODING_SYSTEM (coding_system, str->codesys);
2359 return subsidiary_coding_system (coding_system, str->eol_type);
2363 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2365 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2366 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2368 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2369 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2370 reset_decoding_stream (str);
2373 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2374 stream for writing, no automatic code detection will be performed.
2375 The reason for this is that automatic code detection requires a
2376 seekable input. Things will also fail if you open a decoding
2377 stream for reading using a non-fully-specified coding system and
2378 a non-seekable input stream. */
2381 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2384 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2385 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2389 str->other_end = stream;
2390 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2391 str->eol_type = EOL_AUTODETECT;
2392 if (!strcmp (mode, "r")
2393 && Lstream_seekable_p (stream))
2394 /* We can determine the coding system now. */
2395 determine_real_coding_system (stream, &codesys, &str->eol_type);
2396 set_decoding_stream_coding_system (lstr, codesys);
2397 str->decst.eol_type = str->eol_type;
2398 str->decst.mask = ~0;
2399 XSETLSTREAM (obj, lstr);
2404 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2406 return make_decoding_stream_1 (stream, codesys, "r");
2410 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2412 return make_decoding_stream_1 (stream, codesys, "w");
2415 /* Note: the decode_coding_* functions all take the same
2416 arguments as mule_decode(), which is to say some SRC data of
2417 size N, which is to be stored into dynamic array DST.
2418 DECODING is the stream within which the decoding is
2419 taking place, but no data is actually read from or
2420 written to that stream; that is handled in decoding_reader()
2421 or decoding_writer(). This allows the same functions to
2422 be used for both reading and writing. */
2425 mule_decode (Lstream *decoding, const unsigned char *src,
2426 unsigned_char_dynarr *dst, unsigned int n)
2428 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2430 /* If necessary, do encoding-detection now. We do this when
2431 we're a writing stream or a non-seekable reading stream,
2432 meaning that we can't just process the whole input,
2433 rewind, and start over. */
2435 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2436 str->eol_type == EOL_AUTODETECT)
2438 Lisp_Object codesys;
2440 XSETCODING_SYSTEM (codesys, str->codesys);
2441 detect_coding_type (&str->decst, src, n,
2442 CODING_SYSTEM_TYPE (str->codesys) !=
2443 CODESYS_AUTODETECT);
2444 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2445 str->decst.mask != ~0)
2446 /* #### This is cheesy. What we really ought to do is
2447 buffer up a certain amount of data so as to get a
2448 less random result. */
2449 codesys = coding_system_from_mask (str->decst.mask);
2450 str->eol_type = str->decst.eol_type;
2451 if (XCODING_SYSTEM (codesys) != str->codesys)
2453 /* Preserve the CODING_STATE_END flag in case it was set.
2454 If we erase it, bad things might happen. */
2455 int was_end = str->flags & CODING_STATE_END;
2456 set_decoding_stream_coding_system (decoding, codesys);
2458 str->flags |= CODING_STATE_END;
2462 switch (CODING_SYSTEM_TYPE (str->codesys))
2465 case CODESYS_INTERNAL:
2466 Dynarr_add_many (dst, src, n);
2469 case CODESYS_AUTODETECT:
2470 /* If we got this far and still haven't decided on the coding
2471 system, then do no conversion. */
2472 case CODESYS_NO_CONVERSION:
2473 decode_coding_no_conversion (decoding, src, dst, n);
2476 case CODESYS_SHIFT_JIS:
2477 decode_coding_sjis (decoding, src, dst, n);
2480 decode_coding_big5 (decoding, src, dst, n);
2483 decode_coding_ucs4 (decoding, src, dst, n);
2486 decode_coding_utf8 (decoding, src, dst, n);
2489 str->ccl.last_block = str->flags & CODING_STATE_END;
2490 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2492 case CODESYS_ISO2022:
2493 decode_coding_iso2022 (decoding, src, dst, n);
2501 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2502 Decode the text between START and END which is encoded in CODING-SYSTEM.
2503 This is useful if you've read in encoded text from a file without decoding
2504 it (e.g. you read in a JIS-formatted file but used the `binary' or
2505 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2506 Return length of decoded text.
2507 BUFFER defaults to the current buffer if unspecified.
2509 (start, end, coding_system, buffer))
2512 struct buffer *buf = decode_buffer (buffer, 0);
2513 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2514 Lstream *istr, *ostr;
2515 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2517 get_buffer_range_char (buf, start, end, &b, &e, 0);
2519 barf_if_buffer_read_only (buf, b, e);
2521 coding_system = Fget_coding_system (coding_system);
2522 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2523 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2524 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2526 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2527 Fget_coding_system (Qbinary));
2528 istr = XLSTREAM (instream);
2529 ostr = XLSTREAM (outstream);
2530 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2532 /* The chain of streams looks like this:
2534 [BUFFER] <----- send through
2535 ------> [ENCODE AS BINARY]
2536 ------> [DECODE AS SPECIFIED]
2542 char tempbuf[1024]; /* some random amount */
2543 Bufpos newpos, even_newer_pos;
2544 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2545 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2549 newpos = lisp_buffer_stream_startpos (istr);
2550 Lstream_write (ostr, tempbuf, size_in_bytes);
2551 even_newer_pos = lisp_buffer_stream_startpos (istr);
2552 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2555 Lstream_close (istr);
2556 Lstream_close (ostr);
2558 Lstream_delete (istr);
2559 Lstream_delete (ostr);
2560 Lstream_delete (XLSTREAM (de_outstream));
2561 Lstream_delete (XLSTREAM (lb_outstream));
2566 /************************************************************************/
2567 /* Converting to an external encoding ("encoding") */
2568 /************************************************************************/
2570 /* An encoding stream is an output stream. When you create the
2571 stream, you specify the coding system that governs the encoding
2572 and another stream that the resulting encoded data is to be
2573 sent to, and then start sending data to it. */
2575 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2577 struct encoding_stream
2579 /* Coding system that governs the conversion. */
2580 Lisp_Coding_System *codesys;
2582 /* Stream that we read the encoded data from or
2583 write the decoded data to. */
2586 /* If we are reading, then we can return only a fixed amount of
2587 data, so if the conversion resulted in too much data, we store it
2588 here for retrieval the next time around. */
2589 unsigned_char_dynarr *runoff;
2591 /* FLAGS holds flags indicating the current state of the encoding.
2592 Some of these flags are dependent on the coding system. */
2595 /* CH holds a partially built-up character. Since we only deal
2596 with one- and two-byte characters at the moment, we only use
2597 this to store the first byte of a two-byte character. */
2600 /* Additional information used by the ISO2022 encoder. */
2603 /* CHARSET holds the character sets currently assigned to the G0
2604 through G3 registers. It is initialized from the array
2605 INITIAL_CHARSET in CODESYS. */
2606 Lisp_Object charset[4];
2608 /* Which registers are currently invoked into the left (GL) and
2609 right (GR) halves of the 8-bit encoding space? */
2610 int register_left, register_right;
2612 /* Whether we need to explicitly designate the charset in the
2613 G? register before using it. It is initialized from the
2614 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2615 unsigned char force_charset_on_output[4];
2617 /* Other state variables that need to be preserved across
2619 Lisp_Object current_charset;
2621 int current_char_boundary;
2624 /* Additional information (the state of the running CCL program)
2625 used by the CCL encoder. */
2626 struct ccl_program ccl;
2630 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2631 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2633 static int encoding_rewinder (Lstream *stream);
2634 static int encoding_seekable_p (Lstream *stream);
2635 static int encoding_flusher (Lstream *stream);
2636 static int encoding_closer (Lstream *stream);
2638 static Lisp_Object encoding_marker (Lisp_Object stream);
2640 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2641 sizeof (struct encoding_stream));
2644 encoding_marker (Lisp_Object stream)
2646 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2647 Lisp_Object str_obj;
2649 /* We do not need to mark the coding systems or charsets stored
2650 within the stream because they are stored in a global list
2651 and automatically marked. */
2653 XSETLSTREAM (str_obj, str);
2654 mark_object (str_obj);
2655 if (str->imp->marker)
2656 return (str->imp->marker) (str_obj);
2661 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2662 so we read data from the other end, encode it, and store it into DATA. */
2665 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2667 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2668 unsigned char *orig_data = data;
2670 int error_occurred = 0;
2672 /* We need to interface to mule_encode(), which expects to take some
2673 amount of data and store the result into a Dynarr. We have
2674 mule_encode() store into str->runoff, and take data from there
2677 /* We loop until we have enough data, reading chunks from the other
2678 end and encoding it. */
2681 /* Take data from the runoff if we can. Make sure to take at
2682 most SIZE bytes, and delete the data from the runoff. */
2683 if (Dynarr_length (str->runoff) > 0)
2685 int chunk = min ((int) size, Dynarr_length (str->runoff));
2686 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2687 Dynarr_delete_many (str->runoff, 0, chunk);
2693 break; /* No more room for data */
2695 if (str->flags & CODING_STATE_END)
2696 /* This means that on the previous iteration, we hit the EOF on
2697 the other end. We loop once more so that mule_encode() can
2698 output any final stuff it may be holding, or any "go back
2699 to a sane state" escape sequences. (This latter makes sense
2700 during encoding.) */
2703 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2704 left of storage in it, so it's OK to read directly into it.
2705 (We'll be overwriting above, after we've encoded it into the
2707 read_size = Lstream_read (str->other_end, data, size);
2714 /* There might be some more end data produced in the translation.
2715 See the comment above. */
2716 str->flags |= CODING_STATE_END;
2717 mule_encode (stream, data, str->runoff, read_size);
2720 if (data == orig_data)
2721 return error_occurred ? -1 : 0;
2723 return data - orig_data;
2727 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2729 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2732 /* Encode all our data into the runoff, and then attempt to write
2733 it all out to the other end. Remove whatever chunk we succeeded
2735 mule_encode (stream, data, str->runoff, size);
2736 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2737 Dynarr_length (str->runoff));
2739 Dynarr_delete_many (str->runoff, 0, retval);
2740 /* Do NOT return retval. The return value indicates how much
2741 of the incoming data was written, not how many bytes were
2747 reset_encoding_stream (struct encoding_stream *str)
2750 switch (CODING_SYSTEM_TYPE (str->codesys))
2752 case CODESYS_ISO2022:
2756 for (i = 0; i < 4; i++)
2758 str->iso2022.charset[i] =
2759 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2760 str->iso2022.force_charset_on_output[i] =
2761 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2763 str->iso2022.register_left = 0;
2764 str->iso2022.register_right = 1;
2765 str->iso2022.current_charset = Qnil;
2766 str->iso2022.current_half = 0;
2768 str->iso2022.current_char_boundary = 0;
2770 str->iso2022.current_char_boundary = 1;
2775 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2782 str->flags = str->ch = 0;
2786 encoding_rewinder (Lstream *stream)
2788 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2789 reset_encoding_stream (str);
2790 Dynarr_reset (str->runoff);
2791 return Lstream_rewind (str->other_end);
2795 encoding_seekable_p (Lstream *stream)
2797 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2798 return Lstream_seekable_p (str->other_end);
2802 encoding_flusher (Lstream *stream)
2804 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2805 return Lstream_flush (str->other_end);
2809 encoding_closer (Lstream *stream)
2811 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2812 if (stream->flags & LSTREAM_FL_WRITE)
2814 str->flags |= CODING_STATE_END;
2815 encoding_writer (stream, 0, 0);
2817 Dynarr_free (str->runoff);
2818 return Lstream_close (str->other_end);
2822 encoding_stream_coding_system (Lstream *stream)
2824 Lisp_Object coding_system;
2825 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2827 XSETCODING_SYSTEM (coding_system, str->codesys);
2828 return coding_system;
2832 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2834 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2835 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2837 reset_encoding_stream (str);
2841 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2844 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2845 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2849 str->runoff = Dynarr_new (unsigned_char);
2850 str->other_end = stream;
2851 set_encoding_stream_coding_system (lstr, codesys);
2852 XSETLSTREAM (obj, lstr);
2857 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2859 return make_encoding_stream_1 (stream, codesys, "r");
2863 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2865 return make_encoding_stream_1 (stream, codesys, "w");
2868 /* Convert N bytes of internally-formatted data stored in SRC to an
2869 external format, according to the encoding stream ENCODING.
2870 Store the encoded data into DST. */
2873 mule_encode (Lstream *encoding, const unsigned char *src,
2874 unsigned_char_dynarr *dst, unsigned int n)
2876 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2878 switch (CODING_SYSTEM_TYPE (str->codesys))
2881 case CODESYS_INTERNAL:
2882 Dynarr_add_many (dst, src, n);
2885 case CODESYS_AUTODETECT:
2886 /* If we got this far and still haven't decided on the coding
2887 system, then do no conversion. */
2888 case CODESYS_NO_CONVERSION:
2889 encode_coding_no_conversion (encoding, src, dst, n);
2892 case CODESYS_SHIFT_JIS:
2893 encode_coding_sjis (encoding, src, dst, n);
2896 encode_coding_big5 (encoding, src, dst, n);
2899 encode_coding_ucs4 (encoding, src, dst, n);
2902 encode_coding_utf8 (encoding, src, dst, n);
2905 str->ccl.last_block = str->flags & CODING_STATE_END;
2906 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2908 case CODESYS_ISO2022:
2909 encode_coding_iso2022 (encoding, src, dst, n);
2917 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2918 Encode the text between START and END using CODING-SYSTEM.
2919 This will, for example, convert Japanese characters into stuff such as
2920 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2921 text. BUFFER defaults to the current buffer if unspecified.
2923 (start, end, coding_system, buffer))
2926 struct buffer *buf = decode_buffer (buffer, 0);
2927 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2928 Lstream *istr, *ostr;
2929 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2931 get_buffer_range_char (buf, start, end, &b, &e, 0);
2933 barf_if_buffer_read_only (buf, b, e);
2935 coding_system = Fget_coding_system (coding_system);
2936 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2937 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2938 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2939 Fget_coding_system (Qbinary));
2940 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2942 istr = XLSTREAM (instream);
2943 ostr = XLSTREAM (outstream);
2944 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2945 /* The chain of streams looks like this:
2947 [BUFFER] <----- send through
2948 ------> [ENCODE AS SPECIFIED]
2949 ------> [DECODE AS BINARY]
2954 char tempbuf[1024]; /* some random amount */
2955 Bufpos newpos, even_newer_pos;
2956 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2957 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2961 newpos = lisp_buffer_stream_startpos (istr);
2962 Lstream_write (ostr, tempbuf, size_in_bytes);
2963 even_newer_pos = lisp_buffer_stream_startpos (istr);
2964 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2970 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2971 Lstream_close (istr);
2972 Lstream_close (ostr);
2974 Lstream_delete (istr);
2975 Lstream_delete (ostr);
2976 Lstream_delete (XLSTREAM (de_outstream));
2977 Lstream_delete (XLSTREAM (lb_outstream));
2978 return make_int (retlen);
2984 /************************************************************************/
2985 /* Shift-JIS methods */
2986 /************************************************************************/
2988 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2989 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2990 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2991 encoded by "position-code + 0x80". A character of JISX0208
2992 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2993 position-codes are divided and shifted so that it fit in the range
2996 --- CODE RANGE of Shift-JIS ---
2997 (character set) (range)
2999 JISX0201-Kana 0xA0 .. 0xDF
3000 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3001 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3002 -------------------------------
3006 /* Is this the first byte of a Shift-JIS two-byte char? */
3008 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3009 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3011 /* Is this the second byte of a Shift-JIS two-byte char? */
3013 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3014 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3016 #define BYTE_SJIS_KATAKANA_P(c) \
3017 ((c) >= 0xA1 && (c) <= 0xDF)
3020 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
3028 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3030 if (st->shift_jis.in_second_byte)
3032 st->shift_jis.in_second_byte = 0;
3036 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3037 st->shift_jis.in_second_byte = 1;
3039 return CODING_CATEGORY_SHIFT_JIS_MASK;
3042 /* Convert Shift-JIS data to internal format. */
3045 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
3046 unsigned_char_dynarr *dst, unsigned int n)
3049 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3050 unsigned int flags = str->flags;
3051 unsigned int ch = str->ch;
3052 eol_type_t eol_type = str->eol_type;
3060 /* Previous character was first byte of Shift-JIS Kanji char. */
3061 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3063 unsigned char e1, e2;
3065 DECODE_SJIS (ch, c, e1, e2);
3067 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3071 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3072 Dynarr_add (dst, e1);
3073 Dynarr_add (dst, e2);
3078 DECODE_ADD_BINARY_CHAR (ch, dst);
3079 DECODE_ADD_BINARY_CHAR (c, dst);
3085 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3086 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3088 else if (BYTE_SJIS_KATAKANA_P (c))
3091 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3094 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3095 Dynarr_add (dst, c);
3099 DECODE_ADD_BINARY_CHAR (c, dst);
3101 label_continue_loop:;
3104 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3110 /* Convert internally-formatted data to Shift-JIS. */
3113 encode_coding_sjis (Lstream *encoding, const unsigned char *src,
3114 unsigned_char_dynarr *dst, unsigned int n)
3117 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3118 unsigned int flags = str->flags;
3119 unsigned int ch = str->ch;
3120 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3122 unsigned char char_boundary = str->iso2022.current_char_boundary;
3129 switch (char_boundary)
3137 else if ( c >= 0xf8 )
3142 else if ( c >= 0xf0 )
3147 else if ( c >= 0xe0 )
3152 else if ( c >= 0xc0 )
3162 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3163 Dynarr_add (dst, '\r');
3164 if (eol_type != EOL_CR)
3165 Dynarr_add (dst, c);
3168 Dynarr_add (dst, c);
3173 ch = ( ch << 6 ) | ( c & 0x3f );
3175 Lisp_Object charset;
3176 unsigned int c1, c2, s1, s2;
3178 BREAKUP_CHAR (ch, charset, c1, c2);
3179 if (EQ(charset, Vcharset_katakana_jisx0201))
3181 Dynarr_add (dst, c1 | 0x80);
3183 else if (EQ(charset, Vcharset_japanese_jisx0208))
3185 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3186 Dynarr_add (dst, s1);
3187 Dynarr_add (dst, s2);
3193 ch = ( ch << 6 ) | ( c & 0x3f );
3199 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3200 Dynarr_add (dst, '\r');
3201 if (eol_type != EOL_CR)
3202 Dynarr_add (dst, '\n');
3205 else if (BYTE_ASCII_P (c))
3207 Dynarr_add (dst, c);
3210 else if (BUFBYTE_LEADING_BYTE_P (c))
3211 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3212 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3213 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3216 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3218 Dynarr_add (dst, c);
3221 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3222 ch == LEADING_BYTE_JAPANESE_JISX0208)
3226 unsigned char j1, j2;
3227 ENCODE_SJIS (ch, c, j1, j2);
3228 Dynarr_add (dst, j1);
3229 Dynarr_add (dst, j2);
3239 str->iso2022.current_char_boundary = char_boundary;
3243 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3244 Decode a JISX0208 character of Shift-JIS coding-system.
3245 CODE is the character code in Shift-JIS as a cons of type bytes.
3246 Return the corresponding character.
3250 unsigned char c1, c2, s1, s2;
3253 CHECK_INT (XCAR (code));
3254 CHECK_INT (XCDR (code));
3255 s1 = XINT (XCAR (code));
3256 s2 = XINT (XCDR (code));
3257 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3258 BYTE_SJIS_TWO_BYTE_2_P (s2))
3260 DECODE_SJIS (s1, s2, c1, c2);
3261 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3262 c1 & 0x7F, c2 & 0x7F));
3268 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3269 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3270 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3274 Lisp_Object charset;
3277 CHECK_CHAR_COERCE_INT (ch);
3278 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3279 if (EQ (charset, Vcharset_japanese_jisx0208))
3281 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3282 return Fcons (make_int (s1), make_int (s2));
3289 /************************************************************************/
3291 /************************************************************************/
3293 /* BIG5 is a coding system encoding two character sets: ASCII and
3294 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3295 character set and is encoded in two-byte.
3297 --- CODE RANGE of BIG5 ---
3298 (character set) (range)
3300 Big5 (1st byte) 0xA1 .. 0xFE
3301 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3302 --------------------------
3304 Since the number of characters in Big5 is larger than maximum
3305 characters in Emacs' charset (96x96), it can't be handled as one
3306 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3307 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3308 contains frequently used characters and the latter contains less
3309 frequently used characters. */
3311 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3312 ((c) >= 0xA1 && (c) <= 0xFE)
3314 /* Is this the second byte of a Shift-JIS two-byte char? */
3316 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3317 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3319 /* Number of Big5 characters which have the same code in 1st byte. */
3321 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3323 /* Code conversion macros. These are macros because they are used in
3324 inner loops during code conversion.
3326 Note that temporary variables in macros introduce the classic
3327 dynamic-scoping problems with variable names. We use capital-
3328 lettered variables in the assumption that XEmacs does not use
3329 capital letters in variables except in a very formalized way
3332 /* Convert Big5 code (b1, b2) into its internal string representation
3335 /* There is a much simpler way to split the Big5 charset into two.
3336 For the moment I'm going to leave the algorithm as-is because it
3337 claims to separate out the most-used characters into a single
3338 charset, which perhaps will lead to optimizations in various
3341 The way the algorithm works is something like this:
3343 Big5 can be viewed as a 94x157 charset, where the row is
3344 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3345 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3346 the split between low and high column numbers is apparently
3347 meaningless; ascending rows produce less and less frequent chars.
3348 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3349 the first charset, and the upper half (0xC9 .. 0xFE) to the
3350 second. To do the conversion, we convert the character into
3351 a single number where 0 .. 156 is the first row, 157 .. 313
3352 is the second, etc. That way, the characters are ordered by
3353 decreasing frequency. Then we just chop the space in two
3354 and coerce the result into a 94x94 space.
3357 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3359 int B1 = b1, B2 = b2; \
3361 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3365 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3369 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3370 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3372 c1 = I / (0xFF - 0xA1) + 0xA1; \
3373 c2 = I % (0xFF - 0xA1) + 0xA1; \
3376 /* Convert the internal string representation of a Big5 character
3377 (lb, c1, c2) into Big5 code (b1, b2). */
3379 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3381 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3383 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3385 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3387 b1 = I / BIG5_SAME_ROW + 0xA1; \
3388 b2 = I % BIG5_SAME_ROW; \
3389 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3393 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3401 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3402 (c >= 0x80 && c <= 0xA0))
3404 if (st->big5.in_second_byte)
3406 st->big5.in_second_byte = 0;
3407 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3411 st->big5.in_second_byte = 1;
3413 return CODING_CATEGORY_BIG5_MASK;
3416 /* Convert Big5 data to internal format. */
3419 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3420 unsigned_char_dynarr *dst, unsigned int n)
3423 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3424 unsigned int flags = str->flags;
3425 unsigned int ch = str->ch;
3426 eol_type_t eol_type = str->eol_type;
3433 /* Previous character was first byte of Big5 char. */
3434 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3436 unsigned char b1, b2, b3;
3437 DECODE_BIG5 (ch, c, b1, b2, b3);
3438 Dynarr_add (dst, b1);
3439 Dynarr_add (dst, b2);
3440 Dynarr_add (dst, b3);
3444 DECODE_ADD_BINARY_CHAR (ch, dst);
3445 DECODE_ADD_BINARY_CHAR (c, dst);
3451 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3452 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3455 DECODE_ADD_BINARY_CHAR (c, dst);
3457 label_continue_loop:;
3460 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3466 /* Convert internally-formatted data to Big5. */
3469 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3470 unsigned_char_dynarr *dst, unsigned int n)
3474 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3475 unsigned int flags = str->flags;
3476 unsigned int ch = str->ch;
3477 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3484 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3485 Dynarr_add (dst, '\r');
3486 if (eol_type != EOL_CR)
3487 Dynarr_add (dst, '\n');
3489 else if (BYTE_ASCII_P (c))
3492 Dynarr_add (dst, c);
3494 else if (BUFBYTE_LEADING_BYTE_P (c))
3496 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3497 c == LEADING_BYTE_CHINESE_BIG5_2)
3499 /* A recognized leading byte. */
3501 continue; /* not done with this character. */
3503 /* otherwise just ignore this character. */
3505 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3506 ch == LEADING_BYTE_CHINESE_BIG5_2)
3508 /* Previous char was a recognized leading byte. */
3510 continue; /* not done with this character. */
3514 /* Encountering second byte of a Big5 character. */
3515 unsigned char b1, b2;
3517 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3518 Dynarr_add (dst, b1);
3519 Dynarr_add (dst, b2);
3531 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3532 Decode a Big5 character CODE of BIG5 coding-system.
3533 CODE is the character code in BIG5, a cons of two integers.
3534 Return the corresponding character.
3538 unsigned char c1, c2, b1, b2;
3541 CHECK_INT (XCAR (code));
3542 CHECK_INT (XCDR (code));
3543 b1 = XINT (XCAR (code));
3544 b2 = XINT (XCDR (code));
3545 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3546 BYTE_BIG5_TWO_BYTE_2_P (b2))
3548 Charset_ID leading_byte;
3549 Lisp_Object charset;
3550 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3551 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3552 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3558 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3559 Encode the Big5 character CH to BIG5 coding-system.
3560 Return the corresponding character code in Big5.
3564 Lisp_Object charset;
3567 CHECK_CHAR_COERCE_INT (ch);
3568 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3569 if (EQ (charset, Vcharset_chinese_big5_1) ||
3570 EQ (charset, Vcharset_chinese_big5_2))
3572 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3574 return Fcons (make_int (b1), make_int (b2));
3581 /************************************************************************/
3584 /* UCS-4 character codes are implemented as nonnegative integers. */
3586 /************************************************************************/
3589 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3590 Map UCS-4 code CODE to Mule character CHARACTER.
3592 Return T on success, NIL on failure.
3598 CHECK_CHAR (character);
3602 if (c < sizeof (fcd->ucs_to_mule_table))
3604 fcd->ucs_to_mule_table[c] = character;
3612 ucs_to_char (unsigned long code)
3614 if (code < sizeof (fcd->ucs_to_mule_table))
3616 return fcd->ucs_to_mule_table[code];
3618 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3623 c = code % (94 * 94);
3625 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3626 (94, 2, code / (94 * 94) + '@',
3627 CHARSET_LEFT_TO_RIGHT),
3628 c / 94 + 33, c % 94 + 33));
3634 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3635 Return Mule character corresponding to UCS code CODE (a positive integer).
3639 CHECK_NATNUM (code);
3640 return ucs_to_char (XINT (code));
3643 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3644 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3648 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3649 Fset_char_ucs is more restrictive on index arg, but should
3650 check code arg in a char_table method. */
3651 CHECK_CHAR (character);
3652 CHECK_NATNUM (code);
3653 return Fput_char_table (character, code, mule_to_ucs_table);
3656 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3657 Return the UCS code (a positive integer) corresponding to CHARACTER.
3661 return Fget_char_table (character, mule_to_ucs_table);
3666 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3668 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3669 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3670 is not found, instead.
3671 #### do something more appropriate (use blob?)
3672 Danger, Will Robinson! Data loss. Should we signal user? */
3674 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3676 Lisp_Object chr = ucs_to_char (ch);
3680 Bufbyte work[MAX_EMCHAR_LEN];
3685 simple_set_charptr_emchar (work, ch) :
3686 non_ascii_set_charptr_emchar (work, ch);
3687 Dynarr_add_many (dst, work, len);
3691 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3692 Dynarr_add (dst, 34 + 128);
3693 Dynarr_add (dst, 46 + 128);
3699 static unsigned long
3700 mule_char_to_ucs4 (Lisp_Object charset,
3701 unsigned char h, unsigned char l)
3704 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3711 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3712 (XCHARSET_CHARS (charset) == 94) )
3714 unsigned char final = XCHARSET_FINAL (charset);
3716 if ( ('@' <= final) && (final < 0x7f) )
3718 return 0xe00000 + (final - '@') * 94 * 94
3719 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3733 encode_ucs4 (Lisp_Object charset,
3734 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3736 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3737 Dynarr_add (dst, code >> 24);
3738 Dynarr_add (dst, (code >> 16) & 255);
3739 Dynarr_add (dst, (code >> 8) & 255);
3740 Dynarr_add (dst, code & 255);
3745 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3751 switch (st->ucs4.in_byte)
3760 st->ucs4.in_byte = 0;
3766 return CODING_CATEGORY_UCS4_MASK;
3770 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3771 unsigned_char_dynarr *dst, unsigned int n)
3773 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3774 unsigned int flags = str->flags;
3775 unsigned int ch = str->ch;
3776 unsigned char counter = str->counter;
3780 unsigned char c = *src++;
3788 decode_ucs4 ( ( ch << 8 ) | c, dst);
3793 ch = ( ch << 8 ) | c;
3797 if (counter & CODING_STATE_END)
3798 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3802 str->counter = counter;
3806 encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
3807 unsigned_char_dynarr *dst, unsigned int n)
3810 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3811 unsigned int flags = str->flags;
3812 unsigned int ch = str->ch;
3813 unsigned char char_boundary = str->iso2022.current_char_boundary;
3814 Lisp_Object charset = str->iso2022.current_charset;
3816 #ifdef ENABLE_COMPOSITE_CHARS
3817 /* flags for handling composite chars. We do a little switcharoo
3818 on the source while we're outputting the composite char. */
3819 unsigned int saved_n = 0;
3820 const unsigned char *saved_src = NULL;
3821 int in_composite = 0;
3828 unsigned char c = *src++;
3830 if (BYTE_ASCII_P (c))
3831 { /* Processing ASCII character */
3833 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3836 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3837 { /* Processing Leading Byte */
3839 charset = CHARSET_BY_LEADING_BYTE (c);
3840 if (LEADING_BYTE_PREFIX_P(c))
3845 { /* Processing Non-ASCII character */
3847 if (EQ (charset, Vcharset_control_1))
3849 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3853 switch (XCHARSET_REP_BYTES (charset))
3856 encode_ucs4 (charset, c, 0, dst);
3859 if (XCHARSET_PRIVATE_P (charset))
3861 encode_ucs4 (charset, c, 0, dst);
3866 #ifdef ENABLE_COMPOSITE_CHARS
3867 if (EQ (charset, Vcharset_composite))
3871 /* #### Bother! We don't know how to
3873 Dynarr_add (dst, 0);
3874 Dynarr_add (dst, 0);
3875 Dynarr_add (dst, 0);
3876 Dynarr_add (dst, '~');
3880 Emchar emch = MAKE_CHAR (Vcharset_composite,
3881 ch & 0x7F, c & 0x7F);
3882 Lisp_Object lstr = composite_char_string (emch);
3886 src = XSTRING_DATA (lstr);
3887 n = XSTRING_LENGTH (lstr);
3891 #endif /* ENABLE_COMPOSITE_CHARS */
3893 encode_ucs4(charset, ch, c, dst);
3906 encode_ucs4 (charset, ch, c, dst);
3922 #ifdef ENABLE_COMPOSITE_CHARS
3928 goto back_to_square_n; /* Wheeeeeeeee ..... */
3930 #endif /* ENABLE_COMPOSITE_CHARS */
3934 str->iso2022.current_char_boundary = char_boundary;
3935 str->iso2022.current_charset = charset;
3937 /* Verbum caro factum est! */
3942 /************************************************************************/
3944 /************************************************************************/
3947 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3952 unsigned char c = *src++;
3953 switch (st->utf8.in_byte)
3956 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3959 st->utf8.in_byte = 5;
3961 st->utf8.in_byte = 4;
3963 st->utf8.in_byte = 3;
3965 st->utf8.in_byte = 2;
3967 st->utf8.in_byte = 1;
3972 if ((c & 0xc0) != 0x80)
3978 return CODING_CATEGORY_UTF8_MASK;
3982 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3983 unsigned_char_dynarr *dst, unsigned int n)
3985 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3986 unsigned int flags = str->flags;
3987 unsigned int ch = str->ch;
3988 eol_type_t eol_type = str->eol_type;
3989 unsigned char counter = str->counter;
3993 unsigned char c = *src++;
4002 else if ( c >= 0xf8 )
4007 else if ( c >= 0xf0 )
4012 else if ( c >= 0xe0 )
4017 else if ( c >= 0xc0 )
4024 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4025 decode_ucs4 (c, dst);
4029 ch = ( ch << 6 ) | ( c & 0x3f );
4030 decode_ucs4 (ch, dst);
4035 ch = ( ch << 6 ) | ( c & 0x3f );
4038 label_continue_loop:;
4041 if (flags & CODING_STATE_END)
4042 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4046 str->counter = counter;
4051 encode_utf8 (Lisp_Object charset,
4052 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4054 unsigned long code = mule_char_to_ucs4 (charset, h, l);
4057 Dynarr_add (dst, code);
4059 else if ( code <= 0x7ff )
4061 Dynarr_add (dst, (code >> 6) | 0xc0);
4062 Dynarr_add (dst, (code & 0x3f) | 0x80);
4064 else if ( code <= 0xffff )
4066 Dynarr_add (dst, (code >> 12) | 0xe0);
4067 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4068 Dynarr_add (dst, (code & 0x3f) | 0x80);
4070 else if ( code <= 0x1fffff )
4072 Dynarr_add (dst, (code >> 18) | 0xf0);
4073 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4074 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4075 Dynarr_add (dst, (code & 0x3f) | 0x80);
4077 else if ( code <= 0x3ffffff )
4079 Dynarr_add (dst, (code >> 24) | 0xf8);
4080 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4081 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4082 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4083 Dynarr_add (dst, (code & 0x3f) | 0x80);
4087 Dynarr_add (dst, (code >> 30) | 0xfc);
4088 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4089 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4090 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4091 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4092 Dynarr_add (dst, (code & 0x3f) | 0x80);
4098 encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
4099 unsigned_char_dynarr *dst, unsigned int n)
4101 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4102 unsigned int flags = str->flags;
4103 unsigned int ch = str->ch;
4104 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4105 unsigned char char_boundary = str->iso2022.current_char_boundary;
4110 unsigned char c = *src++;
4111 switch (char_boundary)
4116 Dynarr_add (dst, c);
4119 else if ( c >= 0xf8 )
4121 Dynarr_add (dst, c);
4124 else if ( c >= 0xf0 )
4126 Dynarr_add (dst, c);
4129 else if ( c >= 0xe0 )
4131 Dynarr_add (dst, c);
4134 else if ( c >= 0xc0 )
4136 Dynarr_add (dst, c);
4143 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4144 Dynarr_add (dst, '\r');
4145 if (eol_type != EOL_CR)
4146 Dynarr_add (dst, c);
4149 Dynarr_add (dst, c);
4154 Dynarr_add (dst, c);
4158 Dynarr_add (dst, c);
4162 #else /* not UTF2000 */
4163 Lisp_Object charset = str->iso2022.current_charset;
4165 #ifdef ENABLE_COMPOSITE_CHARS
4166 /* flags for handling composite chars. We do a little switcharoo
4167 on the source while we're outputting the composite char. */
4168 unsigned int saved_n = 0;
4169 const unsigned char *saved_src = NULL;
4170 int in_composite = 0;
4173 #endif /* ENABLE_COMPOSITE_CHARS */
4177 unsigned char c = *src++;
4179 if (BYTE_ASCII_P (c))
4180 { /* Processing ASCII character */
4184 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4185 Dynarr_add (dst, '\r');
4186 if (eol_type != EOL_CR)
4187 Dynarr_add (dst, c);
4190 encode_utf8 (Vcharset_ascii, c, 0, dst);
4193 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4194 { /* Processing Leading Byte */
4196 charset = CHARSET_BY_LEADING_BYTE (c);
4197 if (LEADING_BYTE_PREFIX_P(c))
4202 { /* Processing Non-ASCII character */
4204 if (EQ (charset, Vcharset_control_1))
4206 encode_utf8 (Vcharset_control_1, c, 0, dst);
4210 switch (XCHARSET_REP_BYTES (charset))
4213 encode_utf8 (charset, c, 0, dst);
4216 if (XCHARSET_PRIVATE_P (charset))
4218 encode_utf8 (charset, c, 0, dst);
4223 #ifdef ENABLE_COMPOSITE_CHARS
4224 if (EQ (charset, Vcharset_composite))
4228 /* #### Bother! We don't know how to
4230 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4234 Emchar emch = MAKE_CHAR (Vcharset_composite,
4235 ch & 0x7F, c & 0x7F);
4236 Lisp_Object lstr = composite_char_string (emch);
4240 src = XSTRING_DATA (lstr);
4241 n = XSTRING_LENGTH (lstr);
4245 #endif /* ENABLE_COMPOSITE_CHARS */
4247 encode_utf8 (charset, ch, c, dst);
4260 encode_utf8 (charset, ch, c, dst);
4276 #ifdef ENABLE_COMPOSITE_CHARS
4282 goto back_to_square_n; /* Wheeeeeeeee ..... */
4286 #endif /* not UTF2000 */
4289 str->iso2022.current_char_boundary = char_boundary;
4291 str->iso2022.current_charset = charset;
4294 /* Verbum caro factum est! */
4298 /************************************************************************/
4299 /* ISO2022 methods */
4300 /************************************************************************/
4302 /* The following note describes the coding system ISO2022 briefly.
4303 Since the intention of this note is to help understand the
4304 functions in this file, some parts are NOT ACCURATE or OVERLY
4305 SIMPLIFIED. For thorough understanding, please refer to the
4306 original document of ISO2022.
4308 ISO2022 provides many mechanisms to encode several character sets
4309 in 7-bit and 8-bit environments. For 7-bit environments, all text
4310 is encoded using bytes less than 128. This may make the encoded
4311 text a little bit longer, but the text passes more easily through
4312 several gateways, some of which strip off MSB (Most Signigant Bit).
4314 There are two kinds of character sets: control character set and
4315 graphic character set. The former contains control characters such
4316 as `newline' and `escape' to provide control functions (control
4317 functions are also provided by escape sequences). The latter
4318 contains graphic characters such as 'A' and '-'. Emacs recognizes
4319 two control character sets and many graphic character sets.
4321 Graphic character sets are classified into one of the following
4322 four classes, according to the number of bytes (DIMENSION) and
4323 number of characters in one dimension (CHARS) of the set:
4324 - DIMENSION1_CHARS94
4325 - DIMENSION1_CHARS96
4326 - DIMENSION2_CHARS94
4327 - DIMENSION2_CHARS96
4329 In addition, each character set is assigned an identification tag,
4330 unique for each set, called "final character" (denoted as <F>
4331 hereafter). The <F> of each character set is decided by ECMA(*)
4332 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4333 (0x30..0x3F are for private use only).
4335 Note (*): ECMA = European Computer Manufacturers Association
4337 Here are examples of graphic character set [NAME(<F>)]:
4338 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4339 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4340 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4341 o DIMENSION2_CHARS96 -- none for the moment
4343 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4344 C0 [0x00..0x1F] -- control character plane 0
4345 GL [0x20..0x7F] -- graphic character plane 0
4346 C1 [0x80..0x9F] -- control character plane 1
4347 GR [0xA0..0xFF] -- graphic character plane 1
4349 A control character set is directly designated and invoked to C0 or
4350 C1 by an escape sequence. The most common case is that:
4351 - ISO646's control character set is designated/invoked to C0, and
4352 - ISO6429's control character set is designated/invoked to C1,
4353 and usually these designations/invocations are omitted in encoded
4354 text. In a 7-bit environment, only C0 can be used, and a control
4355 character for C1 is encoded by an appropriate escape sequence to
4356 fit into the environment. All control characters for C1 are
4357 defined to have corresponding escape sequences.
4359 A graphic character set is at first designated to one of four
4360 graphic registers (G0 through G3), then these graphic registers are
4361 invoked to GL or GR. These designations and invocations can be
4362 done independently. The most common case is that G0 is invoked to
4363 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4364 these invocations and designations are omitted in encoded text.
4365 In a 7-bit environment, only GL can be used.
4367 When a graphic character set of CHARS94 is invoked to GL, codes
4368 0x20 and 0x7F of the GL area work as control characters SPACE and
4369 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4372 There are two ways of invocation: locking-shift and single-shift.
4373 With locking-shift, the invocation lasts until the next different
4374 invocation, whereas with single-shift, the invocation affects the
4375 following character only and doesn't affect the locking-shift
4376 state. Invocations are done by the following control characters or
4379 ----------------------------------------------------------------------
4380 abbrev function cntrl escape seq description
4381 ----------------------------------------------------------------------
4382 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4383 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4384 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4385 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4386 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4387 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4388 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4389 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4390 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4391 ----------------------------------------------------------------------
4392 (*) These are not used by any known coding system.
4394 Control characters for these functions are defined by macros
4395 ISO_CODE_XXX in `coding.h'.
4397 Designations are done by the following escape sequences:
4398 ----------------------------------------------------------------------
4399 escape sequence description
4400 ----------------------------------------------------------------------
4401 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4402 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4403 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4404 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4405 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4406 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4407 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4408 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4409 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4410 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4411 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4412 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4413 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4414 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4415 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4416 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4417 ----------------------------------------------------------------------
4419 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4420 of dimension 1, chars 94, and final character <F>, etc...
4422 Note (*): Although these designations are not allowed in ISO2022,
4423 Emacs accepts them on decoding, and produces them on encoding
4424 CHARS96 character sets in a coding system which is characterized as
4425 7-bit environment, non-locking-shift, and non-single-shift.
4427 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4428 '(' can be omitted. We refer to this as "short-form" hereafter.
4430 Now you may notice that there are a lot of ways for encoding the
4431 same multilingual text in ISO2022. Actually, there exist many
4432 coding systems such as Compound Text (used in X11's inter client
4433 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4434 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4435 localized platforms), and all of these are variants of ISO2022.
4437 In addition to the above, Emacs handles two more kinds of escape
4438 sequences: ISO6429's direction specification and Emacs' private
4439 sequence for specifying character composition.
4441 ISO6429's direction specification takes the following form:
4442 o CSI ']' -- end of the current direction
4443 o CSI '0' ']' -- end of the current direction
4444 o CSI '1' ']' -- start of left-to-right text
4445 o CSI '2' ']' -- start of right-to-left text
4446 The control character CSI (0x9B: control sequence introducer) is
4447 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4449 Character composition specification takes the following form:
4450 o ESC '0' -- start character composition
4451 o ESC '1' -- end character composition
4452 Since these are not standard escape sequences of any ISO standard,
4453 their use with these meanings is restricted to Emacs only. */
4456 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4460 for (i = 0; i < 4; i++)
4462 if (!NILP (coding_system))
4464 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4466 iso->charset[i] = Qt;
4467 iso->invalid_designated[i] = 0;
4469 iso->esc = ISO_ESC_NOTHING;
4470 iso->esc_bytes_index = 0;
4471 iso->register_left = 0;
4472 iso->register_right = 1;
4473 iso->switched_dir_and_no_valid_charset_yet = 0;
4474 iso->invalid_switch_dir = 0;
4475 iso->output_direction_sequence = 0;
4476 iso->output_literally = 0;
4477 #ifdef ENABLE_COMPOSITE_CHARS
4478 if (iso->composite_chars)
4479 Dynarr_reset (iso->composite_chars);
4484 fit_to_be_escape_quoted (unsigned char c)
4501 /* Parse one byte of an ISO2022 escape sequence.
4502 If the result is an invalid escape sequence, return 0 and
4503 do not change anything in STR. Otherwise, if the result is
4504 an incomplete escape sequence, update ISO2022.ESC and
4505 ISO2022.ESC_BYTES and return -1. Otherwise, update
4506 all the state variables (but not ISO2022.ESC_BYTES) and
4509 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4510 or invocation of an invalid character set and treat that as
4511 an unrecognized escape sequence. */
4514 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4515 unsigned char c, unsigned int *flags,
4516 int check_invalid_charsets)
4518 /* (1) If we're at the end of a designation sequence, CS is the
4519 charset being designated and REG is the register to designate
4522 (2) If we're at the end of a locking-shift sequence, REG is
4523 the register to invoke and HALF (0 == left, 1 == right) is
4524 the half to invoke it into.
4526 (3) If we're at the end of a single-shift sequence, REG is
4527 the register to invoke. */
4528 Lisp_Object cs = Qnil;
4531 /* NOTE: This code does goto's all over the fucking place.
4532 The reason for this is that we're basically implementing
4533 a state machine here, and hierarchical languages like C
4534 don't really provide a clean way of doing this. */
4536 if (! (*flags & CODING_STATE_ESCAPE))
4537 /* At beginning of escape sequence; we need to reset our
4538 escape-state variables. */
4539 iso->esc = ISO_ESC_NOTHING;
4541 iso->output_literally = 0;
4542 iso->output_direction_sequence = 0;
4546 case ISO_ESC_NOTHING:
4547 iso->esc_bytes_index = 0;
4550 case ISO_CODE_ESC: /* Start escape sequence */
4551 *flags |= CODING_STATE_ESCAPE;
4555 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4556 *flags |= CODING_STATE_ESCAPE;
4557 iso->esc = ISO_ESC_5_11;
4560 case ISO_CODE_SO: /* locking shift 1 */
4563 case ISO_CODE_SI: /* locking shift 0 */
4567 case ISO_CODE_SS2: /* single shift */
4570 case ISO_CODE_SS3: /* single shift */
4574 default: /* Other control characters */
4581 /**** single shift ****/
4583 case 'N': /* single shift 2 */
4586 case 'O': /* single shift 3 */
4590 /**** locking shift ****/
4592 case '~': /* locking shift 1 right */
4595 case 'n': /* locking shift 2 */
4598 case '}': /* locking shift 2 right */
4601 case 'o': /* locking shift 3 */
4604 case '|': /* locking shift 3 right */
4608 #ifdef ENABLE_COMPOSITE_CHARS
4609 /**** composite ****/
4612 iso->esc = ISO_ESC_START_COMPOSITE;
4613 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4614 CODING_STATE_COMPOSITE;
4618 iso->esc = ISO_ESC_END_COMPOSITE;
4619 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4620 ~CODING_STATE_COMPOSITE;
4622 #endif /* ENABLE_COMPOSITE_CHARS */
4624 /**** directionality ****/
4627 iso->esc = ISO_ESC_5_11;
4630 /**** designation ****/
4632 case '$': /* multibyte charset prefix */
4633 iso->esc = ISO_ESC_2_4;
4637 if (0x28 <= c && c <= 0x2F)
4639 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4643 /* This function is called with CODESYS equal to nil when
4644 doing coding-system detection. */
4646 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4647 && fit_to_be_escape_quoted (c))
4649 iso->esc = ISO_ESC_LITERAL;
4650 *flags &= CODING_STATE_ISO2022_LOCK;
4660 /**** directionality ****/
4662 case ISO_ESC_5_11: /* ISO6429 direction control */
4665 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4666 goto directionality;
4668 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4669 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4670 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4674 case ISO_ESC_5_11_0:
4677 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4678 goto directionality;
4682 case ISO_ESC_5_11_1:
4685 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4686 goto directionality;
4690 case ISO_ESC_5_11_2:
4693 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4694 goto directionality;
4699 iso->esc = ISO_ESC_DIRECTIONALITY;
4700 /* Various junk here to attempt to preserve the direction sequences
4701 literally in the text if they would otherwise be swallowed due
4702 to invalid designations that don't show up as actual charset
4703 changes in the text. */
4704 if (iso->invalid_switch_dir)
4706 /* We already inserted a direction switch literally into the
4707 text. We assume (#### this may not be right) that the
4708 next direction switch is the one going the other way,
4709 and we need to output that literally as well. */
4710 iso->output_literally = 1;
4711 iso->invalid_switch_dir = 0;
4717 /* If we are in the thrall of an invalid designation,
4718 then stick the directionality sequence literally into the
4719 output stream so it ends up in the original text again. */
4720 for (jj = 0; jj < 4; jj++)
4721 if (iso->invalid_designated[jj])
4725 iso->output_literally = 1;
4726 iso->invalid_switch_dir = 1;
4729 /* Indicate that we haven't yet seen a valid designation,
4730 so that if a switch-dir is directly followed by an
4731 invalid designation, both get inserted literally. */
4732 iso->switched_dir_and_no_valid_charset_yet = 1;
4737 /**** designation ****/
4740 if (0x28 <= c && c <= 0x2F)
4742 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4745 if (0x40 <= c && c <= 0x42)
4747 cs = CHARSET_BY_ATTRIBUTES (94, 2, c,
4748 *flags & CODING_STATE_R2L ?
4749 CHARSET_RIGHT_TO_LEFT :
4750 CHARSET_LEFT_TO_RIGHT);
4762 if (c < '0' || c > '~')
4763 return 0; /* bad final byte */
4765 if (iso->esc >= ISO_ESC_2_8 &&
4766 iso->esc <= ISO_ESC_2_15)
4768 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4769 single = 1; /* single-byte */
4770 reg = (iso->esc - ISO_ESC_2_8) & 3;
4772 else if (iso->esc >= ISO_ESC_2_4_8 &&
4773 iso->esc <= ISO_ESC_2_4_15)
4775 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4776 single = -1; /* multi-byte */
4777 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4781 /* Can this ever be reached? -slb */
4785 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4786 *flags & CODING_STATE_R2L ?
4787 CHARSET_RIGHT_TO_LEFT :
4788 CHARSET_LEFT_TO_RIGHT);
4794 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4798 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4799 /* can't invoke something that ain't there. */
4801 iso->esc = ISO_ESC_SINGLE_SHIFT;
4802 *flags &= CODING_STATE_ISO2022_LOCK;
4804 *flags |= CODING_STATE_SS2;
4806 *flags |= CODING_STATE_SS3;
4810 if (check_invalid_charsets &&
4811 !CHARSETP (iso->charset[reg]))
4812 /* can't invoke something that ain't there. */
4815 iso->register_right = reg;
4817 iso->register_left = reg;
4818 *flags &= CODING_STATE_ISO2022_LOCK;
4819 iso->esc = ISO_ESC_LOCKING_SHIFT;
4823 if (NILP (cs) && check_invalid_charsets)
4825 iso->invalid_designated[reg] = 1;
4826 iso->charset[reg] = Vcharset_ascii;
4827 iso->esc = ISO_ESC_DESIGNATE;
4828 *flags &= CODING_STATE_ISO2022_LOCK;
4829 iso->output_literally = 1;
4830 if (iso->switched_dir_and_no_valid_charset_yet)
4832 /* We encountered a switch-direction followed by an
4833 invalid designation. Ensure that the switch-direction
4834 gets outputted; otherwise it will probably get eaten
4835 when the text is written out again. */
4836 iso->switched_dir_and_no_valid_charset_yet = 0;
4837 iso->output_direction_sequence = 1;
4838 /* And make sure that the switch-dir going the other
4839 way gets outputted, as well. */
4840 iso->invalid_switch_dir = 1;
4844 /* This function is called with CODESYS equal to nil when
4845 doing coding-system detection. */
4846 if (!NILP (codesys))
4848 charset_conversion_spec_dynarr *dyn =
4849 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4855 for (i = 0; i < Dynarr_length (dyn); i++)
4857 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4858 if (EQ (cs, spec->from_charset))
4859 cs = spec->to_charset;
4864 iso->charset[reg] = cs;
4865 iso->esc = ISO_ESC_DESIGNATE;
4866 *flags &= CODING_STATE_ISO2022_LOCK;
4867 if (iso->invalid_designated[reg])
4869 iso->invalid_designated[reg] = 0;
4870 iso->output_literally = 1;
4872 if (iso->switched_dir_and_no_valid_charset_yet)
4873 iso->switched_dir_and_no_valid_charset_yet = 0;
4878 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4883 /* #### There are serious deficiencies in the recognition mechanism
4884 here. This needs to be much smarter if it's going to cut it.
4885 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4886 it should be detected as Latin-1.
4887 All the ISO2022 stuff in this file should be synced up with the
4888 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4889 Perhaps we should wait till R2L works in FSF Emacs? */
4891 if (!st->iso2022.initted)
4893 reset_iso2022 (Qnil, &st->iso2022.iso);
4894 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4895 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4896 CODING_CATEGORY_ISO_8_1_MASK |
4897 CODING_CATEGORY_ISO_8_2_MASK |
4898 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4899 st->iso2022.flags = 0;
4900 st->iso2022.high_byte_count = 0;
4901 st->iso2022.saw_single_shift = 0;
4902 st->iso2022.initted = 1;
4905 mask = st->iso2022.mask;
4912 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4913 st->iso2022.high_byte_count++;
4917 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4919 if (st->iso2022.high_byte_count & 1)
4920 /* odd number of high bytes; assume not iso-8-2 */
4921 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4923 st->iso2022.high_byte_count = 0;
4924 st->iso2022.saw_single_shift = 0;
4926 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4928 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4929 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4930 { /* control chars */
4933 /* Allow and ignore control characters that you might
4934 reasonably see in a text file */
4939 case 8: /* backspace */
4940 case 11: /* vertical tab */
4941 case 12: /* form feed */
4942 case 26: /* MS-DOS C-z junk */
4943 case 31: /* '^_' -- for info */
4944 goto label_continue_loop;
4951 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4954 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4955 &st->iso2022.flags, 0))
4957 switch (st->iso2022.iso.esc)
4959 case ISO_ESC_DESIGNATE:
4960 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4961 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4963 case ISO_ESC_LOCKING_SHIFT:
4964 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4965 goto ran_out_of_chars;
4966 case ISO_ESC_SINGLE_SHIFT:
4967 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4968 st->iso2022.saw_single_shift = 1;
4977 goto ran_out_of_chars;
4980 label_continue_loop:;
4989 postprocess_iso2022_mask (int mask)
4991 /* #### kind of cheesy */
4992 /* If seven-bit ISO is allowed, then assume that the encoding is
4993 entirely seven-bit and turn off the eight-bit ones. */
4994 if (mask & CODING_CATEGORY_ISO_7_MASK)
4995 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4996 CODING_CATEGORY_ISO_8_1_MASK |
4997 CODING_CATEGORY_ISO_8_2_MASK);
5001 /* If FLAGS is a null pointer or specifies right-to-left motion,
5002 output a switch-dir-to-left-to-right sequence to DST.
5003 Also update FLAGS if it is not a null pointer.
5004 If INTERNAL_P is set, we are outputting in internal format and
5005 need to handle the CSI differently. */
5008 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5009 unsigned_char_dynarr *dst,
5010 unsigned int *flags,
5013 if (!flags || (*flags & CODING_STATE_R2L))
5015 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5017 Dynarr_add (dst, ISO_CODE_ESC);
5018 Dynarr_add (dst, '[');
5020 else if (internal_p)
5021 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5023 Dynarr_add (dst, ISO_CODE_CSI);
5024 Dynarr_add (dst, '0');
5025 Dynarr_add (dst, ']');
5027 *flags &= ~CODING_STATE_R2L;
5031 /* If FLAGS is a null pointer or specifies a direction different from
5032 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5033 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5034 sequence to DST. Also update FLAGS if it is not a null pointer.
5035 If INTERNAL_P is set, we are outputting in internal format and
5036 need to handle the CSI differently. */
5039 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5040 unsigned_char_dynarr *dst, unsigned int *flags,
5043 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5044 direction == CHARSET_LEFT_TO_RIGHT)
5045 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5046 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5047 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5048 direction == CHARSET_RIGHT_TO_LEFT)
5050 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5052 Dynarr_add (dst, ISO_CODE_ESC);
5053 Dynarr_add (dst, '[');
5055 else if (internal_p)
5056 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5058 Dynarr_add (dst, ISO_CODE_CSI);
5059 Dynarr_add (dst, '2');
5060 Dynarr_add (dst, ']');
5062 *flags |= CODING_STATE_R2L;
5066 /* Convert ISO2022-format data to internal format. */
5069 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
5070 unsigned_char_dynarr *dst, unsigned int n)
5072 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5073 unsigned int flags = str->flags;
5074 unsigned int ch = str->ch;
5075 eol_type_t eol_type = str->eol_type;
5076 #ifdef ENABLE_COMPOSITE_CHARS
5077 unsigned_char_dynarr *real_dst = dst;
5079 Lisp_Object coding_system;
5081 XSETCODING_SYSTEM (coding_system, str->codesys);
5083 #ifdef ENABLE_COMPOSITE_CHARS
5084 if (flags & CODING_STATE_COMPOSITE)
5085 dst = str->iso2022.composite_chars;
5086 #endif /* ENABLE_COMPOSITE_CHARS */
5090 unsigned char c = *src++;
5091 if (flags & CODING_STATE_ESCAPE)
5092 { /* Within ESC sequence */
5093 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5098 switch (str->iso2022.esc)
5100 #ifdef ENABLE_COMPOSITE_CHARS
5101 case ISO_ESC_START_COMPOSITE:
5102 if (str->iso2022.composite_chars)
5103 Dynarr_reset (str->iso2022.composite_chars);
5105 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5106 dst = str->iso2022.composite_chars;
5108 case ISO_ESC_END_COMPOSITE:
5110 Bufbyte comstr[MAX_EMCHAR_LEN];
5112 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5113 Dynarr_length (dst));
5115 len = set_charptr_emchar (comstr, emch);
5116 Dynarr_add_many (dst, comstr, len);
5119 #endif /* ENABLE_COMPOSITE_CHARS */
5121 case ISO_ESC_LITERAL:
5122 DECODE_ADD_BINARY_CHAR (c, dst);
5126 /* Everything else handled already */
5131 /* Attempted error recovery. */
5132 if (str->iso2022.output_direction_sequence)
5133 ensure_correct_direction (flags & CODING_STATE_R2L ?
5134 CHARSET_RIGHT_TO_LEFT :
5135 CHARSET_LEFT_TO_RIGHT,
5136 str->codesys, dst, 0, 1);
5137 /* More error recovery. */
5138 if (!retval || str->iso2022.output_literally)
5140 /* Output the (possibly invalid) sequence */
5142 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5143 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5144 flags &= CODING_STATE_ISO2022_LOCK;
5146 n++, src--;/* Repeat the loop with the same character. */
5149 /* No sense in reprocessing the final byte of the
5150 escape sequence; it could mess things up anyway.
5152 DECODE_ADD_BINARY_CHAR (c, dst);
5157 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5158 { /* Control characters */
5160 /***** Error-handling *****/
5162 /* If we were in the middle of a character, dump out the
5163 partial character. */
5164 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5166 /* If we just saw a single-shift character, dump it out.
5167 This may dump out the wrong sort of single-shift character,
5168 but least it will give an indication that something went
5170 if (flags & CODING_STATE_SS2)
5172 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5173 flags &= ~CODING_STATE_SS2;
5175 if (flags & CODING_STATE_SS3)
5177 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5178 flags &= ~CODING_STATE_SS3;
5181 /***** Now handle the control characters. *****/
5184 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5186 flags &= CODING_STATE_ISO2022_LOCK;
5188 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5189 DECODE_ADD_BINARY_CHAR (c, dst);
5192 { /* Graphic characters */
5193 Lisp_Object charset;
5199 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5201 /* Now determine the charset. */
5202 reg = ((flags & CODING_STATE_SS2) ? 2
5203 : (flags & CODING_STATE_SS3) ? 3
5204 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5205 : str->iso2022.register_left);
5206 charset = str->iso2022.charset[reg];
5208 /* Error checking: */
5209 if (! CHARSETP (charset)
5210 || str->iso2022.invalid_designated[reg]
5211 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5212 && XCHARSET_CHARS (charset) == 94))
5213 /* Mrmph. We are trying to invoke a register that has no
5214 or an invalid charset in it, or trying to add a character
5215 outside the range of the charset. Insert that char literally
5216 to preserve it for the output. */
5218 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5219 DECODE_ADD_BINARY_CHAR (c, dst);
5224 /* Things are probably hunky-dorey. */
5226 /* Fetch reverse charset, maybe. */
5227 if (((flags & CODING_STATE_R2L) &&
5228 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5230 (!(flags & CODING_STATE_R2L) &&
5231 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5233 Lisp_Object new_charset =
5234 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5235 if (!NILP (new_charset))
5236 charset = new_charset;
5240 if (XCHARSET_DIMENSION (charset) == 1)
5242 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5244 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5249 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5255 lb = XCHARSET_LEADING_BYTE (charset);
5256 switch (XCHARSET_REP_BYTES (charset))
5259 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5260 Dynarr_add (dst, c & 0x7F);
5263 case 2: /* one-byte official */
5264 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5265 Dynarr_add (dst, lb);
5266 Dynarr_add (dst, c | 0x80);
5269 case 3: /* one-byte private or two-byte official */
5270 if (XCHARSET_PRIVATE_P (charset))
5272 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5273 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5274 Dynarr_add (dst, lb);
5275 Dynarr_add (dst, c | 0x80);
5281 Dynarr_add (dst, lb);
5282 Dynarr_add (dst, ch | 0x80);
5283 Dynarr_add (dst, c | 0x80);
5291 default: /* two-byte private */
5294 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5295 Dynarr_add (dst, lb);
5296 Dynarr_add (dst, ch | 0x80);
5297 Dynarr_add (dst, c | 0x80);
5307 flags &= CODING_STATE_ISO2022_LOCK;
5310 label_continue_loop:;
5313 if (flags & CODING_STATE_END)
5314 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5321 /***** ISO2022 encoder *****/
5323 /* Designate CHARSET into register REG. */
5326 iso2022_designate (Lisp_Object charset, unsigned char reg,
5327 struct encoding_stream *str, unsigned_char_dynarr *dst)
5329 static const char inter94[] = "()*+";
5330 static const char inter96[] = ",-./";
5332 unsigned char final;
5333 Lisp_Object old_charset = str->iso2022.charset[reg];
5335 str->iso2022.charset[reg] = charset;
5336 if (!CHARSETP (charset))
5337 /* charset might be an initial nil or t. */
5339 type = XCHARSET_TYPE (charset);
5340 final = XCHARSET_FINAL (charset);
5341 if (!str->iso2022.force_charset_on_output[reg] &&
5342 CHARSETP (old_charset) &&
5343 XCHARSET_TYPE (old_charset) == type &&
5344 XCHARSET_FINAL (old_charset) == final)
5347 str->iso2022.force_charset_on_output[reg] = 0;
5350 charset_conversion_spec_dynarr *dyn =
5351 str->codesys->iso2022.output_conv;
5357 for (i = 0; i < Dynarr_length (dyn); i++)
5359 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5360 if (EQ (charset, spec->from_charset))
5361 charset = spec->to_charset;
5366 Dynarr_add (dst, ISO_CODE_ESC);
5369 case CHARSET_TYPE_94:
5370 Dynarr_add (dst, inter94[reg]);
5372 case CHARSET_TYPE_96:
5373 Dynarr_add (dst, inter96[reg]);
5375 case CHARSET_TYPE_94X94:
5376 Dynarr_add (dst, '$');
5378 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5381 Dynarr_add (dst, inter94[reg]);
5383 case CHARSET_TYPE_96X96:
5384 Dynarr_add (dst, '$');
5385 Dynarr_add (dst, inter96[reg]);
5388 Dynarr_add (dst, final);
5392 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5394 if (str->iso2022.register_left != 0)
5396 Dynarr_add (dst, ISO_CODE_SI);
5397 str->iso2022.register_left = 0;
5402 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5404 if (str->iso2022.register_left != 1)
5406 Dynarr_add (dst, ISO_CODE_SO);
5407 str->iso2022.register_left = 1;
5411 /* Convert internally-formatted data to ISO2022 format. */
5414 encode_coding_iso2022 (Lstream *encoding, const unsigned char *src,
5415 unsigned_char_dynarr *dst, unsigned int n)
5417 unsigned char charmask, c;
5418 unsigned char char_boundary;
5419 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5420 unsigned int flags = str->flags;
5421 Emchar ch = str->ch;
5422 Lisp_Coding_System *codesys = str->codesys;
5423 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5425 Lisp_Object charset;
5428 unsigned int byte1, byte2;
5431 #ifdef ENABLE_COMPOSITE_CHARS
5432 /* flags for handling composite chars. We do a little switcharoo
5433 on the source while we're outputting the composite char. */
5434 unsigned int saved_n = 0;
5435 const unsigned char *saved_src = NULL;
5436 int in_composite = 0;
5437 #endif /* ENABLE_COMPOSITE_CHARS */
5439 char_boundary = str->iso2022.current_char_boundary;
5440 charset = str->iso2022.current_charset;
5441 half = str->iso2022.current_half;
5443 #ifdef ENABLE_COMPOSITE_CHARS
5451 switch (char_boundary)
5459 else if ( c >= 0xf8 )
5464 else if ( c >= 0xf0 )
5469 else if ( c >= 0xe0 )
5474 else if ( c >= 0xc0 )
5483 restore_left_to_right_direction (codesys, dst, &flags, 0);
5485 /* Make sure G0 contains ASCII */
5486 if ((c > ' ' && c < ISO_CODE_DEL) ||
5487 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5489 ensure_normal_shift (str, dst);
5490 iso2022_designate (Vcharset_ascii, 0, str, dst);
5493 /* If necessary, restore everything to the default state
5496 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5498 restore_left_to_right_direction (codesys, dst, &flags, 0);
5500 ensure_normal_shift (str, dst);
5502 for (i = 0; i < 4; i++)
5504 Lisp_Object initial_charset =
5505 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5506 iso2022_designate (initial_charset, i, str, dst);
5511 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5512 Dynarr_add (dst, '\r');
5513 if (eol_type != EOL_CR)
5514 Dynarr_add (dst, c);
5518 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5519 && fit_to_be_escape_quoted (c))
5520 Dynarr_add (dst, ISO_CODE_ESC);
5521 Dynarr_add (dst, c);
5527 ch = ( ch << 6 ) | ( c & 0x3f );
5530 if ( (0x80 <= ch) && (ch <= 0x9f) )
5532 charmask = (half == 0 ? 0x00 : 0x80);
5534 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5535 && fit_to_be_escape_quoted (ch))
5536 Dynarr_add (dst, ISO_CODE_ESC);
5537 /* you asked for it ... */
5538 Dynarr_add (dst, ch);
5544 BREAKUP_CHAR (ch, charset, byte1, byte2);
5545 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5546 codesys, dst, &flags, 0);
5548 /* Now determine which register to use. */
5550 for (i = 0; i < 4; i++)
5552 if (EQ (charset, str->iso2022.charset[i]) ||
5554 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5563 if (XCHARSET_GRAPHIC (charset) != 0)
5565 if (!NILP (str->iso2022.charset[1]) &&
5566 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5567 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5569 else if (!NILP (str->iso2022.charset[2]))
5571 else if (!NILP (str->iso2022.charset[3]))
5580 iso2022_designate (charset, reg, str, dst);
5582 /* Now invoke that register. */
5586 ensure_normal_shift (str, dst);
5591 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5593 ensure_shift_out (str, dst);
5601 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5603 Dynarr_add (dst, ISO_CODE_ESC);
5604 Dynarr_add (dst, 'N');
5609 Dynarr_add (dst, ISO_CODE_SS2);
5615 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5617 Dynarr_add (dst, ISO_CODE_ESC);
5618 Dynarr_add (dst, 'O');
5623 Dynarr_add (dst, ISO_CODE_SS3);
5632 charmask = (half == 0 ? 0x00 : 0x80);
5634 switch (XCHARSET_DIMENSION (charset))
5637 Dynarr_add (dst, byte1 | charmask);
5640 Dynarr_add (dst, byte1 | charmask);
5641 Dynarr_add (dst, byte2 | charmask);
5650 ch = ( ch << 6 ) | ( c & 0x3f );
5654 #else /* not UTF2000 */
5660 if (BYTE_ASCII_P (c))
5661 { /* Processing ASCII character */
5664 restore_left_to_right_direction (codesys, dst, &flags, 0);
5666 /* Make sure G0 contains ASCII */
5667 if ((c > ' ' && c < ISO_CODE_DEL) ||
5668 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5670 ensure_normal_shift (str, dst);
5671 iso2022_designate (Vcharset_ascii, 0, str, dst);
5674 /* If necessary, restore everything to the default state
5677 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5679 restore_left_to_right_direction (codesys, dst, &flags, 0);
5681 ensure_normal_shift (str, dst);
5683 for (i = 0; i < 4; i++)
5685 Lisp_Object initial_charset =
5686 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5687 iso2022_designate (initial_charset, i, str, dst);
5692 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5693 Dynarr_add (dst, '\r');
5694 if (eol_type != EOL_CR)
5695 Dynarr_add (dst, c);
5699 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5700 && fit_to_be_escape_quoted (c))
5701 Dynarr_add (dst, ISO_CODE_ESC);
5702 Dynarr_add (dst, c);
5707 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5708 { /* Processing Leading Byte */
5710 charset = CHARSET_BY_LEADING_BYTE (c);
5711 if (LEADING_BYTE_PREFIX_P(c))
5713 else if (!EQ (charset, Vcharset_control_1)
5714 #ifdef ENABLE_COMPOSITE_CHARS
5715 && !EQ (charset, Vcharset_composite)
5721 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5722 codesys, dst, &flags, 0);
5724 /* Now determine which register to use. */
5726 for (i = 0; i < 4; i++)
5728 if (EQ (charset, str->iso2022.charset[i]) ||
5730 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5739 if (XCHARSET_GRAPHIC (charset) != 0)
5741 if (!NILP (str->iso2022.charset[1]) &&
5742 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5743 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5745 else if (!NILP (str->iso2022.charset[2]))
5747 else if (!NILP (str->iso2022.charset[3]))
5756 iso2022_designate (charset, reg, str, dst);
5758 /* Now invoke that register. */
5762 ensure_normal_shift (str, dst);
5767 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5769 ensure_shift_out (str, dst);
5777 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5779 Dynarr_add (dst, ISO_CODE_ESC);
5780 Dynarr_add (dst, 'N');
5785 Dynarr_add (dst, ISO_CODE_SS2);
5791 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5793 Dynarr_add (dst, ISO_CODE_ESC);
5794 Dynarr_add (dst, 'O');
5799 Dynarr_add (dst, ISO_CODE_SS3);
5811 { /* Processing Non-ASCII character */
5812 charmask = (half == 0 ? 0x7F : 0xFF);
5814 if (EQ (charset, Vcharset_control_1))
5816 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5817 && fit_to_be_escape_quoted (c))
5818 Dynarr_add (dst, ISO_CODE_ESC);
5819 /* you asked for it ... */
5820 Dynarr_add (dst, c - 0x20);
5824 switch (XCHARSET_REP_BYTES (charset))
5827 Dynarr_add (dst, c & charmask);
5830 if (XCHARSET_PRIVATE_P (charset))
5832 Dynarr_add (dst, c & charmask);
5837 #ifdef ENABLE_COMPOSITE_CHARS
5838 if (EQ (charset, Vcharset_composite))
5842 /* #### Bother! We don't know how to
5844 Dynarr_add (dst, '~');
5848 Emchar emch = MAKE_CHAR (Vcharset_composite,
5849 ch & 0x7F, c & 0x7F);
5850 Lisp_Object lstr = composite_char_string (emch);
5854 src = XSTRING_DATA (lstr);
5855 n = XSTRING_LENGTH (lstr);
5856 Dynarr_add (dst, ISO_CODE_ESC);
5857 Dynarr_add (dst, '0'); /* start composing */
5861 #endif /* ENABLE_COMPOSITE_CHARS */
5863 Dynarr_add (dst, ch & charmask);
5864 Dynarr_add (dst, c & charmask);
5877 Dynarr_add (dst, ch & charmask);
5878 Dynarr_add (dst, c & charmask);
5893 #endif /* not UTF2000 */
5895 #ifdef ENABLE_COMPOSITE_CHARS
5901 Dynarr_add (dst, ISO_CODE_ESC);
5902 Dynarr_add (dst, '1'); /* end composing */
5903 goto back_to_square_n; /* Wheeeeeeeee ..... */
5905 #endif /* ENABLE_COMPOSITE_CHARS */
5908 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5910 if (char_boundary && flags & CODING_STATE_END)
5913 restore_left_to_right_direction (codesys, dst, &flags, 0);
5914 ensure_normal_shift (str, dst);
5915 for (i = 0; i < 4; i++)
5917 Lisp_Object initial_charset =
5918 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5919 iso2022_designate (initial_charset, i, str, dst);
5925 str->iso2022.current_char_boundary = char_boundary;
5926 str->iso2022.current_charset = charset;
5927 str->iso2022.current_half = half;
5929 /* Verbum caro factum est! */
5933 /************************************************************************/
5934 /* No-conversion methods */
5935 /************************************************************************/
5937 /* This is used when reading in "binary" files -- i.e. files that may
5938 contain all 256 possible byte values and that are not to be
5939 interpreted as being in any particular decoding. */
5941 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5942 unsigned_char_dynarr *dst, unsigned int n)
5945 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5946 unsigned int flags = str->flags;
5947 unsigned int ch = str->ch;
5948 eol_type_t eol_type = str->eol_type;
5954 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5955 DECODE_ADD_BINARY_CHAR (c, dst);
5956 label_continue_loop:;
5959 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5966 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5967 unsigned_char_dynarr *dst, unsigned int n)
5970 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5971 unsigned int flags = str->flags;
5972 unsigned int ch = str->ch;
5973 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5975 unsigned char char_boundary = str->iso2022.current_char_boundary;
5982 switch (char_boundary)
5990 else if ( c >= 0xf8 )
5995 else if ( c >= 0xf0 )
6000 else if ( c >= 0xe0 )
6005 else if ( c >= 0xc0 )
6016 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6017 Dynarr_add (dst, '\r');
6018 if (eol_type != EOL_CR)
6019 Dynarr_add (dst, c);
6022 Dynarr_add (dst, c);
6027 ch = ( ch << 6 ) | ( c & 0x3f );
6028 Dynarr_add (dst, ch & 0xff);
6032 ch = ( ch << 6 ) | ( c & 0x3f );
6035 #else /* not UTF2000 */
6038 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6039 Dynarr_add (dst, '\r');
6040 if (eol_type != EOL_CR)
6041 Dynarr_add (dst, '\n');
6044 else if (BYTE_ASCII_P (c))
6047 Dynarr_add (dst, c);
6049 else if (BUFBYTE_LEADING_BYTE_P (c))
6052 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6053 c == LEADING_BYTE_CONTROL_1)
6056 Dynarr_add (dst, '~'); /* untranslatable character */
6060 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6061 Dynarr_add (dst, c);
6062 else if (ch == LEADING_BYTE_CONTROL_1)
6065 Dynarr_add (dst, c - 0x20);
6067 /* else it should be the second or third byte of an
6068 untranslatable character, so ignore it */
6071 #endif /* not UTF2000 */
6077 str->iso2022.current_char_boundary = char_boundary;
6083 /************************************************************************/
6084 /* Initialization */
6085 /************************************************************************/
6088 syms_of_file_coding (void)
6090 INIT_LRECORD_IMPLEMENTATION (coding_system);
6092 deferror (&Qcoding_system_error, "coding-system-error",
6093 "Coding-system error", Qio_error);
6095 DEFSUBR (Fcoding_system_p);
6096 DEFSUBR (Ffind_coding_system);
6097 DEFSUBR (Fget_coding_system);
6098 DEFSUBR (Fcoding_system_list);
6099 DEFSUBR (Fcoding_system_name);
6100 DEFSUBR (Fmake_coding_system);
6101 DEFSUBR (Fcopy_coding_system);
6102 DEFSUBR (Fcoding_system_canonical_name_p);
6103 DEFSUBR (Fcoding_system_alias_p);
6104 DEFSUBR (Fcoding_system_aliasee);
6105 DEFSUBR (Fdefine_coding_system_alias);
6106 DEFSUBR (Fsubsidiary_coding_system);
6108 DEFSUBR (Fcoding_system_type);
6109 DEFSUBR (Fcoding_system_doc_string);
6111 DEFSUBR (Fcoding_system_charset);
6113 DEFSUBR (Fcoding_system_property);
6115 DEFSUBR (Fcoding_category_list);
6116 DEFSUBR (Fset_coding_priority_list);
6117 DEFSUBR (Fcoding_priority_list);
6118 DEFSUBR (Fset_coding_category_system);
6119 DEFSUBR (Fcoding_category_system);
6121 DEFSUBR (Fdetect_coding_region);
6122 DEFSUBR (Fdecode_coding_region);
6123 DEFSUBR (Fencode_coding_region);
6125 DEFSUBR (Fdecode_shift_jis_char);
6126 DEFSUBR (Fencode_shift_jis_char);
6127 DEFSUBR (Fdecode_big5_char);
6128 DEFSUBR (Fencode_big5_char);
6130 DEFSUBR (Fset_ucs_char);
6131 DEFSUBR (Fucs_char);
6132 DEFSUBR (Fset_char_ucs);
6133 DEFSUBR (Fchar_ucs);
6134 #endif /* not UTF2000 */
6136 defsymbol (&Qcoding_systemp, "coding-system-p");
6137 defsymbol (&Qno_conversion, "no-conversion");
6138 defsymbol (&Qraw_text, "raw-text");
6140 defsymbol (&Qbig5, "big5");
6141 defsymbol (&Qshift_jis, "shift-jis");
6142 defsymbol (&Qucs4, "ucs-4");
6143 defsymbol (&Qutf8, "utf-8");
6144 defsymbol (&Qccl, "ccl");
6145 defsymbol (&Qiso2022, "iso2022");
6147 defsymbol (&Qmnemonic, "mnemonic");
6148 defsymbol (&Qeol_type, "eol-type");
6149 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6150 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6152 defsymbol (&Qcr, "cr");
6153 defsymbol (&Qlf, "lf");
6154 defsymbol (&Qcrlf, "crlf");
6155 defsymbol (&Qeol_cr, "eol-cr");
6156 defsymbol (&Qeol_lf, "eol-lf");
6157 defsymbol (&Qeol_crlf, "eol-crlf");
6159 defsymbol (&Qcharset_g0, "charset-g0");
6160 defsymbol (&Qcharset_g1, "charset-g1");
6161 defsymbol (&Qcharset_g2, "charset-g2");
6162 defsymbol (&Qcharset_g3, "charset-g3");
6163 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6164 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6165 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6166 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6167 defsymbol (&Qno_iso6429, "no-iso6429");
6168 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6169 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6171 defsymbol (&Qshort, "short");
6172 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6173 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6174 defsymbol (&Qseven, "seven");
6175 defsymbol (&Qlock_shift, "lock-shift");
6176 defsymbol (&Qescape_quoted, "escape-quoted");
6178 defsymbol (&Qencode, "encode");
6179 defsymbol (&Qdecode, "decode");
6182 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6184 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6186 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6188 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6190 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6192 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6194 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6196 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6198 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6201 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6206 lstream_type_create_file_coding (void)
6208 LSTREAM_HAS_METHOD (decoding, reader);
6209 LSTREAM_HAS_METHOD (decoding, writer);
6210 LSTREAM_HAS_METHOD (decoding, rewinder);
6211 LSTREAM_HAS_METHOD (decoding, seekable_p);
6212 LSTREAM_HAS_METHOD (decoding, flusher);
6213 LSTREAM_HAS_METHOD (decoding, closer);
6214 LSTREAM_HAS_METHOD (decoding, marker);
6216 LSTREAM_HAS_METHOD (encoding, reader);
6217 LSTREAM_HAS_METHOD (encoding, writer);
6218 LSTREAM_HAS_METHOD (encoding, rewinder);
6219 LSTREAM_HAS_METHOD (encoding, seekable_p);
6220 LSTREAM_HAS_METHOD (encoding, flusher);
6221 LSTREAM_HAS_METHOD (encoding, closer);
6222 LSTREAM_HAS_METHOD (encoding, marker);
6226 vars_of_file_coding (void)
6230 fcd = xnew (struct file_coding_dump);
6231 dumpstruct (&fcd, &fcd_description);
6233 /* Initialize to something reasonable ... */
6234 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6236 fcd->coding_category_system[i] = Qnil;
6237 fcd->coding_category_by_priority[i] = i;
6240 Fprovide (intern ("file-coding"));
6242 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6243 Coding system used for TTY keyboard input.
6244 Not used under a windowing system.
6246 Vkeyboard_coding_system = Qnil;
6248 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6249 Coding system used for TTY display output.
6250 Not used under a windowing system.
6252 Vterminal_coding_system = Qnil;
6254 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6255 Overriding coding system used when reading from a file or process.
6256 You should bind this variable with `let', but do not set it globally.
6257 If this is non-nil, it specifies the coding system that will be used
6258 to decode input on read operations, such as from a file or process.
6259 It overrides `buffer-file-coding-system-for-read',
6260 `insert-file-contents-pre-hook', etc. Use those variables instead of
6261 this one for permanent changes to the environment. */ );
6262 Vcoding_system_for_read = Qnil;
6264 DEFVAR_LISP ("coding-system-for-write",
6265 &Vcoding_system_for_write /*
6266 Overriding coding system used when writing to a file or process.
6267 You should bind this variable with `let', but do not set it globally.
6268 If this is non-nil, it specifies the coding system that will be used
6269 to encode output for write operations, such as to a file or process.
6270 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6271 Use those variables instead of this one for permanent changes to the
6273 Vcoding_system_for_write = Qnil;
6275 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6276 Coding system used to convert pathnames when accessing files.
6278 Vfile_name_coding_system = Qnil;
6280 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6281 Non-nil means the buffer contents are regarded as multi-byte form
6282 of characters, not a binary code. This affects the display, file I/O,
6283 and behaviors of various editing commands.
6285 Setting this to nil does not do anything.
6287 enable_multibyte_characters = 1;
6291 complex_vars_of_file_coding (void)
6293 staticpro (&Vcoding_system_hash_table);
6294 Vcoding_system_hash_table =
6295 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6297 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6298 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6300 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6302 struct codesys_prop csp; \
6304 csp.prop_type = (Prop_Type); \
6305 Dynarr_add (the_codesys_prop_dynarr, csp); \
6308 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6309 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6310 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6311 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6312 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6313 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6314 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6316 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6317 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6318 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6319 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6320 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6321 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6322 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6323 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6324 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6325 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6326 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6327 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6328 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6329 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6330 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6331 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6332 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6334 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6335 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6337 /* Need to create this here or we're really screwed. */
6339 (Qraw_text, Qno_conversion,
6340 build_string ("Raw text, which means it converts only line-break-codes."),
6341 list2 (Qmnemonic, build_string ("Raw")));
6344 (Qbinary, Qno_conversion,
6345 build_string ("Binary, which means it does not convert anything."),
6346 list4 (Qeol_type, Qlf,
6347 Qmnemonic, build_string ("Binary")));
6352 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6353 list2 (Qmnemonic, build_string ("UTF8")));
6356 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6358 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6360 Fdefine_coding_system_alias (Qterminal, Qbinary);
6361 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6363 /* Need this for bootstrapping */
6364 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6365 Fget_coding_system (Qraw_text);
6368 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6369 = Fget_coding_system (Qutf8);
6372 #if defined(MULE) && !defined(UTF2000)
6376 for (i = 0; i < 65536; i++)
6377 fcd->ucs_to_mule_table[i] = Qnil;
6379 staticpro (&mule_to_ucs_table);
6380 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6381 #endif /* defined(MULE) && !defined(UTF2000) */