1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
61 #if defined(MULE) && !defined(UTF2000)
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
68 #if defined(MULE) && !defined(UTF2000)
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const unsigned char *src,
181 static void decode_coding_sjis (Lstream *decoding,
182 const unsigned char *src,
183 unsigned_char_dynarr *dst,
185 static void encode_coding_sjis (Lstream *encoding,
186 const unsigned char *src,
187 unsigned_char_dynarr *dst,
189 static int detect_coding_big5 (struct detection_state *st,
190 const unsigned char *src,
192 static void decode_coding_big5 (Lstream *decoding,
193 const unsigned char *src,
194 unsigned_char_dynarr *dst, unsigned int n);
195 static void encode_coding_big5 (Lstream *encoding,
196 const unsigned char *src,
197 unsigned_char_dynarr *dst, unsigned int n);
198 static int detect_coding_ucs4 (struct detection_state *st,
199 const unsigned char *src,
201 static void decode_coding_ucs4 (Lstream *decoding,
202 const unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_ucs4 (Lstream *encoding,
205 const unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_utf8 (struct detection_state *st,
208 const unsigned char *src,
210 static void decode_coding_utf8 (Lstream *decoding,
211 const unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 static void encode_coding_utf8 (Lstream *encoding,
214 const unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
216 static int postprocess_iso2022_mask (int mask);
217 static void reset_iso2022 (Lisp_Object coding_system,
218 struct iso2022_decoder *iso);
219 static int detect_coding_iso2022 (struct detection_state *st,
220 const unsigned char *src,
222 static void decode_coding_iso2022 (Lstream *decoding,
223 const unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 static void encode_coding_iso2022 (Lstream *encoding,
226 const unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 static void decode_coding_no_conversion (Lstream *decoding,
230 const unsigned char *src,
231 unsigned_char_dynarr *dst,
233 static void encode_coding_no_conversion (Lstream *encoding,
234 const unsigned char *src,
235 unsigned_char_dynarr *dst,
237 static void mule_decode (Lstream *decoding, const unsigned char *src,
238 unsigned_char_dynarr *dst, unsigned int n);
239 static void mule_encode (Lstream *encoding, const unsigned char *src,
240 unsigned_char_dynarr *dst, unsigned int n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
677 signal_simple_error_2
678 ("Attempted conversion between different charset types",
680 spec.from_charset = from;
681 spec.to_charset = to;
683 Dynarr_add (store_here, spec);
687 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
688 specs, return the equivalent as the Lisp programmer would see it.
690 If LOAD_HERE is 0, return Qnil. */
693 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
700 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
702 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
703 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
706 return Fnreverse (result);
711 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
712 Register symbol NAME as a coding system.
714 TYPE describes the conversion method used and should be one of
717 Automatic conversion. XEmacs attempts to detect the coding system
720 No conversion. Use this for binary files and such. On output,
721 graphic characters that are not in ASCII or Latin-1 will be
722 replaced by a ?. (For a no-conversion-encoded buffer, these
723 characters will only be present if you explicitly insert them.)
725 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
727 ISO 10646 UCS-4 encoding.
729 ISO 10646 UTF-8 encoding.
731 Any ISO2022-compliant encoding. Among other things, this includes
732 JIS (the Japanese encoding commonly used for e-mail), EUC (the
733 standard Unix encoding for Japanese and other languages), and
734 Compound Text (the encoding used in X11). You can specify more
735 specific information about the conversion with the PROPS argument.
737 Big5 (the encoding commonly used for Taiwanese).
739 The conversion is performed using a user-written pseudo-code
740 program. CCL (Code Conversion Language) is the name of this
743 Write out or read in the raw contents of the memory representing
744 the buffer's text. This is primarily useful for debugging
745 purposes, and is only enabled when XEmacs has been compiled with
746 DEBUG_XEMACS defined (via the --debug configure option).
747 WARNING: Reading in a file using 'internal conversion can result
748 in an internal inconsistency in the memory representing a
749 buffer's text, which will produce unpredictable results and may
750 cause XEmacs to crash. Under normal circumstances you should
751 never use 'internal conversion.
753 DOC-STRING is a string describing the coding system.
755 PROPS is a property list, describing the specific nature of the
756 character set. Recognized properties are:
759 String to be displayed in the modeline when this coding system is
763 End-of-line conversion to be used. It should be one of
766 Automatically detect the end-of-line type (LF, CRLF,
767 or CR). Also generate subsidiary coding systems named
768 `NAME-unix', `NAME-dos', and `NAME-mac', that are
769 identical to this coding system but have an EOL-TYPE
770 value of 'lf, 'crlf, and 'cr, respectively.
772 The end of a line is marked externally using ASCII LF.
773 Since this is also the way that XEmacs represents an
774 end-of-line internally, specifying this option results
775 in no end-of-line conversion. This is the standard
776 format for Unix text files.
778 The end of a line is marked externally using ASCII
779 CRLF. This is the standard format for MS-DOS text
782 The end of a line is marked externally using ASCII CR.
783 This is the standard format for Macintosh text files.
785 Automatically detect the end-of-line type but do not
786 generate subsidiary coding systems. (This value is
787 converted to nil when stored internally, and
788 `coding-system-property' will return nil.)
790 'post-read-conversion
791 Function called after a file has been read in, to perform the
792 decoding. Called with two arguments, BEG and END, denoting
793 a region of the current buffer to be decoded.
795 'pre-write-conversion
796 Function called before a file is written out, to perform the
797 encoding. Called with two arguments, BEG and END, denoting
798 a region of the current buffer to be encoded.
801 The following additional properties are recognized if TYPE is 'iso2022:
807 The character set initially designated to the G0 - G3 registers.
808 The value should be one of
810 -- A charset object (designate that character set)
811 -- nil (do not ever use this register)
812 -- t (no character set is initially designated to
813 the register, but may be later on; this automatically
814 sets the corresponding `force-g*-on-output' property)
820 If non-nil, send an explicit designation sequence on output before
821 using the specified register.
824 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
825 "ESC $ B" on output in place of the full designation sequences
826 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
829 If non-nil, don't designate ASCII to G0 at each end of line on output.
830 Setting this to non-nil also suppresses other state-resetting that
831 normally happens at the end of a line.
834 If non-nil, don't designate ASCII to G0 before control chars on output.
837 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
841 If non-nil, use locking-shift (SO/SI) instead of single-shift
842 or designation by escape sequence.
845 If non-nil, don't use ISO6429's direction specification.
848 If non-nil, literal control characters that are the same as
849 the beginning of a recognized ISO2022 or ISO6429 escape sequence
850 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
851 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
852 so that they can be properly distinguished from an escape sequence.
853 (Note that doing this results in a non-portable encoding.) This
854 encoding flag is used for byte-compiled files. Note that ESC
855 is a good choice for a quoting character because there are no
856 escape sequences whose second byte is a character from the Control-0
857 or Control-1 character sets; this is explicitly disallowed by the
860 'input-charset-conversion
861 A list of conversion specifications, specifying conversion of
862 characters in one charset to another when decoding is performed.
863 Each specification is a list of two elements: the source charset,
864 and the destination charset.
866 'output-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when encoding is performed.
869 The form of each specification is the same as for
870 'input-charset-conversion.
873 The following additional properties are recognized (and required)
877 CCL program used for decoding (converting to internal format).
880 CCL program used for encoding (converting to external format).
882 (name, type, doc_string, props))
884 Lisp_Coding_System *codesys;
885 enum coding_system_type ty;
886 int need_to_setup_eol_systems = 1;
888 /* Convert type to constant */
889 if (NILP (type) || EQ (type, Qundecided))
890 { ty = CODESYS_AUTODETECT; }
892 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
893 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
894 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
895 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
896 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
897 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
899 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
901 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
904 signal_simple_error ("Invalid coding system type", type);
908 codesys = allocate_coding_system (ty, name);
910 if (NILP (doc_string))
911 doc_string = build_string ("");
913 CHECK_STRING (doc_string);
914 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
917 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
919 if (EQ (key, Qmnemonic))
922 CHECK_STRING (value);
923 CODING_SYSTEM_MNEMONIC (codesys) = value;
926 else if (EQ (key, Qeol_type))
928 need_to_setup_eol_systems = NILP (value);
931 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
934 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
935 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
937 else if (ty == CODESYS_ISO2022)
939 #define FROB_INITIAL_CHARSET(charset_num) \
940 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
941 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
943 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
944 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
945 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
946 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
948 #define FROB_FORCE_CHARSET(charset_num) \
949 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
951 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
952 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
953 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
954 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
956 #define FROB_BOOLEAN_PROPERTY(prop) \
957 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
959 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
960 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
961 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
962 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
963 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
964 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
965 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
967 else if (EQ (key, Qinput_charset_conversion))
969 codesys->iso2022.input_conv =
970 Dynarr_new (charset_conversion_spec);
971 parse_charset_conversion_specs (codesys->iso2022.input_conv,
974 else if (EQ (key, Qoutput_charset_conversion))
976 codesys->iso2022.output_conv =
977 Dynarr_new (charset_conversion_spec);
978 parse_charset_conversion_specs (codesys->iso2022.output_conv,
982 signal_simple_error ("Unrecognized property", key);
984 else if (EQ (type, Qccl))
986 if (EQ (key, Qdecode))
988 CHECK_VECTOR (value);
989 CODING_SYSTEM_CCL_DECODE (codesys) = value;
991 else if (EQ (key, Qencode))
993 CHECK_VECTOR (value);
994 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
997 signal_simple_error ("Unrecognized property", key);
1001 signal_simple_error ("Unrecognized property", key);
1005 if (need_to_setup_eol_systems)
1006 setup_eol_coding_systems (codesys);
1009 Lisp_Object codesys_obj;
1010 XSETCODING_SYSTEM (codesys_obj, codesys);
1011 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1016 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1017 Copy OLD-CODING-SYSTEM to NEW-NAME.
1018 If NEW-NAME does not name an existing coding system, a new one will
1021 (old_coding_system, new_name))
1023 Lisp_Object new_coding_system;
1024 old_coding_system = Fget_coding_system (old_coding_system);
1025 new_coding_system = Ffind_coding_system (new_name);
1026 if (NILP (new_coding_system))
1028 XSETCODING_SYSTEM (new_coding_system,
1029 allocate_coding_system
1030 (XCODING_SYSTEM_TYPE (old_coding_system),
1032 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1036 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1037 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1038 memcpy (((char *) to ) + sizeof (to->header),
1039 ((char *) from) + sizeof (from->header),
1040 sizeof (*from) - sizeof (from->header));
1041 to->name = new_name;
1043 return new_coding_system;
1046 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1047 Return t if OBJECT names a coding system, and is not a coding system alias.
1051 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1055 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1056 Return t if OBJECT is a coding system alias.
1057 All coding system aliases are created by `define-coding-system-alias'.
1061 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1065 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1066 Return the coding-system symbol for which symbol ALIAS is an alias.
1070 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1071 if (SYMBOLP (aliasee))
1074 signal_simple_error ("Symbol is not a coding system alias", alias);
1075 return Qnil; /* To keep the compiler happy */
1079 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1081 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1085 /* A maphash function, for removing dangling coding system aliases. */
1087 dangling_coding_system_alias_p (Lisp_Object alias,
1088 Lisp_Object aliasee,
1089 void *dangling_aliases)
1091 if (SYMBOLP (aliasee)
1092 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1094 (*(int *) dangling_aliases)++;
1101 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1102 Define symbol ALIAS as an alias for coding system ALIASEE.
1104 You can use this function to redefine an alias that has already been defined,
1105 but you cannot redefine a name which is the canonical name for a coding system.
1106 \(a canonical name of a coding system is what is returned when you call
1107 `coding-system-name' on a coding system).
1109 ALIASEE itself can be an alias, which allows you to define nested aliases.
1111 You are forbidden, however, from creating alias loops or `dangling' aliases.
1112 These will be detected, and an error will be signaled if you attempt to do so.
1114 If ALIASEE is nil, then ALIAS will simply be undefined.
1116 See also `coding-system-alias-p', `coding-system-aliasee',
1117 and `coding-system-canonical-name-p'.
1121 Lisp_Object real_coding_system, probe;
1123 CHECK_SYMBOL (alias);
1125 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1127 ("Symbol is the canonical name of a coding system and cannot be redefined",
1132 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1133 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1134 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1136 Fremhash (alias, Vcoding_system_hash_table);
1138 /* Undefine subsidiary aliases,
1139 presumably created by a previous call to this function */
1140 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1141 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1142 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1144 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1145 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1146 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1149 /* Undefine dangling coding system aliases. */
1151 int dangling_aliases;
1154 dangling_aliases = 0;
1155 elisp_map_remhash (dangling_coding_system_alias_p,
1156 Vcoding_system_hash_table,
1158 } while (dangling_aliases > 0);
1164 if (CODING_SYSTEMP (aliasee))
1165 aliasee = XCODING_SYSTEM_NAME (aliasee);
1167 /* Checks that aliasee names a coding-system */
1168 real_coding_system = Fget_coding_system (aliasee);
1170 /* Check for coding system alias loops */
1171 if (EQ (alias, aliasee))
1172 alias_loop: signal_simple_error_2
1173 ("Attempt to create a coding system alias loop", alias, aliasee);
1175 for (probe = aliasee;
1177 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1179 if (EQ (probe, alias))
1183 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1185 /* Set up aliases for subsidiaries.
1186 #### There must be a better way to handle subsidiary coding systems. */
1188 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1190 for (i = 0; i < countof (suffixes); i++)
1192 Lisp_Object alias_subsidiary =
1193 append_suffix_to_symbol (alias, suffixes[i]);
1194 Lisp_Object aliasee_subsidiary =
1195 append_suffix_to_symbol (aliasee, suffixes[i]);
1197 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1198 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1201 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1202 but it doesn't look intentional, so I'd rather return something
1203 meaningful or nothing at all. */
1208 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1210 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1211 Lisp_Object new_coding_system;
1213 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1214 return coding_system;
1218 case EOL_AUTODETECT: return coding_system;
1219 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1220 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1221 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1222 default: abort (); return Qnil;
1225 return NILP (new_coding_system) ? coding_system : new_coding_system;
1228 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1229 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1231 (coding_system, eol_type))
1233 coding_system = Fget_coding_system (coding_system);
1235 return subsidiary_coding_system (coding_system,
1236 symbol_to_eol_type (eol_type));
1240 /************************************************************************/
1241 /* Coding system accessors */
1242 /************************************************************************/
1244 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1245 Return the doc string for CODING-SYSTEM.
1249 coding_system = Fget_coding_system (coding_system);
1250 return XCODING_SYSTEM_DOC_STRING (coding_system);
1253 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1254 Return the type of CODING-SYSTEM.
1258 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1261 case CODESYS_AUTODETECT: return Qundecided;
1263 case CODESYS_SHIFT_JIS: return Qshift_jis;
1264 case CODESYS_ISO2022: return Qiso2022;
1265 case CODESYS_BIG5: return Qbig5;
1266 case CODESYS_UCS4: return Qucs4;
1267 case CODESYS_UTF8: return Qutf8;
1268 case CODESYS_CCL: return Qccl;
1270 case CODESYS_NO_CONVERSION: return Qno_conversion;
1272 case CODESYS_INTERNAL: return Qinternal;
1279 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1282 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1284 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1287 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1288 Return initial charset of CODING-SYSTEM designated to GNUM.
1291 (coding_system, gnum))
1293 coding_system = Fget_coding_system (coding_system);
1296 return coding_system_charset (coding_system, XINT (gnum));
1300 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1301 Return the PROP property of CODING-SYSTEM.
1303 (coding_system, prop))
1306 enum coding_system_type type;
1308 coding_system = Fget_coding_system (coding_system);
1309 CHECK_SYMBOL (prop);
1310 type = XCODING_SYSTEM_TYPE (coding_system);
1312 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1313 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1316 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1318 case CODESYS_PROP_ALL_OK:
1321 case CODESYS_PROP_ISO2022:
1322 if (type != CODESYS_ISO2022)
1324 ("Property only valid in ISO2022 coding systems",
1328 case CODESYS_PROP_CCL:
1329 if (type != CODESYS_CCL)
1331 ("Property only valid in CCL coding systems",
1341 signal_simple_error ("Unrecognized property", prop);
1343 if (EQ (prop, Qname))
1344 return XCODING_SYSTEM_NAME (coding_system);
1345 else if (EQ (prop, Qtype))
1346 return Fcoding_system_type (coding_system);
1347 else if (EQ (prop, Qdoc_string))
1348 return XCODING_SYSTEM_DOC_STRING (coding_system);
1349 else if (EQ (prop, Qmnemonic))
1350 return XCODING_SYSTEM_MNEMONIC (coding_system);
1351 else if (EQ (prop, Qeol_type))
1352 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1353 else if (EQ (prop, Qeol_lf))
1354 return XCODING_SYSTEM_EOL_LF (coding_system);
1355 else if (EQ (prop, Qeol_crlf))
1356 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1357 else if (EQ (prop, Qeol_cr))
1358 return XCODING_SYSTEM_EOL_CR (coding_system);
1359 else if (EQ (prop, Qpost_read_conversion))
1360 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1361 else if (EQ (prop, Qpre_write_conversion))
1362 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1364 else if (type == CODESYS_ISO2022)
1366 if (EQ (prop, Qcharset_g0))
1367 return coding_system_charset (coding_system, 0);
1368 else if (EQ (prop, Qcharset_g1))
1369 return coding_system_charset (coding_system, 1);
1370 else if (EQ (prop, Qcharset_g2))
1371 return coding_system_charset (coding_system, 2);
1372 else if (EQ (prop, Qcharset_g3))
1373 return coding_system_charset (coding_system, 3);
1375 #define FORCE_CHARSET(charset_num) \
1376 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1377 (coding_system, charset_num) ? Qt : Qnil)
1379 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1380 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1381 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1382 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1384 #define LISP_BOOLEAN(prop) \
1385 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1387 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1388 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1389 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1390 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1391 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1392 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1393 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1395 else if (EQ (prop, Qinput_charset_conversion))
1397 unparse_charset_conversion_specs
1398 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1399 else if (EQ (prop, Qoutput_charset_conversion))
1401 unparse_charset_conversion_specs
1402 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1406 else if (type == CODESYS_CCL)
1408 if (EQ (prop, Qdecode))
1409 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1410 else if (EQ (prop, Qencode))
1411 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1419 return Qnil; /* not reached */
1423 /************************************************************************/
1424 /* Coding category functions */
1425 /************************************************************************/
1428 decode_coding_category (Lisp_Object symbol)
1432 CHECK_SYMBOL (symbol);
1433 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1434 if (EQ (coding_category_symbol[i], symbol))
1437 signal_simple_error ("Unrecognized coding category", symbol);
1438 return 0; /* not reached */
1441 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1442 Return a list of all recognized coding categories.
1447 Lisp_Object list = Qnil;
1449 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1450 list = Fcons (coding_category_symbol[i], list);
1454 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1455 Change the priority order of the coding categories.
1456 LIST should be list of coding categories, in descending order of
1457 priority. Unspecified coding categories will be lower in priority
1458 than all specified ones, in the same relative order they were in
1463 int category_to_priority[CODING_CATEGORY_LAST + 1];
1467 /* First generate a list that maps coding categories to priorities. */
1469 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1470 category_to_priority[i] = -1;
1472 /* Highest priority comes from the specified list. */
1474 EXTERNAL_LIST_LOOP (rest, list)
1476 int cat = decode_coding_category (XCAR (rest));
1478 if (category_to_priority[cat] >= 0)
1479 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1480 category_to_priority[cat] = i++;
1483 /* Now go through the existing categories by priority to retrieve
1484 the categories not yet specified and preserve their priority
1486 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1488 int cat = fcd->coding_category_by_priority[j];
1489 if (category_to_priority[cat] < 0)
1490 category_to_priority[cat] = i++;
1493 /* Now we need to construct the inverse of the mapping we just
1496 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1497 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1499 /* Phew! That was confusing. */
1503 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1504 Return a list of coding categories in descending order of priority.
1509 Lisp_Object list = Qnil;
1511 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1512 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1517 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1518 Change the coding system associated with a coding category.
1520 (coding_category, coding_system))
1522 int cat = decode_coding_category (coding_category);
1524 coding_system = Fget_coding_system (coding_system);
1525 fcd->coding_category_system[cat] = coding_system;
1529 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1530 Return the coding system associated with a coding category.
1534 int cat = decode_coding_category (coding_category);
1535 Lisp_Object sys = fcd->coding_category_system[cat];
1538 return XCODING_SYSTEM_NAME (sys);
1543 /************************************************************************/
1544 /* Detecting the encoding of data */
1545 /************************************************************************/
1547 struct detection_state
1549 eol_type_t eol_type;
1585 struct iso2022_decoder iso;
1587 int high_byte_count;
1588 unsigned int saw_single_shift:1;
1601 acceptable_control_char_p (int c)
1605 /* Allow and ignore control characters that you might
1606 reasonably see in a text file */
1611 case 8: /* backspace */
1612 case 11: /* vertical tab */
1613 case 12: /* form feed */
1614 case 26: /* MS-DOS C-z junk */
1615 case 31: /* '^_' -- for info */
1623 mask_has_at_most_one_bit_p (int mask)
1625 /* Perhaps the only thing useful you learn from intensive Microsoft
1626 technical interviews */
1627 return (mask & (mask - 1)) == 0;
1631 detect_eol_type (struct detection_state *st, const unsigned char *src,
1641 if (st->eol.just_saw_cr)
1643 else if (st->eol.seen_anything)
1646 else if (st->eol.just_saw_cr)
1649 st->eol.just_saw_cr = 1;
1651 st->eol.just_saw_cr = 0;
1652 st->eol.seen_anything = 1;
1655 return EOL_AUTODETECT;
1658 /* Attempt to determine the encoding and EOL type of the given text.
1659 Before calling this function for the first type, you must initialize
1660 st->eol_type as appropriate and initialize st->mask to ~0.
1662 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1665 st->mask holds the determined coding category mask, or ~0 if only
1666 ASCII has been seen so far.
1670 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1671 is present in st->mask
1672 1 == definitive answers are here for both st->eol_type and st->mask
1676 detect_coding_type (struct detection_state *st, const Extbyte *src,
1677 unsigned int n, int just_do_eol)
1681 if (st->eol_type == EOL_AUTODETECT)
1682 st->eol_type = detect_eol_type (st, src, n);
1685 return st->eol_type != EOL_AUTODETECT;
1687 if (!st->seen_non_ascii)
1689 for (; n; n--, src++)
1692 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1694 st->seen_non_ascii = 1;
1696 st->shift_jis.mask = ~0;
1700 st->iso2022.mask = ~0;
1710 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1711 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1712 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1713 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1714 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1715 st->big5.mask = detect_coding_big5 (st, src, n);
1716 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1717 st->utf8.mask = detect_coding_utf8 (st, src, n);
1718 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1719 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1722 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1723 | st->utf8.mask | st->ucs4.mask;
1726 int retval = mask_has_at_most_one_bit_p (st->mask);
1727 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1728 return retval && st->eol_type != EOL_AUTODETECT;
1733 coding_system_from_mask (int mask)
1737 /* If the file was entirely or basically ASCII, use the
1738 default value of `buffer-file-coding-system'. */
1739 Lisp_Object retval =
1740 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1743 retval = Ffind_coding_system (retval);
1747 (Qbad_variable, Qwarning,
1748 "Invalid `default-buffer-file-coding-system', set to nil");
1749 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1753 retval = Fget_coding_system (Qraw_text);
1761 mask = postprocess_iso2022_mask (mask);
1763 /* Look through the coding categories by priority and find
1764 the first one that is allowed. */
1765 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1767 cat = fcd->coding_category_by_priority[i];
1768 if ((mask & (1 << cat)) &&
1769 !NILP (fcd->coding_category_system[cat]))
1773 return fcd->coding_category_system[cat];
1775 return Fget_coding_system (Qraw_text);
1779 /* Given a seekable read stream and potential coding system and EOL type
1780 as specified, do any autodetection that is called for. If the
1781 coding system and/or EOL type are not `autodetect', they will be left
1782 alone; but this function will never return an autodetect coding system
1785 This function does not automatically fetch subsidiary coding systems;
1786 that should be unnecessary with the explicit eol-type argument. */
1788 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1791 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1792 eol_type_t *eol_type_in_out)
1794 struct detection_state decst;
1796 if (*eol_type_in_out == EOL_AUTODETECT)
1797 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1800 decst.eol_type = *eol_type_in_out;
1803 /* If autodetection is called for, do it now. */
1804 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1805 || *eol_type_in_out == EOL_AUTODETECT)
1808 Lisp_Object coding_system = Qnil;
1810 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1813 /* Look for initial "-*-"; mode line prefix */
1815 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1820 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1822 Extbyte *local_vars_beg = p + 3;
1823 /* Look for final "-*-"; mode line suffix */
1824 for (p = local_vars_beg,
1825 scan_end = buf + nread - LENGTH ("-*-");
1830 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1832 Extbyte *suffix = p;
1833 /* Look for "coding:" */
1834 for (p = local_vars_beg,
1835 scan_end = suffix - LENGTH ("coding:?");
1838 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1839 && (p == local_vars_beg
1840 || (*(p-1) == ' ' ||
1846 p += LENGTH ("coding:");
1847 while (*p == ' ' || *p == '\t') p++;
1849 /* Get coding system name */
1850 save = *suffix; *suffix = '\0';
1851 /* Characters valid in a MIME charset name (rfc 1521),
1852 and in a Lisp symbol name. */
1853 n = strspn ( (char *) p,
1854 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1855 "abcdefghijklmnopqrstuvwxyz"
1861 save = p[n]; p[n] = '\0';
1863 Ffind_coding_system (intern ((char *) p));
1873 if (NILP (coding_system))
1876 if (detect_coding_type (&decst, buf, nread,
1877 XCODING_SYSTEM_TYPE (*codesys_in_out)
1878 != CODESYS_AUTODETECT))
1880 nread = Lstream_read (stream, buf, sizeof (buf));
1886 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1887 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1890 if (detect_coding_type (&decst, buf, nread, 1))
1892 nread = Lstream_read (stream, buf, sizeof (buf));
1898 *eol_type_in_out = decst.eol_type;
1899 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1901 if (NILP (coding_system))
1902 *codesys_in_out = coding_system_from_mask (decst.mask);
1904 *codesys_in_out = coding_system;
1908 /* If we absolutely can't determine the EOL type, just assume LF. */
1909 if (*eol_type_in_out == EOL_AUTODETECT)
1910 *eol_type_in_out = EOL_LF;
1912 Lstream_rewind (stream);
1915 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1916 Detect coding system of the text in the region between START and END.
1917 Returned a list of possible coding systems ordered by priority.
1918 If only ASCII characters are found, it returns 'undecided or one of
1919 its subsidiary coding systems according to a detected end-of-line
1920 type. Optional arg BUFFER defaults to the current buffer.
1922 (start, end, buffer))
1924 Lisp_Object val = Qnil;
1925 struct buffer *buf = decode_buffer (buffer, 0);
1927 Lisp_Object instream, lb_instream;
1928 Lstream *istr, *lb_istr;
1929 struct detection_state decst;
1930 struct gcpro gcpro1, gcpro2;
1932 get_buffer_range_char (buf, start, end, &b, &e, 0);
1933 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1934 lb_istr = XLSTREAM (lb_instream);
1935 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1936 istr = XLSTREAM (instream);
1937 GCPRO2 (instream, lb_instream);
1939 decst.eol_type = EOL_AUTODETECT;
1943 unsigned char random_buffer[4096];
1944 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1948 if (detect_coding_type (&decst, random_buffer, nread, 0))
1952 if (decst.mask == ~0)
1953 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1961 decst.mask = postprocess_iso2022_mask (decst.mask);
1963 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1965 int sys = fcd->coding_category_by_priority[i];
1966 if (decst.mask & (1 << sys))
1968 Lisp_Object codesys = fcd->coding_category_system[sys];
1969 if (!NILP (codesys))
1970 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1971 val = Fcons (codesys, val);
1975 Lstream_close (istr);
1977 Lstream_delete (istr);
1978 Lstream_delete (lb_istr);
1983 /************************************************************************/
1984 /* Converting to internal Mule format ("decoding") */
1985 /************************************************************************/
1987 /* A decoding stream is a stream used for decoding text (i.e.
1988 converting from some external format to internal format).
1989 The decoding-stream object keeps track of the actual coding
1990 stream, the stream that is at the other end, and data that
1991 needs to be persistent across the lifetime of the stream. */
1993 /* Handle the EOL stuff related to just-read-in character C.
1994 EOL_TYPE is the EOL type of the coding stream.
1995 FLAGS is the current value of FLAGS in the coding stream, and may
1996 be modified by this macro. (The macro only looks at the
1997 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1998 bytes are to be written. You need to also define a local goto
1999 label "label_continue_loop" that is at the end of the main
2000 character-reading loop.
2002 If C is a CR character, then this macro handles it entirely and
2003 jumps to label_continue_loop. Otherwise, this macro does not add
2004 anything to DST, and continues normally. You should continue
2005 processing C normally after this macro. */
2007 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2011 if (eol_type == EOL_CR) \
2012 Dynarr_add (dst, '\n'); \
2013 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2014 Dynarr_add (dst, c); \
2016 flags |= CODING_STATE_CR; \
2017 goto label_continue_loop; \
2019 else if (flags & CODING_STATE_CR) \
2020 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2022 Dynarr_add (dst, '\r'); \
2023 flags &= ~CODING_STATE_CR; \
2027 /* C should be a binary character in the range 0 - 255; convert
2028 to internal format and add to Dynarr DST. */
2031 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2033 if (BYTE_ASCII_P (c)) \
2034 Dynarr_add (dst, c); \
2037 Dynarr_add (dst, (c >> 6) | 0xc0); \
2038 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2043 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2047 Dynarr_add (dst, c);
2049 else if ( c <= 0x7ff )
2051 Dynarr_add (dst, (c >> 6) | 0xc0);
2052 Dynarr_add (dst, (c & 0x3f) | 0x80);
2054 else if ( c <= 0xffff )
2056 Dynarr_add (dst, (c >> 12) | 0xe0);
2057 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2058 Dynarr_add (dst, (c & 0x3f) | 0x80);
2060 else if ( c <= 0x1fffff )
2062 Dynarr_add (dst, (c >> 18) | 0xf0);
2063 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2064 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2065 Dynarr_add (dst, (c & 0x3f) | 0x80);
2067 else if ( c <= 0x3ffffff )
2069 Dynarr_add (dst, (c >> 24) | 0xf8);
2070 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2071 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2072 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2073 Dynarr_add (dst, (c & 0x3f) | 0x80);
2077 Dynarr_add (dst, (c >> 30) | 0xfc);
2078 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2079 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2080 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2081 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2082 Dynarr_add (dst, (c & 0x3f) | 0x80);
2086 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2088 if (BYTE_ASCII_P (c)) \
2089 Dynarr_add (dst, c); \
2090 else if (BYTE_C1_P (c)) \
2092 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2093 Dynarr_add (dst, c + 0x20); \
2097 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2098 Dynarr_add (dst, c); \
2103 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2107 DECODE_ADD_BINARY_CHAR (ch, dst); \
2112 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2114 if (flags & CODING_STATE_END) \
2116 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2117 if (flags & CODING_STATE_CR) \
2118 Dynarr_add (dst, '\r'); \
2122 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2124 struct decoding_stream
2126 /* Coding system that governs the conversion. */
2127 Lisp_Coding_System *codesys;
2129 /* Stream that we read the encoded data from or
2130 write the decoded data to. */
2133 /* If we are reading, then we can return only a fixed amount of
2134 data, so if the conversion resulted in too much data, we store it
2135 here for retrieval the next time around. */
2136 unsigned_char_dynarr *runoff;
2138 /* FLAGS holds flags indicating the current state of the decoding.
2139 Some of these flags are dependent on the coding system. */
2142 /* CH holds a partially built-up character. Since we only deal
2143 with one- and two-byte characters at the moment, we only use
2144 this to store the first byte of a two-byte character. */
2147 /* EOL_TYPE specifies the type of end-of-line conversion that
2148 currently applies. We need to keep this separate from the
2149 EOL type stored in CODESYS because the latter might indicate
2150 automatic EOL-type detection while the former will always
2151 indicate a particular EOL type. */
2152 eol_type_t eol_type;
2154 /* Additional ISO2022 information. We define the structure above
2155 because it's also needed by the detection routines. */
2156 struct iso2022_decoder iso2022;
2158 /* Additional information (the state of the running CCL program)
2159 used by the CCL decoder. */
2160 struct ccl_program ccl;
2162 /* counter for UTF-8 or UCS-4 */
2163 unsigned char counter;
2165 struct detection_state decst;
2168 static ssize_t decoding_reader (Lstream *stream,
2169 unsigned char *data, size_t size);
2170 static ssize_t decoding_writer (Lstream *stream,
2171 const unsigned char *data, size_t size);
2172 static int decoding_rewinder (Lstream *stream);
2173 static int decoding_seekable_p (Lstream *stream);
2174 static int decoding_flusher (Lstream *stream);
2175 static int decoding_closer (Lstream *stream);
2177 static Lisp_Object decoding_marker (Lisp_Object stream);
2179 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2180 sizeof (struct decoding_stream));
2183 decoding_marker (Lisp_Object stream)
2185 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2186 Lisp_Object str_obj;
2188 /* We do not need to mark the coding systems or charsets stored
2189 within the stream because they are stored in a global list
2190 and automatically marked. */
2192 XSETLSTREAM (str_obj, str);
2193 mark_object (str_obj);
2194 if (str->imp->marker)
2195 return (str->imp->marker) (str_obj);
2200 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2201 so we read data from the other end, decode it, and store it into DATA. */
2204 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2206 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2207 unsigned char *orig_data = data;
2209 int error_occurred = 0;
2211 /* We need to interface to mule_decode(), which expects to take some
2212 amount of data and store the result into a Dynarr. We have
2213 mule_decode() store into str->runoff, and take data from there
2216 /* We loop until we have enough data, reading chunks from the other
2217 end and decoding it. */
2220 /* Take data from the runoff if we can. Make sure to take at
2221 most SIZE bytes, and delete the data from the runoff. */
2222 if (Dynarr_length (str->runoff) > 0)
2224 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2225 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2226 Dynarr_delete_many (str->runoff, 0, chunk);
2232 break; /* No more room for data */
2234 if (str->flags & CODING_STATE_END)
2235 /* This means that on the previous iteration, we hit the EOF on
2236 the other end. We loop once more so that mule_decode() can
2237 output any final stuff it may be holding, or any "go back
2238 to a sane state" escape sequences. (This latter makes sense
2239 during encoding.) */
2242 /* Exhausted the runoff, so get some more. DATA has at least
2243 SIZE bytes left of storage in it, so it's OK to read directly
2244 into it. (We'll be overwriting above, after we've decoded it
2245 into the runoff.) */
2246 read_size = Lstream_read (str->other_end, data, size);
2253 /* There might be some more end data produced in the translation.
2254 See the comment above. */
2255 str->flags |= CODING_STATE_END;
2256 mule_decode (stream, data, str->runoff, read_size);
2259 if (data - orig_data == 0)
2260 return error_occurred ? -1 : 0;
2262 return data - orig_data;
2266 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2268 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2271 /* Decode all our data into the runoff, and then attempt to write
2272 it all out to the other end. Remove whatever chunk we succeeded
2274 mule_decode (stream, data, str->runoff, size);
2275 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2276 Dynarr_length (str->runoff));
2278 Dynarr_delete_many (str->runoff, 0, retval);
2279 /* Do NOT return retval. The return value indicates how much
2280 of the incoming data was written, not how many bytes were
2286 reset_decoding_stream (struct decoding_stream *str)
2289 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2291 Lisp_Object coding_system;
2292 XSETCODING_SYSTEM (coding_system, str->codesys);
2293 reset_iso2022 (coding_system, &str->iso2022);
2295 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2297 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2301 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2302 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2305 str->decst.eol_type = EOL_AUTODETECT;
2306 str->decst.mask = ~0;
2308 str->flags = str->ch = 0;
2312 decoding_rewinder (Lstream *stream)
2314 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2315 reset_decoding_stream (str);
2316 Dynarr_reset (str->runoff);
2317 return Lstream_rewind (str->other_end);
2321 decoding_seekable_p (Lstream *stream)
2323 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2324 return Lstream_seekable_p (str->other_end);
2328 decoding_flusher (Lstream *stream)
2330 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2331 return Lstream_flush (str->other_end);
2335 decoding_closer (Lstream *stream)
2337 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2338 if (stream->flags & LSTREAM_FL_WRITE)
2340 str->flags |= CODING_STATE_END;
2341 decoding_writer (stream, 0, 0);
2343 Dynarr_free (str->runoff);
2345 #ifdef ENABLE_COMPOSITE_CHARS
2346 if (str->iso2022.composite_chars)
2347 Dynarr_free (str->iso2022.composite_chars);
2350 return Lstream_close (str->other_end);
2354 decoding_stream_coding_system (Lstream *stream)
2356 Lisp_Object coding_system;
2357 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2359 XSETCODING_SYSTEM (coding_system, str->codesys);
2360 return subsidiary_coding_system (coding_system, str->eol_type);
2364 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2366 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2367 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2369 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2370 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2371 reset_decoding_stream (str);
2374 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2375 stream for writing, no automatic code detection will be performed.
2376 The reason for this is that automatic code detection requires a
2377 seekable input. Things will also fail if you open a decoding
2378 stream for reading using a non-fully-specified coding system and
2379 a non-seekable input stream. */
2382 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2385 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2386 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2390 str->other_end = stream;
2391 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2392 str->eol_type = EOL_AUTODETECT;
2393 if (!strcmp (mode, "r")
2394 && Lstream_seekable_p (stream))
2395 /* We can determine the coding system now. */
2396 determine_real_coding_system (stream, &codesys, &str->eol_type);
2397 set_decoding_stream_coding_system (lstr, codesys);
2398 str->decst.eol_type = str->eol_type;
2399 str->decst.mask = ~0;
2400 XSETLSTREAM (obj, lstr);
2405 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2407 return make_decoding_stream_1 (stream, codesys, "r");
2411 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2413 return make_decoding_stream_1 (stream, codesys, "w");
2416 /* Note: the decode_coding_* functions all take the same
2417 arguments as mule_decode(), which is to say some SRC data of
2418 size N, which is to be stored into dynamic array DST.
2419 DECODING is the stream within which the decoding is
2420 taking place, but no data is actually read from or
2421 written to that stream; that is handled in decoding_reader()
2422 or decoding_writer(). This allows the same functions to
2423 be used for both reading and writing. */
2426 mule_decode (Lstream *decoding, const unsigned char *src,
2427 unsigned_char_dynarr *dst, unsigned int n)
2429 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2431 /* If necessary, do encoding-detection now. We do this when
2432 we're a writing stream or a non-seekable reading stream,
2433 meaning that we can't just process the whole input,
2434 rewind, and start over. */
2436 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2437 str->eol_type == EOL_AUTODETECT)
2439 Lisp_Object codesys;
2441 XSETCODING_SYSTEM (codesys, str->codesys);
2442 detect_coding_type (&str->decst, src, n,
2443 CODING_SYSTEM_TYPE (str->codesys) !=
2444 CODESYS_AUTODETECT);
2445 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2446 str->decst.mask != ~0)
2447 /* #### This is cheesy. What we really ought to do is
2448 buffer up a certain amount of data so as to get a
2449 less random result. */
2450 codesys = coding_system_from_mask (str->decst.mask);
2451 str->eol_type = str->decst.eol_type;
2452 if (XCODING_SYSTEM (codesys) != str->codesys)
2454 /* Preserve the CODING_STATE_END flag in case it was set.
2455 If we erase it, bad things might happen. */
2456 int was_end = str->flags & CODING_STATE_END;
2457 set_decoding_stream_coding_system (decoding, codesys);
2459 str->flags |= CODING_STATE_END;
2463 switch (CODING_SYSTEM_TYPE (str->codesys))
2466 case CODESYS_INTERNAL:
2467 Dynarr_add_many (dst, src, n);
2470 case CODESYS_AUTODETECT:
2471 /* If we got this far and still haven't decided on the coding
2472 system, then do no conversion. */
2473 case CODESYS_NO_CONVERSION:
2474 decode_coding_no_conversion (decoding, src, dst, n);
2477 case CODESYS_SHIFT_JIS:
2478 decode_coding_sjis (decoding, src, dst, n);
2481 decode_coding_big5 (decoding, src, dst, n);
2484 decode_coding_ucs4 (decoding, src, dst, n);
2487 decode_coding_utf8 (decoding, src, dst, n);
2490 str->ccl.last_block = str->flags & CODING_STATE_END;
2491 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2493 case CODESYS_ISO2022:
2494 decode_coding_iso2022 (decoding, src, dst, n);
2502 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2503 Decode the text between START and END which is encoded in CODING-SYSTEM.
2504 This is useful if you've read in encoded text from a file without decoding
2505 it (e.g. you read in a JIS-formatted file but used the `binary' or
2506 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2507 Return length of decoded text.
2508 BUFFER defaults to the current buffer if unspecified.
2510 (start, end, coding_system, buffer))
2513 struct buffer *buf = decode_buffer (buffer, 0);
2514 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2515 Lstream *istr, *ostr;
2516 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2518 get_buffer_range_char (buf, start, end, &b, &e, 0);
2520 barf_if_buffer_read_only (buf, b, e);
2522 coding_system = Fget_coding_system (coding_system);
2523 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2524 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2525 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2527 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2528 Fget_coding_system (Qbinary));
2529 istr = XLSTREAM (instream);
2530 ostr = XLSTREAM (outstream);
2531 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2533 /* The chain of streams looks like this:
2535 [BUFFER] <----- send through
2536 ------> [ENCODE AS BINARY]
2537 ------> [DECODE AS SPECIFIED]
2543 char tempbuf[1024]; /* some random amount */
2544 Bufpos newpos, even_newer_pos;
2545 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2546 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2550 newpos = lisp_buffer_stream_startpos (istr);
2551 Lstream_write (ostr, tempbuf, size_in_bytes);
2552 even_newer_pos = lisp_buffer_stream_startpos (istr);
2553 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2556 Lstream_close (istr);
2557 Lstream_close (ostr);
2559 Lstream_delete (istr);
2560 Lstream_delete (ostr);
2561 Lstream_delete (XLSTREAM (de_outstream));
2562 Lstream_delete (XLSTREAM (lb_outstream));
2567 /************************************************************************/
2568 /* Converting to an external encoding ("encoding") */
2569 /************************************************************************/
2571 /* An encoding stream is an output stream. When you create the
2572 stream, you specify the coding system that governs the encoding
2573 and another stream that the resulting encoded data is to be
2574 sent to, and then start sending data to it. */
2576 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2578 struct encoding_stream
2580 /* Coding system that governs the conversion. */
2581 Lisp_Coding_System *codesys;
2583 /* Stream that we read the encoded data from or
2584 write the decoded data to. */
2587 /* If we are reading, then we can return only a fixed amount of
2588 data, so if the conversion resulted in too much data, we store it
2589 here for retrieval the next time around. */
2590 unsigned_char_dynarr *runoff;
2592 /* FLAGS holds flags indicating the current state of the encoding.
2593 Some of these flags are dependent on the coding system. */
2596 /* CH holds a partially built-up character. Since we only deal
2597 with one- and two-byte characters at the moment, we only use
2598 this to store the first byte of a two-byte character. */
2601 /* Additional information used by the ISO2022 encoder. */
2604 /* CHARSET holds the character sets currently assigned to the G0
2605 through G3 registers. It is initialized from the array
2606 INITIAL_CHARSET in CODESYS. */
2607 Lisp_Object charset[4];
2609 /* Which registers are currently invoked into the left (GL) and
2610 right (GR) halves of the 8-bit encoding space? */
2611 int register_left, register_right;
2613 /* Whether we need to explicitly designate the charset in the
2614 G? register before using it. It is initialized from the
2615 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2616 unsigned char force_charset_on_output[4];
2618 /* Other state variables that need to be preserved across
2620 Lisp_Object current_charset;
2622 int current_char_boundary;
2625 /* Additional information (the state of the running CCL program)
2626 used by the CCL encoder. */
2627 struct ccl_program ccl;
2631 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2632 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2634 static int encoding_rewinder (Lstream *stream);
2635 static int encoding_seekable_p (Lstream *stream);
2636 static int encoding_flusher (Lstream *stream);
2637 static int encoding_closer (Lstream *stream);
2639 static Lisp_Object encoding_marker (Lisp_Object stream);
2641 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2642 sizeof (struct encoding_stream));
2645 encoding_marker (Lisp_Object stream)
2647 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2648 Lisp_Object str_obj;
2650 /* We do not need to mark the coding systems or charsets stored
2651 within the stream because they are stored in a global list
2652 and automatically marked. */
2654 XSETLSTREAM (str_obj, str);
2655 mark_object (str_obj);
2656 if (str->imp->marker)
2657 return (str->imp->marker) (str_obj);
2662 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2663 so we read data from the other end, encode it, and store it into DATA. */
2666 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2668 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2669 unsigned char *orig_data = data;
2671 int error_occurred = 0;
2673 /* We need to interface to mule_encode(), which expects to take some
2674 amount of data and store the result into a Dynarr. We have
2675 mule_encode() store into str->runoff, and take data from there
2678 /* We loop until we have enough data, reading chunks from the other
2679 end and encoding it. */
2682 /* Take data from the runoff if we can. Make sure to take at
2683 most SIZE bytes, and delete the data from the runoff. */
2684 if (Dynarr_length (str->runoff) > 0)
2686 int chunk = min ((int) size, Dynarr_length (str->runoff));
2687 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2688 Dynarr_delete_many (str->runoff, 0, chunk);
2694 break; /* No more room for data */
2696 if (str->flags & CODING_STATE_END)
2697 /* This means that on the previous iteration, we hit the EOF on
2698 the other end. We loop once more so that mule_encode() can
2699 output any final stuff it may be holding, or any "go back
2700 to a sane state" escape sequences. (This latter makes sense
2701 during encoding.) */
2704 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2705 left of storage in it, so it's OK to read directly into it.
2706 (We'll be overwriting above, after we've encoded it into the
2708 read_size = Lstream_read (str->other_end, data, size);
2715 /* There might be some more end data produced in the translation.
2716 See the comment above. */
2717 str->flags |= CODING_STATE_END;
2718 mule_encode (stream, data, str->runoff, read_size);
2721 if (data == orig_data)
2722 return error_occurred ? -1 : 0;
2724 return data - orig_data;
2728 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2730 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2733 /* Encode all our data into the runoff, and then attempt to write
2734 it all out to the other end. Remove whatever chunk we succeeded
2736 mule_encode (stream, data, str->runoff, size);
2737 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2738 Dynarr_length (str->runoff));
2740 Dynarr_delete_many (str->runoff, 0, retval);
2741 /* Do NOT return retval. The return value indicates how much
2742 of the incoming data was written, not how many bytes were
2748 reset_encoding_stream (struct encoding_stream *str)
2751 switch (CODING_SYSTEM_TYPE (str->codesys))
2753 case CODESYS_ISO2022:
2757 for (i = 0; i < 4; i++)
2759 str->iso2022.charset[i] =
2760 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2761 str->iso2022.force_charset_on_output[i] =
2762 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2764 str->iso2022.register_left = 0;
2765 str->iso2022.register_right = 1;
2766 str->iso2022.current_charset = Qnil;
2767 str->iso2022.current_half = 0;
2769 str->iso2022.current_char_boundary = 0;
2771 str->iso2022.current_char_boundary = 1;
2776 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2783 str->flags = str->ch = 0;
2787 encoding_rewinder (Lstream *stream)
2789 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2790 reset_encoding_stream (str);
2791 Dynarr_reset (str->runoff);
2792 return Lstream_rewind (str->other_end);
2796 encoding_seekable_p (Lstream *stream)
2798 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2799 return Lstream_seekable_p (str->other_end);
2803 encoding_flusher (Lstream *stream)
2805 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2806 return Lstream_flush (str->other_end);
2810 encoding_closer (Lstream *stream)
2812 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2813 if (stream->flags & LSTREAM_FL_WRITE)
2815 str->flags |= CODING_STATE_END;
2816 encoding_writer (stream, 0, 0);
2818 Dynarr_free (str->runoff);
2819 return Lstream_close (str->other_end);
2823 encoding_stream_coding_system (Lstream *stream)
2825 Lisp_Object coding_system;
2826 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2828 XSETCODING_SYSTEM (coding_system, str->codesys);
2829 return coding_system;
2833 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2835 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2836 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2838 reset_encoding_stream (str);
2842 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2845 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2846 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2850 str->runoff = Dynarr_new (unsigned_char);
2851 str->other_end = stream;
2852 set_encoding_stream_coding_system (lstr, codesys);
2853 XSETLSTREAM (obj, lstr);
2858 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2860 return make_encoding_stream_1 (stream, codesys, "r");
2864 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2866 return make_encoding_stream_1 (stream, codesys, "w");
2869 /* Convert N bytes of internally-formatted data stored in SRC to an
2870 external format, according to the encoding stream ENCODING.
2871 Store the encoded data into DST. */
2874 mule_encode (Lstream *encoding, const unsigned char *src,
2875 unsigned_char_dynarr *dst, unsigned int n)
2877 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2879 switch (CODING_SYSTEM_TYPE (str->codesys))
2882 case CODESYS_INTERNAL:
2883 Dynarr_add_many (dst, src, n);
2886 case CODESYS_AUTODETECT:
2887 /* If we got this far and still haven't decided on the coding
2888 system, then do no conversion. */
2889 case CODESYS_NO_CONVERSION:
2890 encode_coding_no_conversion (encoding, src, dst, n);
2893 case CODESYS_SHIFT_JIS:
2894 encode_coding_sjis (encoding, src, dst, n);
2897 encode_coding_big5 (encoding, src, dst, n);
2900 encode_coding_ucs4 (encoding, src, dst, n);
2903 encode_coding_utf8 (encoding, src, dst, n);
2906 str->ccl.last_block = str->flags & CODING_STATE_END;
2907 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2909 case CODESYS_ISO2022:
2910 encode_coding_iso2022 (encoding, src, dst, n);
2918 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2919 Encode the text between START and END using CODING-SYSTEM.
2920 This will, for example, convert Japanese characters into stuff such as
2921 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2922 text. BUFFER defaults to the current buffer if unspecified.
2924 (start, end, coding_system, buffer))
2927 struct buffer *buf = decode_buffer (buffer, 0);
2928 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2929 Lstream *istr, *ostr;
2930 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2932 get_buffer_range_char (buf, start, end, &b, &e, 0);
2934 barf_if_buffer_read_only (buf, b, e);
2936 coding_system = Fget_coding_system (coding_system);
2937 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2938 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2939 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2940 Fget_coding_system (Qbinary));
2941 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2943 istr = XLSTREAM (instream);
2944 ostr = XLSTREAM (outstream);
2945 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2946 /* The chain of streams looks like this:
2948 [BUFFER] <----- send through
2949 ------> [ENCODE AS SPECIFIED]
2950 ------> [DECODE AS BINARY]
2955 char tempbuf[1024]; /* some random amount */
2956 Bufpos newpos, even_newer_pos;
2957 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2958 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2962 newpos = lisp_buffer_stream_startpos (istr);
2963 Lstream_write (ostr, tempbuf, size_in_bytes);
2964 even_newer_pos = lisp_buffer_stream_startpos (istr);
2965 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2971 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2972 Lstream_close (istr);
2973 Lstream_close (ostr);
2975 Lstream_delete (istr);
2976 Lstream_delete (ostr);
2977 Lstream_delete (XLSTREAM (de_outstream));
2978 Lstream_delete (XLSTREAM (lb_outstream));
2979 return make_int (retlen);
2985 /************************************************************************/
2986 /* Shift-JIS methods */
2987 /************************************************************************/
2989 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2990 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2991 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2992 encoded by "position-code + 0x80". A character of JISX0208
2993 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2994 position-codes are divided and shifted so that it fit in the range
2997 --- CODE RANGE of Shift-JIS ---
2998 (character set) (range)
3000 JISX0201-Kana 0xA0 .. 0xDF
3001 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3002 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3003 -------------------------------
3007 /* Is this the first byte of a Shift-JIS two-byte char? */
3009 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3010 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3012 /* Is this the second byte of a Shift-JIS two-byte char? */
3014 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3015 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3017 #define BYTE_SJIS_KATAKANA_P(c) \
3018 ((c) >= 0xA1 && (c) <= 0xDF)
3021 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
3029 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3031 if (st->shift_jis.in_second_byte)
3033 st->shift_jis.in_second_byte = 0;
3037 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3038 st->shift_jis.in_second_byte = 1;
3040 return CODING_CATEGORY_SHIFT_JIS_MASK;
3043 /* Convert Shift-JIS data to internal format. */
3046 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
3047 unsigned_char_dynarr *dst, unsigned int n)
3050 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3051 unsigned int flags = str->flags;
3052 unsigned int ch = str->ch;
3053 eol_type_t eol_type = str->eol_type;
3061 /* Previous character was first byte of Shift-JIS Kanji char. */
3062 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3064 unsigned char e1, e2;
3066 DECODE_SJIS (ch, c, e1, e2);
3068 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3072 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3073 Dynarr_add (dst, e1);
3074 Dynarr_add (dst, e2);
3079 DECODE_ADD_BINARY_CHAR (ch, dst);
3080 DECODE_ADD_BINARY_CHAR (c, dst);
3086 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3087 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3089 else if (BYTE_SJIS_KATAKANA_P (c))
3092 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3095 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3096 Dynarr_add (dst, c);
3100 DECODE_ADD_BINARY_CHAR (c, dst);
3102 label_continue_loop:;
3105 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3111 /* Convert internally-formatted data to Shift-JIS. */
3114 encode_coding_sjis (Lstream *encoding, const unsigned char *src,
3115 unsigned_char_dynarr *dst, unsigned int n)
3118 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3119 unsigned int flags = str->flags;
3120 unsigned int ch = str->ch;
3121 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3123 unsigned char char_boundary = str->iso2022.current_char_boundary;
3130 switch (char_boundary)
3138 else if ( c >= 0xf8 )
3143 else if ( c >= 0xf0 )
3148 else if ( c >= 0xe0 )
3153 else if ( c >= 0xc0 )
3163 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3164 Dynarr_add (dst, '\r');
3165 if (eol_type != EOL_CR)
3166 Dynarr_add (dst, c);
3169 Dynarr_add (dst, c);
3174 ch = ( ch << 6 ) | ( c & 0x3f );
3176 Lisp_Object charset;
3177 unsigned int c1, c2, s1, s2;
3179 BREAKUP_CHAR (ch, charset, c1, c2);
3180 if (EQ(charset, Vcharset_katakana_jisx0201))
3182 Dynarr_add (dst, c1 | 0x80);
3184 else if (EQ(charset, Vcharset_japanese_jisx0208))
3186 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3187 Dynarr_add (dst, s1);
3188 Dynarr_add (dst, s2);
3194 ch = ( ch << 6 ) | ( c & 0x3f );
3200 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3201 Dynarr_add (dst, '\r');
3202 if (eol_type != EOL_CR)
3203 Dynarr_add (dst, '\n');
3206 else if (BYTE_ASCII_P (c))
3208 Dynarr_add (dst, c);
3211 else if (BUFBYTE_LEADING_BYTE_P (c))
3212 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3213 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3214 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3217 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3219 Dynarr_add (dst, c);
3222 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3223 ch == LEADING_BYTE_JAPANESE_JISX0208)
3227 unsigned char j1, j2;
3228 ENCODE_SJIS (ch, c, j1, j2);
3229 Dynarr_add (dst, j1);
3230 Dynarr_add (dst, j2);
3240 str->iso2022.current_char_boundary = char_boundary;
3244 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3245 Decode a JISX0208 character of Shift-JIS coding-system.
3246 CODE is the character code in Shift-JIS as a cons of type bytes.
3247 Return the corresponding character.
3251 unsigned char c1, c2, s1, s2;
3254 CHECK_INT (XCAR (code));
3255 CHECK_INT (XCDR (code));
3256 s1 = XINT (XCAR (code));
3257 s2 = XINT (XCDR (code));
3258 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3259 BYTE_SJIS_TWO_BYTE_2_P (s2))
3261 DECODE_SJIS (s1, s2, c1, c2);
3262 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3263 c1 & 0x7F, c2 & 0x7F));
3269 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3270 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3271 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3275 Lisp_Object charset;
3278 CHECK_CHAR_COERCE_INT (ch);
3279 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3280 if (EQ (charset, Vcharset_japanese_jisx0208))
3282 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3283 return Fcons (make_int (s1), make_int (s2));
3290 /************************************************************************/
3292 /************************************************************************/
3294 /* BIG5 is a coding system encoding two character sets: ASCII and
3295 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3296 character set and is encoded in two-byte.
3298 --- CODE RANGE of BIG5 ---
3299 (character set) (range)
3301 Big5 (1st byte) 0xA1 .. 0xFE
3302 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3303 --------------------------
3305 Since the number of characters in Big5 is larger than maximum
3306 characters in Emacs' charset (96x96), it can't be handled as one
3307 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3308 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3309 contains frequently used characters and the latter contains less
3310 frequently used characters. */
3312 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3313 ((c) >= 0xA1 && (c) <= 0xFE)
3315 /* Is this the second byte of a Shift-JIS two-byte char? */
3317 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3318 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3320 /* Number of Big5 characters which have the same code in 1st byte. */
3322 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3324 /* Code conversion macros. These are macros because they are used in
3325 inner loops during code conversion.
3327 Note that temporary variables in macros introduce the classic
3328 dynamic-scoping problems with variable names. We use capital-
3329 lettered variables in the assumption that XEmacs does not use
3330 capital letters in variables except in a very formalized way
3333 /* Convert Big5 code (b1, b2) into its internal string representation
3336 /* There is a much simpler way to split the Big5 charset into two.
3337 For the moment I'm going to leave the algorithm as-is because it
3338 claims to separate out the most-used characters into a single
3339 charset, which perhaps will lead to optimizations in various
3342 The way the algorithm works is something like this:
3344 Big5 can be viewed as a 94x157 charset, where the row is
3345 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3346 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3347 the split between low and high column numbers is apparently
3348 meaningless; ascending rows produce less and less frequent chars.
3349 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3350 the first charset, and the upper half (0xC9 .. 0xFE) to the
3351 second. To do the conversion, we convert the character into
3352 a single number where 0 .. 156 is the first row, 157 .. 313
3353 is the second, etc. That way, the characters are ordered by
3354 decreasing frequency. Then we just chop the space in two
3355 and coerce the result into a 94x94 space.
3358 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3360 int B1 = b1, B2 = b2; \
3362 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3366 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3370 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3371 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3373 c1 = I / (0xFF - 0xA1) + 0xA1; \
3374 c2 = I % (0xFF - 0xA1) + 0xA1; \
3377 /* Convert the internal string representation of a Big5 character
3378 (lb, c1, c2) into Big5 code (b1, b2). */
3380 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3382 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3384 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3386 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3388 b1 = I / BIG5_SAME_ROW + 0xA1; \
3389 b2 = I % BIG5_SAME_ROW; \
3390 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3394 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3402 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3403 (c >= 0x80 && c <= 0xA0))
3405 if (st->big5.in_second_byte)
3407 st->big5.in_second_byte = 0;
3408 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3412 st->big5.in_second_byte = 1;
3414 return CODING_CATEGORY_BIG5_MASK;
3417 /* Convert Big5 data to internal format. */
3420 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3421 unsigned_char_dynarr *dst, unsigned int n)
3424 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3425 unsigned int flags = str->flags;
3426 unsigned int ch = str->ch;
3427 eol_type_t eol_type = str->eol_type;
3434 /* Previous character was first byte of Big5 char. */
3435 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3437 unsigned char b1, b2, b3;
3438 DECODE_BIG5 (ch, c, b1, b2, b3);
3439 Dynarr_add (dst, b1);
3440 Dynarr_add (dst, b2);
3441 Dynarr_add (dst, b3);
3445 DECODE_ADD_BINARY_CHAR (ch, dst);
3446 DECODE_ADD_BINARY_CHAR (c, dst);
3452 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3453 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3456 DECODE_ADD_BINARY_CHAR (c, dst);
3458 label_continue_loop:;
3461 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3467 /* Convert internally-formatted data to Big5. */
3470 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3471 unsigned_char_dynarr *dst, unsigned int n)
3475 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3476 unsigned int flags = str->flags;
3477 unsigned int ch = str->ch;
3478 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3485 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3486 Dynarr_add (dst, '\r');
3487 if (eol_type != EOL_CR)
3488 Dynarr_add (dst, '\n');
3490 else if (BYTE_ASCII_P (c))
3493 Dynarr_add (dst, c);
3495 else if (BUFBYTE_LEADING_BYTE_P (c))
3497 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3498 c == LEADING_BYTE_CHINESE_BIG5_2)
3500 /* A recognized leading byte. */
3502 continue; /* not done with this character. */
3504 /* otherwise just ignore this character. */
3506 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3507 ch == LEADING_BYTE_CHINESE_BIG5_2)
3509 /* Previous char was a recognized leading byte. */
3511 continue; /* not done with this character. */
3515 /* Encountering second byte of a Big5 character. */
3516 unsigned char b1, b2;
3518 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3519 Dynarr_add (dst, b1);
3520 Dynarr_add (dst, b2);
3532 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3533 Decode a Big5 character CODE of BIG5 coding-system.
3534 CODE is the character code in BIG5, a cons of two integers.
3535 Return the corresponding character.
3539 unsigned char c1, c2, b1, b2;
3542 CHECK_INT (XCAR (code));
3543 CHECK_INT (XCDR (code));
3544 b1 = XINT (XCAR (code));
3545 b2 = XINT (XCDR (code));
3546 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3547 BYTE_BIG5_TWO_BYTE_2_P (b2))
3549 Charset_ID leading_byte;
3550 Lisp_Object charset;
3551 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3552 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3553 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3559 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3560 Encode the Big5 character CH to BIG5 coding-system.
3561 Return the corresponding character code in Big5.
3565 Lisp_Object charset;
3568 CHECK_CHAR_COERCE_INT (ch);
3569 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3570 if (EQ (charset, Vcharset_chinese_big5_1) ||
3571 EQ (charset, Vcharset_chinese_big5_2))
3573 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3575 return Fcons (make_int (b1), make_int (b2));
3582 /************************************************************************/
3585 /* UCS-4 character codes are implemented as nonnegative integers. */
3587 /************************************************************************/
3590 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3591 Map UCS-4 code CODE to Mule character CHARACTER.
3593 Return T on success, NIL on failure.
3599 CHECK_CHAR (character);
3600 CHECK_NATNUM (code);
3603 if (c < countof (fcd->ucs_to_mule_table))
3605 fcd->ucs_to_mule_table[c] = character;
3613 ucs_to_char (unsigned long code)
3615 if (code < countof (fcd->ucs_to_mule_table))
3617 return fcd->ucs_to_mule_table[code];
3619 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3624 c = code % (94 * 94);
3626 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3627 (94, 2, code / (94 * 94) + '@',
3628 CHARSET_LEFT_TO_RIGHT),
3629 c / 94 + 33, c % 94 + 33));
3635 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3636 Return Mule character corresponding to UCS code CODE (a positive integer).
3640 CHECK_NATNUM (code);
3641 return ucs_to_char (XINT (code));
3644 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3645 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3649 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3650 Fset_char_ucs is more restrictive on index arg, but should
3651 check code arg in a char_table method. */
3652 CHECK_CHAR (character);
3653 CHECK_NATNUM (code);
3654 return Fput_char_table (character, code, mule_to_ucs_table);
3657 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3658 Return the UCS code (a positive integer) corresponding to CHARACTER.
3662 return Fget_char_table (character, mule_to_ucs_table);
3667 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3669 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3670 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3671 is not found, instead.
3672 #### do something more appropriate (use blob?)
3673 Danger, Will Robinson! Data loss. Should we signal user? */
3675 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3677 Lisp_Object chr = ucs_to_char (ch);
3681 Bufbyte work[MAX_EMCHAR_LEN];
3686 simple_set_charptr_emchar (work, ch) :
3687 non_ascii_set_charptr_emchar (work, ch);
3688 Dynarr_add_many (dst, work, len);
3692 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3693 Dynarr_add (dst, 34 + 128);
3694 Dynarr_add (dst, 46 + 128);
3700 static unsigned long
3701 mule_char_to_ucs4 (Lisp_Object charset,
3702 unsigned char h, unsigned char l)
3705 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3712 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3713 (XCHARSET_CHARS (charset) == 94) )
3715 unsigned char final = XCHARSET_FINAL (charset);
3717 if ( ('@' <= final) && (final < 0x7f) )
3719 return 0xe00000 + (final - '@') * 94 * 94
3720 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3734 encode_ucs4 (Lisp_Object charset,
3735 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3737 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3738 Dynarr_add (dst, code >> 24);
3739 Dynarr_add (dst, (code >> 16) & 255);
3740 Dynarr_add (dst, (code >> 8) & 255);
3741 Dynarr_add (dst, code & 255);
3746 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3752 switch (st->ucs4.in_byte)
3761 st->ucs4.in_byte = 0;
3767 return CODING_CATEGORY_UCS4_MASK;
3771 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3772 unsigned_char_dynarr *dst, unsigned int n)
3774 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3775 unsigned int flags = str->flags;
3776 unsigned int ch = str->ch;
3777 unsigned char counter = str->counter;
3781 unsigned char c = *src++;
3789 decode_ucs4 ( ( ch << 8 ) | c, dst);
3794 ch = ( ch << 8 ) | c;
3798 if (counter & CODING_STATE_END)
3799 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3803 str->counter = counter;
3807 encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
3808 unsigned_char_dynarr *dst, unsigned int n)
3811 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3812 unsigned int flags = str->flags;
3813 unsigned int ch = str->ch;
3814 unsigned char char_boundary = str->iso2022.current_char_boundary;
3815 Lisp_Object charset = str->iso2022.current_charset;
3817 #ifdef ENABLE_COMPOSITE_CHARS
3818 /* flags for handling composite chars. We do a little switcharoo
3819 on the source while we're outputting the composite char. */
3820 unsigned int saved_n = 0;
3821 const unsigned char *saved_src = NULL;
3822 int in_composite = 0;
3829 unsigned char c = *src++;
3831 if (BYTE_ASCII_P (c))
3832 { /* Processing ASCII character */
3834 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3837 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3838 { /* Processing Leading Byte */
3840 charset = CHARSET_BY_LEADING_BYTE (c);
3841 if (LEADING_BYTE_PREFIX_P(c))
3846 { /* Processing Non-ASCII character */
3848 if (EQ (charset, Vcharset_control_1))
3850 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3854 switch (XCHARSET_REP_BYTES (charset))
3857 encode_ucs4 (charset, c, 0, dst);
3860 if (XCHARSET_PRIVATE_P (charset))
3862 encode_ucs4 (charset, c, 0, dst);
3867 #ifdef ENABLE_COMPOSITE_CHARS
3868 if (EQ (charset, Vcharset_composite))
3872 /* #### Bother! We don't know how to
3874 Dynarr_add (dst, 0);
3875 Dynarr_add (dst, 0);
3876 Dynarr_add (dst, 0);
3877 Dynarr_add (dst, '~');
3881 Emchar emch = MAKE_CHAR (Vcharset_composite,
3882 ch & 0x7F, c & 0x7F);
3883 Lisp_Object lstr = composite_char_string (emch);
3887 src = XSTRING_DATA (lstr);
3888 n = XSTRING_LENGTH (lstr);
3892 #endif /* ENABLE_COMPOSITE_CHARS */
3894 encode_ucs4(charset, ch, c, dst);
3907 encode_ucs4 (charset, ch, c, dst);
3923 #ifdef ENABLE_COMPOSITE_CHARS
3929 goto back_to_square_n; /* Wheeeeeeeee ..... */
3931 #endif /* ENABLE_COMPOSITE_CHARS */
3935 str->iso2022.current_char_boundary = char_boundary;
3936 str->iso2022.current_charset = charset;
3938 /* Verbum caro factum est! */
3943 /************************************************************************/
3945 /************************************************************************/
3948 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3953 unsigned char c = *src++;
3954 switch (st->utf8.in_byte)
3957 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3960 st->utf8.in_byte = 5;
3962 st->utf8.in_byte = 4;
3964 st->utf8.in_byte = 3;
3966 st->utf8.in_byte = 2;
3968 st->utf8.in_byte = 1;
3973 if ((c & 0xc0) != 0x80)
3979 return CODING_CATEGORY_UTF8_MASK;
3983 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3984 unsigned_char_dynarr *dst, unsigned int n)
3986 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3987 unsigned int flags = str->flags;
3988 unsigned int ch = str->ch;
3989 eol_type_t eol_type = str->eol_type;
3990 unsigned char counter = str->counter;
3994 unsigned char c = *src++;
4003 else if ( c >= 0xf8 )
4008 else if ( c >= 0xf0 )
4013 else if ( c >= 0xe0 )
4018 else if ( c >= 0xc0 )
4025 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4026 decode_ucs4 (c, dst);
4030 ch = ( ch << 6 ) | ( c & 0x3f );
4031 decode_ucs4 (ch, dst);
4036 ch = ( ch << 6 ) | ( c & 0x3f );
4039 label_continue_loop:;
4042 if (flags & CODING_STATE_END)
4043 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4047 str->counter = counter;
4052 encode_utf8 (Lisp_Object charset,
4053 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4055 unsigned long code = mule_char_to_ucs4 (charset, h, l);
4058 Dynarr_add (dst, code);
4060 else if ( code <= 0x7ff )
4062 Dynarr_add (dst, (code >> 6) | 0xc0);
4063 Dynarr_add (dst, (code & 0x3f) | 0x80);
4065 else if ( code <= 0xffff )
4067 Dynarr_add (dst, (code >> 12) | 0xe0);
4068 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4069 Dynarr_add (dst, (code & 0x3f) | 0x80);
4071 else if ( code <= 0x1fffff )
4073 Dynarr_add (dst, (code >> 18) | 0xf0);
4074 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4075 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4076 Dynarr_add (dst, (code & 0x3f) | 0x80);
4078 else if ( code <= 0x3ffffff )
4080 Dynarr_add (dst, (code >> 24) | 0xf8);
4081 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4082 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4083 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4084 Dynarr_add (dst, (code & 0x3f) | 0x80);
4088 Dynarr_add (dst, (code >> 30) | 0xfc);
4089 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4090 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4091 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4092 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4093 Dynarr_add (dst, (code & 0x3f) | 0x80);
4099 encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
4100 unsigned_char_dynarr *dst, unsigned int n)
4102 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4103 unsigned int flags = str->flags;
4104 unsigned int ch = str->ch;
4105 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4106 unsigned char char_boundary = str->iso2022.current_char_boundary;
4111 unsigned char c = *src++;
4112 switch (char_boundary)
4117 Dynarr_add (dst, c);
4120 else if ( c >= 0xf8 )
4122 Dynarr_add (dst, c);
4125 else if ( c >= 0xf0 )
4127 Dynarr_add (dst, c);
4130 else if ( c >= 0xe0 )
4132 Dynarr_add (dst, c);
4135 else if ( c >= 0xc0 )
4137 Dynarr_add (dst, c);
4144 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4145 Dynarr_add (dst, '\r');
4146 if (eol_type != EOL_CR)
4147 Dynarr_add (dst, c);
4150 Dynarr_add (dst, c);
4155 Dynarr_add (dst, c);
4159 Dynarr_add (dst, c);
4163 #else /* not UTF2000 */
4164 Lisp_Object charset = str->iso2022.current_charset;
4166 #ifdef ENABLE_COMPOSITE_CHARS
4167 /* flags for handling composite chars. We do a little switcharoo
4168 on the source while we're outputting the composite char. */
4169 unsigned int saved_n = 0;
4170 const unsigned char *saved_src = NULL;
4171 int in_composite = 0;
4174 #endif /* ENABLE_COMPOSITE_CHARS */
4178 unsigned char c = *src++;
4180 if (BYTE_ASCII_P (c))
4181 { /* Processing ASCII character */
4185 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4186 Dynarr_add (dst, '\r');
4187 if (eol_type != EOL_CR)
4188 Dynarr_add (dst, c);
4191 encode_utf8 (Vcharset_ascii, c, 0, dst);
4194 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4195 { /* Processing Leading Byte */
4197 charset = CHARSET_BY_LEADING_BYTE (c);
4198 if (LEADING_BYTE_PREFIX_P(c))
4203 { /* Processing Non-ASCII character */
4205 if (EQ (charset, Vcharset_control_1))
4207 encode_utf8 (Vcharset_control_1, c, 0, dst);
4211 switch (XCHARSET_REP_BYTES (charset))
4214 encode_utf8 (charset, c, 0, dst);
4217 if (XCHARSET_PRIVATE_P (charset))
4219 encode_utf8 (charset, c, 0, dst);
4224 #ifdef ENABLE_COMPOSITE_CHARS
4225 if (EQ (charset, Vcharset_composite))
4229 /* #### Bother! We don't know how to
4231 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4235 Emchar emch = MAKE_CHAR (Vcharset_composite,
4236 ch & 0x7F, c & 0x7F);
4237 Lisp_Object lstr = composite_char_string (emch);
4241 src = XSTRING_DATA (lstr);
4242 n = XSTRING_LENGTH (lstr);
4246 #endif /* ENABLE_COMPOSITE_CHARS */
4248 encode_utf8 (charset, ch, c, dst);
4261 encode_utf8 (charset, ch, c, dst);
4277 #ifdef ENABLE_COMPOSITE_CHARS
4283 goto back_to_square_n; /* Wheeeeeeeee ..... */
4287 #endif /* not UTF2000 */
4290 str->iso2022.current_char_boundary = char_boundary;
4292 str->iso2022.current_charset = charset;
4295 /* Verbum caro factum est! */
4299 /************************************************************************/
4300 /* ISO2022 methods */
4301 /************************************************************************/
4303 /* The following note describes the coding system ISO2022 briefly.
4304 Since the intention of this note is to help understand the
4305 functions in this file, some parts are NOT ACCURATE or OVERLY
4306 SIMPLIFIED. For thorough understanding, please refer to the
4307 original document of ISO2022.
4309 ISO2022 provides many mechanisms to encode several character sets
4310 in 7-bit and 8-bit environments. For 7-bit environments, all text
4311 is encoded using bytes less than 128. This may make the encoded
4312 text a little bit longer, but the text passes more easily through
4313 several gateways, some of which strip off MSB (Most Signigant Bit).
4315 There are two kinds of character sets: control character set and
4316 graphic character set. The former contains control characters such
4317 as `newline' and `escape' to provide control functions (control
4318 functions are also provided by escape sequences). The latter
4319 contains graphic characters such as 'A' and '-'. Emacs recognizes
4320 two control character sets and many graphic character sets.
4322 Graphic character sets are classified into one of the following
4323 four classes, according to the number of bytes (DIMENSION) and
4324 number of characters in one dimension (CHARS) of the set:
4325 - DIMENSION1_CHARS94
4326 - DIMENSION1_CHARS96
4327 - DIMENSION2_CHARS94
4328 - DIMENSION2_CHARS96
4330 In addition, each character set is assigned an identification tag,
4331 unique for each set, called "final character" (denoted as <F>
4332 hereafter). The <F> of each character set is decided by ECMA(*)
4333 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4334 (0x30..0x3F are for private use only).
4336 Note (*): ECMA = European Computer Manufacturers Association
4338 Here are examples of graphic character set [NAME(<F>)]:
4339 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4340 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4341 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4342 o DIMENSION2_CHARS96 -- none for the moment
4344 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4345 C0 [0x00..0x1F] -- control character plane 0
4346 GL [0x20..0x7F] -- graphic character plane 0
4347 C1 [0x80..0x9F] -- control character plane 1
4348 GR [0xA0..0xFF] -- graphic character plane 1
4350 A control character set is directly designated and invoked to C0 or
4351 C1 by an escape sequence. The most common case is that:
4352 - ISO646's control character set is designated/invoked to C0, and
4353 - ISO6429's control character set is designated/invoked to C1,
4354 and usually these designations/invocations are omitted in encoded
4355 text. In a 7-bit environment, only C0 can be used, and a control
4356 character for C1 is encoded by an appropriate escape sequence to
4357 fit into the environment. All control characters for C1 are
4358 defined to have corresponding escape sequences.
4360 A graphic character set is at first designated to one of four
4361 graphic registers (G0 through G3), then these graphic registers are
4362 invoked to GL or GR. These designations and invocations can be
4363 done independently. The most common case is that G0 is invoked to
4364 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4365 these invocations and designations are omitted in encoded text.
4366 In a 7-bit environment, only GL can be used.
4368 When a graphic character set of CHARS94 is invoked to GL, codes
4369 0x20 and 0x7F of the GL area work as control characters SPACE and
4370 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4373 There are two ways of invocation: locking-shift and single-shift.
4374 With locking-shift, the invocation lasts until the next different
4375 invocation, whereas with single-shift, the invocation affects the
4376 following character only and doesn't affect the locking-shift
4377 state. Invocations are done by the following control characters or
4380 ----------------------------------------------------------------------
4381 abbrev function cntrl escape seq description
4382 ----------------------------------------------------------------------
4383 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4384 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4385 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4386 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4387 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4388 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4389 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4390 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4391 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4392 ----------------------------------------------------------------------
4393 (*) These are not used by any known coding system.
4395 Control characters for these functions are defined by macros
4396 ISO_CODE_XXX in `coding.h'.
4398 Designations are done by the following escape sequences:
4399 ----------------------------------------------------------------------
4400 escape sequence description
4401 ----------------------------------------------------------------------
4402 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4403 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4404 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4405 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4406 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4407 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4408 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4409 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4410 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4411 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4412 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4413 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4414 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4415 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4416 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4417 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4418 ----------------------------------------------------------------------
4420 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4421 of dimension 1, chars 94, and final character <F>, etc...
4423 Note (*): Although these designations are not allowed in ISO2022,
4424 Emacs accepts them on decoding, and produces them on encoding
4425 CHARS96 character sets in a coding system which is characterized as
4426 7-bit environment, non-locking-shift, and non-single-shift.
4428 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4429 '(' can be omitted. We refer to this as "short-form" hereafter.
4431 Now you may notice that there are a lot of ways for encoding the
4432 same multilingual text in ISO2022. Actually, there exist many
4433 coding systems such as Compound Text (used in X11's inter client
4434 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4435 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4436 localized platforms), and all of these are variants of ISO2022.
4438 In addition to the above, Emacs handles two more kinds of escape
4439 sequences: ISO6429's direction specification and Emacs' private
4440 sequence for specifying character composition.
4442 ISO6429's direction specification takes the following form:
4443 o CSI ']' -- end of the current direction
4444 o CSI '0' ']' -- end of the current direction
4445 o CSI '1' ']' -- start of left-to-right text
4446 o CSI '2' ']' -- start of right-to-left text
4447 The control character CSI (0x9B: control sequence introducer) is
4448 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4450 Character composition specification takes the following form:
4451 o ESC '0' -- start character composition
4452 o ESC '1' -- end character composition
4453 Since these are not standard escape sequences of any ISO standard,
4454 their use with these meanings is restricted to Emacs only. */
4457 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4461 for (i = 0; i < 4; i++)
4463 if (!NILP (coding_system))
4465 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4467 iso->charset[i] = Qt;
4468 iso->invalid_designated[i] = 0;
4470 iso->esc = ISO_ESC_NOTHING;
4471 iso->esc_bytes_index = 0;
4472 iso->register_left = 0;
4473 iso->register_right = 1;
4474 iso->switched_dir_and_no_valid_charset_yet = 0;
4475 iso->invalid_switch_dir = 0;
4476 iso->output_direction_sequence = 0;
4477 iso->output_literally = 0;
4478 #ifdef ENABLE_COMPOSITE_CHARS
4479 if (iso->composite_chars)
4480 Dynarr_reset (iso->composite_chars);
4485 fit_to_be_escape_quoted (unsigned char c)
4502 /* Parse one byte of an ISO2022 escape sequence.
4503 If the result is an invalid escape sequence, return 0 and
4504 do not change anything in STR. Otherwise, if the result is
4505 an incomplete escape sequence, update ISO2022.ESC and
4506 ISO2022.ESC_BYTES and return -1. Otherwise, update
4507 all the state variables (but not ISO2022.ESC_BYTES) and
4510 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4511 or invocation of an invalid character set and treat that as
4512 an unrecognized escape sequence. */
4515 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4516 unsigned char c, unsigned int *flags,
4517 int check_invalid_charsets)
4519 /* (1) If we're at the end of a designation sequence, CS is the
4520 charset being designated and REG is the register to designate
4523 (2) If we're at the end of a locking-shift sequence, REG is
4524 the register to invoke and HALF (0 == left, 1 == right) is
4525 the half to invoke it into.
4527 (3) If we're at the end of a single-shift sequence, REG is
4528 the register to invoke. */
4529 Lisp_Object cs = Qnil;
4532 /* NOTE: This code does goto's all over the fucking place.
4533 The reason for this is that we're basically implementing
4534 a state machine here, and hierarchical languages like C
4535 don't really provide a clean way of doing this. */
4537 if (! (*flags & CODING_STATE_ESCAPE))
4538 /* At beginning of escape sequence; we need to reset our
4539 escape-state variables. */
4540 iso->esc = ISO_ESC_NOTHING;
4542 iso->output_literally = 0;
4543 iso->output_direction_sequence = 0;
4547 case ISO_ESC_NOTHING:
4548 iso->esc_bytes_index = 0;
4551 case ISO_CODE_ESC: /* Start escape sequence */
4552 *flags |= CODING_STATE_ESCAPE;
4556 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4557 *flags |= CODING_STATE_ESCAPE;
4558 iso->esc = ISO_ESC_5_11;
4561 case ISO_CODE_SO: /* locking shift 1 */
4564 case ISO_CODE_SI: /* locking shift 0 */
4568 case ISO_CODE_SS2: /* single shift */
4571 case ISO_CODE_SS3: /* single shift */
4575 default: /* Other control characters */
4582 /**** single shift ****/
4584 case 'N': /* single shift 2 */
4587 case 'O': /* single shift 3 */
4591 /**** locking shift ****/
4593 case '~': /* locking shift 1 right */
4596 case 'n': /* locking shift 2 */
4599 case '}': /* locking shift 2 right */
4602 case 'o': /* locking shift 3 */
4605 case '|': /* locking shift 3 right */
4609 #ifdef ENABLE_COMPOSITE_CHARS
4610 /**** composite ****/
4613 iso->esc = ISO_ESC_START_COMPOSITE;
4614 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4615 CODING_STATE_COMPOSITE;
4619 iso->esc = ISO_ESC_END_COMPOSITE;
4620 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4621 ~CODING_STATE_COMPOSITE;
4623 #endif /* ENABLE_COMPOSITE_CHARS */
4625 /**** directionality ****/
4628 iso->esc = ISO_ESC_5_11;
4631 /**** designation ****/
4633 case '$': /* multibyte charset prefix */
4634 iso->esc = ISO_ESC_2_4;
4638 if (0x28 <= c && c <= 0x2F)
4640 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4644 /* This function is called with CODESYS equal to nil when
4645 doing coding-system detection. */
4647 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4648 && fit_to_be_escape_quoted (c))
4650 iso->esc = ISO_ESC_LITERAL;
4651 *flags &= CODING_STATE_ISO2022_LOCK;
4661 /**** directionality ****/
4663 case ISO_ESC_5_11: /* ISO6429 direction control */
4666 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4667 goto directionality;
4669 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4670 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4671 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4675 case ISO_ESC_5_11_0:
4678 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4679 goto directionality;
4683 case ISO_ESC_5_11_1:
4686 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4687 goto directionality;
4691 case ISO_ESC_5_11_2:
4694 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4695 goto directionality;
4700 iso->esc = ISO_ESC_DIRECTIONALITY;
4701 /* Various junk here to attempt to preserve the direction sequences
4702 literally in the text if they would otherwise be swallowed due
4703 to invalid designations that don't show up as actual charset
4704 changes in the text. */
4705 if (iso->invalid_switch_dir)
4707 /* We already inserted a direction switch literally into the
4708 text. We assume (#### this may not be right) that the
4709 next direction switch is the one going the other way,
4710 and we need to output that literally as well. */
4711 iso->output_literally = 1;
4712 iso->invalid_switch_dir = 0;
4718 /* If we are in the thrall of an invalid designation,
4719 then stick the directionality sequence literally into the
4720 output stream so it ends up in the original text again. */
4721 for (jj = 0; jj < 4; jj++)
4722 if (iso->invalid_designated[jj])
4726 iso->output_literally = 1;
4727 iso->invalid_switch_dir = 1;
4730 /* Indicate that we haven't yet seen a valid designation,
4731 so that if a switch-dir is directly followed by an
4732 invalid designation, both get inserted literally. */
4733 iso->switched_dir_and_no_valid_charset_yet = 1;
4738 /**** designation ****/
4741 if (0x28 <= c && c <= 0x2F)
4743 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4746 if (0x40 <= c && c <= 0x42)
4748 cs = CHARSET_BY_ATTRIBUTES (94, 2, c,
4749 *flags & CODING_STATE_R2L ?
4750 CHARSET_RIGHT_TO_LEFT :
4751 CHARSET_LEFT_TO_RIGHT);
4763 if (c < '0' || c > '~')
4764 return 0; /* bad final byte */
4766 if (iso->esc >= ISO_ESC_2_8 &&
4767 iso->esc <= ISO_ESC_2_15)
4769 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4770 single = 1; /* single-byte */
4771 reg = (iso->esc - ISO_ESC_2_8) & 3;
4773 else if (iso->esc >= ISO_ESC_2_4_8 &&
4774 iso->esc <= ISO_ESC_2_4_15)
4776 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4777 single = -1; /* multi-byte */
4778 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4782 /* Can this ever be reached? -slb */
4787 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4788 *flags & CODING_STATE_R2L ?
4789 CHARSET_RIGHT_TO_LEFT :
4790 CHARSET_LEFT_TO_RIGHT);
4796 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4800 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4801 /* can't invoke something that ain't there. */
4803 iso->esc = ISO_ESC_SINGLE_SHIFT;
4804 *flags &= CODING_STATE_ISO2022_LOCK;
4806 *flags |= CODING_STATE_SS2;
4808 *flags |= CODING_STATE_SS3;
4812 if (check_invalid_charsets &&
4813 !CHARSETP (iso->charset[reg]))
4814 /* can't invoke something that ain't there. */
4817 iso->register_right = reg;
4819 iso->register_left = reg;
4820 *flags &= CODING_STATE_ISO2022_LOCK;
4821 iso->esc = ISO_ESC_LOCKING_SHIFT;
4825 if (NILP (cs) && check_invalid_charsets)
4827 iso->invalid_designated[reg] = 1;
4828 iso->charset[reg] = Vcharset_ascii;
4829 iso->esc = ISO_ESC_DESIGNATE;
4830 *flags &= CODING_STATE_ISO2022_LOCK;
4831 iso->output_literally = 1;
4832 if (iso->switched_dir_and_no_valid_charset_yet)
4834 /* We encountered a switch-direction followed by an
4835 invalid designation. Ensure that the switch-direction
4836 gets outputted; otherwise it will probably get eaten
4837 when the text is written out again. */
4838 iso->switched_dir_and_no_valid_charset_yet = 0;
4839 iso->output_direction_sequence = 1;
4840 /* And make sure that the switch-dir going the other
4841 way gets outputted, as well. */
4842 iso->invalid_switch_dir = 1;
4846 /* This function is called with CODESYS equal to nil when
4847 doing coding-system detection. */
4848 if (!NILP (codesys))
4850 charset_conversion_spec_dynarr *dyn =
4851 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4857 for (i = 0; i < Dynarr_length (dyn); i++)
4859 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4860 if (EQ (cs, spec->from_charset))
4861 cs = spec->to_charset;
4866 iso->charset[reg] = cs;
4867 iso->esc = ISO_ESC_DESIGNATE;
4868 *flags &= CODING_STATE_ISO2022_LOCK;
4869 if (iso->invalid_designated[reg])
4871 iso->invalid_designated[reg] = 0;
4872 iso->output_literally = 1;
4874 if (iso->switched_dir_and_no_valid_charset_yet)
4875 iso->switched_dir_and_no_valid_charset_yet = 0;
4880 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4885 /* #### There are serious deficiencies in the recognition mechanism
4886 here. This needs to be much smarter if it's going to cut it.
4887 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4888 it should be detected as Latin-1.
4889 All the ISO2022 stuff in this file should be synced up with the
4890 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4891 Perhaps we should wait till R2L works in FSF Emacs? */
4893 if (!st->iso2022.initted)
4895 reset_iso2022 (Qnil, &st->iso2022.iso);
4896 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4897 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4898 CODING_CATEGORY_ISO_8_1_MASK |
4899 CODING_CATEGORY_ISO_8_2_MASK |
4900 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4901 st->iso2022.flags = 0;
4902 st->iso2022.high_byte_count = 0;
4903 st->iso2022.saw_single_shift = 0;
4904 st->iso2022.initted = 1;
4907 mask = st->iso2022.mask;
4914 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4915 st->iso2022.high_byte_count++;
4919 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4921 if (st->iso2022.high_byte_count & 1)
4922 /* odd number of high bytes; assume not iso-8-2 */
4923 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4925 st->iso2022.high_byte_count = 0;
4926 st->iso2022.saw_single_shift = 0;
4928 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4930 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4931 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4932 { /* control chars */
4935 /* Allow and ignore control characters that you might
4936 reasonably see in a text file */
4941 case 8: /* backspace */
4942 case 11: /* vertical tab */
4943 case 12: /* form feed */
4944 case 26: /* MS-DOS C-z junk */
4945 case 31: /* '^_' -- for info */
4946 goto label_continue_loop;
4953 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4956 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4957 &st->iso2022.flags, 0))
4959 switch (st->iso2022.iso.esc)
4961 case ISO_ESC_DESIGNATE:
4962 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4963 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4965 case ISO_ESC_LOCKING_SHIFT:
4966 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4967 goto ran_out_of_chars;
4968 case ISO_ESC_SINGLE_SHIFT:
4969 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4970 st->iso2022.saw_single_shift = 1;
4979 goto ran_out_of_chars;
4982 label_continue_loop:;
4991 postprocess_iso2022_mask (int mask)
4993 /* #### kind of cheesy */
4994 /* If seven-bit ISO is allowed, then assume that the encoding is
4995 entirely seven-bit and turn off the eight-bit ones. */
4996 if (mask & CODING_CATEGORY_ISO_7_MASK)
4997 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4998 CODING_CATEGORY_ISO_8_1_MASK |
4999 CODING_CATEGORY_ISO_8_2_MASK);
5003 /* If FLAGS is a null pointer or specifies right-to-left motion,
5004 output a switch-dir-to-left-to-right sequence to DST.
5005 Also update FLAGS if it is not a null pointer.
5006 If INTERNAL_P is set, we are outputting in internal format and
5007 need to handle the CSI differently. */
5010 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5011 unsigned_char_dynarr *dst,
5012 unsigned int *flags,
5015 if (!flags || (*flags & CODING_STATE_R2L))
5017 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5019 Dynarr_add (dst, ISO_CODE_ESC);
5020 Dynarr_add (dst, '[');
5022 else if (internal_p)
5023 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5025 Dynarr_add (dst, ISO_CODE_CSI);
5026 Dynarr_add (dst, '0');
5027 Dynarr_add (dst, ']');
5029 *flags &= ~CODING_STATE_R2L;
5033 /* If FLAGS is a null pointer or specifies a direction different from
5034 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5035 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5036 sequence to DST. Also update FLAGS if it is not a null pointer.
5037 If INTERNAL_P is set, we are outputting in internal format and
5038 need to handle the CSI differently. */
5041 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5042 unsigned_char_dynarr *dst, unsigned int *flags,
5045 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5046 direction == CHARSET_LEFT_TO_RIGHT)
5047 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5048 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5049 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5050 direction == CHARSET_RIGHT_TO_LEFT)
5052 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5054 Dynarr_add (dst, ISO_CODE_ESC);
5055 Dynarr_add (dst, '[');
5057 else if (internal_p)
5058 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5060 Dynarr_add (dst, ISO_CODE_CSI);
5061 Dynarr_add (dst, '2');
5062 Dynarr_add (dst, ']');
5064 *flags |= CODING_STATE_R2L;
5068 /* Convert ISO2022-format data to internal format. */
5071 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
5072 unsigned_char_dynarr *dst, unsigned int n)
5074 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5075 unsigned int flags = str->flags;
5076 unsigned int ch = str->ch;
5077 eol_type_t eol_type = str->eol_type;
5078 #ifdef ENABLE_COMPOSITE_CHARS
5079 unsigned_char_dynarr *real_dst = dst;
5081 Lisp_Object coding_system;
5083 XSETCODING_SYSTEM (coding_system, str->codesys);
5085 #ifdef ENABLE_COMPOSITE_CHARS
5086 if (flags & CODING_STATE_COMPOSITE)
5087 dst = str->iso2022.composite_chars;
5088 #endif /* ENABLE_COMPOSITE_CHARS */
5092 unsigned char c = *src++;
5093 if (flags & CODING_STATE_ESCAPE)
5094 { /* Within ESC sequence */
5095 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5100 switch (str->iso2022.esc)
5102 #ifdef ENABLE_COMPOSITE_CHARS
5103 case ISO_ESC_START_COMPOSITE:
5104 if (str->iso2022.composite_chars)
5105 Dynarr_reset (str->iso2022.composite_chars);
5107 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5108 dst = str->iso2022.composite_chars;
5110 case ISO_ESC_END_COMPOSITE:
5112 Bufbyte comstr[MAX_EMCHAR_LEN];
5114 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5115 Dynarr_length (dst));
5117 len = set_charptr_emchar (comstr, emch);
5118 Dynarr_add_many (dst, comstr, len);
5121 #endif /* ENABLE_COMPOSITE_CHARS */
5123 case ISO_ESC_LITERAL:
5124 DECODE_ADD_BINARY_CHAR (c, dst);
5128 /* Everything else handled already */
5133 /* Attempted error recovery. */
5134 if (str->iso2022.output_direction_sequence)
5135 ensure_correct_direction (flags & CODING_STATE_R2L ?
5136 CHARSET_RIGHT_TO_LEFT :
5137 CHARSET_LEFT_TO_RIGHT,
5138 str->codesys, dst, 0, 1);
5139 /* More error recovery. */
5140 if (!retval || str->iso2022.output_literally)
5142 /* Output the (possibly invalid) sequence */
5144 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5145 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5146 flags &= CODING_STATE_ISO2022_LOCK;
5148 n++, src--;/* Repeat the loop with the same character. */
5151 /* No sense in reprocessing the final byte of the
5152 escape sequence; it could mess things up anyway.
5154 DECODE_ADD_BINARY_CHAR (c, dst);
5159 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5160 { /* Control characters */
5162 /***** Error-handling *****/
5164 /* If we were in the middle of a character, dump out the
5165 partial character. */
5166 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5168 /* If we just saw a single-shift character, dump it out.
5169 This may dump out the wrong sort of single-shift character,
5170 but least it will give an indication that something went
5172 if (flags & CODING_STATE_SS2)
5174 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5175 flags &= ~CODING_STATE_SS2;
5177 if (flags & CODING_STATE_SS3)
5179 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5180 flags &= ~CODING_STATE_SS3;
5183 /***** Now handle the control characters. *****/
5186 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5188 flags &= CODING_STATE_ISO2022_LOCK;
5190 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5191 DECODE_ADD_BINARY_CHAR (c, dst);
5194 { /* Graphic characters */
5195 Lisp_Object charset;
5201 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5203 /* Now determine the charset. */
5204 reg = ((flags & CODING_STATE_SS2) ? 2
5205 : (flags & CODING_STATE_SS3) ? 3
5206 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5207 : str->iso2022.register_left);
5208 charset = str->iso2022.charset[reg];
5210 /* Error checking: */
5211 if (! CHARSETP (charset)
5212 || str->iso2022.invalid_designated[reg]
5213 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5214 && XCHARSET_CHARS (charset) == 94))
5215 /* Mrmph. We are trying to invoke a register that has no
5216 or an invalid charset in it, or trying to add a character
5217 outside the range of the charset. Insert that char literally
5218 to preserve it for the output. */
5220 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5221 DECODE_ADD_BINARY_CHAR (c, dst);
5226 /* Things are probably hunky-dorey. */
5228 /* Fetch reverse charset, maybe. */
5229 if (((flags & CODING_STATE_R2L) &&
5230 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5232 (!(flags & CODING_STATE_R2L) &&
5233 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5235 Lisp_Object new_charset =
5236 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5237 if (!NILP (new_charset))
5238 charset = new_charset;
5242 if (XCHARSET_DIMENSION (charset) == 1)
5244 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5246 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5251 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5257 lb = XCHARSET_LEADING_BYTE (charset);
5258 switch (XCHARSET_REP_BYTES (charset))
5261 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5262 Dynarr_add (dst, c & 0x7F);
5265 case 2: /* one-byte official */
5266 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5267 Dynarr_add (dst, lb);
5268 Dynarr_add (dst, c | 0x80);
5271 case 3: /* one-byte private or two-byte official */
5272 if (XCHARSET_PRIVATE_P (charset))
5274 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5275 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5276 Dynarr_add (dst, lb);
5277 Dynarr_add (dst, c | 0x80);
5283 Dynarr_add (dst, lb);
5284 Dynarr_add (dst, ch | 0x80);
5285 Dynarr_add (dst, c | 0x80);
5293 default: /* two-byte private */
5296 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5297 Dynarr_add (dst, lb);
5298 Dynarr_add (dst, ch | 0x80);
5299 Dynarr_add (dst, c | 0x80);
5309 flags &= CODING_STATE_ISO2022_LOCK;
5312 label_continue_loop:;
5315 if (flags & CODING_STATE_END)
5316 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5323 /***** ISO2022 encoder *****/
5325 /* Designate CHARSET into register REG. */
5328 iso2022_designate (Lisp_Object charset, unsigned char reg,
5329 struct encoding_stream *str, unsigned_char_dynarr *dst)
5331 static const char inter94[] = "()*+";
5332 static const char inter96[] = ",-./";
5334 unsigned char final;
5335 Lisp_Object old_charset = str->iso2022.charset[reg];
5337 str->iso2022.charset[reg] = charset;
5338 if (!CHARSETP (charset))
5339 /* charset might be an initial nil or t. */
5341 type = XCHARSET_TYPE (charset);
5342 final = XCHARSET_FINAL (charset);
5343 if (!str->iso2022.force_charset_on_output[reg] &&
5344 CHARSETP (old_charset) &&
5345 XCHARSET_TYPE (old_charset) == type &&
5346 XCHARSET_FINAL (old_charset) == final)
5349 str->iso2022.force_charset_on_output[reg] = 0;
5352 charset_conversion_spec_dynarr *dyn =
5353 str->codesys->iso2022.output_conv;
5359 for (i = 0; i < Dynarr_length (dyn); i++)
5361 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5362 if (EQ (charset, spec->from_charset))
5363 charset = spec->to_charset;
5368 Dynarr_add (dst, ISO_CODE_ESC);
5371 case CHARSET_TYPE_94:
5372 Dynarr_add (dst, inter94[reg]);
5374 case CHARSET_TYPE_96:
5375 Dynarr_add (dst, inter96[reg]);
5377 case CHARSET_TYPE_94X94:
5378 Dynarr_add (dst, '$');
5380 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5383 Dynarr_add (dst, inter94[reg]);
5385 case CHARSET_TYPE_96X96:
5386 Dynarr_add (dst, '$');
5387 Dynarr_add (dst, inter96[reg]);
5390 Dynarr_add (dst, final);
5394 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5396 if (str->iso2022.register_left != 0)
5398 Dynarr_add (dst, ISO_CODE_SI);
5399 str->iso2022.register_left = 0;
5404 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5406 if (str->iso2022.register_left != 1)
5408 Dynarr_add (dst, ISO_CODE_SO);
5409 str->iso2022.register_left = 1;
5413 /* Convert internally-formatted data to ISO2022 format. */
5416 encode_coding_iso2022 (Lstream *encoding, const unsigned char *src,
5417 unsigned_char_dynarr *dst, unsigned int n)
5419 unsigned char charmask, c;
5420 unsigned char char_boundary;
5421 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5422 unsigned int flags = str->flags;
5423 Emchar ch = str->ch;
5424 Lisp_Coding_System *codesys = str->codesys;
5425 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5427 Lisp_Object charset;
5430 unsigned int byte1, byte2;
5433 #ifdef ENABLE_COMPOSITE_CHARS
5434 /* flags for handling composite chars. We do a little switcharoo
5435 on the source while we're outputting the composite char. */
5436 unsigned int saved_n = 0;
5437 const unsigned char *saved_src = NULL;
5438 int in_composite = 0;
5439 #endif /* ENABLE_COMPOSITE_CHARS */
5441 char_boundary = str->iso2022.current_char_boundary;
5442 charset = str->iso2022.current_charset;
5443 half = str->iso2022.current_half;
5445 #ifdef ENABLE_COMPOSITE_CHARS
5453 switch (char_boundary)
5461 else if ( c >= 0xf8 )
5466 else if ( c >= 0xf0 )
5471 else if ( c >= 0xe0 )
5476 else if ( c >= 0xc0 )
5485 restore_left_to_right_direction (codesys, dst, &flags, 0);
5487 /* Make sure G0 contains ASCII */
5488 if ((c > ' ' && c < ISO_CODE_DEL) ||
5489 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5491 ensure_normal_shift (str, dst);
5492 iso2022_designate (Vcharset_ascii, 0, str, dst);
5495 /* If necessary, restore everything to the default state
5498 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5500 restore_left_to_right_direction (codesys, dst, &flags, 0);
5502 ensure_normal_shift (str, dst);
5504 for (i = 0; i < 4; i++)
5506 Lisp_Object initial_charset =
5507 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5508 iso2022_designate (initial_charset, i, str, dst);
5513 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5514 Dynarr_add (dst, '\r');
5515 if (eol_type != EOL_CR)
5516 Dynarr_add (dst, c);
5520 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5521 && fit_to_be_escape_quoted (c))
5522 Dynarr_add (dst, ISO_CODE_ESC);
5523 Dynarr_add (dst, c);
5529 ch = ( ch << 6 ) | ( c & 0x3f );
5532 if ( (0x80 <= ch) && (ch <= 0x9f) )
5534 charmask = (half == 0 ? 0x00 : 0x80);
5536 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5537 && fit_to_be_escape_quoted (ch))
5538 Dynarr_add (dst, ISO_CODE_ESC);
5539 /* you asked for it ... */
5540 Dynarr_add (dst, ch);
5546 BREAKUP_CHAR (ch, charset, byte1, byte2);
5547 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5548 codesys, dst, &flags, 0);
5550 /* Now determine which register to use. */
5552 for (i = 0; i < 4; i++)
5554 if (EQ (charset, str->iso2022.charset[i]) ||
5556 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5565 if (XCHARSET_GRAPHIC (charset) != 0)
5567 if (!NILP (str->iso2022.charset[1]) &&
5568 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5569 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5571 else if (!NILP (str->iso2022.charset[2]))
5573 else if (!NILP (str->iso2022.charset[3]))
5582 iso2022_designate (charset, reg, str, dst);
5584 /* Now invoke that register. */
5588 ensure_normal_shift (str, dst);
5593 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5595 ensure_shift_out (str, dst);
5603 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5605 Dynarr_add (dst, ISO_CODE_ESC);
5606 Dynarr_add (dst, 'N');
5611 Dynarr_add (dst, ISO_CODE_SS2);
5617 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5619 Dynarr_add (dst, ISO_CODE_ESC);
5620 Dynarr_add (dst, 'O');
5625 Dynarr_add (dst, ISO_CODE_SS3);
5634 charmask = (half == 0 ? 0x00 : 0x80);
5636 switch (XCHARSET_DIMENSION (charset))
5639 Dynarr_add (dst, byte1 | charmask);
5642 Dynarr_add (dst, byte1 | charmask);
5643 Dynarr_add (dst, byte2 | charmask);
5652 ch = ( ch << 6 ) | ( c & 0x3f );
5656 #else /* not UTF2000 */
5662 if (BYTE_ASCII_P (c))
5663 { /* Processing ASCII character */
5666 restore_left_to_right_direction (codesys, dst, &flags, 0);
5668 /* Make sure G0 contains ASCII */
5669 if ((c > ' ' && c < ISO_CODE_DEL) ||
5670 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5672 ensure_normal_shift (str, dst);
5673 iso2022_designate (Vcharset_ascii, 0, str, dst);
5676 /* If necessary, restore everything to the default state
5679 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5681 restore_left_to_right_direction (codesys, dst, &flags, 0);
5683 ensure_normal_shift (str, dst);
5685 for (i = 0; i < 4; i++)
5687 Lisp_Object initial_charset =
5688 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5689 iso2022_designate (initial_charset, i, str, dst);
5694 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5695 Dynarr_add (dst, '\r');
5696 if (eol_type != EOL_CR)
5697 Dynarr_add (dst, c);
5701 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5702 && fit_to_be_escape_quoted (c))
5703 Dynarr_add (dst, ISO_CODE_ESC);
5704 Dynarr_add (dst, c);
5709 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5710 { /* Processing Leading Byte */
5712 charset = CHARSET_BY_LEADING_BYTE (c);
5713 if (LEADING_BYTE_PREFIX_P(c))
5715 else if (!EQ (charset, Vcharset_control_1)
5716 #ifdef ENABLE_COMPOSITE_CHARS
5717 && !EQ (charset, Vcharset_composite)
5723 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5724 codesys, dst, &flags, 0);
5726 /* Now determine which register to use. */
5728 for (i = 0; i < 4; i++)
5730 if (EQ (charset, str->iso2022.charset[i]) ||
5732 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5741 if (XCHARSET_GRAPHIC (charset) != 0)
5743 if (!NILP (str->iso2022.charset[1]) &&
5744 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5745 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5747 else if (!NILP (str->iso2022.charset[2]))
5749 else if (!NILP (str->iso2022.charset[3]))
5758 iso2022_designate (charset, reg, str, dst);
5760 /* Now invoke that register. */
5764 ensure_normal_shift (str, dst);
5769 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5771 ensure_shift_out (str, dst);
5779 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5781 Dynarr_add (dst, ISO_CODE_ESC);
5782 Dynarr_add (dst, 'N');
5787 Dynarr_add (dst, ISO_CODE_SS2);
5793 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5795 Dynarr_add (dst, ISO_CODE_ESC);
5796 Dynarr_add (dst, 'O');
5801 Dynarr_add (dst, ISO_CODE_SS3);
5813 { /* Processing Non-ASCII character */
5814 charmask = (half == 0 ? 0x7F : 0xFF);
5816 if (EQ (charset, Vcharset_control_1))
5818 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5819 && fit_to_be_escape_quoted (c))
5820 Dynarr_add (dst, ISO_CODE_ESC);
5821 /* you asked for it ... */
5822 Dynarr_add (dst, c - 0x20);
5826 switch (XCHARSET_REP_BYTES (charset))
5829 Dynarr_add (dst, c & charmask);
5832 if (XCHARSET_PRIVATE_P (charset))
5834 Dynarr_add (dst, c & charmask);
5839 #ifdef ENABLE_COMPOSITE_CHARS
5840 if (EQ (charset, Vcharset_composite))
5844 /* #### Bother! We don't know how to
5846 Dynarr_add (dst, '~');
5850 Emchar emch = MAKE_CHAR (Vcharset_composite,
5851 ch & 0x7F, c & 0x7F);
5852 Lisp_Object lstr = composite_char_string (emch);
5856 src = XSTRING_DATA (lstr);
5857 n = XSTRING_LENGTH (lstr);
5858 Dynarr_add (dst, ISO_CODE_ESC);
5859 Dynarr_add (dst, '0'); /* start composing */
5863 #endif /* ENABLE_COMPOSITE_CHARS */
5865 Dynarr_add (dst, ch & charmask);
5866 Dynarr_add (dst, c & charmask);
5879 Dynarr_add (dst, ch & charmask);
5880 Dynarr_add (dst, c & charmask);
5895 #endif /* not UTF2000 */
5897 #ifdef ENABLE_COMPOSITE_CHARS
5903 Dynarr_add (dst, ISO_CODE_ESC);
5904 Dynarr_add (dst, '1'); /* end composing */
5905 goto back_to_square_n; /* Wheeeeeeeee ..... */
5907 #endif /* ENABLE_COMPOSITE_CHARS */
5910 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5912 if (char_boundary && flags & CODING_STATE_END)
5915 restore_left_to_right_direction (codesys, dst, &flags, 0);
5916 ensure_normal_shift (str, dst);
5917 for (i = 0; i < 4; i++)
5919 Lisp_Object initial_charset =
5920 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5921 iso2022_designate (initial_charset, i, str, dst);
5927 str->iso2022.current_char_boundary = char_boundary;
5928 str->iso2022.current_charset = charset;
5929 str->iso2022.current_half = half;
5931 /* Verbum caro factum est! */
5935 /************************************************************************/
5936 /* No-conversion methods */
5937 /************************************************************************/
5939 /* This is used when reading in "binary" files -- i.e. files that may
5940 contain all 256 possible byte values and that are not to be
5941 interpreted as being in any particular decoding. */
5943 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5944 unsigned_char_dynarr *dst, unsigned int n)
5947 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5948 unsigned int flags = str->flags;
5949 unsigned int ch = str->ch;
5950 eol_type_t eol_type = str->eol_type;
5956 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5957 DECODE_ADD_BINARY_CHAR (c, dst);
5958 label_continue_loop:;
5961 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5968 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5969 unsigned_char_dynarr *dst, unsigned int n)
5972 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5973 unsigned int flags = str->flags;
5974 unsigned int ch = str->ch;
5975 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5977 unsigned char char_boundary = str->iso2022.current_char_boundary;
5984 switch (char_boundary)
5992 else if ( c >= 0xf8 )
5997 else if ( c >= 0xf0 )
6002 else if ( c >= 0xe0 )
6007 else if ( c >= 0xc0 )
6018 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6019 Dynarr_add (dst, '\r');
6020 if (eol_type != EOL_CR)
6021 Dynarr_add (dst, c);
6024 Dynarr_add (dst, c);
6029 ch = ( ch << 6 ) | ( c & 0x3f );
6030 Dynarr_add (dst, ch & 0xff);
6034 ch = ( ch << 6 ) | ( c & 0x3f );
6037 #else /* not UTF2000 */
6040 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6041 Dynarr_add (dst, '\r');
6042 if (eol_type != EOL_CR)
6043 Dynarr_add (dst, '\n');
6046 else if (BYTE_ASCII_P (c))
6049 Dynarr_add (dst, c);
6051 else if (BUFBYTE_LEADING_BYTE_P (c))
6054 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6055 c == LEADING_BYTE_CONTROL_1)
6058 Dynarr_add (dst, '~'); /* untranslatable character */
6062 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6063 Dynarr_add (dst, c);
6064 else if (ch == LEADING_BYTE_CONTROL_1)
6067 Dynarr_add (dst, c - 0x20);
6069 /* else it should be the second or third byte of an
6070 untranslatable character, so ignore it */
6073 #endif /* not UTF2000 */
6079 str->iso2022.current_char_boundary = char_boundary;
6085 /************************************************************************/
6086 /* Initialization */
6087 /************************************************************************/
6090 syms_of_file_coding (void)
6092 INIT_LRECORD_IMPLEMENTATION (coding_system);
6094 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6096 DEFSUBR (Fcoding_system_p);
6097 DEFSUBR (Ffind_coding_system);
6098 DEFSUBR (Fget_coding_system);
6099 DEFSUBR (Fcoding_system_list);
6100 DEFSUBR (Fcoding_system_name);
6101 DEFSUBR (Fmake_coding_system);
6102 DEFSUBR (Fcopy_coding_system);
6103 DEFSUBR (Fcoding_system_canonical_name_p);
6104 DEFSUBR (Fcoding_system_alias_p);
6105 DEFSUBR (Fcoding_system_aliasee);
6106 DEFSUBR (Fdefine_coding_system_alias);
6107 DEFSUBR (Fsubsidiary_coding_system);
6109 DEFSUBR (Fcoding_system_type);
6110 DEFSUBR (Fcoding_system_doc_string);
6112 DEFSUBR (Fcoding_system_charset);
6114 DEFSUBR (Fcoding_system_property);
6116 DEFSUBR (Fcoding_category_list);
6117 DEFSUBR (Fset_coding_priority_list);
6118 DEFSUBR (Fcoding_priority_list);
6119 DEFSUBR (Fset_coding_category_system);
6120 DEFSUBR (Fcoding_category_system);
6122 DEFSUBR (Fdetect_coding_region);
6123 DEFSUBR (Fdecode_coding_region);
6124 DEFSUBR (Fencode_coding_region);
6126 DEFSUBR (Fdecode_shift_jis_char);
6127 DEFSUBR (Fencode_shift_jis_char);
6128 DEFSUBR (Fdecode_big5_char);
6129 DEFSUBR (Fencode_big5_char);
6131 DEFSUBR (Fset_ucs_char);
6132 DEFSUBR (Fucs_char);
6133 DEFSUBR (Fset_char_ucs);
6134 DEFSUBR (Fchar_ucs);
6135 #endif /* not UTF2000 */
6137 defsymbol (&Qcoding_systemp, "coding-system-p");
6138 defsymbol (&Qno_conversion, "no-conversion");
6139 defsymbol (&Qraw_text, "raw-text");
6141 defsymbol (&Qbig5, "big5");
6142 defsymbol (&Qshift_jis, "shift-jis");
6143 defsymbol (&Qucs4, "ucs-4");
6144 defsymbol (&Qutf8, "utf-8");
6145 defsymbol (&Qccl, "ccl");
6146 defsymbol (&Qiso2022, "iso2022");
6148 defsymbol (&Qmnemonic, "mnemonic");
6149 defsymbol (&Qeol_type, "eol-type");
6150 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6151 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6153 defsymbol (&Qcr, "cr");
6154 defsymbol (&Qlf, "lf");
6155 defsymbol (&Qcrlf, "crlf");
6156 defsymbol (&Qeol_cr, "eol-cr");
6157 defsymbol (&Qeol_lf, "eol-lf");
6158 defsymbol (&Qeol_crlf, "eol-crlf");
6160 defsymbol (&Qcharset_g0, "charset-g0");
6161 defsymbol (&Qcharset_g1, "charset-g1");
6162 defsymbol (&Qcharset_g2, "charset-g2");
6163 defsymbol (&Qcharset_g3, "charset-g3");
6164 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6165 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6166 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6167 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6168 defsymbol (&Qno_iso6429, "no-iso6429");
6169 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6170 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6172 defsymbol (&Qshort, "short");
6173 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6174 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6175 defsymbol (&Qseven, "seven");
6176 defsymbol (&Qlock_shift, "lock-shift");
6177 defsymbol (&Qescape_quoted, "escape-quoted");
6179 defsymbol (&Qencode, "encode");
6180 defsymbol (&Qdecode, "decode");
6183 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6185 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6187 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6189 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6191 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6193 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6195 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6197 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6199 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6202 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6207 lstream_type_create_file_coding (void)
6209 LSTREAM_HAS_METHOD (decoding, reader);
6210 LSTREAM_HAS_METHOD (decoding, writer);
6211 LSTREAM_HAS_METHOD (decoding, rewinder);
6212 LSTREAM_HAS_METHOD (decoding, seekable_p);
6213 LSTREAM_HAS_METHOD (decoding, flusher);
6214 LSTREAM_HAS_METHOD (decoding, closer);
6215 LSTREAM_HAS_METHOD (decoding, marker);
6217 LSTREAM_HAS_METHOD (encoding, reader);
6218 LSTREAM_HAS_METHOD (encoding, writer);
6219 LSTREAM_HAS_METHOD (encoding, rewinder);
6220 LSTREAM_HAS_METHOD (encoding, seekable_p);
6221 LSTREAM_HAS_METHOD (encoding, flusher);
6222 LSTREAM_HAS_METHOD (encoding, closer);
6223 LSTREAM_HAS_METHOD (encoding, marker);
6227 vars_of_file_coding (void)
6231 fcd = xnew (struct file_coding_dump);
6232 dumpstruct (&fcd, &fcd_description);
6234 /* Initialize to something reasonable ... */
6235 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6237 fcd->coding_category_system[i] = Qnil;
6238 fcd->coding_category_by_priority[i] = i;
6241 Fprovide (intern ("file-coding"));
6243 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6244 Coding system used for TTY keyboard input.
6245 Not used under a windowing system.
6247 Vkeyboard_coding_system = Qnil;
6249 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6250 Coding system used for TTY display output.
6251 Not used under a windowing system.
6253 Vterminal_coding_system = Qnil;
6255 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6256 Overriding coding system used when reading from a file or process.
6257 You should bind this variable with `let', but do not set it globally.
6258 If this is non-nil, it specifies the coding system that will be used
6259 to decode input on read operations, such as from a file or process.
6260 It overrides `buffer-file-coding-system-for-read',
6261 `insert-file-contents-pre-hook', etc. Use those variables instead of
6262 this one for permanent changes to the environment. */ );
6263 Vcoding_system_for_read = Qnil;
6265 DEFVAR_LISP ("coding-system-for-write",
6266 &Vcoding_system_for_write /*
6267 Overriding coding system used when writing to a file or process.
6268 You should bind this variable with `let', but do not set it globally.
6269 If this is non-nil, it specifies the coding system that will be used
6270 to encode output for write operations, such as to a file or process.
6271 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6272 Use those variables instead of this one for permanent changes to the
6274 Vcoding_system_for_write = Qnil;
6276 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6277 Coding system used to convert pathnames when accessing files.
6279 Vfile_name_coding_system = Qnil;
6281 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6282 Non-nil means the buffer contents are regarded as multi-byte form
6283 of characters, not a binary code. This affects the display, file I/O,
6284 and behaviors of various editing commands.
6286 Setting this to nil does not do anything.
6288 enable_multibyte_characters = 1;
6292 complex_vars_of_file_coding (void)
6294 staticpro (&Vcoding_system_hash_table);
6295 Vcoding_system_hash_table =
6296 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6298 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6299 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6301 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6303 struct codesys_prop csp; \
6305 csp.prop_type = (Prop_Type); \
6306 Dynarr_add (the_codesys_prop_dynarr, csp); \
6309 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6310 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6311 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6312 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6313 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6314 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6315 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6317 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6318 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6319 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6320 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6321 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6322 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6323 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6324 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6325 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6326 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6327 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6328 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6329 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6330 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6331 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6332 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6333 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6335 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6336 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6338 /* Need to create this here or we're really screwed. */
6340 (Qraw_text, Qno_conversion,
6341 build_string ("Raw text, which means it converts only line-break-codes."),
6342 list2 (Qmnemonic, build_string ("Raw")));
6345 (Qbinary, Qno_conversion,
6346 build_string ("Binary, which means it does not convert anything."),
6347 list4 (Qeol_type, Qlf,
6348 Qmnemonic, build_string ("Binary")));
6353 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6354 list2 (Qmnemonic, build_string ("UTF8")));
6357 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6359 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6361 Fdefine_coding_system_alias (Qterminal, Qbinary);
6362 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6364 /* Need this for bootstrapping */
6365 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6366 Fget_coding_system (Qraw_text);
6369 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6370 = Fget_coding_system (Qutf8);
6373 #if defined(MULE) && !defined(UTF2000)
6377 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6378 fcd->ucs_to_mule_table[i] = Qnil;
6380 staticpro (&mule_to_ucs_table);
6381 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6382 #endif /* defined(MULE) && !defined(UTF2000) */