1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
39 #include "file-coding.h"
41 Lisp_Object Qcoding_system_error;
43 Lisp_Object Vkeyboard_coding_system;
44 Lisp_Object Vterminal_coding_system;
45 Lisp_Object Vcoding_system_for_read;
46 Lisp_Object Vcoding_system_for_write;
47 Lisp_Object Vfile_name_coding_system;
49 /* Table of symbols identifying each coding category. */
50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
54 struct file_coding_dump {
55 /* Coding system currently associated with each coding category. */
56 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
58 /* Table of all coding categories in decreasing order of priority.
59 This describes a permutation of the possible coding categories. */
60 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 #if defined(MULE) && !defined(UTF2000)
63 Lisp_Object ucs_to_mule_table[65536];
67 static const struct lrecord_description fcd_description_1[] = {
68 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 #if defined(MULE) && !defined(UTF2000)
70 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
75 static const struct struct_description fcd_description = {
76 sizeof (struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
105 Lisp_Object Qdisable_composition;
107 Lisp_Object Qencode, Qdecode;
109 Lisp_Object Vcoding_system_hash_table;
111 int enable_multibyte_characters;
114 /* Additional information used by the ISO2022 decoder and detector. */
115 struct iso2022_decoder
117 /* CHARSET holds the character sets currently assigned to the G0
118 through G3 variables. It is initialized from the array
119 INITIAL_CHARSET in CODESYS. */
120 Lisp_Object charset[4];
122 /* Which registers are currently invoked into the left (GL) and
123 right (GR) halves of the 8-bit encoding space? */
124 int register_left, register_right;
126 /* ISO_ESC holds a value indicating part of an escape sequence
127 that has already been seen. */
128 enum iso_esc_flag esc;
130 /* This records the bytes we've seen so far in an escape sequence,
131 in case the sequence is invalid (we spit out the bytes unchanged). */
132 unsigned char esc_bytes[8];
134 /* Index for next byte to store in ISO escape sequence. */
137 #ifdef ENABLE_COMPOSITE_CHARS
138 /* Stuff seen so far when composing a string. */
139 unsigned_char_dynarr *composite_chars;
142 /* If we saw an invalid designation sequence for a particular
143 register, we flag it here and switch to ASCII. The next time we
144 see a valid designation for this register, we turn off the flag
145 and do the designation normally, but pretend the sequence was
146 invalid. The effect of all this is that (most of the time) the
147 escape sequences for both the switch to the unknown charset, and
148 the switch back to the known charset, get inserted literally into
149 the buffer and saved out as such. The hope is that we can
150 preserve the escape sequences so that the resulting written out
151 file makes sense. If we don't do any of this, the designation
152 to the invalid charset will be preserved but that switch back
153 to the known charset will probably get eaten because it was
154 the same charset that was already present in the register. */
155 unsigned char invalid_designated[4];
157 /* We try to do similar things as above for direction-switching
158 sequences. If we encountered a direction switch while an
159 invalid designation was present, or an invalid designation
160 just after a direction switch (i.e. no valid designation
161 encountered yet), we insert the direction-switch escape
162 sequence literally into the output stream, and later on
163 insert the corresponding direction-restoring escape sequence
165 unsigned int switched_dir_and_no_valid_charset_yet :1;
166 unsigned int invalid_switch_dir :1;
168 /* Tells the decoder to output the escape sequence literally
169 even though it was valid. Used in the games we play to
170 avoid lossage when we encounter invalid designations. */
171 unsigned int output_literally :1;
172 /* We encountered a direction switch followed by an invalid
173 designation. We didn't output the direction switch
174 literally because we didn't know about the invalid designation;
175 but we have to do so now. */
176 unsigned int output_direction_sequence :1;
179 EXFUN (Fcopy_coding_system, 2);
181 struct detection_state;
184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
185 unsigned_char_dynarr *dst, size_t n);
187 static int detect_coding_sjis (struct detection_state *st,
188 const Extbyte *src, size_t n);
189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
190 unsigned_char_dynarr *dst, size_t n);
191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
192 unsigned_char_dynarr *dst, unsigned int *flags);
193 void char_finish_shift_jis (struct encoding_stream *str,
194 unsigned_char_dynarr *dst, unsigned int *flags);
196 static int detect_coding_big5 (struct detection_state *st,
197 const Extbyte *src, size_t n);
198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, size_t n);
200 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
201 unsigned_char_dynarr *dst, size_t n);
203 static int detect_coding_ucs4 (struct detection_state *st,
204 const Extbyte *src, size_t n);
205 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
206 unsigned_char_dynarr *dst, size_t n);
207 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
208 unsigned_char_dynarr *dst, unsigned int *flags);
209 void char_finish_ucs4 (struct encoding_stream *str,
210 unsigned_char_dynarr *dst, unsigned int *flags);
212 static int detect_coding_utf8 (struct detection_state *st,
213 const Extbyte *src, size_t n);
214 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
215 unsigned_char_dynarr *dst, size_t n);
216 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
217 unsigned_char_dynarr *dst, unsigned int *flags);
218 void char_finish_utf8 (struct encoding_stream *str,
219 unsigned_char_dynarr *dst, unsigned int *flags);
221 static int postprocess_iso2022_mask (int mask);
222 static void reset_iso2022 (Lisp_Object coding_system,
223 struct iso2022_decoder *iso);
224 static int detect_coding_iso2022 (struct detection_state *st,
225 const Extbyte *src, size_t n);
226 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
227 unsigned_char_dynarr *dst, size_t n);
228 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
229 unsigned_char_dynarr *dst, unsigned int *flags);
230 void char_finish_iso2022 (struct encoding_stream *str,
231 unsigned_char_dynarr *dst, unsigned int *flags);
233 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
234 unsigned_char_dynarr *dst, size_t n);
235 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
236 unsigned_char_dynarr *dst, size_t n);
237 static void mule_decode (Lstream *decoding, const Extbyte *src,
238 unsigned_char_dynarr *dst, size_t n);
239 static void mule_encode (Lstream *encoding, const Bufbyte *src,
240 unsigned_char_dynarr *dst, size_t n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
677 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
678 signal_simple_error_2
679 ("Attempted conversion between different charset types",
681 spec.from_charset = from;
682 spec.to_charset = to;
684 Dynarr_add (store_here, spec);
688 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
689 specs, return the equivalent as the Lisp programmer would see it.
691 If LOAD_HERE is 0, return Qnil. */
694 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
701 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
703 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
704 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
707 return Fnreverse (result);
712 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
713 Register symbol NAME as a coding system.
715 TYPE describes the conversion method used and should be one of
718 Automatic conversion. XEmacs attempts to detect the coding system
721 No conversion. Use this for binary files and such. On output,
722 graphic characters that are not in ASCII or Latin-1 will be
723 replaced by a ?. (For a no-conversion-encoded buffer, these
724 characters will only be present if you explicitly insert them.)
726 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
728 ISO 10646 UCS-4 encoding.
730 ISO 10646 UTF-8 encoding.
732 Any ISO2022-compliant encoding. Among other things, this includes
733 JIS (the Japanese encoding commonly used for e-mail), EUC (the
734 standard Unix encoding for Japanese and other languages), and
735 Compound Text (the encoding used in X11). You can specify more
736 specific information about the conversion with the PROPS argument.
738 Big5 (the encoding commonly used for Taiwanese).
740 The conversion is performed using a user-written pseudo-code
741 program. CCL (Code Conversion Language) is the name of this
744 Write out or read in the raw contents of the memory representing
745 the buffer's text. This is primarily useful for debugging
746 purposes, and is only enabled when XEmacs has been compiled with
747 DEBUG_XEMACS defined (via the --debug configure option).
748 WARNING: Reading in a file using 'internal conversion can result
749 in an internal inconsistency in the memory representing a
750 buffer's text, which will produce unpredictable results and may
751 cause XEmacs to crash. Under normal circumstances you should
752 never use 'internal conversion.
754 DOC-STRING is a string describing the coding system.
756 PROPS is a property list, describing the specific nature of the
757 character set. Recognized properties are:
760 String to be displayed in the modeline when this coding system is
764 End-of-line conversion to be used. It should be one of
767 Automatically detect the end-of-line type (LF, CRLF,
768 or CR). Also generate subsidiary coding systems named
769 `NAME-unix', `NAME-dos', and `NAME-mac', that are
770 identical to this coding system but have an EOL-TYPE
771 value of 'lf, 'crlf, and 'cr, respectively.
773 The end of a line is marked externally using ASCII LF.
774 Since this is also the way that XEmacs represents an
775 end-of-line internally, specifying this option results
776 in no end-of-line conversion. This is the standard
777 format for Unix text files.
779 The end of a line is marked externally using ASCII
780 CRLF. This is the standard format for MS-DOS text
783 The end of a line is marked externally using ASCII CR.
784 This is the standard format for Macintosh text files.
786 Automatically detect the end-of-line type but do not
787 generate subsidiary coding systems. (This value is
788 converted to nil when stored internally, and
789 `coding-system-property' will return nil.)
792 If non-nil, composition/decomposition for combining characters
795 'post-read-conversion
796 Function called after a file has been read in, to perform the
797 decoding. Called with two arguments, START and END, denoting
798 a region of the current buffer to be decoded.
800 'pre-write-conversion
801 Function called before a file is written out, to perform the
802 encoding. Called with two arguments, START and END, denoting
803 a region of the current buffer to be encoded.
806 The following additional properties are recognized if TYPE is 'iso2022:
812 The character set initially designated to the G0 - G3 registers.
813 The value should be one of
815 -- A charset object (designate that character set)
816 -- nil (do not ever use this register)
817 -- t (no character set is initially designated to
818 the register, but may be later on; this automatically
819 sets the corresponding `force-g*-on-output' property)
825 If non-nil, send an explicit designation sequence on output before
826 using the specified register.
829 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
830 "ESC $ B" on output in place of the full designation sequences
831 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834 If non-nil, don't designate ASCII to G0 at each end of line on output.
835 Setting this to non-nil also suppresses other state-resetting that
836 normally happens at the end of a line.
839 If non-nil, don't designate ASCII to G0 before control chars on output.
842 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
846 If non-nil, use locking-shift (SO/SI) instead of single-shift
847 or designation by escape sequence.
850 If non-nil, don't use ISO6429's direction specification.
853 If non-nil, literal control characters that are the same as
854 the beginning of a recognized ISO2022 or ISO6429 escape sequence
855 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
856 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
857 so that they can be properly distinguished from an escape sequence.
858 (Note that doing this results in a non-portable encoding.) This
859 encoding flag is used for byte-compiled files. Note that ESC
860 is a good choice for a quoting character because there are no
861 escape sequences whose second byte is a character from the Control-0
862 or Control-1 character sets; this is explicitly disallowed by the
865 'input-charset-conversion
866 A list of conversion specifications, specifying conversion of
867 characters in one charset to another when decoding is performed.
868 Each specification is a list of two elements: the source charset,
869 and the destination charset.
871 'output-charset-conversion
872 A list of conversion specifications, specifying conversion of
873 characters in one charset to another when encoding is performed.
874 The form of each specification is the same as for
875 'input-charset-conversion.
878 The following additional properties are recognized (and required)
882 CCL program used for decoding (converting to internal format).
885 CCL program used for encoding (converting to external format).
887 (name, type, doc_string, props))
889 Lisp_Coding_System *codesys;
890 Lisp_Object rest, key, value;
891 enum coding_system_type ty;
892 int need_to_setup_eol_systems = 1;
894 /* Convert type to constant */
895 if (NILP (type) || EQ (type, Qundecided))
896 { ty = CODESYS_AUTODETECT; }
898 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
899 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
900 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
901 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
902 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
903 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
905 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
907 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
910 signal_simple_error ("Invalid coding system type", type);
914 codesys = allocate_coding_system (ty, name);
916 if (NILP (doc_string))
917 doc_string = build_string ("");
919 CHECK_STRING (doc_string);
920 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
922 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
924 if (EQ (key, Qmnemonic))
927 CHECK_STRING (value);
928 CODING_SYSTEM_MNEMONIC (codesys) = value;
931 else if (EQ (key, Qeol_type))
933 need_to_setup_eol_systems = NILP (value);
936 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
939 else if (EQ (key, Qpost_read_conversion))
940 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
941 else if (EQ (key, Qpre_write_conversion))
942 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
944 else if (EQ (key, Qdisable_composition))
945 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
948 else if (ty == CODESYS_ISO2022)
950 #define FROB_INITIAL_CHARSET(charset_num) \
951 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
952 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
954 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
955 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
956 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
957 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
959 #define FROB_FORCE_CHARSET(charset_num) \
960 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
962 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
963 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
964 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
965 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
967 #define FROB_BOOLEAN_PROPERTY(prop) \
968 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
970 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
971 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
972 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
973 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
974 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
975 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
976 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
978 else if (EQ (key, Qinput_charset_conversion))
980 codesys->iso2022.input_conv =
981 Dynarr_new (charset_conversion_spec);
982 parse_charset_conversion_specs (codesys->iso2022.input_conv,
985 else if (EQ (key, Qoutput_charset_conversion))
987 codesys->iso2022.output_conv =
988 Dynarr_new (charset_conversion_spec);
989 parse_charset_conversion_specs (codesys->iso2022.output_conv,
993 signal_simple_error ("Unrecognized property", key);
995 else if (EQ (type, Qccl))
998 struct ccl_program test_ccl;
1001 /* Check key first. */
1002 if (EQ (key, Qdecode))
1003 suffix = "-ccl-decode";
1004 else if (EQ (key, Qencode))
1005 suffix = "-ccl-encode";
1007 signal_simple_error ("Unrecognized property", key);
1009 /* If value is vector, register it as a ccl program
1010 associated with an newly created symbol for
1011 backward compatibility. */
1012 if (VECTORP (value))
1014 sym = Fintern (concat2 (Fsymbol_name (name),
1015 build_string (suffix)),
1017 Fregister_ccl_program (sym, value);
1021 CHECK_SYMBOL (value);
1024 /* check if the given ccl programs are valid. */
1025 if (setup_ccl_program (&test_ccl, sym) < 0)
1026 signal_simple_error ("Invalid CCL program", value);
1028 if (EQ (key, Qdecode))
1029 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1030 else if (EQ (key, Qencode))
1031 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1036 signal_simple_error ("Unrecognized property", key);
1039 if (need_to_setup_eol_systems)
1040 setup_eol_coding_systems (codesys);
1043 Lisp_Object codesys_obj;
1044 XSETCODING_SYSTEM (codesys_obj, codesys);
1045 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1050 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1051 Copy OLD-CODING-SYSTEM to NEW-NAME.
1052 If NEW-NAME does not name an existing coding system, a new one will
1055 (old_coding_system, new_name))
1057 Lisp_Object new_coding_system;
1058 old_coding_system = Fget_coding_system (old_coding_system);
1059 new_coding_system = Ffind_coding_system (new_name);
1060 if (NILP (new_coding_system))
1062 XSETCODING_SYSTEM (new_coding_system,
1063 allocate_coding_system
1064 (XCODING_SYSTEM_TYPE (old_coding_system),
1066 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1070 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1071 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1072 memcpy (((char *) to ) + sizeof (to->header),
1073 ((char *) from) + sizeof (from->header),
1074 sizeof (*from) - sizeof (from->header));
1075 to->name = new_name;
1077 return new_coding_system;
1080 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1081 Return t if OBJECT names a coding system, and is not a coding system alias.
1085 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1089 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1090 Return t if OBJECT is a coding system alias.
1091 All coding system aliases are created by `define-coding-system-alias'.
1095 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1099 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1100 Return the coding-system symbol for which symbol ALIAS is an alias.
1104 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1105 if (SYMBOLP (aliasee))
1108 signal_simple_error ("Symbol is not a coding system alias", alias);
1109 return Qnil; /* To keep the compiler happy */
1113 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1115 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1119 /* A maphash function, for removing dangling coding system aliases. */
1121 dangling_coding_system_alias_p (Lisp_Object alias,
1122 Lisp_Object aliasee,
1123 void *dangling_aliases)
1125 if (SYMBOLP (aliasee)
1126 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1128 (*(int *) dangling_aliases)++;
1135 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1136 Define symbol ALIAS as an alias for coding system ALIASEE.
1138 You can use this function to redefine an alias that has already been defined,
1139 but you cannot redefine a name which is the canonical name for a coding system.
1140 \(a canonical name of a coding system is what is returned when you call
1141 `coding-system-name' on a coding system).
1143 ALIASEE itself can be an alias, which allows you to define nested aliases.
1145 You are forbidden, however, from creating alias loops or `dangling' aliases.
1146 These will be detected, and an error will be signaled if you attempt to do so.
1148 If ALIASEE is nil, then ALIAS will simply be undefined.
1150 See also `coding-system-alias-p', `coding-system-aliasee',
1151 and `coding-system-canonical-name-p'.
1155 Lisp_Object real_coding_system, probe;
1157 CHECK_SYMBOL (alias);
1159 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1161 ("Symbol is the canonical name of a coding system and cannot be redefined",
1166 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1167 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1168 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1170 Fremhash (alias, Vcoding_system_hash_table);
1172 /* Undefine subsidiary aliases,
1173 presumably created by a previous call to this function */
1174 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1175 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1176 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1178 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1179 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1180 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1183 /* Undefine dangling coding system aliases. */
1185 int dangling_aliases;
1188 dangling_aliases = 0;
1189 elisp_map_remhash (dangling_coding_system_alias_p,
1190 Vcoding_system_hash_table,
1192 } while (dangling_aliases > 0);
1198 if (CODING_SYSTEMP (aliasee))
1199 aliasee = XCODING_SYSTEM_NAME (aliasee);
1201 /* Checks that aliasee names a coding-system */
1202 real_coding_system = Fget_coding_system (aliasee);
1204 /* Check for coding system alias loops */
1205 if (EQ (alias, aliasee))
1206 alias_loop: signal_simple_error_2
1207 ("Attempt to create a coding system alias loop", alias, aliasee);
1209 for (probe = aliasee;
1211 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1213 if (EQ (probe, alias))
1217 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1219 /* Set up aliases for subsidiaries.
1220 #### There must be a better way to handle subsidiary coding systems. */
1222 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1224 for (i = 0; i < countof (suffixes); i++)
1226 Lisp_Object alias_subsidiary =
1227 append_suffix_to_symbol (alias, suffixes[i]);
1228 Lisp_Object aliasee_subsidiary =
1229 append_suffix_to_symbol (aliasee, suffixes[i]);
1231 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1232 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1235 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1236 but it doesn't look intentional, so I'd rather return something
1237 meaningful or nothing at all. */
1242 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1244 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1245 Lisp_Object new_coding_system;
1247 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1248 return coding_system;
1252 case EOL_AUTODETECT: return coding_system;
1253 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1254 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1255 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1256 default: abort (); return Qnil;
1259 return NILP (new_coding_system) ? coding_system : new_coding_system;
1262 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1263 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1265 (coding_system, eol_type))
1267 coding_system = Fget_coding_system (coding_system);
1269 return subsidiary_coding_system (coding_system,
1270 symbol_to_eol_type (eol_type));
1274 /************************************************************************/
1275 /* Coding system accessors */
1276 /************************************************************************/
1278 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1279 Return the doc string for CODING-SYSTEM.
1283 coding_system = Fget_coding_system (coding_system);
1284 return XCODING_SYSTEM_DOC_STRING (coding_system);
1287 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1288 Return the type of CODING-SYSTEM.
1292 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1295 case CODESYS_AUTODETECT: return Qundecided;
1297 case CODESYS_SHIFT_JIS: return Qshift_jis;
1298 case CODESYS_ISO2022: return Qiso2022;
1299 case CODESYS_BIG5: return Qbig5;
1300 case CODESYS_UCS4: return Qucs4;
1301 case CODESYS_UTF8: return Qutf8;
1302 case CODESYS_CCL: return Qccl;
1304 case CODESYS_NO_CONVERSION: return Qno_conversion;
1306 case CODESYS_INTERNAL: return Qinternal;
1313 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1316 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1318 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1321 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1322 Return initial charset of CODING-SYSTEM designated to GNUM.
1325 (coding_system, gnum))
1327 coding_system = Fget_coding_system (coding_system);
1330 return coding_system_charset (coding_system, XINT (gnum));
1334 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1335 Return the PROP property of CODING-SYSTEM.
1337 (coding_system, prop))
1340 enum coding_system_type type;
1342 coding_system = Fget_coding_system (coding_system);
1343 CHECK_SYMBOL (prop);
1344 type = XCODING_SYSTEM_TYPE (coding_system);
1346 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1347 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1350 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1352 case CODESYS_PROP_ALL_OK:
1355 case CODESYS_PROP_ISO2022:
1356 if (type != CODESYS_ISO2022)
1358 ("Property only valid in ISO2022 coding systems",
1362 case CODESYS_PROP_CCL:
1363 if (type != CODESYS_CCL)
1365 ("Property only valid in CCL coding systems",
1375 signal_simple_error ("Unrecognized property", prop);
1377 if (EQ (prop, Qname))
1378 return XCODING_SYSTEM_NAME (coding_system);
1379 else if (EQ (prop, Qtype))
1380 return Fcoding_system_type (coding_system);
1381 else if (EQ (prop, Qdoc_string))
1382 return XCODING_SYSTEM_DOC_STRING (coding_system);
1383 else if (EQ (prop, Qmnemonic))
1384 return XCODING_SYSTEM_MNEMONIC (coding_system);
1385 else if (EQ (prop, Qeol_type))
1386 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1387 else if (EQ (prop, Qeol_lf))
1388 return XCODING_SYSTEM_EOL_LF (coding_system);
1389 else if (EQ (prop, Qeol_crlf))
1390 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1391 else if (EQ (prop, Qeol_cr))
1392 return XCODING_SYSTEM_EOL_CR (coding_system);
1393 else if (EQ (prop, Qpost_read_conversion))
1394 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1395 else if (EQ (prop, Qpre_write_conversion))
1396 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1398 else if (type == CODESYS_ISO2022)
1400 if (EQ (prop, Qcharset_g0))
1401 return coding_system_charset (coding_system, 0);
1402 else if (EQ (prop, Qcharset_g1))
1403 return coding_system_charset (coding_system, 1);
1404 else if (EQ (prop, Qcharset_g2))
1405 return coding_system_charset (coding_system, 2);
1406 else if (EQ (prop, Qcharset_g3))
1407 return coding_system_charset (coding_system, 3);
1409 #define FORCE_CHARSET(charset_num) \
1410 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1411 (coding_system, charset_num) ? Qt : Qnil)
1413 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1414 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1415 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1416 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1418 #define LISP_BOOLEAN(prop) \
1419 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1421 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1422 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1423 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1424 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1425 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1426 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1427 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1429 else if (EQ (prop, Qinput_charset_conversion))
1431 unparse_charset_conversion_specs
1432 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1433 else if (EQ (prop, Qoutput_charset_conversion))
1435 unparse_charset_conversion_specs
1436 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1440 else if (type == CODESYS_CCL)
1442 if (EQ (prop, Qdecode))
1443 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1444 else if (EQ (prop, Qencode))
1445 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1453 return Qnil; /* not reached */
1457 /************************************************************************/
1458 /* Coding category functions */
1459 /************************************************************************/
1462 decode_coding_category (Lisp_Object symbol)
1466 CHECK_SYMBOL (symbol);
1467 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1468 if (EQ (coding_category_symbol[i], symbol))
1471 signal_simple_error ("Unrecognized coding category", symbol);
1472 return 0; /* not reached */
1475 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1476 Return a list of all recognized coding categories.
1481 Lisp_Object list = Qnil;
1483 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1484 list = Fcons (coding_category_symbol[i], list);
1488 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1489 Change the priority order of the coding categories.
1490 LIST should be list of coding categories, in descending order of
1491 priority. Unspecified coding categories will be lower in priority
1492 than all specified ones, in the same relative order they were in
1497 int category_to_priority[CODING_CATEGORY_LAST + 1];
1501 /* First generate a list that maps coding categories to priorities. */
1503 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1504 category_to_priority[i] = -1;
1506 /* Highest priority comes from the specified list. */
1508 EXTERNAL_LIST_LOOP (rest, list)
1510 int cat = decode_coding_category (XCAR (rest));
1512 if (category_to_priority[cat] >= 0)
1513 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1514 category_to_priority[cat] = i++;
1517 /* Now go through the existing categories by priority to retrieve
1518 the categories not yet specified and preserve their priority
1520 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1522 int cat = fcd->coding_category_by_priority[j];
1523 if (category_to_priority[cat] < 0)
1524 category_to_priority[cat] = i++;
1527 /* Now we need to construct the inverse of the mapping we just
1530 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1531 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1533 /* Phew! That was confusing. */
1537 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1538 Return a list of coding categories in descending order of priority.
1543 Lisp_Object list = Qnil;
1545 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1546 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1551 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1552 Change the coding system associated with a coding category.
1554 (coding_category, coding_system))
1556 int cat = decode_coding_category (coding_category);
1558 coding_system = Fget_coding_system (coding_system);
1559 fcd->coding_category_system[cat] = coding_system;
1563 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1564 Return the coding system associated with a coding category.
1568 int cat = decode_coding_category (coding_category);
1569 Lisp_Object sys = fcd->coding_category_system[cat];
1572 return XCODING_SYSTEM_NAME (sys);
1577 /************************************************************************/
1578 /* Detecting the encoding of data */
1579 /************************************************************************/
1581 struct detection_state
1583 eol_type_t eol_type;
1619 struct iso2022_decoder iso;
1621 int high_byte_count;
1622 unsigned int saw_single_shift:1;
1635 acceptable_control_char_p (int c)
1639 /* Allow and ignore control characters that you might
1640 reasonably see in a text file */
1645 case 8: /* backspace */
1646 case 11: /* vertical tab */
1647 case 12: /* form feed */
1648 case 26: /* MS-DOS C-z junk */
1649 case 31: /* '^_' -- for info */
1657 mask_has_at_most_one_bit_p (int mask)
1659 /* Perhaps the only thing useful you learn from intensive Microsoft
1660 technical interviews */
1661 return (mask & (mask - 1)) == 0;
1665 detect_eol_type (struct detection_state *st, const Extbyte *src,
1670 unsigned char c = *(unsigned char *)src++;
1673 if (st->eol.just_saw_cr)
1675 else if (st->eol.seen_anything)
1678 else if (st->eol.just_saw_cr)
1681 st->eol.just_saw_cr = 1;
1683 st->eol.just_saw_cr = 0;
1684 st->eol.seen_anything = 1;
1687 return EOL_AUTODETECT;
1690 /* Attempt to determine the encoding and EOL type of the given text.
1691 Before calling this function for the first type, you must initialize
1692 st->eol_type as appropriate and initialize st->mask to ~0.
1694 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1697 st->mask holds the determined coding category mask, or ~0 if only
1698 ASCII has been seen so far.
1702 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1703 is present in st->mask
1704 1 == definitive answers are here for both st->eol_type and st->mask
1708 detect_coding_type (struct detection_state *st, const Extbyte *src,
1709 size_t n, int just_do_eol)
1711 if (st->eol_type == EOL_AUTODETECT)
1712 st->eol_type = detect_eol_type (st, src, n);
1715 return st->eol_type != EOL_AUTODETECT;
1717 if (!st->seen_non_ascii)
1719 for (; n; n--, src++)
1721 unsigned char c = *(unsigned char *) src;
1722 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1724 st->seen_non_ascii = 1;
1726 st->shift_jis.mask = ~0;
1730 st->iso2022.mask = ~0;
1740 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1741 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1742 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1743 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1744 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1745 st->big5.mask = detect_coding_big5 (st, src, n);
1746 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1747 st->utf8.mask = detect_coding_utf8 (st, src, n);
1748 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1749 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1752 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1753 | st->utf8.mask | st->ucs4.mask;
1756 int retval = mask_has_at_most_one_bit_p (st->mask);
1757 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1758 return retval && st->eol_type != EOL_AUTODETECT;
1763 coding_system_from_mask (int mask)
1767 /* If the file was entirely or basically ASCII, use the
1768 default value of `buffer-file-coding-system'. */
1769 Lisp_Object retval =
1770 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1773 retval = Ffind_coding_system (retval);
1777 (Qbad_variable, Qwarning,
1778 "Invalid `default-buffer-file-coding-system', set to nil");
1779 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1783 retval = Fget_coding_system (Qraw_text);
1791 mask = postprocess_iso2022_mask (mask);
1793 /* Look through the coding categories by priority and find
1794 the first one that is allowed. */
1795 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1797 cat = fcd->coding_category_by_priority[i];
1798 if ((mask & (1 << cat)) &&
1799 !NILP (fcd->coding_category_system[cat]))
1803 return fcd->coding_category_system[cat];
1805 return Fget_coding_system (Qraw_text);
1809 /* Given a seekable read stream and potential coding system and EOL type
1810 as specified, do any autodetection that is called for. If the
1811 coding system and/or EOL type are not `autodetect', they will be left
1812 alone; but this function will never return an autodetect coding system
1815 This function does not automatically fetch subsidiary coding systems;
1816 that should be unnecessary with the explicit eol-type argument. */
1818 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1821 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1822 eol_type_t *eol_type_in_out)
1824 struct detection_state decst;
1826 if (*eol_type_in_out == EOL_AUTODETECT)
1827 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1830 decst.eol_type = *eol_type_in_out;
1833 /* If autodetection is called for, do it now. */
1834 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1835 || *eol_type_in_out == EOL_AUTODETECT)
1838 Lisp_Object coding_system = Qnil;
1840 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1843 /* Look for initial "-*-"; mode line prefix */
1845 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1850 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1852 Extbyte *local_vars_beg = p + 3;
1853 /* Look for final "-*-"; mode line suffix */
1854 for (p = local_vars_beg,
1855 scan_end = buf + nread - LENGTH ("-*-");
1860 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1862 Extbyte *suffix = p;
1863 /* Look for "coding:" */
1864 for (p = local_vars_beg,
1865 scan_end = suffix - LENGTH ("coding:?");
1868 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1869 && (p == local_vars_beg
1870 || (*(p-1) == ' ' ||
1876 p += LENGTH ("coding:");
1877 while (*p == ' ' || *p == '\t') p++;
1879 /* Get coding system name */
1880 save = *suffix; *suffix = '\0';
1881 /* Characters valid in a MIME charset name (rfc 1521),
1882 and in a Lisp symbol name. */
1883 n = strspn ( (char *) p,
1884 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1885 "abcdefghijklmnopqrstuvwxyz"
1891 save = p[n]; p[n] = '\0';
1893 Ffind_coding_system (intern ((char *) p));
1903 if (NILP (coding_system))
1906 if (detect_coding_type (&decst, buf, nread,
1907 XCODING_SYSTEM_TYPE (*codesys_in_out)
1908 != CODESYS_AUTODETECT))
1910 nread = Lstream_read (stream, buf, sizeof (buf));
1916 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1917 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1920 if (detect_coding_type (&decst, buf, nread, 1))
1922 nread = Lstream_read (stream, buf, sizeof (buf));
1928 *eol_type_in_out = decst.eol_type;
1929 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1931 if (NILP (coding_system))
1932 *codesys_in_out = coding_system_from_mask (decst.mask);
1934 *codesys_in_out = coding_system;
1938 /* If we absolutely can't determine the EOL type, just assume LF. */
1939 if (*eol_type_in_out == EOL_AUTODETECT)
1940 *eol_type_in_out = EOL_LF;
1942 Lstream_rewind (stream);
1945 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1946 Detect coding system of the text in the region between START and END.
1947 Return a list of possible coding systems ordered by priority.
1948 If only ASCII characters are found, return 'undecided or one of
1949 its subsidiary coding systems according to a detected end-of-line
1950 type. Optional arg BUFFER defaults to the current buffer.
1952 (start, end, buffer))
1954 Lisp_Object val = Qnil;
1955 struct buffer *buf = decode_buffer (buffer, 0);
1957 Lisp_Object instream, lb_instream;
1958 Lstream *istr, *lb_istr;
1959 struct detection_state decst;
1960 struct gcpro gcpro1, gcpro2;
1962 get_buffer_range_char (buf, start, end, &b, &e, 0);
1963 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1964 lb_istr = XLSTREAM (lb_instream);
1965 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1966 istr = XLSTREAM (instream);
1967 GCPRO2 (instream, lb_instream);
1969 decst.eol_type = EOL_AUTODETECT;
1973 Extbyte random_buffer[4096];
1974 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1978 if (detect_coding_type (&decst, random_buffer, nread, 0))
1982 if (decst.mask == ~0)
1983 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1991 decst.mask = postprocess_iso2022_mask (decst.mask);
1993 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1995 int sys = fcd->coding_category_by_priority[i];
1996 if (decst.mask & (1 << sys))
1998 Lisp_Object codesys = fcd->coding_category_system[sys];
1999 if (!NILP (codesys))
2000 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2001 val = Fcons (codesys, val);
2005 Lstream_close (istr);
2007 Lstream_delete (istr);
2008 Lstream_delete (lb_istr);
2013 /************************************************************************/
2014 /* Converting to internal Mule format ("decoding") */
2015 /************************************************************************/
2017 /* A decoding stream is a stream used for decoding text (i.e.
2018 converting from some external format to internal format).
2019 The decoding-stream object keeps track of the actual coding
2020 stream, the stream that is at the other end, and data that
2021 needs to be persistent across the lifetime of the stream. */
2023 /* Handle the EOL stuff related to just-read-in character C.
2024 EOL_TYPE is the EOL type of the coding stream.
2025 FLAGS is the current value of FLAGS in the coding stream, and may
2026 be modified by this macro. (The macro only looks at the
2027 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2028 bytes are to be written. You need to also define a local goto
2029 label "label_continue_loop" that is at the end of the main
2030 character-reading loop.
2032 If C is a CR character, then this macro handles it entirely and
2033 jumps to label_continue_loop. Otherwise, this macro does not add
2034 anything to DST, and continues normally. You should continue
2035 processing C normally after this macro. */
2037 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2041 if (eol_type == EOL_CR) \
2042 Dynarr_add (dst, '\n'); \
2043 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2044 Dynarr_add (dst, c); \
2046 flags |= CODING_STATE_CR; \
2047 goto label_continue_loop; \
2049 else if (flags & CODING_STATE_CR) \
2050 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2052 Dynarr_add (dst, '\r'); \
2053 flags &= ~CODING_STATE_CR; \
2057 /* C should be a binary character in the range 0 - 255; convert
2058 to internal format and add to Dynarr DST. */
2061 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2063 if (BYTE_ASCII_P (c)) \
2064 Dynarr_add (dst, c); \
2067 Dynarr_add (dst, (c >> 6) | 0xc0); \
2068 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2072 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2074 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2078 Dynarr_add (dst, c);
2080 else if ( c <= 0x7ff )
2082 Dynarr_add (dst, (c >> 6) | 0xc0);
2083 Dynarr_add (dst, (c & 0x3f) | 0x80);
2085 else if ( c <= 0xffff )
2087 Dynarr_add (dst, (c >> 12) | 0xe0);
2088 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2089 Dynarr_add (dst, (c & 0x3f) | 0x80);
2091 else if ( c <= 0x1fffff )
2093 Dynarr_add (dst, (c >> 18) | 0xf0);
2094 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2095 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2096 Dynarr_add (dst, (c & 0x3f) | 0x80);
2098 else if ( c <= 0x3ffffff )
2100 Dynarr_add (dst, (c >> 24) | 0xf8);
2101 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2102 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2103 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2104 Dynarr_add (dst, (c & 0x3f) | 0x80);
2108 Dynarr_add (dst, (c >> 30) | 0xfc);
2109 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2110 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2111 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2112 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2113 Dynarr_add (dst, (c & 0x3f) | 0x80);
2117 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2119 if (BYTE_ASCII_P (c)) \
2120 Dynarr_add (dst, c); \
2121 else if (BYTE_C1_P (c)) \
2123 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2124 Dynarr_add (dst, c + 0x20); \
2128 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2129 Dynarr_add (dst, c); \
2134 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2138 DECODE_ADD_BINARY_CHAR (ch, dst); \
2143 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2145 if (flags & CODING_STATE_END) \
2147 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2148 if (flags & CODING_STATE_CR) \
2149 Dynarr_add (dst, '\r'); \
2153 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2155 struct decoding_stream
2157 /* Coding system that governs the conversion. */
2158 Lisp_Coding_System *codesys;
2160 /* Stream that we read the encoded data from or
2161 write the decoded data to. */
2164 /* If we are reading, then we can return only a fixed amount of
2165 data, so if the conversion resulted in too much data, we store it
2166 here for retrieval the next time around. */
2167 unsigned_char_dynarr *runoff;
2169 /* FLAGS holds flags indicating the current state of the decoding.
2170 Some of these flags are dependent on the coding system. */
2173 /* CPOS holds a partially built-up code-point of character. */
2176 /* EOL_TYPE specifies the type of end-of-line conversion that
2177 currently applies. We need to keep this separate from the
2178 EOL type stored in CODESYS because the latter might indicate
2179 automatic EOL-type detection while the former will always
2180 indicate a particular EOL type. */
2181 eol_type_t eol_type;
2183 /* Additional ISO2022 information. We define the structure above
2184 because it's also needed by the detection routines. */
2185 struct iso2022_decoder iso2022;
2187 /* Additional information (the state of the running CCL program)
2188 used by the CCL decoder. */
2189 struct ccl_program ccl;
2191 /* counter for UTF-8 or UCS-4 */
2192 unsigned char counter;
2195 unsigned combined_char_count;
2196 Emchar combined_chars[16];
2197 Lisp_Object combining_table;
2199 struct detection_state decst;
2203 extern Lisp_Object Vcharacter_composition_table;
2206 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2208 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2212 for (i = 0; i < str->combined_char_count; i++)
2213 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2214 str->combined_char_count = 0;
2215 str->combining_table = Qnil;
2218 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2219 unsigned_char_dynarr* dst);
2221 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2222 Emchar character, unsigned_char_dynarr* dst)
2224 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2225 DECODE_ADD_UCS_CHAR (character, dst);
2226 else if (!CHAR_ID_TABLE_P (str->combining_table))
2229 = get_char_id_table (character, Vcharacter_composition_table);
2232 DECODE_ADD_UCS_CHAR (character, dst);
2235 str->combined_chars[0] = character;
2236 str->combined_char_count = 1;
2237 str->combining_table = ret;
2243 = get_char_id_table (character, str->combining_table);
2247 Emchar char2 = XCHARVAL (ret);
2248 ret = get_char_id_table (char2, Vcharacter_composition_table);
2251 DECODE_ADD_UCS_CHAR (char2, dst);
2252 str->combined_char_count = 0;
2253 str->combining_table = Qnil;
2257 str->combined_chars[0] = char2;
2258 str->combined_char_count = 1;
2259 str->combining_table = ret;
2262 else if (CHAR_ID_TABLE_P (ret))
2264 str->combined_chars[str->combined_char_count++] = character;
2265 str->combining_table = ret;
2269 COMPOSE_FLUSH_CHARS (str, dst);
2270 DECODE_ADD_UCS_CHAR (character, dst);
2274 #else /* not UTF2000 */
2275 #define COMPOSE_FLUSH_CHARS(str, dst)
2276 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2277 #endif /* UTF2000 */
2279 static ssize_t decoding_reader (Lstream *stream,
2280 unsigned char *data, size_t size);
2281 static ssize_t decoding_writer (Lstream *stream,
2282 const unsigned char *data, size_t size);
2283 static int decoding_rewinder (Lstream *stream);
2284 static int decoding_seekable_p (Lstream *stream);
2285 static int decoding_flusher (Lstream *stream);
2286 static int decoding_closer (Lstream *stream);
2288 static Lisp_Object decoding_marker (Lisp_Object stream);
2290 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2291 sizeof (struct decoding_stream));
2294 decoding_marker (Lisp_Object stream)
2296 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2297 Lisp_Object str_obj;
2299 /* We do not need to mark the coding systems or charsets stored
2300 within the stream because they are stored in a global list
2301 and automatically marked. */
2303 XSETLSTREAM (str_obj, str);
2304 mark_object (str_obj);
2305 if (str->imp->marker)
2306 return (str->imp->marker) (str_obj);
2311 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2312 so we read data from the other end, decode it, and store it into DATA. */
2315 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2317 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2318 unsigned char *orig_data = data;
2320 int error_occurred = 0;
2322 /* We need to interface to mule_decode(), which expects to take some
2323 amount of data and store the result into a Dynarr. We have
2324 mule_decode() store into str->runoff, and take data from there
2327 /* We loop until we have enough data, reading chunks from the other
2328 end and decoding it. */
2331 /* Take data from the runoff if we can. Make sure to take at
2332 most SIZE bytes, and delete the data from the runoff. */
2333 if (Dynarr_length (str->runoff) > 0)
2335 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2336 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2337 Dynarr_delete_many (str->runoff, 0, chunk);
2343 break; /* No more room for data */
2345 if (str->flags & CODING_STATE_END)
2346 /* This means that on the previous iteration, we hit the EOF on
2347 the other end. We loop once more so that mule_decode() can
2348 output any final stuff it may be holding, or any "go back
2349 to a sane state" escape sequences. (This latter makes sense
2350 during encoding.) */
2353 /* Exhausted the runoff, so get some more. DATA has at least
2354 SIZE bytes left of storage in it, so it's OK to read directly
2355 into it. (We'll be overwriting above, after we've decoded it
2356 into the runoff.) */
2357 read_size = Lstream_read (str->other_end, data, size);
2364 /* There might be some more end data produced in the translation.
2365 See the comment above. */
2366 str->flags |= CODING_STATE_END;
2367 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2370 if (data - orig_data == 0)
2371 return error_occurred ? -1 : 0;
2373 return data - orig_data;
2377 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2379 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2382 /* Decode all our data into the runoff, and then attempt to write
2383 it all out to the other end. Remove whatever chunk we succeeded
2385 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2386 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2387 Dynarr_length (str->runoff));
2389 Dynarr_delete_many (str->runoff, 0, retval);
2390 /* Do NOT return retval. The return value indicates how much
2391 of the incoming data was written, not how many bytes were
2397 reset_decoding_stream (struct decoding_stream *str)
2400 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2402 Lisp_Object coding_system;
2403 XSETCODING_SYSTEM (coding_system, str->codesys);
2404 reset_iso2022 (coding_system, &str->iso2022);
2406 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2408 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2413 str->combined_char_count = 0;
2414 str->combining_table = Qnil;
2416 str->flags = str->cpos = 0;
2420 decoding_rewinder (Lstream *stream)
2422 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2423 reset_decoding_stream (str);
2424 Dynarr_reset (str->runoff);
2425 return Lstream_rewind (str->other_end);
2429 decoding_seekable_p (Lstream *stream)
2431 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2432 return Lstream_seekable_p (str->other_end);
2436 decoding_flusher (Lstream *stream)
2438 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2439 return Lstream_flush (str->other_end);
2443 decoding_closer (Lstream *stream)
2445 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2446 if (stream->flags & LSTREAM_FL_WRITE)
2448 str->flags |= CODING_STATE_END;
2449 decoding_writer (stream, 0, 0);
2451 Dynarr_free (str->runoff);
2453 #ifdef ENABLE_COMPOSITE_CHARS
2454 if (str->iso2022.composite_chars)
2455 Dynarr_free (str->iso2022.composite_chars);
2458 return Lstream_close (str->other_end);
2462 decoding_stream_coding_system (Lstream *stream)
2464 Lisp_Object coding_system;
2465 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2467 XSETCODING_SYSTEM (coding_system, str->codesys);
2468 return subsidiary_coding_system (coding_system, str->eol_type);
2472 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2474 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2475 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2477 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2478 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2479 reset_decoding_stream (str);
2482 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2483 stream for writing, no automatic code detection will be performed.
2484 The reason for this is that automatic code detection requires a
2485 seekable input. Things will also fail if you open a decoding
2486 stream for reading using a non-fully-specified coding system and
2487 a non-seekable input stream. */
2490 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2493 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2494 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2498 str->other_end = stream;
2499 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2500 str->eol_type = EOL_AUTODETECT;
2501 if (!strcmp (mode, "r")
2502 && Lstream_seekable_p (stream))
2503 /* We can determine the coding system now. */
2504 determine_real_coding_system (stream, &codesys, &str->eol_type);
2505 set_decoding_stream_coding_system (lstr, codesys);
2506 str->decst.eol_type = str->eol_type;
2507 str->decst.mask = ~0;
2508 XSETLSTREAM (obj, lstr);
2513 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2515 return make_decoding_stream_1 (stream, codesys, "r");
2519 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2521 return make_decoding_stream_1 (stream, codesys, "w");
2524 /* Note: the decode_coding_* functions all take the same
2525 arguments as mule_decode(), which is to say some SRC data of
2526 size N, which is to be stored into dynamic array DST.
2527 DECODING is the stream within which the decoding is
2528 taking place, but no data is actually read from or
2529 written to that stream; that is handled in decoding_reader()
2530 or decoding_writer(). This allows the same functions to
2531 be used for both reading and writing. */
2534 mule_decode (Lstream *decoding, const Extbyte *src,
2535 unsigned_char_dynarr *dst, size_t n)
2537 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2539 /* If necessary, do encoding-detection now. We do this when
2540 we're a writing stream or a non-seekable reading stream,
2541 meaning that we can't just process the whole input,
2542 rewind, and start over. */
2544 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2545 str->eol_type == EOL_AUTODETECT)
2547 Lisp_Object codesys;
2549 XSETCODING_SYSTEM (codesys, str->codesys);
2550 detect_coding_type (&str->decst, src, n,
2551 CODING_SYSTEM_TYPE (str->codesys) !=
2552 CODESYS_AUTODETECT);
2553 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2554 str->decst.mask != ~0)
2555 /* #### This is cheesy. What we really ought to do is
2556 buffer up a certain amount of data so as to get a
2557 less random result. */
2558 codesys = coding_system_from_mask (str->decst.mask);
2559 str->eol_type = str->decst.eol_type;
2560 if (XCODING_SYSTEM (codesys) != str->codesys)
2562 /* Preserve the CODING_STATE_END flag in case it was set.
2563 If we erase it, bad things might happen. */
2564 int was_end = str->flags & CODING_STATE_END;
2565 set_decoding_stream_coding_system (decoding, codesys);
2567 str->flags |= CODING_STATE_END;
2571 switch (CODING_SYSTEM_TYPE (str->codesys))
2574 case CODESYS_INTERNAL:
2575 Dynarr_add_many (dst, src, n);
2578 case CODESYS_AUTODETECT:
2579 /* If we got this far and still haven't decided on the coding
2580 system, then do no conversion. */
2581 case CODESYS_NO_CONVERSION:
2582 decode_coding_no_conversion (decoding, src, dst, n);
2585 case CODESYS_SHIFT_JIS:
2586 decode_coding_sjis (decoding, src, dst, n);
2589 decode_coding_big5 (decoding, src, dst, n);
2592 decode_coding_ucs4 (decoding, src, dst, n);
2595 decode_coding_utf8 (decoding, src, dst, n);
2598 str->ccl.last_block = str->flags & CODING_STATE_END;
2599 /* When applying ccl program to stream, MUST NOT set NULL
2601 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2602 dst, n, 0, CCL_MODE_DECODING);
2604 case CODESYS_ISO2022:
2605 decode_coding_iso2022 (decoding, src, dst, n);
2613 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2614 Decode the text between START and END which is encoded in CODING-SYSTEM.
2615 This is useful if you've read in encoded text from a file without decoding
2616 it (e.g. you read in a JIS-formatted file but used the `binary' or
2617 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2618 Return length of decoded text.
2619 BUFFER defaults to the current buffer if unspecified.
2621 (start, end, coding_system, buffer))
2624 struct buffer *buf = decode_buffer (buffer, 0);
2625 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2626 Lstream *istr, *ostr;
2627 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2629 get_buffer_range_char (buf, start, end, &b, &e, 0);
2631 barf_if_buffer_read_only (buf, b, e);
2633 coding_system = Fget_coding_system (coding_system);
2634 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2635 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2636 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2638 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2639 Fget_coding_system (Qbinary));
2640 istr = XLSTREAM (instream);
2641 ostr = XLSTREAM (outstream);
2642 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2644 /* The chain of streams looks like this:
2646 [BUFFER] <----- send through
2647 ------> [ENCODE AS BINARY]
2648 ------> [DECODE AS SPECIFIED]
2654 char tempbuf[1024]; /* some random amount */
2655 Bufpos newpos, even_newer_pos;
2656 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2657 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2661 newpos = lisp_buffer_stream_startpos (istr);
2662 Lstream_write (ostr, tempbuf, size_in_bytes);
2663 even_newer_pos = lisp_buffer_stream_startpos (istr);
2664 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2667 Lstream_close (istr);
2668 Lstream_close (ostr);
2670 Lstream_delete (istr);
2671 Lstream_delete (ostr);
2672 Lstream_delete (XLSTREAM (de_outstream));
2673 Lstream_delete (XLSTREAM (lb_outstream));
2678 /************************************************************************/
2679 /* Converting to an external encoding ("encoding") */
2680 /************************************************************************/
2682 /* An encoding stream is an output stream. When you create the
2683 stream, you specify the coding system that governs the encoding
2684 and another stream that the resulting encoded data is to be
2685 sent to, and then start sending data to it. */
2687 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2689 struct encoding_stream
2691 /* Coding system that governs the conversion. */
2692 Lisp_Coding_System *codesys;
2694 /* Stream that we read the encoded data from or
2695 write the decoded data to. */
2698 /* If we are reading, then we can return only a fixed amount of
2699 data, so if the conversion resulted in too much data, we store it
2700 here for retrieval the next time around. */
2701 unsigned_char_dynarr *runoff;
2703 /* FLAGS holds flags indicating the current state of the encoding.
2704 Some of these flags are dependent on the coding system. */
2707 /* CH holds a partially built-up character. Since we only deal
2708 with one- and two-byte characters at the moment, we only use
2709 this to store the first byte of a two-byte character. */
2712 /* Additional information used by the ISO2022 encoder. */
2715 /* CHARSET holds the character sets currently assigned to the G0
2716 through G3 registers. It is initialized from the array
2717 INITIAL_CHARSET in CODESYS. */
2718 Lisp_Object charset[4];
2720 /* Which registers are currently invoked into the left (GL) and
2721 right (GR) halves of the 8-bit encoding space? */
2722 int register_left, register_right;
2724 /* Whether we need to explicitly designate the charset in the
2725 G? register before using it. It is initialized from the
2726 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2727 unsigned char force_charset_on_output[4];
2729 /* Other state variables that need to be preserved across
2731 Lisp_Object current_charset;
2733 int current_char_boundary;
2736 void (*encode_char) (struct encoding_stream *str, Emchar c,
2737 unsigned_char_dynarr *dst, unsigned int *flags);
2738 void (*finish) (struct encoding_stream *str,
2739 unsigned_char_dynarr *dst, unsigned int *flags);
2741 /* Additional information (the state of the running CCL program)
2742 used by the CCL encoder. */
2743 struct ccl_program ccl;
2747 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2748 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2750 static int encoding_rewinder (Lstream *stream);
2751 static int encoding_seekable_p (Lstream *stream);
2752 static int encoding_flusher (Lstream *stream);
2753 static int encoding_closer (Lstream *stream);
2755 static Lisp_Object encoding_marker (Lisp_Object stream);
2757 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2758 sizeof (struct encoding_stream));
2761 encoding_marker (Lisp_Object stream)
2763 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2764 Lisp_Object str_obj;
2766 /* We do not need to mark the coding systems or charsets stored
2767 within the stream because they are stored in a global list
2768 and automatically marked. */
2770 XSETLSTREAM (str_obj, str);
2771 mark_object (str_obj);
2772 if (str->imp->marker)
2773 return (str->imp->marker) (str_obj);
2778 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2779 so we read data from the other end, encode it, and store it into DATA. */
2782 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2784 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2785 unsigned char *orig_data = data;
2787 int error_occurred = 0;
2789 /* We need to interface to mule_encode(), which expects to take some
2790 amount of data and store the result into a Dynarr. We have
2791 mule_encode() store into str->runoff, and take data from there
2794 /* We loop until we have enough data, reading chunks from the other
2795 end and encoding it. */
2798 /* Take data from the runoff if we can. Make sure to take at
2799 most SIZE bytes, and delete the data from the runoff. */
2800 if (Dynarr_length (str->runoff) > 0)
2802 int chunk = min ((int) size, Dynarr_length (str->runoff));
2803 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2804 Dynarr_delete_many (str->runoff, 0, chunk);
2810 break; /* No more room for data */
2812 if (str->flags & CODING_STATE_END)
2813 /* This means that on the previous iteration, we hit the EOF on
2814 the other end. We loop once more so that mule_encode() can
2815 output any final stuff it may be holding, or any "go back
2816 to a sane state" escape sequences. (This latter makes sense
2817 during encoding.) */
2820 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2821 left of storage in it, so it's OK to read directly into it.
2822 (We'll be overwriting above, after we've encoded it into the
2824 read_size = Lstream_read (str->other_end, data, size);
2831 /* There might be some more end data produced in the translation.
2832 See the comment above. */
2833 str->flags |= CODING_STATE_END;
2834 mule_encode (stream, data, str->runoff, read_size);
2837 if (data == orig_data)
2838 return error_occurred ? -1 : 0;
2840 return data - orig_data;
2844 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2846 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2849 /* Encode all our data into the runoff, and then attempt to write
2850 it all out to the other end. Remove whatever chunk we succeeded
2852 mule_encode (stream, data, str->runoff, size);
2853 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2854 Dynarr_length (str->runoff));
2856 Dynarr_delete_many (str->runoff, 0, retval);
2857 /* Do NOT return retval. The return value indicates how much
2858 of the incoming data was written, not how many bytes were
2864 reset_encoding_stream (struct encoding_stream *str)
2867 switch (CODING_SYSTEM_TYPE (str->codesys))
2869 case CODESYS_ISO2022:
2873 str->encode_char = &char_encode_iso2022;
2874 str->finish = &char_finish_iso2022;
2875 for (i = 0; i < 4; i++)
2877 str->iso2022.charset[i] =
2878 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2879 str->iso2022.force_charset_on_output[i] =
2880 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2882 str->iso2022.register_left = 0;
2883 str->iso2022.register_right = 1;
2884 str->iso2022.current_charset = Qnil;
2885 str->iso2022.current_half = 0;
2889 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2892 str->encode_char = &char_encode_utf8;
2893 str->finish = &char_finish_utf8;
2896 str->encode_char = &char_encode_ucs4;
2897 str->finish = &char_finish_ucs4;
2899 case CODESYS_SHIFT_JIS:
2900 str->encode_char = &char_encode_shift_jis;
2901 str->finish = &char_finish_shift_jis;
2907 str->iso2022.current_char_boundary = 0;
2908 str->flags = str->ch = 0;
2912 encoding_rewinder (Lstream *stream)
2914 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915 reset_encoding_stream (str);
2916 Dynarr_reset (str->runoff);
2917 return Lstream_rewind (str->other_end);
2921 encoding_seekable_p (Lstream *stream)
2923 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2924 return Lstream_seekable_p (str->other_end);
2928 encoding_flusher (Lstream *stream)
2930 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2931 return Lstream_flush (str->other_end);
2935 encoding_closer (Lstream *stream)
2937 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2938 if (stream->flags & LSTREAM_FL_WRITE)
2940 str->flags |= CODING_STATE_END;
2941 encoding_writer (stream, 0, 0);
2943 Dynarr_free (str->runoff);
2944 return Lstream_close (str->other_end);
2948 encoding_stream_coding_system (Lstream *stream)
2950 Lisp_Object coding_system;
2951 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2953 XSETCODING_SYSTEM (coding_system, str->codesys);
2954 return coding_system;
2958 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2960 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2961 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2963 reset_encoding_stream (str);
2967 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2970 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2971 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2975 str->runoff = Dynarr_new (unsigned_char);
2976 str->other_end = stream;
2977 set_encoding_stream_coding_system (lstr, codesys);
2978 XSETLSTREAM (obj, lstr);
2983 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2985 return make_encoding_stream_1 (stream, codesys, "r");
2989 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2991 return make_encoding_stream_1 (stream, codesys, "w");
2994 /* Convert N bytes of internally-formatted data stored in SRC to an
2995 external format, according to the encoding stream ENCODING.
2996 Store the encoded data into DST. */
2999 mule_encode (Lstream *encoding, const Bufbyte *src,
3000 unsigned_char_dynarr *dst, size_t n)
3002 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3004 switch (CODING_SYSTEM_TYPE (str->codesys))
3007 case CODESYS_INTERNAL:
3008 Dynarr_add_many (dst, src, n);
3011 case CODESYS_AUTODETECT:
3012 /* If we got this far and still haven't decided on the coding
3013 system, then do no conversion. */
3014 case CODESYS_NO_CONVERSION:
3015 encode_coding_no_conversion (encoding, src, dst, n);
3019 encode_coding_big5 (encoding, src, dst, n);
3022 str->ccl.last_block = str->flags & CODING_STATE_END;
3023 /* When applying ccl program to stream, MUST NOT set NULL
3025 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3026 dst, n, 0, CCL_MODE_ENCODING);
3030 text_encode_generic (encoding, src, dst, n);
3034 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3035 Encode the text between START and END using CODING-SYSTEM.
3036 This will, for example, convert Japanese characters into stuff such as
3037 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3038 text. BUFFER defaults to the current buffer if unspecified.
3040 (start, end, coding_system, buffer))
3043 struct buffer *buf = decode_buffer (buffer, 0);
3044 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3045 Lstream *istr, *ostr;
3046 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3048 get_buffer_range_char (buf, start, end, &b, &e, 0);
3050 barf_if_buffer_read_only (buf, b, e);
3052 coding_system = Fget_coding_system (coding_system);
3053 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3054 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3055 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3056 Fget_coding_system (Qbinary));
3057 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3059 istr = XLSTREAM (instream);
3060 ostr = XLSTREAM (outstream);
3061 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3062 /* The chain of streams looks like this:
3064 [BUFFER] <----- send through
3065 ------> [ENCODE AS SPECIFIED]
3066 ------> [DECODE AS BINARY]
3071 char tempbuf[1024]; /* some random amount */
3072 Bufpos newpos, even_newer_pos;
3073 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3074 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3078 newpos = lisp_buffer_stream_startpos (istr);
3079 Lstream_write (ostr, tempbuf, size_in_bytes);
3080 even_newer_pos = lisp_buffer_stream_startpos (istr);
3081 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3087 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3088 Lstream_close (istr);
3089 Lstream_close (ostr);
3091 Lstream_delete (istr);
3092 Lstream_delete (ostr);
3093 Lstream_delete (XLSTREAM (de_outstream));
3094 Lstream_delete (XLSTREAM (lb_outstream));
3095 return make_int (retlen);
3102 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3103 unsigned_char_dynarr *dst, size_t n)
3106 unsigned char char_boundary;
3107 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3108 unsigned int flags = str->flags;
3109 Emchar ch = str->ch;
3111 char_boundary = str->iso2022.current_char_boundary;
3117 if (char_boundary == 0)
3145 (*str->encode_char) (str, c, dst, &flags);
3147 else if (char_boundary == 1)
3149 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3155 ch = (ch << 6) | (c & 0x3f);
3160 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3162 (*str->finish) (str, dst, &flags);
3167 str->iso2022.current_char_boundary = char_boundary;
3171 /************************************************************************/
3172 /* Shift-JIS methods */
3173 /************************************************************************/
3175 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3176 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3177 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3178 encoded by "position-code + 0x80". A character of JISX0208
3179 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3180 position-codes are divided and shifted so that it fit in the range
3183 --- CODE RANGE of Shift-JIS ---
3184 (character set) (range)
3186 JISX0201-Kana 0xA0 .. 0xDF
3187 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3188 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3189 -------------------------------
3193 /* Is this the first byte of a Shift-JIS two-byte char? */
3195 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3196 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3198 /* Is this the second byte of a Shift-JIS two-byte char? */
3200 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3201 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3203 #define BYTE_SJIS_KATAKANA_P(c) \
3204 ((c) >= 0xA1 && (c) <= 0xDF)
3207 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3211 unsigned char c = *(unsigned char *)src++;
3212 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3214 if (st->shift_jis.in_second_byte)
3216 st->shift_jis.in_second_byte = 0;
3220 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3221 st->shift_jis.in_second_byte = 1;
3223 return CODING_CATEGORY_SHIFT_JIS_MASK;
3226 /* Convert Shift-JIS data to internal format. */
3229 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3230 unsigned_char_dynarr *dst, size_t n)
3232 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3233 unsigned int flags = str->flags;
3234 unsigned int cpos = str->cpos;
3235 eol_type_t eol_type = str->eol_type;
3239 unsigned char c = *(unsigned char *)src++;
3243 /* Previous character was first byte of Shift-JIS Kanji char. */
3244 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3246 unsigned char e1, e2;
3248 DECODE_SJIS (cpos, c, e1, e2);
3250 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3254 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3255 Dynarr_add (dst, e1);
3256 Dynarr_add (dst, e2);
3261 DECODE_ADD_BINARY_CHAR (cpos, dst);
3262 DECODE_ADD_BINARY_CHAR (c, dst);
3268 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3269 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3271 else if (BYTE_SJIS_KATAKANA_P (c))
3274 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3277 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3278 Dynarr_add (dst, c);
3283 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3287 DECODE_ADD_BINARY_CHAR (c, dst);
3289 label_continue_loop:;
3292 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3298 /* Convert internal character representation to Shift_JIS. */
3301 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3302 unsigned_char_dynarr *dst, unsigned int *flags)
3304 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3308 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3309 Dynarr_add (dst, '\r');
3310 if (eol_type != EOL_CR)
3311 Dynarr_add (dst, ch);
3315 unsigned int s1, s2;
3317 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3319 if (code_point >= 0)
3320 Dynarr_add (dst, code_point);
3321 else if ((code_point
3322 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3325 ENCODE_SJIS ((code_point >> 8) | 0x80,
3326 (code_point & 0xFF) | 0x80, s1, s2);
3327 Dynarr_add (dst, s1);
3328 Dynarr_add (dst, s2);
3330 else if ((code_point
3331 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3333 Dynarr_add (dst, code_point | 0x80);
3334 else if ((code_point
3335 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3338 ENCODE_SJIS ((code_point >> 8) | 0x80,
3339 (code_point & 0xFF) | 0x80, s1, s2);
3340 Dynarr_add (dst, s1);
3341 Dynarr_add (dst, s2);
3343 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3345 Dynarr_add (dst, code_point);
3347 Dynarr_add (dst, '?');
3349 Lisp_Object charset;
3350 unsigned int c1, c2;
3352 BREAKUP_CHAR (ch, charset, c1, c2);
3354 if (EQ(charset, Vcharset_katakana_jisx0201))
3356 Dynarr_add (dst, c1 | 0x80);
3360 Dynarr_add (dst, c1);
3362 else if (EQ(charset, Vcharset_japanese_jisx0208))
3364 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3365 Dynarr_add (dst, s1);
3366 Dynarr_add (dst, s2);
3369 Dynarr_add (dst, '?');
3375 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3376 unsigned int *flags)
3380 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3381 Decode a JISX0208 character of Shift-JIS coding-system.
3382 CODE is the character code in Shift-JIS as a cons of type bytes.
3383 Return the corresponding character.
3387 unsigned char c1, c2, s1, s2;
3390 CHECK_INT (XCAR (code));
3391 CHECK_INT (XCDR (code));
3392 s1 = XINT (XCAR (code));
3393 s2 = XINT (XCDR (code));
3394 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3395 BYTE_SJIS_TWO_BYTE_2_P (s2))
3397 DECODE_SJIS (s1, s2, c1, c2);
3398 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3399 c1 & 0x7F, c2 & 0x7F));
3405 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3406 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3407 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3411 Lisp_Object charset;
3414 CHECK_CHAR_COERCE_INT (character);
3415 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3416 if (EQ (charset, Vcharset_japanese_jisx0208))
3418 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3419 return Fcons (make_int (s1), make_int (s2));
3426 /************************************************************************/
3428 /************************************************************************/
3430 /* BIG5 is a coding system encoding two character sets: ASCII and
3431 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3432 character set and is encoded in two-byte.
3434 --- CODE RANGE of BIG5 ---
3435 (character set) (range)
3437 Big5 (1st byte) 0xA1 .. 0xFE
3438 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3439 --------------------------
3441 Since the number of characters in Big5 is larger than maximum
3442 characters in Emacs' charset (96x96), it can't be handled as one
3443 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3444 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3445 contains frequently used characters and the latter contains less
3446 frequently used characters. */
3448 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3449 ((c) >= 0xA1 && (c) <= 0xFE)
3451 /* Is this the second byte of a Shift-JIS two-byte char? */
3453 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3454 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3456 /* Number of Big5 characters which have the same code in 1st byte. */
3458 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3460 /* Code conversion macros. These are macros because they are used in
3461 inner loops during code conversion.
3463 Note that temporary variables in macros introduce the classic
3464 dynamic-scoping problems with variable names. We use capital-
3465 lettered variables in the assumption that XEmacs does not use
3466 capital letters in variables except in a very formalized way
3469 /* Convert Big5 code (b1, b2) into its internal string representation
3472 /* There is a much simpler way to split the Big5 charset into two.
3473 For the moment I'm going to leave the algorithm as-is because it
3474 claims to separate out the most-used characters into a single
3475 charset, which perhaps will lead to optimizations in various
3478 The way the algorithm works is something like this:
3480 Big5 can be viewed as a 94x157 charset, where the row is
3481 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3482 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3483 the split between low and high column numbers is apparently
3484 meaningless; ascending rows produce less and less frequent chars.
3485 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3486 the first charset, and the upper half (0xC9 .. 0xFE) to the
3487 second. To do the conversion, we convert the character into
3488 a single number where 0 .. 156 is the first row, 157 .. 313
3489 is the second, etc. That way, the characters are ordered by
3490 decreasing frequency. Then we just chop the space in two
3491 and coerce the result into a 94x94 space.
3494 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3496 int B1 = b1, B2 = b2; \
3498 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3502 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3506 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3507 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3509 c1 = I / (0xFF - 0xA1) + 0xA1; \
3510 c2 = I % (0xFF - 0xA1) + 0xA1; \
3513 /* Convert the internal string representation of a Big5 character
3514 (lb, c1, c2) into Big5 code (b1, b2). */
3516 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3518 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3520 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3522 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3524 b1 = I / BIG5_SAME_ROW + 0xA1; \
3525 b2 = I % BIG5_SAME_ROW; \
3526 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3530 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3534 unsigned char c = *(unsigned char *)src++;
3535 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3536 (c >= 0x80 && c <= 0xA0))
3538 if (st->big5.in_second_byte)
3540 st->big5.in_second_byte = 0;
3541 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3545 st->big5.in_second_byte = 1;
3547 return CODING_CATEGORY_BIG5_MASK;
3550 /* Convert Big5 data to internal format. */
3553 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3554 unsigned_char_dynarr *dst, size_t n)
3556 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3557 unsigned int flags = str->flags;
3558 unsigned int cpos = str->cpos;
3559 eol_type_t eol_type = str->eol_type;
3563 unsigned char c = *(unsigned char *)src++;
3566 /* Previous character was first byte of Big5 char. */
3567 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3571 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3574 unsigned char b1, b2, b3;
3575 DECODE_BIG5 (cpos, c, b1, b2, b3);
3576 Dynarr_add (dst, b1);
3577 Dynarr_add (dst, b2);
3578 Dynarr_add (dst, b3);
3583 DECODE_ADD_BINARY_CHAR (cpos, dst);
3584 DECODE_ADD_BINARY_CHAR (c, dst);
3590 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3591 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3594 DECODE_ADD_BINARY_CHAR (c, dst);
3596 label_continue_loop:;
3599 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3605 /* Convert internally-formatted data to Big5. */
3608 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3609 unsigned_char_dynarr *dst, size_t n)
3613 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3614 unsigned int flags = str->flags;
3615 unsigned int ch = str->ch;
3616 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3623 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3624 Dynarr_add (dst, '\r');
3625 if (eol_type != EOL_CR)
3626 Dynarr_add (dst, '\n');
3628 else if (BYTE_ASCII_P (c))
3631 Dynarr_add (dst, c);
3633 else if (BUFBYTE_LEADING_BYTE_P (c))
3635 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3636 c == LEADING_BYTE_CHINESE_BIG5_2)
3638 /* A recognized leading byte. */
3640 continue; /* not done with this character. */
3642 /* otherwise just ignore this character. */
3644 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3645 ch == LEADING_BYTE_CHINESE_BIG5_2)
3647 /* Previous char was a recognized leading byte. */
3649 continue; /* not done with this character. */
3653 /* Encountering second byte of a Big5 character. */
3654 unsigned char b1, b2;
3656 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3657 Dynarr_add (dst, b1);
3658 Dynarr_add (dst, b2);
3670 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3671 Decode a Big5 character CODE of BIG5 coding-system.
3672 CODE is the character code in BIG5, a cons of two integers.
3673 Return the corresponding character.
3677 unsigned char c1, c2, b1, b2;
3680 CHECK_INT (XCAR (code));
3681 CHECK_INT (XCDR (code));
3682 b1 = XINT (XCAR (code));
3683 b2 = XINT (XCDR (code));
3684 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3685 BYTE_BIG5_TWO_BYTE_2_P (b2))
3687 Charset_ID leading_byte;
3688 Lisp_Object charset;
3689 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3690 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3691 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3697 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3698 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3699 Return the corresponding character code in Big5.
3703 Lisp_Object charset;
3706 CHECK_CHAR_COERCE_INT (character);
3707 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3708 if (EQ (charset, Vcharset_chinese_big5_1) ||
3709 EQ (charset, Vcharset_chinese_big5_2))
3711 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3713 return Fcons (make_int (b1), make_int (b2));
3720 /************************************************************************/
3722 /************************************************************************/
3725 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3729 unsigned char c = *(unsigned char *)src++;
3730 switch (st->ucs4.in_byte)
3739 st->ucs4.in_byte = 0;
3745 return CODING_CATEGORY_UCS4_MASK;
3749 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3750 unsigned_char_dynarr *dst, size_t n)
3752 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3753 unsigned int flags = str->flags;
3754 unsigned int cpos = str->cpos;
3755 unsigned char counter = str->counter;
3759 unsigned char c = *(unsigned char *)src++;
3767 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3772 cpos = ( cpos << 8 ) | c;
3776 if (counter & CODING_STATE_END)
3777 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3781 str->counter = counter;
3785 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3786 unsigned_char_dynarr *dst, unsigned int *flags)
3788 Dynarr_add (dst, ch >> 24);
3789 Dynarr_add (dst, ch >> 16);
3790 Dynarr_add (dst, ch >> 8);
3791 Dynarr_add (dst, ch );
3795 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3796 unsigned int *flags)
3801 /************************************************************************/
3803 /************************************************************************/
3806 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3810 unsigned char c = *(unsigned char *)src++;
3811 switch (st->utf8.in_byte)
3814 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3817 st->utf8.in_byte = 5;
3819 st->utf8.in_byte = 4;
3821 st->utf8.in_byte = 3;
3823 st->utf8.in_byte = 2;
3825 st->utf8.in_byte = 1;
3830 if ((c & 0xc0) != 0x80)
3836 return CODING_CATEGORY_UTF8_MASK;
3840 decode_output_utf8_partial_char (unsigned char counter,
3842 unsigned_char_dynarr *dst)
3845 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3846 else if (counter == 4)
3848 if (cpos < (1 << 6))
3849 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3852 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3853 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3856 else if (counter == 3)
3858 if (cpos < (1 << 6))
3859 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3860 else if (cpos < (1 << 12))
3862 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3863 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3867 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3868 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3869 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3872 else if (counter == 2)
3874 if (cpos < (1 << 6))
3875 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3876 else if (cpos < (1 << 12))
3878 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3879 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3881 else if (cpos < (1 << 18))
3883 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3884 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3885 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3889 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3890 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3891 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3892 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3897 if (cpos < (1 << 6))
3898 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3899 else if (cpos < (1 << 12))
3901 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3902 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3904 else if (cpos < (1 << 18))
3906 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3907 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3908 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3910 else if (cpos < (1 << 24))
3912 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3913 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3914 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3915 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3919 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3920 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3921 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3922 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3923 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3929 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3930 unsigned_char_dynarr *dst, size_t n)
3932 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3933 unsigned int flags = str->flags;
3934 unsigned int cpos = str->cpos;
3935 eol_type_t eol_type = str->eol_type;
3936 unsigned char counter = str->counter;
3940 unsigned char c = *(unsigned char *)src++;
3945 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3946 DECODE_ADD_UCS_CHAR (c, dst);
3948 else if ( c < 0xE0 )
3953 else if ( c < 0xF0 )
3958 else if ( c < 0xF8 )
3963 else if ( c < 0xFC )
3974 else if ( (c & 0xC0) == 0x80 )
3976 cpos = ( cpos << 6 ) | ( c & 0x3f );
3979 DECODE_ADD_UCS_CHAR (cpos, dst);
3988 decode_output_utf8_partial_char (counter, cpos, dst);
3989 DECODE_ADD_BINARY_CHAR (c, dst);
3993 label_continue_loop:;
3996 if (flags & CODING_STATE_END)
3999 decode_output_utf8_partial_char (counter, cpos, dst);
4005 str->counter = counter;
4009 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4010 unsigned_char_dynarr *dst, unsigned int *flags)
4012 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4016 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4017 Dynarr_add (dst, '\r');
4018 if (eol_type != EOL_CR)
4019 Dynarr_add (dst, ch);
4021 else if (ch <= 0x7f)
4023 Dynarr_add (dst, ch);
4025 else if (ch <= 0x7ff)
4027 Dynarr_add (dst, (ch >> 6) | 0xc0);
4028 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4030 else if (ch <= 0xffff)
4032 Dynarr_add (dst, (ch >> 12) | 0xe0);
4033 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4034 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4036 else if (ch <= 0x1fffff)
4038 Dynarr_add (dst, (ch >> 18) | 0xf0);
4039 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4040 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4041 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4043 else if (ch <= 0x3ffffff)
4045 Dynarr_add (dst, (ch >> 24) | 0xf8);
4046 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4047 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4048 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4049 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4053 Dynarr_add (dst, (ch >> 30) | 0xfc);
4054 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4055 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4056 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4057 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4058 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4063 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4064 unsigned int *flags)
4069 /************************************************************************/
4070 /* ISO2022 methods */
4071 /************************************************************************/
4073 /* The following note describes the coding system ISO2022 briefly.
4074 Since the intention of this note is to help understand the
4075 functions in this file, some parts are NOT ACCURATE or OVERLY
4076 SIMPLIFIED. For thorough understanding, please refer to the
4077 original document of ISO2022.
4079 ISO2022 provides many mechanisms to encode several character sets
4080 in 7-bit and 8-bit environments. For 7-bit environments, all text
4081 is encoded using bytes less than 128. This may make the encoded
4082 text a little bit longer, but the text passes more easily through
4083 several gateways, some of which strip off MSB (Most Signigant Bit).
4085 There are two kinds of character sets: control character set and
4086 graphic character set. The former contains control characters such
4087 as `newline' and `escape' to provide control functions (control
4088 functions are also provided by escape sequences). The latter
4089 contains graphic characters such as 'A' and '-'. Emacs recognizes
4090 two control character sets and many graphic character sets.
4092 Graphic character sets are classified into one of the following
4093 four classes, according to the number of bytes (DIMENSION) and
4094 number of characters in one dimension (CHARS) of the set:
4095 - DIMENSION1_CHARS94
4096 - DIMENSION1_CHARS96
4097 - DIMENSION2_CHARS94
4098 - DIMENSION2_CHARS96
4100 In addition, each character set is assigned an identification tag,
4101 unique for each set, called "final character" (denoted as <F>
4102 hereafter). The <F> of each character set is decided by ECMA(*)
4103 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4104 (0x30..0x3F are for private use only).
4106 Note (*): ECMA = European Computer Manufacturers Association
4108 Here are examples of graphic character set [NAME(<F>)]:
4109 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4110 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4111 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4112 o DIMENSION2_CHARS96 -- none for the moment
4114 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4115 C0 [0x00..0x1F] -- control character plane 0
4116 GL [0x20..0x7F] -- graphic character plane 0
4117 C1 [0x80..0x9F] -- control character plane 1
4118 GR [0xA0..0xFF] -- graphic character plane 1
4120 A control character set is directly designated and invoked to C0 or
4121 C1 by an escape sequence. The most common case is that:
4122 - ISO646's control character set is designated/invoked to C0, and
4123 - ISO6429's control character set is designated/invoked to C1,
4124 and usually these designations/invocations are omitted in encoded
4125 text. In a 7-bit environment, only C0 can be used, and a control
4126 character for C1 is encoded by an appropriate escape sequence to
4127 fit into the environment. All control characters for C1 are
4128 defined to have corresponding escape sequences.
4130 A graphic character set is at first designated to one of four
4131 graphic registers (G0 through G3), then these graphic registers are
4132 invoked to GL or GR. These designations and invocations can be
4133 done independently. The most common case is that G0 is invoked to
4134 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4135 these invocations and designations are omitted in encoded text.
4136 In a 7-bit environment, only GL can be used.
4138 When a graphic character set of CHARS94 is invoked to GL, codes
4139 0x20 and 0x7F of the GL area work as control characters SPACE and
4140 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4143 There are two ways of invocation: locking-shift and single-shift.
4144 With locking-shift, the invocation lasts until the next different
4145 invocation, whereas with single-shift, the invocation affects the
4146 following character only and doesn't affect the locking-shift
4147 state. Invocations are done by the following control characters or
4150 ----------------------------------------------------------------------
4151 abbrev function cntrl escape seq description
4152 ----------------------------------------------------------------------
4153 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4154 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4155 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4156 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4157 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4158 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4159 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4160 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4161 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4162 ----------------------------------------------------------------------
4163 (*) These are not used by any known coding system.
4165 Control characters for these functions are defined by macros
4166 ISO_CODE_XXX in `coding.h'.
4168 Designations are done by the following escape sequences:
4169 ----------------------------------------------------------------------
4170 escape sequence description
4171 ----------------------------------------------------------------------
4172 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4173 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4174 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4175 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4176 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4177 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4178 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4179 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4180 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4181 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4182 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4183 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4184 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4185 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4186 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4187 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4188 ----------------------------------------------------------------------
4190 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4191 of dimension 1, chars 94, and final character <F>, etc...
4193 Note (*): Although these designations are not allowed in ISO2022,
4194 Emacs accepts them on decoding, and produces them on encoding
4195 CHARS96 character sets in a coding system which is characterized as
4196 7-bit environment, non-locking-shift, and non-single-shift.
4198 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4199 '(' can be omitted. We refer to this as "short-form" hereafter.
4201 Now you may notice that there are a lot of ways for encoding the
4202 same multilingual text in ISO2022. Actually, there exist many
4203 coding systems such as Compound Text (used in X11's inter client
4204 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4205 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4206 localized platforms), and all of these are variants of ISO2022.
4208 In addition to the above, Emacs handles two more kinds of escape
4209 sequences: ISO6429's direction specification and Emacs' private
4210 sequence for specifying character composition.
4212 ISO6429's direction specification takes the following form:
4213 o CSI ']' -- end of the current direction
4214 o CSI '0' ']' -- end of the current direction
4215 o CSI '1' ']' -- start of left-to-right text
4216 o CSI '2' ']' -- start of right-to-left text
4217 The control character CSI (0x9B: control sequence introducer) is
4218 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4220 Character composition specification takes the following form:
4221 o ESC '0' -- start character composition
4222 o ESC '1' -- end character composition
4223 Since these are not standard escape sequences of any ISO standard,
4224 their use with these meanings is restricted to Emacs only. */
4227 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4231 for (i = 0; i < 4; i++)
4233 if (!NILP (coding_system))
4235 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4237 iso->charset[i] = Qt;
4238 iso->invalid_designated[i] = 0;
4240 iso->esc = ISO_ESC_NOTHING;
4241 iso->esc_bytes_index = 0;
4242 iso->register_left = 0;
4243 iso->register_right = 1;
4244 iso->switched_dir_and_no_valid_charset_yet = 0;
4245 iso->invalid_switch_dir = 0;
4246 iso->output_direction_sequence = 0;
4247 iso->output_literally = 0;
4248 #ifdef ENABLE_COMPOSITE_CHARS
4249 if (iso->composite_chars)
4250 Dynarr_reset (iso->composite_chars);
4255 fit_to_be_escape_quoted (unsigned char c)
4272 /* Parse one byte of an ISO2022 escape sequence.
4273 If the result is an invalid escape sequence, return 0 and
4274 do not change anything in STR. Otherwise, if the result is
4275 an incomplete escape sequence, update ISO2022.ESC and
4276 ISO2022.ESC_BYTES and return -1. Otherwise, update
4277 all the state variables (but not ISO2022.ESC_BYTES) and
4280 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4281 or invocation of an invalid character set and treat that as
4282 an unrecognized escape sequence. */
4285 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4286 unsigned char c, unsigned int *flags,
4287 int check_invalid_charsets)
4289 /* (1) If we're at the end of a designation sequence, CS is the
4290 charset being designated and REG is the register to designate
4293 (2) If we're at the end of a locking-shift sequence, REG is
4294 the register to invoke and HALF (0 == left, 1 == right) is
4295 the half to invoke it into.
4297 (3) If we're at the end of a single-shift sequence, REG is
4298 the register to invoke. */
4299 Lisp_Object cs = Qnil;
4302 /* NOTE: This code does goto's all over the fucking place.
4303 The reason for this is that we're basically implementing
4304 a state machine here, and hierarchical languages like C
4305 don't really provide a clean way of doing this. */
4307 if (! (*flags & CODING_STATE_ESCAPE))
4308 /* At beginning of escape sequence; we need to reset our
4309 escape-state variables. */
4310 iso->esc = ISO_ESC_NOTHING;
4312 iso->output_literally = 0;
4313 iso->output_direction_sequence = 0;
4317 case ISO_ESC_NOTHING:
4318 iso->esc_bytes_index = 0;
4321 case ISO_CODE_ESC: /* Start escape sequence */
4322 *flags |= CODING_STATE_ESCAPE;
4326 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4327 *flags |= CODING_STATE_ESCAPE;
4328 iso->esc = ISO_ESC_5_11;
4331 case ISO_CODE_SO: /* locking shift 1 */
4334 case ISO_CODE_SI: /* locking shift 0 */
4338 case ISO_CODE_SS2: /* single shift */
4341 case ISO_CODE_SS3: /* single shift */
4345 default: /* Other control characters */
4352 /**** single shift ****/
4354 case 'N': /* single shift 2 */
4357 case 'O': /* single shift 3 */
4361 /**** locking shift ****/
4363 case '~': /* locking shift 1 right */
4366 case 'n': /* locking shift 2 */
4369 case '}': /* locking shift 2 right */
4372 case 'o': /* locking shift 3 */
4375 case '|': /* locking shift 3 right */
4379 #ifdef ENABLE_COMPOSITE_CHARS
4380 /**** composite ****/
4383 iso->esc = ISO_ESC_START_COMPOSITE;
4384 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4385 CODING_STATE_COMPOSITE;
4389 iso->esc = ISO_ESC_END_COMPOSITE;
4390 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4391 ~CODING_STATE_COMPOSITE;
4393 #endif /* ENABLE_COMPOSITE_CHARS */
4395 /**** directionality ****/
4398 iso->esc = ISO_ESC_5_11;
4401 /**** designation ****/
4403 case '$': /* multibyte charset prefix */
4404 iso->esc = ISO_ESC_2_4;
4408 if (0x28 <= c && c <= 0x2F)
4410 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4414 /* This function is called with CODESYS equal to nil when
4415 doing coding-system detection. */
4417 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4418 && fit_to_be_escape_quoted (c))
4420 iso->esc = ISO_ESC_LITERAL;
4421 *flags &= CODING_STATE_ISO2022_LOCK;
4431 /**** directionality ****/
4433 case ISO_ESC_5_11: /* ISO6429 direction control */
4436 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4437 goto directionality;
4439 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4440 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4441 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4445 case ISO_ESC_5_11_0:
4448 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4449 goto directionality;
4453 case ISO_ESC_5_11_1:
4456 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4457 goto directionality;
4461 case ISO_ESC_5_11_2:
4464 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4465 goto directionality;
4470 iso->esc = ISO_ESC_DIRECTIONALITY;
4471 /* Various junk here to attempt to preserve the direction sequences
4472 literally in the text if they would otherwise be swallowed due
4473 to invalid designations that don't show up as actual charset
4474 changes in the text. */
4475 if (iso->invalid_switch_dir)
4477 /* We already inserted a direction switch literally into the
4478 text. We assume (#### this may not be right) that the
4479 next direction switch is the one going the other way,
4480 and we need to output that literally as well. */
4481 iso->output_literally = 1;
4482 iso->invalid_switch_dir = 0;
4488 /* If we are in the thrall of an invalid designation,
4489 then stick the directionality sequence literally into the
4490 output stream so it ends up in the original text again. */
4491 for (jj = 0; jj < 4; jj++)
4492 if (iso->invalid_designated[jj])
4496 iso->output_literally = 1;
4497 iso->invalid_switch_dir = 1;
4500 /* Indicate that we haven't yet seen a valid designation,
4501 so that if a switch-dir is directly followed by an
4502 invalid designation, both get inserted literally. */
4503 iso->switched_dir_and_no_valid_charset_yet = 1;
4508 /**** designation ****/
4511 if (0x28 <= c && c <= 0x2F)
4513 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4516 if (0x40 <= c && c <= 0x42)
4519 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4520 *flags & CODING_STATE_R2L ?
4521 CHARSET_RIGHT_TO_LEFT :
4522 CHARSET_LEFT_TO_RIGHT);
4533 if (c < '0' || c > '~')
4534 return 0; /* bad final byte */
4536 if (iso->esc >= ISO_ESC_2_8 &&
4537 iso->esc <= ISO_ESC_2_15)
4539 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4540 single = 1; /* single-byte */
4541 reg = (iso->esc - ISO_ESC_2_8) & 3;
4543 else if (iso->esc >= ISO_ESC_2_4_8 &&
4544 iso->esc <= ISO_ESC_2_4_15)
4546 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4547 single = -1; /* multi-byte */
4548 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4552 /* Can this ever be reached? -slb */
4556 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4557 *flags & CODING_STATE_R2L ?
4558 CHARSET_RIGHT_TO_LEFT :
4559 CHARSET_LEFT_TO_RIGHT);
4565 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4569 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4570 /* can't invoke something that ain't there. */
4572 iso->esc = ISO_ESC_SINGLE_SHIFT;
4573 *flags &= CODING_STATE_ISO2022_LOCK;
4575 *flags |= CODING_STATE_SS2;
4577 *flags |= CODING_STATE_SS3;
4581 if (check_invalid_charsets &&
4582 !CHARSETP (iso->charset[reg]))
4583 /* can't invoke something that ain't there. */
4586 iso->register_right = reg;
4588 iso->register_left = reg;
4589 *flags &= CODING_STATE_ISO2022_LOCK;
4590 iso->esc = ISO_ESC_LOCKING_SHIFT;
4594 if (NILP (cs) && check_invalid_charsets)
4596 iso->invalid_designated[reg] = 1;
4597 iso->charset[reg] = Vcharset_ascii;
4598 iso->esc = ISO_ESC_DESIGNATE;
4599 *flags &= CODING_STATE_ISO2022_LOCK;
4600 iso->output_literally = 1;
4601 if (iso->switched_dir_and_no_valid_charset_yet)
4603 /* We encountered a switch-direction followed by an
4604 invalid designation. Ensure that the switch-direction
4605 gets outputted; otherwise it will probably get eaten
4606 when the text is written out again. */
4607 iso->switched_dir_and_no_valid_charset_yet = 0;
4608 iso->output_direction_sequence = 1;
4609 /* And make sure that the switch-dir going the other
4610 way gets outputted, as well. */
4611 iso->invalid_switch_dir = 1;
4615 /* This function is called with CODESYS equal to nil when
4616 doing coding-system detection. */
4617 if (!NILP (codesys))
4619 charset_conversion_spec_dynarr *dyn =
4620 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4626 for (i = 0; i < Dynarr_length (dyn); i++)
4628 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4629 if (EQ (cs, spec->from_charset))
4630 cs = spec->to_charset;
4635 iso->charset[reg] = cs;
4636 iso->esc = ISO_ESC_DESIGNATE;
4637 *flags &= CODING_STATE_ISO2022_LOCK;
4638 if (iso->invalid_designated[reg])
4640 iso->invalid_designated[reg] = 0;
4641 iso->output_literally = 1;
4643 if (iso->switched_dir_and_no_valid_charset_yet)
4644 iso->switched_dir_and_no_valid_charset_yet = 0;
4649 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4653 /* #### There are serious deficiencies in the recognition mechanism
4654 here. This needs to be much smarter if it's going to cut it.
4655 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4656 it should be detected as Latin-1.
4657 All the ISO2022 stuff in this file should be synced up with the
4658 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4659 Perhaps we should wait till R2L works in FSF Emacs? */
4661 if (!st->iso2022.initted)
4663 reset_iso2022 (Qnil, &st->iso2022.iso);
4664 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4665 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4666 CODING_CATEGORY_ISO_8_1_MASK |
4667 CODING_CATEGORY_ISO_8_2_MASK |
4668 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4669 st->iso2022.flags = 0;
4670 st->iso2022.high_byte_count = 0;
4671 st->iso2022.saw_single_shift = 0;
4672 st->iso2022.initted = 1;
4675 mask = st->iso2022.mask;
4679 unsigned char c = *(unsigned char *)src++;
4682 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4683 st->iso2022.high_byte_count++;
4687 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4689 if (st->iso2022.high_byte_count & 1)
4690 /* odd number of high bytes; assume not iso-8-2 */
4691 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4693 st->iso2022.high_byte_count = 0;
4694 st->iso2022.saw_single_shift = 0;
4696 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4698 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4699 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4700 { /* control chars */
4703 /* Allow and ignore control characters that you might
4704 reasonably see in a text file */
4709 case 8: /* backspace */
4710 case 11: /* vertical tab */
4711 case 12: /* form feed */
4712 case 26: /* MS-DOS C-z junk */
4713 case 31: /* '^_' -- for info */
4714 goto label_continue_loop;
4721 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4724 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4725 &st->iso2022.flags, 0))
4727 switch (st->iso2022.iso.esc)
4729 case ISO_ESC_DESIGNATE:
4730 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4731 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4733 case ISO_ESC_LOCKING_SHIFT:
4734 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4735 goto ran_out_of_chars;
4736 case ISO_ESC_SINGLE_SHIFT:
4737 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4738 st->iso2022.saw_single_shift = 1;
4747 goto ran_out_of_chars;
4750 label_continue_loop:;
4759 postprocess_iso2022_mask (int mask)
4761 /* #### kind of cheesy */
4762 /* If seven-bit ISO is allowed, then assume that the encoding is
4763 entirely seven-bit and turn off the eight-bit ones. */
4764 if (mask & CODING_CATEGORY_ISO_7_MASK)
4765 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4766 CODING_CATEGORY_ISO_8_1_MASK |
4767 CODING_CATEGORY_ISO_8_2_MASK);
4771 /* If FLAGS is a null pointer or specifies right-to-left motion,
4772 output a switch-dir-to-left-to-right sequence to DST.
4773 Also update FLAGS if it is not a null pointer.
4774 If INTERNAL_P is set, we are outputting in internal format and
4775 need to handle the CSI differently. */
4778 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4779 unsigned_char_dynarr *dst,
4780 unsigned int *flags,
4783 if (!flags || (*flags & CODING_STATE_R2L))
4785 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4787 Dynarr_add (dst, ISO_CODE_ESC);
4788 Dynarr_add (dst, '[');
4790 else if (internal_p)
4791 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4793 Dynarr_add (dst, ISO_CODE_CSI);
4794 Dynarr_add (dst, '0');
4795 Dynarr_add (dst, ']');
4797 *flags &= ~CODING_STATE_R2L;
4801 /* If FLAGS is a null pointer or specifies a direction different from
4802 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4803 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4804 sequence to DST. Also update FLAGS if it is not a null pointer.
4805 If INTERNAL_P is set, we are outputting in internal format and
4806 need to handle the CSI differently. */
4809 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4810 unsigned_char_dynarr *dst, unsigned int *flags,
4813 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4814 direction == CHARSET_LEFT_TO_RIGHT)
4815 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4816 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4817 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4818 direction == CHARSET_RIGHT_TO_LEFT)
4820 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4822 Dynarr_add (dst, ISO_CODE_ESC);
4823 Dynarr_add (dst, '[');
4825 else if (internal_p)
4826 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4828 Dynarr_add (dst, ISO_CODE_CSI);
4829 Dynarr_add (dst, '2');
4830 Dynarr_add (dst, ']');
4832 *flags |= CODING_STATE_R2L;
4836 /* Convert ISO2022-format data to internal format. */
4839 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4840 unsigned_char_dynarr *dst, size_t n)
4842 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4843 unsigned int flags = str->flags;
4844 unsigned int cpos = str->cpos;
4845 unsigned char counter = str->counter;
4846 eol_type_t eol_type = str->eol_type;
4847 #ifdef ENABLE_COMPOSITE_CHARS
4848 unsigned_char_dynarr *real_dst = dst;
4850 Lisp_Object coding_system;
4852 XSETCODING_SYSTEM (coding_system, str->codesys);
4854 #ifdef ENABLE_COMPOSITE_CHARS
4855 if (flags & CODING_STATE_COMPOSITE)
4856 dst = str->iso2022.composite_chars;
4857 #endif /* ENABLE_COMPOSITE_CHARS */
4861 unsigned char c = *(unsigned char *)src++;
4862 if (flags & CODING_STATE_ESCAPE)
4863 { /* Within ESC sequence */
4864 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4869 switch (str->iso2022.esc)
4871 #ifdef ENABLE_COMPOSITE_CHARS
4872 case ISO_ESC_START_COMPOSITE:
4873 if (str->iso2022.composite_chars)
4874 Dynarr_reset (str->iso2022.composite_chars);
4876 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4877 dst = str->iso2022.composite_chars;
4879 case ISO_ESC_END_COMPOSITE:
4881 Bufbyte comstr[MAX_EMCHAR_LEN];
4883 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4884 Dynarr_length (dst));
4886 len = set_charptr_emchar (comstr, emch);
4887 Dynarr_add_many (dst, comstr, len);
4890 #endif /* ENABLE_COMPOSITE_CHARS */
4892 case ISO_ESC_LITERAL:
4893 COMPOSE_FLUSH_CHARS (str, dst);
4894 DECODE_ADD_BINARY_CHAR (c, dst);
4898 /* Everything else handled already */
4903 /* Attempted error recovery. */
4904 if (str->iso2022.output_direction_sequence)
4905 ensure_correct_direction (flags & CODING_STATE_R2L ?
4906 CHARSET_RIGHT_TO_LEFT :
4907 CHARSET_LEFT_TO_RIGHT,
4908 str->codesys, dst, 0, 1);
4909 /* More error recovery. */
4910 if (!retval || str->iso2022.output_literally)
4912 /* Output the (possibly invalid) sequence */
4914 COMPOSE_FLUSH_CHARS (str, dst);
4915 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4916 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4917 flags &= CODING_STATE_ISO2022_LOCK;
4919 n++, src--;/* Repeat the loop with the same character. */
4922 /* No sense in reprocessing the final byte of the
4923 escape sequence; it could mess things up anyway.
4925 COMPOSE_FLUSH_CHARS (str, dst);
4926 DECODE_ADD_BINARY_CHAR (c, dst);
4932 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4933 { /* Control characters */
4935 /***** Error-handling *****/
4937 /* If we were in the middle of a character, dump out the
4938 partial character. */
4941 COMPOSE_FLUSH_CHARS (str, dst);
4945 DECODE_ADD_BINARY_CHAR
4946 ((unsigned char)(cpos >> (counter * 8)), dst);
4951 /* If we just saw a single-shift character, dump it out.
4952 This may dump out the wrong sort of single-shift character,
4953 but least it will give an indication that something went
4955 if (flags & CODING_STATE_SS2)
4957 COMPOSE_FLUSH_CHARS (str, dst);
4958 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4959 flags &= ~CODING_STATE_SS2;
4961 if (flags & CODING_STATE_SS3)
4963 COMPOSE_FLUSH_CHARS (str, dst);
4964 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4965 flags &= ~CODING_STATE_SS3;
4968 /***** Now handle the control characters. *****/
4974 COMPOSE_FLUSH_CHARS (str, dst);
4975 if (eol_type == EOL_CR)
4976 Dynarr_add (dst, '\n');
4977 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4978 Dynarr_add (dst, c);
4980 flags |= CODING_STATE_CR;
4981 goto label_continue_loop;
4983 else if (flags & CODING_STATE_CR)
4984 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4986 Dynarr_add (dst, '\r');
4987 flags &= ~CODING_STATE_CR;
4990 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4993 flags &= CODING_STATE_ISO2022_LOCK;
4995 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4997 COMPOSE_FLUSH_CHARS (str, dst);
4998 DECODE_ADD_BINARY_CHAR (c, dst);
5002 { /* Graphic characters */
5003 Lisp_Object charset;
5012 COMPOSE_FLUSH_CHARS (str, dst);
5013 if (eol_type == EOL_CR)
5014 Dynarr_add (dst, '\n');
5015 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5016 Dynarr_add (dst, c);
5018 flags |= CODING_STATE_CR;
5019 goto label_continue_loop;
5021 else if (flags & CODING_STATE_CR)
5022 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5024 Dynarr_add (dst, '\r');
5025 flags &= ~CODING_STATE_CR;
5028 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5031 /* Now determine the charset. */
5032 reg = ((flags & CODING_STATE_SS2) ? 2
5033 : (flags & CODING_STATE_SS3) ? 3
5034 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5035 : str->iso2022.register_left);
5036 charset = str->iso2022.charset[reg];
5038 /* Error checking: */
5039 if (! CHARSETP (charset)
5040 || str->iso2022.invalid_designated[reg]
5041 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5042 && XCHARSET_CHARS (charset) == 94))
5043 /* Mrmph. We are trying to invoke a register that has no
5044 or an invalid charset in it, or trying to add a character
5045 outside the range of the charset. Insert that char literally
5046 to preserve it for the output. */
5048 COMPOSE_FLUSH_CHARS (str, dst);
5052 DECODE_ADD_BINARY_CHAR
5053 ((unsigned char)(cpos >> (counter * 8)), dst);
5056 DECODE_ADD_BINARY_CHAR (c, dst);
5061 /* Things are probably hunky-dorey. */
5063 /* Fetch reverse charset, maybe. */
5064 if (((flags & CODING_STATE_R2L) &&
5065 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5067 (!(flags & CODING_STATE_R2L) &&
5068 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5070 Lisp_Object new_charset =
5071 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5072 if (!NILP (new_charset))
5073 charset = new_charset;
5078 if (XCHARSET_DIMENSION (charset) == counter)
5080 COMPOSE_ADD_CHAR (str,
5081 DECODE_CHAR (charset,
5082 ((cpos & 0x7F7F7F) << 8)
5089 cpos = (cpos << 8) | c;
5091 lb = XCHARSET_LEADING_BYTE (charset);
5092 switch (XCHARSET_REP_BYTES (charset))
5095 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5096 Dynarr_add (dst, c & 0x7F);
5099 case 2: /* one-byte official */
5100 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5101 Dynarr_add (dst, lb);
5102 Dynarr_add (dst, c | 0x80);
5105 case 3: /* one-byte private or two-byte official */
5106 if (XCHARSET_PRIVATE_P (charset))
5108 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5109 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5110 Dynarr_add (dst, lb);
5111 Dynarr_add (dst, c | 0x80);
5117 Dynarr_add (dst, lb);
5118 Dynarr_add (dst, ch | 0x80);
5119 Dynarr_add (dst, c | 0x80);
5127 default: /* two-byte private */
5130 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5131 Dynarr_add (dst, lb);
5132 Dynarr_add (dst, ch | 0x80);
5133 Dynarr_add (dst, c | 0x80);
5143 flags &= CODING_STATE_ISO2022_LOCK;
5146 label_continue_loop:;
5149 if (flags & CODING_STATE_END)
5151 COMPOSE_FLUSH_CHARS (str, dst);
5152 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5156 str->counter = counter;
5160 /***** ISO2022 encoder *****/
5162 /* Designate CHARSET into register REG. */
5165 iso2022_designate (Lisp_Object charset, unsigned char reg,
5166 struct encoding_stream *str, unsigned_char_dynarr *dst)
5168 static const char inter94[] = "()*+";
5169 static const char inter96[] = ",-./";
5170 unsigned short chars;
5171 unsigned char dimension;
5172 unsigned char final;
5173 Lisp_Object old_charset = str->iso2022.charset[reg];
5175 str->iso2022.charset[reg] = charset;
5176 if (!CHARSETP (charset))
5177 /* charset might be an initial nil or t. */
5179 chars = XCHARSET_CHARS (charset);
5180 dimension = XCHARSET_DIMENSION (charset);
5181 final = XCHARSET_FINAL (charset);
5182 if (!str->iso2022.force_charset_on_output[reg] &&
5183 CHARSETP (old_charset) &&
5184 XCHARSET_CHARS (old_charset) == chars &&
5185 XCHARSET_DIMENSION (old_charset) == dimension &&
5186 XCHARSET_FINAL (old_charset) == final)
5189 str->iso2022.force_charset_on_output[reg] = 0;
5192 charset_conversion_spec_dynarr *dyn =
5193 str->codesys->iso2022.output_conv;
5199 for (i = 0; i < Dynarr_length (dyn); i++)
5201 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5202 if (EQ (charset, spec->from_charset))
5203 charset = spec->to_charset;
5208 Dynarr_add (dst, ISO_CODE_ESC);
5213 Dynarr_add (dst, inter94[reg]);
5216 Dynarr_add (dst, '$');
5218 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5221 Dynarr_add (dst, inter94[reg]);
5226 Dynarr_add (dst, inter96[reg]);
5229 Dynarr_add (dst, '$');
5230 Dynarr_add (dst, inter96[reg]);
5234 Dynarr_add (dst, final);
5238 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5240 if (str->iso2022.register_left != 0)
5242 Dynarr_add (dst, ISO_CODE_SI);
5243 str->iso2022.register_left = 0;
5248 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5250 if (str->iso2022.register_left != 1)
5252 Dynarr_add (dst, ISO_CODE_SO);
5253 str->iso2022.register_left = 1;
5258 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5259 unsigned_char_dynarr *dst, unsigned int *flags)
5261 unsigned char charmask;
5262 Lisp_Coding_System* codesys = str->codesys;
5263 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5265 Lisp_Object charset = str->iso2022.current_charset;
5266 int half = str->iso2022.current_half;
5267 int code_point = -1;
5271 restore_left_to_right_direction (codesys, dst, flags, 0);
5273 /* Make sure G0 contains ASCII */
5274 if ((ch > ' ' && ch < ISO_CODE_DEL)
5275 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5277 ensure_normal_shift (str, dst);
5278 iso2022_designate (Vcharset_ascii, 0, str, dst);
5281 /* If necessary, restore everything to the default state
5283 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5285 restore_left_to_right_direction (codesys, dst, flags, 0);
5287 ensure_normal_shift (str, dst);
5289 for (i = 0; i < 4; i++)
5291 Lisp_Object initial_charset =
5292 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5293 iso2022_designate (initial_charset, i, str, dst);
5298 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5299 Dynarr_add (dst, '\r');
5300 if (eol_type != EOL_CR)
5301 Dynarr_add (dst, ch);
5305 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5306 && fit_to_be_escape_quoted (ch))
5307 Dynarr_add (dst, ISO_CODE_ESC);
5308 Dynarr_add (dst, ch);
5311 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5313 charmask = (half == 0 ? 0x00 : 0x80);
5315 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5316 && fit_to_be_escape_quoted (ch))
5317 Dynarr_add (dst, ISO_CODE_ESC);
5318 /* you asked for it ... */
5319 Dynarr_add (dst, ch);
5325 /* Now determine which register to use. */
5327 for (i = 0; i < 4; i++)
5329 if ((CHARSETP (charset = str->iso2022.charset[i])
5330 && ((code_point = charset_code_point (charset, ch)) >= 0))
5334 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5335 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5343 Lisp_Object original_default_coded_charset_priority_list
5344 = Vdefault_coded_charset_priority_list;
5346 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5348 code_point = ENCODE_CHAR (ch, charset);
5349 if (XCHARSET_FINAL (charset))
5351 Vdefault_coded_charset_priority_list
5352 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5353 Vdefault_coded_charset_priority_list));
5355 code_point = ENCODE_CHAR (ch, charset);
5356 if (!XCHARSET_FINAL (charset))
5358 charset = Vcharset_ascii;
5362 Vdefault_coded_charset_priority_list
5363 = original_default_coded_charset_priority_list;
5365 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5366 codesys, dst, flags, 0);
5370 if (XCHARSET_GRAPHIC (charset) != 0)
5372 if (!NILP (str->iso2022.charset[1]) &&
5373 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5374 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5376 else if (!NILP (str->iso2022.charset[2]))
5378 else if (!NILP (str->iso2022.charset[3]))
5387 iso2022_designate (charset, reg, str, dst);
5389 /* Now invoke that register. */
5393 ensure_normal_shift (str, dst);
5397 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5399 ensure_shift_out (str, dst);
5406 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5408 Dynarr_add (dst, ISO_CODE_ESC);
5409 Dynarr_add (dst, 'N');
5414 Dynarr_add (dst, ISO_CODE_SS2);
5419 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5421 Dynarr_add (dst, ISO_CODE_ESC);
5422 Dynarr_add (dst, 'O');
5427 Dynarr_add (dst, ISO_CODE_SS3);
5435 charmask = (half == 0 ? 0x00 : 0x80);
5437 switch (XCHARSET_DIMENSION (charset))
5440 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5443 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5444 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5447 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5448 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5449 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5452 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5453 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5454 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5455 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5461 str->iso2022.current_charset = charset;
5462 str->iso2022.current_half = half;
5466 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5467 unsigned int *flags)
5469 Lisp_Coding_System* codesys = str->codesys;
5472 restore_left_to_right_direction (codesys, dst, flags, 0);
5473 ensure_normal_shift (str, dst);
5474 for (i = 0; i < 4; i++)
5476 Lisp_Object initial_charset
5477 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5478 iso2022_designate (initial_charset, i, str, dst);
5483 /************************************************************************/
5484 /* No-conversion methods */
5485 /************************************************************************/
5487 /* This is used when reading in "binary" files -- i.e. files that may
5488 contain all 256 possible byte values and that are not to be
5489 interpreted as being in any particular decoding. */
5491 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5492 unsigned_char_dynarr *dst, size_t n)
5494 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5495 unsigned int flags = str->flags;
5496 unsigned int cpos = str->cpos;
5497 eol_type_t eol_type = str->eol_type;
5501 unsigned char c = *(unsigned char *)src++;
5503 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5504 DECODE_ADD_BINARY_CHAR (c, dst);
5505 label_continue_loop:;
5508 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5515 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5516 unsigned_char_dynarr *dst, size_t n)
5519 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5520 unsigned int flags = str->flags;
5521 unsigned int ch = str->ch;
5522 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5524 unsigned char char_boundary = str->iso2022.current_char_boundary;
5531 if (char_boundary == 0)
5537 else if ( c >= 0xf8 )
5542 else if ( c >= 0xf0 )
5547 else if ( c >= 0xe0 )
5552 else if ( c >= 0xc0 )
5562 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5563 Dynarr_add (dst, '\r');
5564 if (eol_type != EOL_CR)
5565 Dynarr_add (dst, c);
5568 Dynarr_add (dst, c);
5571 else if (char_boundary == 1)
5573 ch = ( ch << 6 ) | ( c & 0x3f );
5574 Dynarr_add (dst, ch & 0xff);
5579 ch = ( ch << 6 ) | ( c & 0x3f );
5582 #else /* not UTF2000 */
5585 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5586 Dynarr_add (dst, '\r');
5587 if (eol_type != EOL_CR)
5588 Dynarr_add (dst, '\n');
5591 else if (BYTE_ASCII_P (c))
5594 Dynarr_add (dst, c);
5596 else if (BUFBYTE_LEADING_BYTE_P (c))
5599 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5600 c == LEADING_BYTE_CONTROL_1)
5603 Dynarr_add (dst, '~'); /* untranslatable character */
5607 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5608 Dynarr_add (dst, c);
5609 else if (ch == LEADING_BYTE_CONTROL_1)
5612 Dynarr_add (dst, c - 0x20);
5614 /* else it should be the second or third byte of an
5615 untranslatable character, so ignore it */
5618 #endif /* not UTF2000 */
5624 str->iso2022.current_char_boundary = char_boundary;
5630 /************************************************************************/
5631 /* Initialization */
5632 /************************************************************************/
5635 syms_of_file_coding (void)
5637 INIT_LRECORD_IMPLEMENTATION (coding_system);
5639 deferror (&Qcoding_system_error, "coding-system-error",
5640 "Coding-system error", Qio_error);
5642 DEFSUBR (Fcoding_system_p);
5643 DEFSUBR (Ffind_coding_system);
5644 DEFSUBR (Fget_coding_system);
5645 DEFSUBR (Fcoding_system_list);
5646 DEFSUBR (Fcoding_system_name);
5647 DEFSUBR (Fmake_coding_system);
5648 DEFSUBR (Fcopy_coding_system);
5649 DEFSUBR (Fcoding_system_canonical_name_p);
5650 DEFSUBR (Fcoding_system_alias_p);
5651 DEFSUBR (Fcoding_system_aliasee);
5652 DEFSUBR (Fdefine_coding_system_alias);
5653 DEFSUBR (Fsubsidiary_coding_system);
5655 DEFSUBR (Fcoding_system_type);
5656 DEFSUBR (Fcoding_system_doc_string);
5658 DEFSUBR (Fcoding_system_charset);
5660 DEFSUBR (Fcoding_system_property);
5662 DEFSUBR (Fcoding_category_list);
5663 DEFSUBR (Fset_coding_priority_list);
5664 DEFSUBR (Fcoding_priority_list);
5665 DEFSUBR (Fset_coding_category_system);
5666 DEFSUBR (Fcoding_category_system);
5668 DEFSUBR (Fdetect_coding_region);
5669 DEFSUBR (Fdecode_coding_region);
5670 DEFSUBR (Fencode_coding_region);
5672 DEFSUBR (Fdecode_shift_jis_char);
5673 DEFSUBR (Fencode_shift_jis_char);
5674 DEFSUBR (Fdecode_big5_char);
5675 DEFSUBR (Fencode_big5_char);
5677 defsymbol (&Qcoding_systemp, "coding-system-p");
5678 defsymbol (&Qno_conversion, "no-conversion");
5679 defsymbol (&Qraw_text, "raw-text");
5681 defsymbol (&Qbig5, "big5");
5682 defsymbol (&Qshift_jis, "shift-jis");
5683 defsymbol (&Qucs4, "ucs-4");
5684 defsymbol (&Qutf8, "utf-8");
5685 defsymbol (&Qccl, "ccl");
5686 defsymbol (&Qiso2022, "iso2022");
5688 defsymbol (&Qmnemonic, "mnemonic");
5689 defsymbol (&Qeol_type, "eol-type");
5690 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5691 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5693 defsymbol (&Qcr, "cr");
5694 defsymbol (&Qlf, "lf");
5695 defsymbol (&Qcrlf, "crlf");
5696 defsymbol (&Qeol_cr, "eol-cr");
5697 defsymbol (&Qeol_lf, "eol-lf");
5698 defsymbol (&Qeol_crlf, "eol-crlf");
5700 defsymbol (&Qcharset_g0, "charset-g0");
5701 defsymbol (&Qcharset_g1, "charset-g1");
5702 defsymbol (&Qcharset_g2, "charset-g2");
5703 defsymbol (&Qcharset_g3, "charset-g3");
5704 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5705 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5706 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5707 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5708 defsymbol (&Qno_iso6429, "no-iso6429");
5709 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5710 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5712 defsymbol (&Qshort, "short");
5713 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5714 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5715 defsymbol (&Qseven, "seven");
5716 defsymbol (&Qlock_shift, "lock-shift");
5717 defsymbol (&Qescape_quoted, "escape-quoted");
5720 defsymbol (&Qdisable_composition, "disable-composition");
5722 defsymbol (&Qencode, "encode");
5723 defsymbol (&Qdecode, "decode");
5726 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5728 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5730 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5732 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5734 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5736 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5738 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5740 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5742 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5745 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5750 lstream_type_create_file_coding (void)
5752 LSTREAM_HAS_METHOD (decoding, reader);
5753 LSTREAM_HAS_METHOD (decoding, writer);
5754 LSTREAM_HAS_METHOD (decoding, rewinder);
5755 LSTREAM_HAS_METHOD (decoding, seekable_p);
5756 LSTREAM_HAS_METHOD (decoding, flusher);
5757 LSTREAM_HAS_METHOD (decoding, closer);
5758 LSTREAM_HAS_METHOD (decoding, marker);
5760 LSTREAM_HAS_METHOD (encoding, reader);
5761 LSTREAM_HAS_METHOD (encoding, writer);
5762 LSTREAM_HAS_METHOD (encoding, rewinder);
5763 LSTREAM_HAS_METHOD (encoding, seekable_p);
5764 LSTREAM_HAS_METHOD (encoding, flusher);
5765 LSTREAM_HAS_METHOD (encoding, closer);
5766 LSTREAM_HAS_METHOD (encoding, marker);
5770 vars_of_file_coding (void)
5774 fcd = xnew (struct file_coding_dump);
5775 dumpstruct (&fcd, &fcd_description);
5777 /* Initialize to something reasonable ... */
5778 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5780 fcd->coding_category_system[i] = Qnil;
5781 fcd->coding_category_by_priority[i] = i;
5784 Fprovide (intern ("file-coding"));
5786 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5787 Coding system used for TTY keyboard input.
5788 Not used under a windowing system.
5790 Vkeyboard_coding_system = Qnil;
5792 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5793 Coding system used for TTY display output.
5794 Not used under a windowing system.
5796 Vterminal_coding_system = Qnil;
5798 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5799 Overriding coding system used when reading from a file or process.
5800 You should bind this variable with `let', but do not set it globally.
5801 If this is non-nil, it specifies the coding system that will be used
5802 to decode input on read operations, such as from a file or process.
5803 It overrides `buffer-file-coding-system-for-read',
5804 `insert-file-contents-pre-hook', etc. Use those variables instead of
5805 this one for permanent changes to the environment. */ );
5806 Vcoding_system_for_read = Qnil;
5808 DEFVAR_LISP ("coding-system-for-write",
5809 &Vcoding_system_for_write /*
5810 Overriding coding system used when writing to a file or process.
5811 You should bind this variable with `let', but do not set it globally.
5812 If this is non-nil, it specifies the coding system that will be used
5813 to encode output for write operations, such as to a file or process.
5814 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5815 Use those variables instead of this one for permanent changes to the
5817 Vcoding_system_for_write = Qnil;
5819 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5820 Coding system used to convert pathnames when accessing files.
5822 Vfile_name_coding_system = Qnil;
5824 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5825 Non-nil means the buffer contents are regarded as multi-byte form
5826 of characters, not a binary code. This affects the display, file I/O,
5827 and behaviors of various editing commands.
5829 Setting this to nil does not do anything.
5831 enable_multibyte_characters = 1;
5835 complex_vars_of_file_coding (void)
5837 staticpro (&Vcoding_system_hash_table);
5838 Vcoding_system_hash_table =
5839 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5841 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5842 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5844 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5846 struct codesys_prop csp; \
5848 csp.prop_type = (Prop_Type); \
5849 Dynarr_add (the_codesys_prop_dynarr, csp); \
5852 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5853 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5854 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5855 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5856 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5857 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5858 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5860 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5861 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5862 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5863 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5864 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5865 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5866 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5867 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5868 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5869 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5870 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5871 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5872 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5873 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5874 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5875 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5876 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5878 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5879 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5881 /* Need to create this here or we're really screwed. */
5883 (Qraw_text, Qno_conversion,
5884 build_string ("Raw text, which means it converts only line-break-codes."),
5885 list2 (Qmnemonic, build_string ("Raw")));
5888 (Qbinary, Qno_conversion,
5889 build_string ("Binary, which means it does not convert anything."),
5890 list4 (Qeol_type, Qlf,
5891 Qmnemonic, build_string ("Binary")));
5896 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5897 list2 (Qmnemonic, build_string ("UTF8")));
5900 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5902 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5904 Fdefine_coding_system_alias (Qterminal, Qbinary);
5905 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5907 /* Need this for bootstrapping */
5908 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5909 Fget_coding_system (Qraw_text);
5912 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5913 = Fget_coding_system (Qutf8);
5916 #if defined(MULE) && !defined(UTF2000)
5920 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5921 fcd->ucs_to_mule_table[i] = Qnil;
5923 staticpro (&mule_to_ucs_table);
5924 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5925 #endif /* defined(MULE) && !defined(UTF2000) */