1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
39 #include "file-coding.h"
41 Lisp_Object Qcoding_system_error;
43 Lisp_Object Vkeyboard_coding_system;
44 Lisp_Object Vterminal_coding_system;
45 Lisp_Object Vcoding_system_for_read;
46 Lisp_Object Vcoding_system_for_write;
47 Lisp_Object Vfile_name_coding_system;
49 /* Table of symbols identifying each coding category. */
50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
54 struct file_coding_dump {
55 /* Coding system currently associated with each coding category. */
56 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
58 /* Table of all coding categories in decreasing order of priority.
59 This describes a permutation of the possible coding categories. */
60 int coding_category_by_priority[CODING_CATEGORY_LAST];
62 #if defined(MULE) && !defined(UTF2000)
63 Lisp_Object ucs_to_mule_table[65536];
67 static const struct lrecord_description fcd_description_1[] = {
68 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
69 #if defined(MULE) && !defined(UTF2000)
70 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
75 static const struct struct_description fcd_description = {
76 sizeof (struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
105 Lisp_Object Qdisable_composition;
107 Lisp_Object Qencode, Qdecode;
109 Lisp_Object Vcoding_system_hash_table;
111 int enable_multibyte_characters;
114 /* Additional information used by the ISO2022 decoder and detector. */
115 struct iso2022_decoder
117 /* CHARSET holds the character sets currently assigned to the G0
118 through G3 variables. It is initialized from the array
119 INITIAL_CHARSET in CODESYS. */
120 Lisp_Object charset[4];
122 /* Which registers are currently invoked into the left (GL) and
123 right (GR) halves of the 8-bit encoding space? */
124 int register_left, register_right;
126 /* ISO_ESC holds a value indicating part of an escape sequence
127 that has already been seen. */
128 enum iso_esc_flag esc;
130 /* This records the bytes we've seen so far in an escape sequence,
131 in case the sequence is invalid (we spit out the bytes unchanged). */
132 unsigned char esc_bytes[8];
134 /* Index for next byte to store in ISO escape sequence. */
137 #ifdef ENABLE_COMPOSITE_CHARS
138 /* Stuff seen so far when composing a string. */
139 unsigned_char_dynarr *composite_chars;
142 /* If we saw an invalid designation sequence for a particular
143 register, we flag it here and switch to ASCII. The next time we
144 see a valid designation for this register, we turn off the flag
145 and do the designation normally, but pretend the sequence was
146 invalid. The effect of all this is that (most of the time) the
147 escape sequences for both the switch to the unknown charset, and
148 the switch back to the known charset, get inserted literally into
149 the buffer and saved out as such. The hope is that we can
150 preserve the escape sequences so that the resulting written out
151 file makes sense. If we don't do any of this, the designation
152 to the invalid charset will be preserved but that switch back
153 to the known charset will probably get eaten because it was
154 the same charset that was already present in the register. */
155 unsigned char invalid_designated[4];
157 /* We try to do similar things as above for direction-switching
158 sequences. If we encountered a direction switch while an
159 invalid designation was present, or an invalid designation
160 just after a direction switch (i.e. no valid designation
161 encountered yet), we insert the direction-switch escape
162 sequence literally into the output stream, and later on
163 insert the corresponding direction-restoring escape sequence
165 unsigned int switched_dir_and_no_valid_charset_yet :1;
166 unsigned int invalid_switch_dir :1;
168 /* Tells the decoder to output the escape sequence literally
169 even though it was valid. Used in the games we play to
170 avoid lossage when we encounter invalid designations. */
171 unsigned int output_literally :1;
172 /* We encountered a direction switch followed by an invalid
173 designation. We didn't output the direction switch
174 literally because we didn't know about the invalid designation;
175 but we have to do so now. */
176 unsigned int output_direction_sequence :1;
179 EXFUN (Fcopy_coding_system, 2);
181 struct detection_state;
184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
185 unsigned_char_dynarr *dst, size_t n);
187 static int detect_coding_sjis (struct detection_state *st,
188 const Extbyte *src, size_t n);
189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
190 unsigned_char_dynarr *dst, size_t n);
191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
192 unsigned_char_dynarr *dst, unsigned int *flags);
193 void char_finish_shift_jis (struct encoding_stream *str,
194 unsigned_char_dynarr *dst, unsigned int *flags);
196 static int detect_coding_big5 (struct detection_state *st,
197 const Extbyte *src, size_t n);
198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, size_t n);
200 void char_encode_big5 (struct encoding_stream *str, Emchar c,
201 unsigned_char_dynarr *dst, unsigned int *flags);
202 void char_finish_big5 (struct encoding_stream *str,
203 unsigned_char_dynarr *dst, unsigned int *flags);
205 static int detect_coding_ucs4 (struct detection_state *st,
206 const Extbyte *src, size_t n);
207 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, size_t n);
209 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
210 unsigned_char_dynarr *dst, unsigned int *flags);
211 void char_finish_ucs4 (struct encoding_stream *str,
212 unsigned_char_dynarr *dst, unsigned int *flags);
214 static int detect_coding_utf8 (struct detection_state *st,
215 const Extbyte *src, size_t n);
216 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, size_t n);
218 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
219 unsigned_char_dynarr *dst, unsigned int *flags);
220 void char_finish_utf8 (struct encoding_stream *str,
221 unsigned_char_dynarr *dst, unsigned int *flags);
223 static int postprocess_iso2022_mask (int mask);
224 static void reset_iso2022 (Lisp_Object coding_system,
225 struct iso2022_decoder *iso);
226 static int detect_coding_iso2022 (struct detection_state *st,
227 const Extbyte *src, size_t n);
228 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
229 unsigned_char_dynarr *dst, size_t n);
230 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
231 unsigned_char_dynarr *dst, unsigned int *flags);
232 void char_finish_iso2022 (struct encoding_stream *str,
233 unsigned_char_dynarr *dst, unsigned int *flags);
235 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
236 unsigned_char_dynarr *dst, size_t n);
237 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
238 unsigned_char_dynarr *dst, size_t n);
239 static void mule_decode (Lstream *decoding, const Extbyte *src,
240 unsigned_char_dynarr *dst, size_t n);
241 static void mule_encode (Lstream *encoding, const Bufbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
244 typedef struct codesys_prop codesys_prop;
253 Dynarr_declare (codesys_prop);
254 } codesys_prop_dynarr;
256 static const struct lrecord_description codesys_prop_description_1[] = {
257 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
261 static const struct struct_description codesys_prop_description = {
262 sizeof (codesys_prop),
263 codesys_prop_description_1
266 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
267 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
271 static const struct struct_description codesys_prop_dynarr_description = {
272 sizeof (codesys_prop_dynarr),
273 codesys_prop_dynarr_description_1
276 codesys_prop_dynarr *the_codesys_prop_dynarr;
278 enum codesys_prop_enum
281 CODESYS_PROP_ISO2022,
286 /************************************************************************/
287 /* Coding system functions */
288 /************************************************************************/
290 static Lisp_Object mark_coding_system (Lisp_Object);
291 static void print_coding_system (Lisp_Object, Lisp_Object, int);
292 static void finalize_coding_system (void *header, int for_disksave);
295 static const struct lrecord_description ccs_description_1[] = {
296 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
297 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
301 static const struct struct_description ccs_description = {
302 sizeof (charset_conversion_spec),
306 static const struct lrecord_description ccsd_description_1[] = {
307 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
311 static const struct struct_description ccsd_description = {
312 sizeof (charset_conversion_spec_dynarr),
317 static const struct lrecord_description coding_system_description[] = {
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
327 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
328 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
329 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
336 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
337 mark_coding_system, print_coding_system,
338 finalize_coding_system,
339 0, 0, coding_system_description,
343 mark_coding_system (Lisp_Object obj)
345 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
347 mark_object (CODING_SYSTEM_NAME (codesys));
348 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
349 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
350 mark_object (CODING_SYSTEM_EOL_LF (codesys));
351 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
352 mark_object (CODING_SYSTEM_EOL_CR (codesys));
354 switch (CODING_SYSTEM_TYPE (codesys))
358 case CODESYS_ISO2022:
359 for (i = 0; i < 4; i++)
360 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
361 if (codesys->iso2022.input_conv)
363 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
365 struct charset_conversion_spec *ccs =
366 Dynarr_atp (codesys->iso2022.input_conv, i);
367 mark_object (ccs->from_charset);
368 mark_object (ccs->to_charset);
371 if (codesys->iso2022.output_conv)
373 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
375 struct charset_conversion_spec *ccs =
376 Dynarr_atp (codesys->iso2022.output_conv, i);
377 mark_object (ccs->from_charset);
378 mark_object (ccs->to_charset);
384 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
385 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
392 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
393 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
397 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
400 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
402 error ("printing unreadable object #<coding_system 0x%x>",
405 write_c_string ("#<coding_system ", printcharfun);
406 print_internal (c->name, printcharfun, 1);
407 write_c_string (">", printcharfun);
411 finalize_coding_system (void *header, int for_disksave)
413 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
414 /* Since coding systems never go away, this function is not
415 necessary. But it would be necessary if we changed things
416 so that coding systems could go away. */
417 if (!for_disksave) /* see comment in lstream.c */
419 switch (CODING_SYSTEM_TYPE (c))
422 case CODESYS_ISO2022:
423 if (c->iso2022.input_conv)
425 Dynarr_free (c->iso2022.input_conv);
426 c->iso2022.input_conv = 0;
428 if (c->iso2022.output_conv)
430 Dynarr_free (c->iso2022.output_conv);
431 c->iso2022.output_conv = 0;
442 symbol_to_eol_type (Lisp_Object symbol)
444 CHECK_SYMBOL (symbol);
445 if (NILP (symbol)) return EOL_AUTODETECT;
446 if (EQ (symbol, Qlf)) return EOL_LF;
447 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
448 if (EQ (symbol, Qcr)) return EOL_CR;
450 signal_simple_error ("Unrecognized eol type", symbol);
451 return EOL_AUTODETECT; /* not reached */
455 eol_type_to_symbol (eol_type_t type)
460 case EOL_LF: return Qlf;
461 case EOL_CRLF: return Qcrlf;
462 case EOL_CR: return Qcr;
463 case EOL_AUTODETECT: return Qnil;
468 setup_eol_coding_systems (Lisp_Coding_System *codesys)
470 Lisp_Object codesys_obj;
471 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
472 char *codesys_name = (char *) alloca (len + 7);
474 char *codesys_mnemonic=0;
476 Lisp_Object codesys_name_sym, sub_codesys_obj;
480 XSETCODING_SYSTEM (codesys_obj, codesys);
482 memcpy (codesys_name,
483 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
485 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
487 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
488 codesys_mnemonic = (char *) alloca (mlen + 7);
489 memcpy (codesys_mnemonic,
490 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
493 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
494 strcpy (codesys_name + len, "-" op_sys); \
496 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
497 codesys_name_sym = intern (codesys_name); \
498 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
499 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
501 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
502 build_string (codesys_mnemonic); \
503 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
506 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
507 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
508 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
511 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
512 Return t if OBJECT is a coding system.
513 A coding system is an object that defines how text containing multiple
514 character sets is encoded into a stream of (typically 8-bit) bytes.
515 The coding system is used to decode the stream into a series of
516 characters (which may be from multiple charsets) when the text is read
517 from a file or process, and is used to encode the text back into the
518 same format when it is written out to a file or process.
520 For example, many ISO2022-compliant coding systems (such as Compound
521 Text, which is used for inter-client data under the X Window System)
522 use escape sequences to switch between different charsets -- Japanese
523 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
524 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
525 `make-coding-system' for more information.
527 Coding systems are normally identified using a symbol, and the
528 symbol is accepted in place of the actual coding system object whenever
529 a coding system is called for. (This is similar to how faces work.)
533 return CODING_SYSTEMP (object) ? Qt : Qnil;
536 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
537 Retrieve the coding system of the given name.
539 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
540 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
541 If there is no such coding system, nil is returned. Otherwise the
542 associated coding system object is returned.
544 (coding_system_or_name))
546 if (NILP (coding_system_or_name))
547 coding_system_or_name = Qbinary;
548 else if (CODING_SYSTEMP (coding_system_or_name))
549 return coding_system_or_name;
551 CHECK_SYMBOL (coding_system_or_name);
555 coding_system_or_name =
556 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
558 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
559 return coding_system_or_name;
563 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
564 Retrieve the coding system of the given name.
565 Same as `find-coding-system' except that if there is no such
566 coding system, an error is signaled instead of returning nil.
570 Lisp_Object coding_system = Ffind_coding_system (name);
572 if (NILP (coding_system))
573 signal_simple_error ("No such coding system", name);
574 return coding_system;
577 /* We store the coding systems in hash tables with the names as the key and the
578 actual coding system object as the value. Occasionally we need to use them
579 in a list format. These routines provide us with that. */
580 struct coding_system_list_closure
582 Lisp_Object *coding_system_list;
586 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
587 void *coding_system_list_closure)
589 /* This function can GC */
590 struct coding_system_list_closure *cscl =
591 (struct coding_system_list_closure *) coding_system_list_closure;
592 Lisp_Object *coding_system_list = cscl->coding_system_list;
594 *coding_system_list = Fcons (key, *coding_system_list);
598 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
599 Return a list of the names of all defined coding systems.
603 Lisp_Object coding_system_list = Qnil;
605 struct coding_system_list_closure coding_system_list_closure;
607 GCPRO1 (coding_system_list);
608 coding_system_list_closure.coding_system_list = &coding_system_list;
609 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
610 &coding_system_list_closure);
613 return coding_system_list;
616 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
617 Return the name of the given coding system.
621 coding_system = Fget_coding_system (coding_system);
622 return XCODING_SYSTEM_NAME (coding_system);
625 static Lisp_Coding_System *
626 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
628 Lisp_Coding_System *codesys =
629 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
631 zero_lcrecord (codesys);
632 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
633 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
634 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
635 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
636 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
637 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
638 CODING_SYSTEM_TYPE (codesys) = type;
639 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
641 if (type == CODESYS_ISO2022)
644 for (i = 0; i < 4; i++)
645 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
647 else if (type == CODESYS_CCL)
649 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
650 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
653 CODING_SYSTEM_NAME (codesys) = name;
659 /* Given a list of charset conversion specs as specified in a Lisp
660 program, parse it into STORE_HERE. */
663 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
664 Lisp_Object spec_list)
668 EXTERNAL_LIST_LOOP (rest, spec_list)
670 Lisp_Object car = XCAR (rest);
671 Lisp_Object from, to;
672 struct charset_conversion_spec spec;
674 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
675 signal_simple_error ("Invalid charset conversion spec", car);
676 from = Fget_charset (XCAR (car));
677 to = Fget_charset (XCAR (XCDR (car)));
678 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
679 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
680 signal_simple_error_2
681 ("Attempted conversion between different charset types",
683 spec.from_charset = from;
684 spec.to_charset = to;
686 Dynarr_add (store_here, spec);
690 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
691 specs, return the equivalent as the Lisp programmer would see it.
693 If LOAD_HERE is 0, return Qnil. */
696 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
703 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
705 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
706 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
709 return Fnreverse (result);
714 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
715 Register symbol NAME as a coding system.
717 TYPE describes the conversion method used and should be one of
720 Automatic conversion. XEmacs attempts to detect the coding system
723 No conversion. Use this for binary files and such. On output,
724 graphic characters that are not in ASCII or Latin-1 will be
725 replaced by a ?. (For a no-conversion-encoded buffer, these
726 characters will only be present if you explicitly insert them.)
728 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
730 ISO 10646 UCS-4 encoding.
732 ISO 10646 UTF-8 encoding.
734 Any ISO2022-compliant encoding. Among other things, this includes
735 JIS (the Japanese encoding commonly used for e-mail), EUC (the
736 standard Unix encoding for Japanese and other languages), and
737 Compound Text (the encoding used in X11). You can specify more
738 specific information about the conversion with the PROPS argument.
740 Big5 (the encoding commonly used for Taiwanese).
742 The conversion is performed using a user-written pseudo-code
743 program. CCL (Code Conversion Language) is the name of this
746 Write out or read in the raw contents of the memory representing
747 the buffer's text. This is primarily useful for debugging
748 purposes, and is only enabled when XEmacs has been compiled with
749 DEBUG_XEMACS defined (via the --debug configure option).
750 WARNING: Reading in a file using 'internal conversion can result
751 in an internal inconsistency in the memory representing a
752 buffer's text, which will produce unpredictable results and may
753 cause XEmacs to crash. Under normal circumstances you should
754 never use 'internal conversion.
756 DOC-STRING is a string describing the coding system.
758 PROPS is a property list, describing the specific nature of the
759 character set. Recognized properties are:
762 String to be displayed in the modeline when this coding system is
766 End-of-line conversion to be used. It should be one of
769 Automatically detect the end-of-line type (LF, CRLF,
770 or CR). Also generate subsidiary coding systems named
771 `NAME-unix', `NAME-dos', and `NAME-mac', that are
772 identical to this coding system but have an EOL-TYPE
773 value of 'lf, 'crlf, and 'cr, respectively.
775 The end of a line is marked externally using ASCII LF.
776 Since this is also the way that XEmacs represents an
777 end-of-line internally, specifying this option results
778 in no end-of-line conversion. This is the standard
779 format for Unix text files.
781 The end of a line is marked externally using ASCII
782 CRLF. This is the standard format for MS-DOS text
785 The end of a line is marked externally using ASCII CR.
786 This is the standard format for Macintosh text files.
788 Automatically detect the end-of-line type but do not
789 generate subsidiary coding systems. (This value is
790 converted to nil when stored internally, and
791 `coding-system-property' will return nil.)
794 If non-nil, composition/decomposition for combining characters
797 'post-read-conversion
798 Function called after a file has been read in, to perform the
799 decoding. Called with two arguments, START and END, denoting
800 a region of the current buffer to be decoded.
802 'pre-write-conversion
803 Function called before a file is written out, to perform the
804 encoding. Called with two arguments, START and END, denoting
805 a region of the current buffer to be encoded.
808 The following additional properties are recognized if TYPE is 'iso2022:
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
841 If non-nil, don't designate ASCII to G0 before control chars on output.
844 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
852 If non-nil, don't use ISO6429's direction specification.
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files. Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
880 The following additional properties are recognized (and required)
884 CCL program used for decoding (converting to internal format).
887 CCL program used for encoding (converting to external format).
889 (name, type, doc_string, props))
891 Lisp_Coding_System *codesys;
892 enum coding_system_type ty;
893 int need_to_setup_eol_systems = 1;
895 /* Convert type to constant */
896 if (NILP (type) || EQ (type, Qundecided))
897 { ty = CODESYS_AUTODETECT; }
899 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
900 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
901 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
902 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
903 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
904 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
906 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
908 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
911 signal_simple_error ("Invalid coding system type", type);
915 codesys = allocate_coding_system (ty, name);
917 if (NILP (doc_string))
918 doc_string = build_string ("");
920 CHECK_STRING (doc_string);
921 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
924 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
926 if (EQ (key, Qmnemonic))
929 CHECK_STRING (value);
930 CODING_SYSTEM_MNEMONIC (codesys) = value;
933 else if (EQ (key, Qeol_type))
935 need_to_setup_eol_systems = NILP (value);
938 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
941 else if (EQ (key, Qpost_read_conversion))
942 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
943 else if (EQ (key, Qpre_write_conversion))
944 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
946 else if (EQ (key, Qdisable_composition))
947 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
950 else if (ty == CODESYS_ISO2022)
952 #define FROB_INITIAL_CHARSET(charset_num) \
953 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
954 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
956 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
957 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
958 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
959 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
961 #define FROB_FORCE_CHARSET(charset_num) \
962 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
964 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
965 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
966 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
967 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
969 #define FROB_BOOLEAN_PROPERTY(prop) \
970 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
972 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
973 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
974 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
975 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
976 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
977 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
978 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
980 else if (EQ (key, Qinput_charset_conversion))
982 codesys->iso2022.input_conv =
983 Dynarr_new (charset_conversion_spec);
984 parse_charset_conversion_specs (codesys->iso2022.input_conv,
987 else if (EQ (key, Qoutput_charset_conversion))
989 codesys->iso2022.output_conv =
990 Dynarr_new (charset_conversion_spec);
991 parse_charset_conversion_specs (codesys->iso2022.output_conv,
995 signal_simple_error ("Unrecognized property", key);
997 else if (EQ (type, Qccl))
1000 struct ccl_program test_ccl;
1003 /* Check key first. */
1004 if (EQ (key, Qdecode))
1005 suffix = "-ccl-decode";
1006 else if (EQ (key, Qencode))
1007 suffix = "-ccl-encode";
1009 signal_simple_error ("Unrecognized property", key);
1011 /* If value is vector, register it as a ccl program
1012 associated with an newly created symbol for
1013 backward compatibility. */
1014 if (VECTORP (value))
1016 sym = Fintern (concat2 (Fsymbol_name (name),
1017 build_string (suffix)),
1019 Fregister_ccl_program (sym, value);
1023 CHECK_SYMBOL (value);
1026 /* check if the given ccl programs are valid. */
1027 if (setup_ccl_program (&test_ccl, sym) < 0)
1028 signal_simple_error ("Invalid CCL program", value);
1030 if (EQ (key, Qdecode))
1031 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1032 else if (EQ (key, Qencode))
1033 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1038 signal_simple_error ("Unrecognized property", key);
1042 if (need_to_setup_eol_systems)
1043 setup_eol_coding_systems (codesys);
1046 Lisp_Object codesys_obj;
1047 XSETCODING_SYSTEM (codesys_obj, codesys);
1048 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1053 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1054 Copy OLD-CODING-SYSTEM to NEW-NAME.
1055 If NEW-NAME does not name an existing coding system, a new one will
1058 (old_coding_system, new_name))
1060 Lisp_Object new_coding_system;
1061 old_coding_system = Fget_coding_system (old_coding_system);
1062 new_coding_system = Ffind_coding_system (new_name);
1063 if (NILP (new_coding_system))
1065 XSETCODING_SYSTEM (new_coding_system,
1066 allocate_coding_system
1067 (XCODING_SYSTEM_TYPE (old_coding_system),
1069 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1073 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1074 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1075 memcpy (((char *) to ) + sizeof (to->header),
1076 ((char *) from) + sizeof (from->header),
1077 sizeof (*from) - sizeof (from->header));
1078 to->name = new_name;
1080 return new_coding_system;
1083 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1084 Return t if OBJECT names a coding system, and is not a coding system alias.
1088 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1092 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1093 Return t if OBJECT is a coding system alias.
1094 All coding system aliases are created by `define-coding-system-alias'.
1098 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1102 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1103 Return the coding-system symbol for which symbol ALIAS is an alias.
1107 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1108 if (SYMBOLP (aliasee))
1111 signal_simple_error ("Symbol is not a coding system alias", alias);
1112 return Qnil; /* To keep the compiler happy */
1116 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1118 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1122 /* A maphash function, for removing dangling coding system aliases. */
1124 dangling_coding_system_alias_p (Lisp_Object alias,
1125 Lisp_Object aliasee,
1126 void *dangling_aliases)
1128 if (SYMBOLP (aliasee)
1129 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1131 (*(int *) dangling_aliases)++;
1138 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1139 Define symbol ALIAS as an alias for coding system ALIASEE.
1141 You can use this function to redefine an alias that has already been defined,
1142 but you cannot redefine a name which is the canonical name for a coding system.
1143 \(a canonical name of a coding system is what is returned when you call
1144 `coding-system-name' on a coding system).
1146 ALIASEE itself can be an alias, which allows you to define nested aliases.
1148 You are forbidden, however, from creating alias loops or `dangling' aliases.
1149 These will be detected, and an error will be signaled if you attempt to do so.
1151 If ALIASEE is nil, then ALIAS will simply be undefined.
1153 See also `coding-system-alias-p', `coding-system-aliasee',
1154 and `coding-system-canonical-name-p'.
1158 Lisp_Object real_coding_system, probe;
1160 CHECK_SYMBOL (alias);
1162 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1164 ("Symbol is the canonical name of a coding system and cannot be redefined",
1169 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1170 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1171 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1173 Fremhash (alias, Vcoding_system_hash_table);
1175 /* Undefine subsidiary aliases,
1176 presumably created by a previous call to this function */
1177 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1178 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1179 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1181 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1182 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1183 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1186 /* Undefine dangling coding system aliases. */
1188 int dangling_aliases;
1191 dangling_aliases = 0;
1192 elisp_map_remhash (dangling_coding_system_alias_p,
1193 Vcoding_system_hash_table,
1195 } while (dangling_aliases > 0);
1201 if (CODING_SYSTEMP (aliasee))
1202 aliasee = XCODING_SYSTEM_NAME (aliasee);
1204 /* Checks that aliasee names a coding-system */
1205 real_coding_system = Fget_coding_system (aliasee);
1207 /* Check for coding system alias loops */
1208 if (EQ (alias, aliasee))
1209 alias_loop: signal_simple_error_2
1210 ("Attempt to create a coding system alias loop", alias, aliasee);
1212 for (probe = aliasee;
1214 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1216 if (EQ (probe, alias))
1220 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1222 /* Set up aliases for subsidiaries.
1223 #### There must be a better way to handle subsidiary coding systems. */
1225 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1227 for (i = 0; i < countof (suffixes); i++)
1229 Lisp_Object alias_subsidiary =
1230 append_suffix_to_symbol (alias, suffixes[i]);
1231 Lisp_Object aliasee_subsidiary =
1232 append_suffix_to_symbol (aliasee, suffixes[i]);
1234 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1235 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1238 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1239 but it doesn't look intentional, so I'd rather return something
1240 meaningful or nothing at all. */
1245 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1247 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1248 Lisp_Object new_coding_system;
1250 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1251 return coding_system;
1255 case EOL_AUTODETECT: return coding_system;
1256 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1257 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1258 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1259 default: abort (); return Qnil;
1262 return NILP (new_coding_system) ? coding_system : new_coding_system;
1265 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1266 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1268 (coding_system, eol_type))
1270 coding_system = Fget_coding_system (coding_system);
1272 return subsidiary_coding_system (coding_system,
1273 symbol_to_eol_type (eol_type));
1277 /************************************************************************/
1278 /* Coding system accessors */
1279 /************************************************************************/
1281 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1282 Return the doc string for CODING-SYSTEM.
1286 coding_system = Fget_coding_system (coding_system);
1287 return XCODING_SYSTEM_DOC_STRING (coding_system);
1290 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1291 Return the type of CODING-SYSTEM.
1295 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1298 case CODESYS_AUTODETECT: return Qundecided;
1300 case CODESYS_SHIFT_JIS: return Qshift_jis;
1301 case CODESYS_ISO2022: return Qiso2022;
1302 case CODESYS_BIG5: return Qbig5;
1303 case CODESYS_UCS4: return Qucs4;
1304 case CODESYS_UTF8: return Qutf8;
1305 case CODESYS_CCL: return Qccl;
1307 case CODESYS_NO_CONVERSION: return Qno_conversion;
1309 case CODESYS_INTERNAL: return Qinternal;
1316 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1319 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1321 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1324 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1325 Return initial charset of CODING-SYSTEM designated to GNUM.
1328 (coding_system, gnum))
1330 coding_system = Fget_coding_system (coding_system);
1333 return coding_system_charset (coding_system, XINT (gnum));
1337 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1338 Return the PROP property of CODING-SYSTEM.
1340 (coding_system, prop))
1343 enum coding_system_type type;
1345 coding_system = Fget_coding_system (coding_system);
1346 CHECK_SYMBOL (prop);
1347 type = XCODING_SYSTEM_TYPE (coding_system);
1349 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1350 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1353 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1355 case CODESYS_PROP_ALL_OK:
1358 case CODESYS_PROP_ISO2022:
1359 if (type != CODESYS_ISO2022)
1361 ("Property only valid in ISO2022 coding systems",
1365 case CODESYS_PROP_CCL:
1366 if (type != CODESYS_CCL)
1368 ("Property only valid in CCL coding systems",
1378 signal_simple_error ("Unrecognized property", prop);
1380 if (EQ (prop, Qname))
1381 return XCODING_SYSTEM_NAME (coding_system);
1382 else if (EQ (prop, Qtype))
1383 return Fcoding_system_type (coding_system);
1384 else if (EQ (prop, Qdoc_string))
1385 return XCODING_SYSTEM_DOC_STRING (coding_system);
1386 else if (EQ (prop, Qmnemonic))
1387 return XCODING_SYSTEM_MNEMONIC (coding_system);
1388 else if (EQ (prop, Qeol_type))
1389 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1390 else if (EQ (prop, Qeol_lf))
1391 return XCODING_SYSTEM_EOL_LF (coding_system);
1392 else if (EQ (prop, Qeol_crlf))
1393 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1394 else if (EQ (prop, Qeol_cr))
1395 return XCODING_SYSTEM_EOL_CR (coding_system);
1396 else if (EQ (prop, Qpost_read_conversion))
1397 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1398 else if (EQ (prop, Qpre_write_conversion))
1399 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1401 else if (type == CODESYS_ISO2022)
1403 if (EQ (prop, Qcharset_g0))
1404 return coding_system_charset (coding_system, 0);
1405 else if (EQ (prop, Qcharset_g1))
1406 return coding_system_charset (coding_system, 1);
1407 else if (EQ (prop, Qcharset_g2))
1408 return coding_system_charset (coding_system, 2);
1409 else if (EQ (prop, Qcharset_g3))
1410 return coding_system_charset (coding_system, 3);
1412 #define FORCE_CHARSET(charset_num) \
1413 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1414 (coding_system, charset_num) ? Qt : Qnil)
1416 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1417 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1418 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1419 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1421 #define LISP_BOOLEAN(prop) \
1422 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1424 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1425 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1426 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1427 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1428 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1429 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1430 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1432 else if (EQ (prop, Qinput_charset_conversion))
1434 unparse_charset_conversion_specs
1435 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1436 else if (EQ (prop, Qoutput_charset_conversion))
1438 unparse_charset_conversion_specs
1439 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1443 else if (type == CODESYS_CCL)
1445 if (EQ (prop, Qdecode))
1446 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1447 else if (EQ (prop, Qencode))
1448 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1456 return Qnil; /* not reached */
1460 /************************************************************************/
1461 /* Coding category functions */
1462 /************************************************************************/
1465 decode_coding_category (Lisp_Object symbol)
1469 CHECK_SYMBOL (symbol);
1470 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1471 if (EQ (coding_category_symbol[i], symbol))
1474 signal_simple_error ("Unrecognized coding category", symbol);
1475 return 0; /* not reached */
1478 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1479 Return a list of all recognized coding categories.
1484 Lisp_Object list = Qnil;
1486 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1487 list = Fcons (coding_category_symbol[i], list);
1491 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1492 Change the priority order of the coding categories.
1493 LIST should be list of coding categories, in descending order of
1494 priority. Unspecified coding categories will be lower in priority
1495 than all specified ones, in the same relative order they were in
1500 int category_to_priority[CODING_CATEGORY_LAST];
1504 /* First generate a list that maps coding categories to priorities. */
1506 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1507 category_to_priority[i] = -1;
1509 /* Highest priority comes from the specified list. */
1511 EXTERNAL_LIST_LOOP (rest, list)
1513 int cat = decode_coding_category (XCAR (rest));
1515 if (category_to_priority[cat] >= 0)
1516 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1517 category_to_priority[cat] = i++;
1520 /* Now go through the existing categories by priority to retrieve
1521 the categories not yet specified and preserve their priority
1523 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1525 int cat = fcd->coding_category_by_priority[j];
1526 if (category_to_priority[cat] < 0)
1527 category_to_priority[cat] = i++;
1530 /* Now we need to construct the inverse of the mapping we just
1533 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1534 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1536 /* Phew! That was confusing. */
1540 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1541 Return a list of coding categories in descending order of priority.
1546 Lisp_Object list = Qnil;
1548 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1549 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1554 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1555 Change the coding system associated with a coding category.
1557 (coding_category, coding_system))
1559 int cat = decode_coding_category (coding_category);
1561 coding_system = Fget_coding_system (coding_system);
1562 fcd->coding_category_system[cat] = coding_system;
1566 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1567 Return the coding system associated with a coding category.
1571 int cat = decode_coding_category (coding_category);
1572 Lisp_Object sys = fcd->coding_category_system[cat];
1575 return XCODING_SYSTEM_NAME (sys);
1580 /************************************************************************/
1581 /* Detecting the encoding of data */
1582 /************************************************************************/
1584 struct detection_state
1586 eol_type_t eol_type;
1622 struct iso2022_decoder iso;
1624 int high_byte_count;
1625 unsigned int saw_single_shift:1;
1638 acceptable_control_char_p (int c)
1642 /* Allow and ignore control characters that you might
1643 reasonably see in a text file */
1648 case 8: /* backspace */
1649 case 11: /* vertical tab */
1650 case 12: /* form feed */
1651 case 26: /* MS-DOS C-z junk */
1652 case 31: /* '^_' -- for info */
1660 mask_has_at_most_one_bit_p (int mask)
1662 /* Perhaps the only thing useful you learn from intensive Microsoft
1663 technical interviews */
1664 return (mask & (mask - 1)) == 0;
1668 detect_eol_type (struct detection_state *st, const Extbyte *src,
1673 unsigned char c = *(unsigned char *)src++;
1676 if (st->eol.just_saw_cr)
1678 else if (st->eol.seen_anything)
1681 else if (st->eol.just_saw_cr)
1684 st->eol.just_saw_cr = 1;
1686 st->eol.just_saw_cr = 0;
1687 st->eol.seen_anything = 1;
1690 return EOL_AUTODETECT;
1693 /* Attempt to determine the encoding and EOL type of the given text.
1694 Before calling this function for the first type, you must initialize
1695 st->eol_type as appropriate and initialize st->mask to ~0.
1697 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1700 st->mask holds the determined coding category mask, or ~0 if only
1701 ASCII has been seen so far.
1705 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1706 is present in st->mask
1707 1 == definitive answers are here for both st->eol_type and st->mask
1711 detect_coding_type (struct detection_state *st, const Extbyte *src,
1712 size_t n, int just_do_eol)
1714 if (st->eol_type == EOL_AUTODETECT)
1715 st->eol_type = detect_eol_type (st, src, n);
1718 return st->eol_type != EOL_AUTODETECT;
1720 if (!st->seen_non_ascii)
1722 for (; n; n--, src++)
1724 unsigned char c = *(unsigned char *) src;
1725 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1727 st->seen_non_ascii = 1;
1729 st->shift_jis.mask = ~0;
1733 st->iso2022.mask = ~0;
1743 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1744 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1745 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1746 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1747 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1748 st->big5.mask = detect_coding_big5 (st, src, n);
1749 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1750 st->utf8.mask = detect_coding_utf8 (st, src, n);
1751 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1752 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1755 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1756 | st->utf8.mask | st->ucs4.mask;
1759 int retval = mask_has_at_most_one_bit_p (st->mask);
1760 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1761 return retval && st->eol_type != EOL_AUTODETECT;
1766 coding_system_from_mask (int mask)
1770 /* If the file was entirely or basically ASCII, use the
1771 default value of `buffer-file-coding-system'. */
1772 Lisp_Object retval =
1773 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1776 retval = Ffind_coding_system (retval);
1780 (Qbad_variable, Qwarning,
1781 "Invalid `default-buffer-file-coding-system', set to nil");
1782 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1786 retval = Fget_coding_system (Qraw_text);
1794 mask = postprocess_iso2022_mask (mask);
1796 /* Look through the coding categories by priority and find
1797 the first one that is allowed. */
1798 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1800 cat = fcd->coding_category_by_priority[i];
1801 if ((mask & (1 << cat)) &&
1802 !NILP (fcd->coding_category_system[cat]))
1806 return fcd->coding_category_system[cat];
1808 return Fget_coding_system (Qraw_text);
1812 /* Given a seekable read stream and potential coding system and EOL type
1813 as specified, do any autodetection that is called for. If the
1814 coding system and/or EOL type are not `autodetect', they will be left
1815 alone; but this function will never return an autodetect coding system
1818 This function does not automatically fetch subsidiary coding systems;
1819 that should be unnecessary with the explicit eol-type argument. */
1821 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1824 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1825 eol_type_t *eol_type_in_out)
1827 struct detection_state decst;
1829 if (*eol_type_in_out == EOL_AUTODETECT)
1830 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1833 decst.eol_type = *eol_type_in_out;
1836 /* If autodetection is called for, do it now. */
1837 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1838 || *eol_type_in_out == EOL_AUTODETECT)
1841 Lisp_Object coding_system = Qnil;
1843 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1846 /* Look for initial "-*-"; mode line prefix */
1848 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1853 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1855 Extbyte *local_vars_beg = p + 3;
1856 /* Look for final "-*-"; mode line suffix */
1857 for (p = local_vars_beg,
1858 scan_end = buf + nread - LENGTH ("-*-");
1863 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1865 Extbyte *suffix = p;
1866 /* Look for "coding:" */
1867 for (p = local_vars_beg,
1868 scan_end = suffix - LENGTH ("coding:?");
1871 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1872 && (p == local_vars_beg
1873 || (*(p-1) == ' ' ||
1879 p += LENGTH ("coding:");
1880 while (*p == ' ' || *p == '\t') p++;
1882 /* Get coding system name */
1883 save = *suffix; *suffix = '\0';
1884 /* Characters valid in a MIME charset name (rfc 1521),
1885 and in a Lisp symbol name. */
1886 n = strspn ( (char *) p,
1887 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1888 "abcdefghijklmnopqrstuvwxyz"
1894 save = p[n]; p[n] = '\0';
1896 Ffind_coding_system (intern ((char *) p));
1906 if (NILP (coding_system))
1909 if (detect_coding_type (&decst, buf, nread,
1910 XCODING_SYSTEM_TYPE (*codesys_in_out)
1911 != CODESYS_AUTODETECT))
1913 nread = Lstream_read (stream, buf, sizeof (buf));
1919 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1920 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1923 if (detect_coding_type (&decst, buf, nread, 1))
1925 nread = Lstream_read (stream, buf, sizeof (buf));
1931 *eol_type_in_out = decst.eol_type;
1932 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1934 if (NILP (coding_system))
1935 *codesys_in_out = coding_system_from_mask (decst.mask);
1937 *codesys_in_out = coding_system;
1941 /* If we absolutely can't determine the EOL type, just assume LF. */
1942 if (*eol_type_in_out == EOL_AUTODETECT)
1943 *eol_type_in_out = EOL_LF;
1945 Lstream_rewind (stream);
1948 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1949 Detect coding system of the text in the region between START and END.
1950 Return a list of possible coding systems ordered by priority.
1951 If only ASCII characters are found, return 'undecided or one of
1952 its subsidiary coding systems according to a detected end-of-line
1953 type. Optional arg BUFFER defaults to the current buffer.
1955 (start, end, buffer))
1957 Lisp_Object val = Qnil;
1958 struct buffer *buf = decode_buffer (buffer, 0);
1960 Lisp_Object instream, lb_instream;
1961 Lstream *istr, *lb_istr;
1962 struct detection_state decst;
1963 struct gcpro gcpro1, gcpro2;
1965 get_buffer_range_char (buf, start, end, &b, &e, 0);
1966 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1967 lb_istr = XLSTREAM (lb_instream);
1968 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1969 istr = XLSTREAM (instream);
1970 GCPRO2 (instream, lb_instream);
1972 decst.eol_type = EOL_AUTODETECT;
1976 Extbyte random_buffer[4096];
1977 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1981 if (detect_coding_type (&decst, random_buffer, nread, 0))
1985 if (decst.mask == ~0)
1986 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1994 decst.mask = postprocess_iso2022_mask (decst.mask);
1996 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1998 int sys = fcd->coding_category_by_priority[i];
1999 if (decst.mask & (1 << sys))
2001 Lisp_Object codesys = fcd->coding_category_system[sys];
2002 if (!NILP (codesys))
2003 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2004 val = Fcons (codesys, val);
2008 Lstream_close (istr);
2010 Lstream_delete (istr);
2011 Lstream_delete (lb_istr);
2016 /************************************************************************/
2017 /* Converting to internal Mule format ("decoding") */
2018 /************************************************************************/
2020 /* A decoding stream is a stream used for decoding text (i.e.
2021 converting from some external format to internal format).
2022 The decoding-stream object keeps track of the actual coding
2023 stream, the stream that is at the other end, and data that
2024 needs to be persistent across the lifetime of the stream. */
2026 /* Handle the EOL stuff related to just-read-in character C.
2027 EOL_TYPE is the EOL type of the coding stream.
2028 FLAGS is the current value of FLAGS in the coding stream, and may
2029 be modified by this macro. (The macro only looks at the
2030 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2031 bytes are to be written. You need to also define a local goto
2032 label "label_continue_loop" that is at the end of the main
2033 character-reading loop.
2035 If C is a CR character, then this macro handles it entirely and
2036 jumps to label_continue_loop. Otherwise, this macro does not add
2037 anything to DST, and continues normally. You should continue
2038 processing C normally after this macro. */
2040 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2044 if (eol_type == EOL_CR) \
2045 Dynarr_add (dst, '\n'); \
2046 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2047 Dynarr_add (dst, c); \
2049 flags |= CODING_STATE_CR; \
2050 goto label_continue_loop; \
2052 else if (flags & CODING_STATE_CR) \
2053 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2055 Dynarr_add (dst, '\r'); \
2056 flags &= ~CODING_STATE_CR; \
2060 /* C should be a binary character in the range 0 - 255; convert
2061 to internal format and add to Dynarr DST. */
2064 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2066 if (BYTE_ASCII_P (c)) \
2067 Dynarr_add (dst, c); \
2070 Dynarr_add (dst, (c >> 6) | 0xc0); \
2071 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2075 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2077 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2081 Dynarr_add (dst, c);
2083 else if ( c <= 0x7ff )
2085 Dynarr_add (dst, (c >> 6) | 0xc0);
2086 Dynarr_add (dst, (c & 0x3f) | 0x80);
2088 else if ( c <= 0xffff )
2090 Dynarr_add (dst, (c >> 12) | 0xe0);
2091 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2092 Dynarr_add (dst, (c & 0x3f) | 0x80);
2094 else if ( c <= 0x1fffff )
2096 Dynarr_add (dst, (c >> 18) | 0xf0);
2097 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2098 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2099 Dynarr_add (dst, (c & 0x3f) | 0x80);
2101 else if ( c <= 0x3ffffff )
2103 Dynarr_add (dst, (c >> 24) | 0xf8);
2104 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2107 Dynarr_add (dst, (c & 0x3f) | 0x80);
2111 Dynarr_add (dst, (c >> 30) | 0xfc);
2112 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2113 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2114 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2115 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2116 Dynarr_add (dst, (c & 0x3f) | 0x80);
2120 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2122 if (BYTE_ASCII_P (c)) \
2123 Dynarr_add (dst, c); \
2124 else if (BYTE_C1_P (c)) \
2126 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2127 Dynarr_add (dst, c + 0x20); \
2131 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2132 Dynarr_add (dst, c); \
2137 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2141 DECODE_ADD_BINARY_CHAR (ch, dst); \
2146 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2148 if (flags & CODING_STATE_END) \
2150 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2151 if (flags & CODING_STATE_CR) \
2152 Dynarr_add (dst, '\r'); \
2156 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2158 struct decoding_stream
2160 /* Coding system that governs the conversion. */
2161 Lisp_Coding_System *codesys;
2163 /* Stream that we read the encoded data from or
2164 write the decoded data to. */
2167 /* If we are reading, then we can return only a fixed amount of
2168 data, so if the conversion resulted in too much data, we store it
2169 here for retrieval the next time around. */
2170 unsigned_char_dynarr *runoff;
2172 /* FLAGS holds flags indicating the current state of the decoding.
2173 Some of these flags are dependent on the coding system. */
2176 /* CPOS holds a partially built-up code-point of character. */
2179 /* EOL_TYPE specifies the type of end-of-line conversion that
2180 currently applies. We need to keep this separate from the
2181 EOL type stored in CODESYS because the latter might indicate
2182 automatic EOL-type detection while the former will always
2183 indicate a particular EOL type. */
2184 eol_type_t eol_type;
2186 /* Additional ISO2022 information. We define the structure above
2187 because it's also needed by the detection routines. */
2188 struct iso2022_decoder iso2022;
2190 /* Additional information (the state of the running CCL program)
2191 used by the CCL decoder. */
2192 struct ccl_program ccl;
2194 /* counter for UTF-8 or UCS-4 */
2195 unsigned char counter;
2198 unsigned combined_char_count;
2199 Emchar combined_chars[16];
2200 Lisp_Object combining_table;
2202 struct detection_state decst;
2206 extern Lisp_Object Vcharacter_composition_table;
2209 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2211 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2215 for (i = 0; i < str->combined_char_count; i++)
2216 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2217 str->combined_char_count = 0;
2218 str->combining_table = Qnil;
2221 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2222 unsigned_char_dynarr* dst);
2224 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2225 Emchar character, unsigned_char_dynarr* dst)
2227 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2228 DECODE_ADD_UCS_CHAR (character, dst);
2229 else if (!CHAR_TABLEP (str->combining_table))
2232 = get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2236 DECODE_ADD_UCS_CHAR (character, dst);
2239 str->combined_chars[0] = character;
2240 str->combined_char_count = 1;
2241 str->combining_table = ret;
2247 = get_char_id_table (XCHAR_TABLE(str->combining_table),
2252 Emchar char2 = XCHARVAL (ret);
2254 get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2258 DECODE_ADD_UCS_CHAR (char2, dst);
2259 str->combined_char_count = 0;
2260 str->combining_table = Qnil;
2264 str->combined_chars[0] = char2;
2265 str->combined_char_count = 1;
2266 str->combining_table = ret;
2269 else if (CHAR_TABLEP (ret))
2271 str->combined_chars[str->combined_char_count++] = character;
2272 str->combining_table = ret;
2276 COMPOSE_FLUSH_CHARS (str, dst);
2277 DECODE_ADD_UCS_CHAR (character, dst);
2281 #else /* not UTF2000 */
2282 #define COMPOSE_FLUSH_CHARS(str, dst)
2283 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2284 #endif /* UTF2000 */
2286 static ssize_t decoding_reader (Lstream *stream,
2287 unsigned char *data, size_t size);
2288 static ssize_t decoding_writer (Lstream *stream,
2289 const unsigned char *data, size_t size);
2290 static int decoding_rewinder (Lstream *stream);
2291 static int decoding_seekable_p (Lstream *stream);
2292 static int decoding_flusher (Lstream *stream);
2293 static int decoding_closer (Lstream *stream);
2295 static Lisp_Object decoding_marker (Lisp_Object stream);
2297 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2298 sizeof (struct decoding_stream));
2301 decoding_marker (Lisp_Object stream)
2303 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2304 Lisp_Object str_obj;
2306 /* We do not need to mark the coding systems or charsets stored
2307 within the stream because they are stored in a global list
2308 and automatically marked. */
2310 XSETLSTREAM (str_obj, str);
2311 mark_object (str_obj);
2312 if (str->imp->marker)
2313 return (str->imp->marker) (str_obj);
2318 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2319 so we read data from the other end, decode it, and store it into DATA. */
2322 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2324 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2325 unsigned char *orig_data = data;
2327 int error_occurred = 0;
2329 /* We need to interface to mule_decode(), which expects to take some
2330 amount of data and store the result into a Dynarr. We have
2331 mule_decode() store into str->runoff, and take data from there
2334 /* We loop until we have enough data, reading chunks from the other
2335 end and decoding it. */
2338 /* Take data from the runoff if we can. Make sure to take at
2339 most SIZE bytes, and delete the data from the runoff. */
2340 if (Dynarr_length (str->runoff) > 0)
2342 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2343 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2344 Dynarr_delete_many (str->runoff, 0, chunk);
2350 break; /* No more room for data */
2352 if (str->flags & CODING_STATE_END)
2353 /* This means that on the previous iteration, we hit the EOF on
2354 the other end. We loop once more so that mule_decode() can
2355 output any final stuff it may be holding, or any "go back
2356 to a sane state" escape sequences. (This latter makes sense
2357 during encoding.) */
2360 /* Exhausted the runoff, so get some more. DATA has at least
2361 SIZE bytes left of storage in it, so it's OK to read directly
2362 into it. (We'll be overwriting above, after we've decoded it
2363 into the runoff.) */
2364 read_size = Lstream_read (str->other_end, data, size);
2371 /* There might be some more end data produced in the translation.
2372 See the comment above. */
2373 str->flags |= CODING_STATE_END;
2374 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2377 if (data - orig_data == 0)
2378 return error_occurred ? -1 : 0;
2380 return data - orig_data;
2384 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2386 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2389 /* Decode all our data into the runoff, and then attempt to write
2390 it all out to the other end. Remove whatever chunk we succeeded
2392 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2393 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2394 Dynarr_length (str->runoff));
2396 Dynarr_delete_many (str->runoff, 0, retval);
2397 /* Do NOT return retval. The return value indicates how much
2398 of the incoming data was written, not how many bytes were
2404 reset_decoding_stream (struct decoding_stream *str)
2407 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2409 Lisp_Object coding_system;
2410 XSETCODING_SYSTEM (coding_system, str->codesys);
2411 reset_iso2022 (coding_system, &str->iso2022);
2413 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2415 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2420 str->combined_char_count = 0;
2421 str->combining_table = Qnil;
2423 str->flags = str->cpos = 0;
2427 decoding_rewinder (Lstream *stream)
2429 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2430 reset_decoding_stream (str);
2431 Dynarr_reset (str->runoff);
2432 return Lstream_rewind (str->other_end);
2436 decoding_seekable_p (Lstream *stream)
2438 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2439 return Lstream_seekable_p (str->other_end);
2443 decoding_flusher (Lstream *stream)
2445 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2446 return Lstream_flush (str->other_end);
2450 decoding_closer (Lstream *stream)
2452 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2453 if (stream->flags & LSTREAM_FL_WRITE)
2455 str->flags |= CODING_STATE_END;
2456 decoding_writer (stream, 0, 0);
2458 Dynarr_free (str->runoff);
2460 #ifdef ENABLE_COMPOSITE_CHARS
2461 if (str->iso2022.composite_chars)
2462 Dynarr_free (str->iso2022.composite_chars);
2465 return Lstream_close (str->other_end);
2469 decoding_stream_coding_system (Lstream *stream)
2471 Lisp_Object coding_system;
2472 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2474 XSETCODING_SYSTEM (coding_system, str->codesys);
2475 return subsidiary_coding_system (coding_system, str->eol_type);
2479 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2481 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2482 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2484 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2485 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2486 reset_decoding_stream (str);
2489 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2490 stream for writing, no automatic code detection will be performed.
2491 The reason for this is that automatic code detection requires a
2492 seekable input. Things will also fail if you open a decoding
2493 stream for reading using a non-fully-specified coding system and
2494 a non-seekable input stream. */
2497 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2500 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2501 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2505 str->other_end = stream;
2506 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2507 str->eol_type = EOL_AUTODETECT;
2508 if (!strcmp (mode, "r")
2509 && Lstream_seekable_p (stream))
2510 /* We can determine the coding system now. */
2511 determine_real_coding_system (stream, &codesys, &str->eol_type);
2512 set_decoding_stream_coding_system (lstr, codesys);
2513 str->decst.eol_type = str->eol_type;
2514 str->decst.mask = ~0;
2515 XSETLSTREAM (obj, lstr);
2520 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2522 return make_decoding_stream_1 (stream, codesys, "r");
2526 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2528 return make_decoding_stream_1 (stream, codesys, "w");
2531 /* Note: the decode_coding_* functions all take the same
2532 arguments as mule_decode(), which is to say some SRC data of
2533 size N, which is to be stored into dynamic array DST.
2534 DECODING is the stream within which the decoding is
2535 taking place, but no data is actually read from or
2536 written to that stream; that is handled in decoding_reader()
2537 or decoding_writer(). This allows the same functions to
2538 be used for both reading and writing. */
2541 mule_decode (Lstream *decoding, const Extbyte *src,
2542 unsigned_char_dynarr *dst, size_t n)
2544 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2546 /* If necessary, do encoding-detection now. We do this when
2547 we're a writing stream or a non-seekable reading stream,
2548 meaning that we can't just process the whole input,
2549 rewind, and start over. */
2551 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2552 str->eol_type == EOL_AUTODETECT)
2554 Lisp_Object codesys;
2556 XSETCODING_SYSTEM (codesys, str->codesys);
2557 detect_coding_type (&str->decst, src, n,
2558 CODING_SYSTEM_TYPE (str->codesys) !=
2559 CODESYS_AUTODETECT);
2560 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2561 str->decst.mask != ~0)
2562 /* #### This is cheesy. What we really ought to do is
2563 buffer up a certain amount of data so as to get a
2564 less random result. */
2565 codesys = coding_system_from_mask (str->decst.mask);
2566 str->eol_type = str->decst.eol_type;
2567 if (XCODING_SYSTEM (codesys) != str->codesys)
2569 /* Preserve the CODING_STATE_END flag in case it was set.
2570 If we erase it, bad things might happen. */
2571 int was_end = str->flags & CODING_STATE_END;
2572 set_decoding_stream_coding_system (decoding, codesys);
2574 str->flags |= CODING_STATE_END;
2578 switch (CODING_SYSTEM_TYPE (str->codesys))
2581 case CODESYS_INTERNAL:
2582 Dynarr_add_many (dst, src, n);
2585 case CODESYS_AUTODETECT:
2586 /* If we got this far and still haven't decided on the coding
2587 system, then do no conversion. */
2588 case CODESYS_NO_CONVERSION:
2589 decode_coding_no_conversion (decoding, src, dst, n);
2592 case CODESYS_SHIFT_JIS:
2593 decode_coding_sjis (decoding, src, dst, n);
2596 decode_coding_big5 (decoding, src, dst, n);
2599 decode_coding_ucs4 (decoding, src, dst, n);
2602 decode_coding_utf8 (decoding, src, dst, n);
2605 str->ccl.last_block = str->flags & CODING_STATE_END;
2606 /* When applying ccl program to stream, MUST NOT set NULL
2608 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2609 dst, n, 0, CCL_MODE_DECODING);
2611 case CODESYS_ISO2022:
2612 decode_coding_iso2022 (decoding, src, dst, n);
2620 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2621 Decode the text between START and END which is encoded in CODING-SYSTEM.
2622 This is useful if you've read in encoded text from a file without decoding
2623 it (e.g. you read in a JIS-formatted file but used the `binary' or
2624 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2625 Return length of decoded text.
2626 BUFFER defaults to the current buffer if unspecified.
2628 (start, end, coding_system, buffer))
2631 struct buffer *buf = decode_buffer (buffer, 0);
2632 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2633 Lstream *istr, *ostr;
2634 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2636 get_buffer_range_char (buf, start, end, &b, &e, 0);
2638 barf_if_buffer_read_only (buf, b, e);
2640 coding_system = Fget_coding_system (coding_system);
2641 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2642 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2643 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2645 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2646 Fget_coding_system (Qbinary));
2647 istr = XLSTREAM (instream);
2648 ostr = XLSTREAM (outstream);
2649 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2651 /* The chain of streams looks like this:
2653 [BUFFER] <----- send through
2654 ------> [ENCODE AS BINARY]
2655 ------> [DECODE AS SPECIFIED]
2661 char tempbuf[1024]; /* some random amount */
2662 Bufpos newpos, even_newer_pos;
2663 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2664 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2668 newpos = lisp_buffer_stream_startpos (istr);
2669 Lstream_write (ostr, tempbuf, size_in_bytes);
2670 even_newer_pos = lisp_buffer_stream_startpos (istr);
2671 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2674 Lstream_close (istr);
2675 Lstream_close (ostr);
2677 Lstream_delete (istr);
2678 Lstream_delete (ostr);
2679 Lstream_delete (XLSTREAM (de_outstream));
2680 Lstream_delete (XLSTREAM (lb_outstream));
2685 /************************************************************************/
2686 /* Converting to an external encoding ("encoding") */
2687 /************************************************************************/
2689 /* An encoding stream is an output stream. When you create the
2690 stream, you specify the coding system that governs the encoding
2691 and another stream that the resulting encoded data is to be
2692 sent to, and then start sending data to it. */
2694 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2696 struct encoding_stream
2698 /* Coding system that governs the conversion. */
2699 Lisp_Coding_System *codesys;
2701 /* Stream that we read the encoded data from or
2702 write the decoded data to. */
2705 /* If we are reading, then we can return only a fixed amount of
2706 data, so if the conversion resulted in too much data, we store it
2707 here for retrieval the next time around. */
2708 unsigned_char_dynarr *runoff;
2710 /* FLAGS holds flags indicating the current state of the encoding.
2711 Some of these flags are dependent on the coding system. */
2714 /* CH holds a partially built-up character. Since we only deal
2715 with one- and two-byte characters at the moment, we only use
2716 this to store the first byte of a two-byte character. */
2719 /* Additional information used by the ISO2022 encoder. */
2722 /* CHARSET holds the character sets currently assigned to the G0
2723 through G3 registers. It is initialized from the array
2724 INITIAL_CHARSET in CODESYS. */
2725 Lisp_Object charset[4];
2727 /* Which registers are currently invoked into the left (GL) and
2728 right (GR) halves of the 8-bit encoding space? */
2729 int register_left, register_right;
2731 /* Whether we need to explicitly designate the charset in the
2732 G? register before using it. It is initialized from the
2733 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2734 unsigned char force_charset_on_output[4];
2736 /* Other state variables that need to be preserved across
2738 Lisp_Object current_charset;
2740 int current_char_boundary;
2743 void (*encode_char) (struct encoding_stream *str, Emchar c,
2744 unsigned_char_dynarr *dst, unsigned int *flags);
2745 void (*finish) (struct encoding_stream *str,
2746 unsigned_char_dynarr *dst, unsigned int *flags);
2748 /* Additional information (the state of the running CCL program)
2749 used by the CCL encoder. */
2750 struct ccl_program ccl;
2754 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2755 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2757 static int encoding_rewinder (Lstream *stream);
2758 static int encoding_seekable_p (Lstream *stream);
2759 static int encoding_flusher (Lstream *stream);
2760 static int encoding_closer (Lstream *stream);
2762 static Lisp_Object encoding_marker (Lisp_Object stream);
2764 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2765 sizeof (struct encoding_stream));
2768 encoding_marker (Lisp_Object stream)
2770 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2771 Lisp_Object str_obj;
2773 /* We do not need to mark the coding systems or charsets stored
2774 within the stream because they are stored in a global list
2775 and automatically marked. */
2777 XSETLSTREAM (str_obj, str);
2778 mark_object (str_obj);
2779 if (str->imp->marker)
2780 return (str->imp->marker) (str_obj);
2785 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2786 so we read data from the other end, encode it, and store it into DATA. */
2789 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2791 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2792 unsigned char *orig_data = data;
2794 int error_occurred = 0;
2796 /* We need to interface to mule_encode(), which expects to take some
2797 amount of data and store the result into a Dynarr. We have
2798 mule_encode() store into str->runoff, and take data from there
2801 /* We loop until we have enough data, reading chunks from the other
2802 end and encoding it. */
2805 /* Take data from the runoff if we can. Make sure to take at
2806 most SIZE bytes, and delete the data from the runoff. */
2807 if (Dynarr_length (str->runoff) > 0)
2809 int chunk = min ((int) size, Dynarr_length (str->runoff));
2810 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2811 Dynarr_delete_many (str->runoff, 0, chunk);
2817 break; /* No more room for data */
2819 if (str->flags & CODING_STATE_END)
2820 /* This means that on the previous iteration, we hit the EOF on
2821 the other end. We loop once more so that mule_encode() can
2822 output any final stuff it may be holding, or any "go back
2823 to a sane state" escape sequences. (This latter makes sense
2824 during encoding.) */
2827 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2828 left of storage in it, so it's OK to read directly into it.
2829 (We'll be overwriting above, after we've encoded it into the
2831 read_size = Lstream_read (str->other_end, data, size);
2838 /* There might be some more end data produced in the translation.
2839 See the comment above. */
2840 str->flags |= CODING_STATE_END;
2841 mule_encode (stream, data, str->runoff, read_size);
2844 if (data == orig_data)
2845 return error_occurred ? -1 : 0;
2847 return data - orig_data;
2851 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2853 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2856 /* Encode all our data into the runoff, and then attempt to write
2857 it all out to the other end. Remove whatever chunk we succeeded
2859 mule_encode (stream, data, str->runoff, size);
2860 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2861 Dynarr_length (str->runoff));
2863 Dynarr_delete_many (str->runoff, 0, retval);
2864 /* Do NOT return retval. The return value indicates how much
2865 of the incoming data was written, not how many bytes were
2871 reset_encoding_stream (struct encoding_stream *str)
2874 switch (CODING_SYSTEM_TYPE (str->codesys))
2876 case CODESYS_ISO2022:
2880 str->encode_char = &char_encode_iso2022;
2881 str->finish = &char_finish_iso2022;
2882 for (i = 0; i < 4; i++)
2884 str->iso2022.charset[i] =
2885 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2886 str->iso2022.force_charset_on_output[i] =
2887 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2889 str->iso2022.register_left = 0;
2890 str->iso2022.register_right = 1;
2891 str->iso2022.current_charset = Qnil;
2892 str->iso2022.current_half = 0;
2896 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2899 str->encode_char = &char_encode_utf8;
2900 str->finish = &char_finish_utf8;
2903 str->encode_char = &char_encode_ucs4;
2904 str->finish = &char_finish_ucs4;
2906 case CODESYS_SHIFT_JIS:
2907 str->encode_char = &char_encode_shift_jis;
2908 str->finish = &char_finish_shift_jis;
2911 str->encode_char = &char_encode_big5;
2912 str->finish = &char_finish_big5;
2918 str->iso2022.current_char_boundary = 0;
2919 str->flags = str->ch = 0;
2923 encoding_rewinder (Lstream *stream)
2925 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2926 reset_encoding_stream (str);
2927 Dynarr_reset (str->runoff);
2928 return Lstream_rewind (str->other_end);
2932 encoding_seekable_p (Lstream *stream)
2934 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2935 return Lstream_seekable_p (str->other_end);
2939 encoding_flusher (Lstream *stream)
2941 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2942 return Lstream_flush (str->other_end);
2946 encoding_closer (Lstream *stream)
2948 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2949 if (stream->flags & LSTREAM_FL_WRITE)
2951 str->flags |= CODING_STATE_END;
2952 encoding_writer (stream, 0, 0);
2954 Dynarr_free (str->runoff);
2955 return Lstream_close (str->other_end);
2959 encoding_stream_coding_system (Lstream *stream)
2961 Lisp_Object coding_system;
2962 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2964 XSETCODING_SYSTEM (coding_system, str->codesys);
2965 return coding_system;
2969 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2971 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2972 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2974 reset_encoding_stream (str);
2978 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2981 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2982 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2986 str->runoff = Dynarr_new (unsigned_char);
2987 str->other_end = stream;
2988 set_encoding_stream_coding_system (lstr, codesys);
2989 XSETLSTREAM (obj, lstr);
2994 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2996 return make_encoding_stream_1 (stream, codesys, "r");
3000 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3002 return make_encoding_stream_1 (stream, codesys, "w");
3005 /* Convert N bytes of internally-formatted data stored in SRC to an
3006 external format, according to the encoding stream ENCODING.
3007 Store the encoded data into DST. */
3010 mule_encode (Lstream *encoding, const Bufbyte *src,
3011 unsigned_char_dynarr *dst, size_t n)
3013 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3015 switch (CODING_SYSTEM_TYPE (str->codesys))
3018 case CODESYS_INTERNAL:
3019 Dynarr_add_many (dst, src, n);
3022 case CODESYS_AUTODETECT:
3023 /* If we got this far and still haven't decided on the coding
3024 system, then do no conversion. */
3025 case CODESYS_NO_CONVERSION:
3026 encode_coding_no_conversion (encoding, src, dst, n);
3030 str->ccl.last_block = str->flags & CODING_STATE_END;
3031 /* When applying ccl program to stream, MUST NOT set NULL
3033 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3034 dst, n, 0, CCL_MODE_ENCODING);
3038 text_encode_generic (encoding, src, dst, n);
3042 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3043 Encode the text between START and END using CODING-SYSTEM.
3044 This will, for example, convert Japanese characters into stuff such as
3045 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3046 text. BUFFER defaults to the current buffer if unspecified.
3048 (start, end, coding_system, buffer))
3051 struct buffer *buf = decode_buffer (buffer, 0);
3052 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3053 Lstream *istr, *ostr;
3054 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3056 get_buffer_range_char (buf, start, end, &b, &e, 0);
3058 barf_if_buffer_read_only (buf, b, e);
3060 coding_system = Fget_coding_system (coding_system);
3061 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3062 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3063 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3064 Fget_coding_system (Qbinary));
3065 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3067 istr = XLSTREAM (instream);
3068 ostr = XLSTREAM (outstream);
3069 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3070 /* The chain of streams looks like this:
3072 [BUFFER] <----- send through
3073 ------> [ENCODE AS SPECIFIED]
3074 ------> [DECODE AS BINARY]
3079 char tempbuf[1024]; /* some random amount */
3080 Bufpos newpos, even_newer_pos;
3081 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3082 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3086 newpos = lisp_buffer_stream_startpos (istr);
3087 Lstream_write (ostr, tempbuf, size_in_bytes);
3088 even_newer_pos = lisp_buffer_stream_startpos (istr);
3089 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3095 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3096 Lstream_close (istr);
3097 Lstream_close (ostr);
3099 Lstream_delete (istr);
3100 Lstream_delete (ostr);
3101 Lstream_delete (XLSTREAM (de_outstream));
3102 Lstream_delete (XLSTREAM (lb_outstream));
3103 return make_int (retlen);
3110 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3111 unsigned_char_dynarr *dst, size_t n)
3114 unsigned char char_boundary;
3115 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3116 unsigned int flags = str->flags;
3117 Emchar ch = str->ch;
3119 char_boundary = str->iso2022.current_char_boundary;
3125 if (char_boundary == 0)
3153 (*str->encode_char) (str, c, dst, &flags);
3155 else if (char_boundary == 1)
3157 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3163 ch = (ch << 6) | (c & 0x3f);
3168 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3170 (*str->finish) (str, dst, &flags);
3175 str->iso2022.current_char_boundary = char_boundary;
3179 /************************************************************************/
3180 /* Shift-JIS methods */
3181 /************************************************************************/
3183 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3184 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3185 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3186 encoded by "position-code + 0x80". A character of JISX0208
3187 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3188 position-codes are divided and shifted so that it fit in the range
3191 --- CODE RANGE of Shift-JIS ---
3192 (character set) (range)
3194 JISX0201-Kana 0xA0 .. 0xDF
3195 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3196 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3197 -------------------------------
3201 /* Is this the first byte of a Shift-JIS two-byte char? */
3203 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3204 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3206 /* Is this the second byte of a Shift-JIS two-byte char? */
3208 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3209 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3211 #define BYTE_SJIS_KATAKANA_P(c) \
3212 ((c) >= 0xA1 && (c) <= 0xDF)
3215 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3219 unsigned char c = *(unsigned char *)src++;
3220 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3222 if (st->shift_jis.in_second_byte)
3224 st->shift_jis.in_second_byte = 0;
3228 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3229 st->shift_jis.in_second_byte = 1;
3231 return CODING_CATEGORY_SHIFT_JIS_MASK;
3234 /* Convert Shift-JIS data to internal format. */
3237 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3238 unsigned_char_dynarr *dst, size_t n)
3240 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3241 unsigned int flags = str->flags;
3242 unsigned int cpos = str->cpos;
3243 eol_type_t eol_type = str->eol_type;
3247 unsigned char c = *(unsigned char *)src++;
3251 /* Previous character was first byte of Shift-JIS Kanji char. */
3252 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3254 unsigned char e1, e2;
3256 DECODE_SJIS (cpos, c, e1, e2);
3258 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3262 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3263 Dynarr_add (dst, e1);
3264 Dynarr_add (dst, e2);
3269 DECODE_ADD_BINARY_CHAR (cpos, dst);
3270 DECODE_ADD_BINARY_CHAR (c, dst);
3276 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3277 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3279 else if (BYTE_SJIS_KATAKANA_P (c))
3282 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3285 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3286 Dynarr_add (dst, c);
3291 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3295 DECODE_ADD_BINARY_CHAR (c, dst);
3297 label_continue_loop:;
3300 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3306 /* Convert internal character representation to Shift_JIS. */
3309 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3310 unsigned_char_dynarr *dst, unsigned int *flags)
3312 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3316 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3317 Dynarr_add (dst, '\r');
3318 if (eol_type != EOL_CR)
3319 Dynarr_add (dst, ch);
3323 unsigned int s1, s2;
3325 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3327 if (code_point >= 0)
3328 Dynarr_add (dst, code_point);
3329 else if ((code_point
3330 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3333 ENCODE_SJIS ((code_point >> 8) | 0x80,
3334 (code_point & 0xFF) | 0x80, s1, s2);
3335 Dynarr_add (dst, s1);
3336 Dynarr_add (dst, s2);
3338 else if ((code_point
3339 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3341 Dynarr_add (dst, code_point | 0x80);
3342 else if ((code_point
3343 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3346 ENCODE_SJIS ((code_point >> 8) | 0x80,
3347 (code_point & 0xFF) | 0x80, s1, s2);
3348 Dynarr_add (dst, s1);
3349 Dynarr_add (dst, s2);
3351 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3353 Dynarr_add (dst, code_point);
3355 Dynarr_add (dst, '?');
3357 Lisp_Object charset;
3358 unsigned int c1, c2;
3360 BREAKUP_CHAR (ch, charset, c1, c2);
3362 if (EQ(charset, Vcharset_katakana_jisx0201))
3364 Dynarr_add (dst, c1 | 0x80);
3368 Dynarr_add (dst, c1);
3370 else if (EQ(charset, Vcharset_japanese_jisx0208))
3372 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3373 Dynarr_add (dst, s1);
3374 Dynarr_add (dst, s2);
3377 Dynarr_add (dst, '?');
3383 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3384 unsigned int *flags)
3388 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3389 Decode a JISX0208 character of Shift-JIS coding-system.
3390 CODE is the character code in Shift-JIS as a cons of type bytes.
3391 Return the corresponding character.
3395 unsigned char c1, c2, s1, s2;
3398 CHECK_INT (XCAR (code));
3399 CHECK_INT (XCDR (code));
3400 s1 = XINT (XCAR (code));
3401 s2 = XINT (XCDR (code));
3402 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3403 BYTE_SJIS_TWO_BYTE_2_P (s2))
3405 DECODE_SJIS (s1, s2, c1, c2);
3406 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3407 c1 & 0x7F, c2 & 0x7F));
3413 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3414 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3415 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3419 Lisp_Object charset;
3422 CHECK_CHAR_COERCE_INT (character);
3423 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3424 if (EQ (charset, Vcharset_japanese_jisx0208))
3426 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3427 return Fcons (make_int (s1), make_int (s2));
3434 /************************************************************************/
3436 /************************************************************************/
3438 /* BIG5 is a coding system encoding two character sets: ASCII and
3439 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3440 character set and is encoded in two-byte.
3442 --- CODE RANGE of BIG5 ---
3443 (character set) (range)
3445 Big5 (1st byte) 0xA1 .. 0xFE
3446 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3447 --------------------------
3449 Since the number of characters in Big5 is larger than maximum
3450 characters in Emacs' charset (96x96), it can't be handled as one
3451 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3452 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3453 contains frequently used characters and the latter contains less
3454 frequently used characters. */
3457 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3458 ((c) >= 0x81 && (c) <= 0xFE)
3460 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3461 ((c) >= 0xA1 && (c) <= 0xFE)
3464 /* Is this the second byte of a Shift-JIS two-byte char? */
3466 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3467 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3469 /* Number of Big5 characters which have the same code in 1st byte. */
3471 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3473 /* Code conversion macros. These are macros because they are used in
3474 inner loops during code conversion.
3476 Note that temporary variables in macros introduce the classic
3477 dynamic-scoping problems with variable names. We use capital-
3478 lettered variables in the assumption that XEmacs does not use
3479 capital letters in variables except in a very formalized way
3482 /* Convert Big5 code (b1, b2) into its internal string representation
3485 /* There is a much simpler way to split the Big5 charset into two.
3486 For the moment I'm going to leave the algorithm as-is because it
3487 claims to separate out the most-used characters into a single
3488 charset, which perhaps will lead to optimizations in various
3491 The way the algorithm works is something like this:
3493 Big5 can be viewed as a 94x157 charset, where the row is
3494 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3495 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3496 the split between low and high column numbers is apparently
3497 meaningless; ascending rows produce less and less frequent chars.
3498 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3499 the first charset, and the upper half (0xC9 .. 0xFE) to the
3500 second. To do the conversion, we convert the character into
3501 a single number where 0 .. 156 is the first row, 157 .. 313
3502 is the second, etc. That way, the characters are ordered by
3503 decreasing frequency. Then we just chop the space in two
3504 and coerce the result into a 94x94 space.
3507 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3509 int B1 = b1, B2 = b2; \
3511 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3515 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3519 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3520 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3522 c1 = I / (0xFF - 0xA1) + 0xA1; \
3523 c2 = I % (0xFF - 0xA1) + 0xA1; \
3526 /* Convert the internal string representation of a Big5 character
3527 (lb, c1, c2) into Big5 code (b1, b2). */
3529 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3531 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3533 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3535 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3537 b1 = I / BIG5_SAME_ROW + 0xA1; \
3538 b2 = I % BIG5_SAME_ROW; \
3539 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3543 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3547 unsigned char c = *(unsigned char *)src++;
3548 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3550 || (c >= 0x80 && c <= 0xA0)
3554 if (st->big5.in_second_byte)
3556 st->big5.in_second_byte = 0;
3557 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3567 st->big5.in_second_byte = 1;
3569 return CODING_CATEGORY_BIG5_MASK;
3572 /* Convert Big5 data to internal format. */
3575 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3576 unsigned_char_dynarr *dst, size_t n)
3578 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3579 unsigned int flags = str->flags;
3580 unsigned int cpos = str->cpos;
3581 eol_type_t eol_type = str->eol_type;
3585 unsigned char c = *(unsigned char *)src++;
3588 /* Previous character was first byte of Big5 char. */
3589 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3593 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3596 unsigned char b1, b2, b3;
3597 DECODE_BIG5 (cpos, c, b1, b2, b3);
3598 Dynarr_add (dst, b1);
3599 Dynarr_add (dst, b2);
3600 Dynarr_add (dst, b3);
3605 DECODE_ADD_BINARY_CHAR (cpos, dst);
3606 DECODE_ADD_BINARY_CHAR (c, dst);
3612 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3613 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3616 DECODE_ADD_BINARY_CHAR (c, dst);
3618 label_continue_loop:;
3621 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3627 /* Convert internally-formatted data to Big5. */
3630 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3631 unsigned_char_dynarr *dst, unsigned int *flags)
3633 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3637 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3638 Dynarr_add (dst, '\r');
3639 if (eol_type != EOL_CR)
3640 Dynarr_add (dst, ch);
3647 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3648 Dynarr_add (dst, code_point);
3649 else if ((code_point
3650 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3652 Dynarr_add (dst, code_point >> 8);
3653 Dynarr_add (dst, code_point & 0xFF);
3655 else if ((code_point
3656 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3659 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3660 + ((code_point & 0xFF) - 33);
3661 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3662 unsigned char b2 = I % BIG5_SAME_ROW;
3664 b2 += b2 < 0x3F ? 0x40 : 0x62;
3665 Dynarr_add (dst, b1);
3666 Dynarr_add (dst, b2);
3668 else if ((code_point
3669 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3672 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3673 + ((code_point & 0xFF) - 33);
3674 unsigned char b1, b2;
3676 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3677 b1 = I / BIG5_SAME_ROW + 0xA1;
3678 b2 = I % BIG5_SAME_ROW;
3679 b2 += b2 < 0x3F ? 0x40 : 0x62;
3680 Dynarr_add (dst, b1);
3681 Dynarr_add (dst, b2);
3684 Dynarr_add (dst, '?');
3691 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3692 unsigned int *flags)
3697 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3698 Decode a Big5 character CODE of BIG5 coding-system.
3699 CODE is the character code in BIG5, a cons of two integers.
3700 Return the corresponding character.
3704 unsigned char c1, c2, b1, b2;
3707 CHECK_INT (XCAR (code));
3708 CHECK_INT (XCDR (code));
3709 b1 = XINT (XCAR (code));
3710 b2 = XINT (XCDR (code));
3711 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3712 BYTE_BIG5_TWO_BYTE_2_P (b2))
3714 Charset_ID leading_byte;
3715 Lisp_Object charset;
3716 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3717 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3718 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3724 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3725 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3726 Return the corresponding character code in Big5.
3730 Lisp_Object charset;
3733 CHECK_CHAR_COERCE_INT (character);
3734 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3735 if (EQ (charset, Vcharset_chinese_big5_1) ||
3736 EQ (charset, Vcharset_chinese_big5_2))
3738 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3740 return Fcons (make_int (b1), make_int (b2));
3747 /************************************************************************/
3749 /************************************************************************/
3752 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3756 unsigned char c = *(unsigned char *)src++;
3757 switch (st->ucs4.in_byte)
3766 st->ucs4.in_byte = 0;
3772 return CODING_CATEGORY_UCS4_MASK;
3776 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3777 unsigned_char_dynarr *dst, size_t n)
3779 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3780 unsigned int flags = str->flags;
3781 unsigned int cpos = str->cpos;
3782 unsigned char counter = str->counter;
3786 unsigned char c = *(unsigned char *)src++;
3794 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3799 cpos = ( cpos << 8 ) | c;
3803 if (counter & CODING_STATE_END)
3804 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3808 str->counter = counter;
3812 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3813 unsigned_char_dynarr *dst, unsigned int *flags)
3815 Dynarr_add (dst, ch >> 24);
3816 Dynarr_add (dst, ch >> 16);
3817 Dynarr_add (dst, ch >> 8);
3818 Dynarr_add (dst, ch );
3822 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3823 unsigned int *flags)
3828 /************************************************************************/
3830 /************************************************************************/
3833 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3837 unsigned char c = *(unsigned char *)src++;
3838 switch (st->utf8.in_byte)
3841 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3844 st->utf8.in_byte = 5;
3846 st->utf8.in_byte = 4;
3848 st->utf8.in_byte = 3;
3850 st->utf8.in_byte = 2;
3852 st->utf8.in_byte = 1;
3857 if ((c & 0xc0) != 0x80)
3863 return CODING_CATEGORY_UTF8_MASK;
3867 decode_output_utf8_partial_char (unsigned char counter,
3869 unsigned_char_dynarr *dst)
3872 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3873 else if (counter == 4)
3875 if (cpos < (1 << 6))
3876 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3879 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3880 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3883 else if (counter == 3)
3885 if (cpos < (1 << 6))
3886 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3887 else if (cpos < (1 << 12))
3889 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3890 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3894 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3895 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3896 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3899 else if (counter == 2)
3901 if (cpos < (1 << 6))
3902 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3903 else if (cpos < (1 << 12))
3905 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3906 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3908 else if (cpos < (1 << 18))
3910 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3911 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3912 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3916 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3917 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3918 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3919 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3924 if (cpos < (1 << 6))
3925 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3926 else if (cpos < (1 << 12))
3928 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3929 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3931 else if (cpos < (1 << 18))
3933 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3934 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3935 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3937 else if (cpos < (1 << 24))
3939 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3940 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3941 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3942 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3946 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3947 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3948 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3949 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3950 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3956 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3957 unsigned_char_dynarr *dst, size_t n)
3959 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3960 unsigned int flags = str->flags;
3961 unsigned int cpos = str->cpos;
3962 eol_type_t eol_type = str->eol_type;
3963 unsigned char counter = str->counter;
3967 unsigned char c = *(unsigned char *)src++;
3972 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3973 DECODE_ADD_UCS_CHAR (c, dst);
3975 else if ( c < 0xE0 )
3980 else if ( c < 0xF0 )
3985 else if ( c < 0xF8 )
3990 else if ( c < 0xFC )
4001 else if ( (c & 0xC0) == 0x80 )
4003 cpos = ( cpos << 6 ) | ( c & 0x3f );
4006 DECODE_ADD_UCS_CHAR (cpos, dst);
4015 decode_output_utf8_partial_char (counter, cpos, dst);
4016 DECODE_ADD_BINARY_CHAR (c, dst);
4020 label_continue_loop:;
4023 if (flags & CODING_STATE_END)
4026 decode_output_utf8_partial_char (counter, cpos, dst);
4032 str->counter = counter;
4036 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4037 unsigned_char_dynarr *dst, unsigned int *flags)
4039 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4043 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4044 Dynarr_add (dst, '\r');
4045 if (eol_type != EOL_CR)
4046 Dynarr_add (dst, ch);
4048 else if (ch <= 0x7f)
4050 Dynarr_add (dst, ch);
4052 else if (ch <= 0x7ff)
4054 Dynarr_add (dst, (ch >> 6) | 0xc0);
4055 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4057 else if (ch <= 0xffff)
4059 Dynarr_add (dst, (ch >> 12) | 0xe0);
4060 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4061 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4063 else if (ch <= 0x1fffff)
4065 Dynarr_add (dst, (ch >> 18) | 0xf0);
4066 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4067 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4068 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4070 else if (ch <= 0x3ffffff)
4072 Dynarr_add (dst, (ch >> 24) | 0xf8);
4073 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4074 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4075 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4076 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4080 Dynarr_add (dst, (ch >> 30) | 0xfc);
4081 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4082 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4083 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4084 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4085 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4090 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4091 unsigned int *flags)
4096 /************************************************************************/
4097 /* ISO2022 methods */
4098 /************************************************************************/
4100 /* The following note describes the coding system ISO2022 briefly.
4101 Since the intention of this note is to help understand the
4102 functions in this file, some parts are NOT ACCURATE or OVERLY
4103 SIMPLIFIED. For thorough understanding, please refer to the
4104 original document of ISO2022.
4106 ISO2022 provides many mechanisms to encode several character sets
4107 in 7-bit and 8-bit environments. For 7-bit environments, all text
4108 is encoded using bytes less than 128. This may make the encoded
4109 text a little bit longer, but the text passes more easily through
4110 several gateways, some of which strip off MSB (Most Signigant Bit).
4112 There are two kinds of character sets: control character set and
4113 graphic character set. The former contains control characters such
4114 as `newline' and `escape' to provide control functions (control
4115 functions are also provided by escape sequences). The latter
4116 contains graphic characters such as 'A' and '-'. Emacs recognizes
4117 two control character sets and many graphic character sets.
4119 Graphic character sets are classified into one of the following
4120 four classes, according to the number of bytes (DIMENSION) and
4121 number of characters in one dimension (CHARS) of the set:
4122 - DIMENSION1_CHARS94
4123 - DIMENSION1_CHARS96
4124 - DIMENSION2_CHARS94
4125 - DIMENSION2_CHARS96
4127 In addition, each character set is assigned an identification tag,
4128 unique for each set, called "final character" (denoted as <F>
4129 hereafter). The <F> of each character set is decided by ECMA(*)
4130 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4131 (0x30..0x3F are for private use only).
4133 Note (*): ECMA = European Computer Manufacturers Association
4135 Here are examples of graphic character set [NAME(<F>)]:
4136 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4137 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4138 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4139 o DIMENSION2_CHARS96 -- none for the moment
4141 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4142 C0 [0x00..0x1F] -- control character plane 0
4143 GL [0x20..0x7F] -- graphic character plane 0
4144 C1 [0x80..0x9F] -- control character plane 1
4145 GR [0xA0..0xFF] -- graphic character plane 1
4147 A control character set is directly designated and invoked to C0 or
4148 C1 by an escape sequence. The most common case is that:
4149 - ISO646's control character set is designated/invoked to C0, and
4150 - ISO6429's control character set is designated/invoked to C1,
4151 and usually these designations/invocations are omitted in encoded
4152 text. In a 7-bit environment, only C0 can be used, and a control
4153 character for C1 is encoded by an appropriate escape sequence to
4154 fit into the environment. All control characters for C1 are
4155 defined to have corresponding escape sequences.
4157 A graphic character set is at first designated to one of four
4158 graphic registers (G0 through G3), then these graphic registers are
4159 invoked to GL or GR. These designations and invocations can be
4160 done independently. The most common case is that G0 is invoked to
4161 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4162 these invocations and designations are omitted in encoded text.
4163 In a 7-bit environment, only GL can be used.
4165 When a graphic character set of CHARS94 is invoked to GL, codes
4166 0x20 and 0x7F of the GL area work as control characters SPACE and
4167 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4170 There are two ways of invocation: locking-shift and single-shift.
4171 With locking-shift, the invocation lasts until the next different
4172 invocation, whereas with single-shift, the invocation affects the
4173 following character only and doesn't affect the locking-shift
4174 state. Invocations are done by the following control characters or
4177 ----------------------------------------------------------------------
4178 abbrev function cntrl escape seq description
4179 ----------------------------------------------------------------------
4180 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4181 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4182 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4183 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4184 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4185 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4186 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4187 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4188 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4189 ----------------------------------------------------------------------
4190 (*) These are not used by any known coding system.
4192 Control characters for these functions are defined by macros
4193 ISO_CODE_XXX in `coding.h'.
4195 Designations are done by the following escape sequences:
4196 ----------------------------------------------------------------------
4197 escape sequence description
4198 ----------------------------------------------------------------------
4199 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4200 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4201 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4202 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4203 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4204 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4205 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4206 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4207 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4208 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4209 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4210 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4211 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4212 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4213 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4214 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4215 ----------------------------------------------------------------------
4217 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4218 of dimension 1, chars 94, and final character <F>, etc...
4220 Note (*): Although these designations are not allowed in ISO2022,
4221 Emacs accepts them on decoding, and produces them on encoding
4222 CHARS96 character sets in a coding system which is characterized as
4223 7-bit environment, non-locking-shift, and non-single-shift.
4225 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4226 '(' can be omitted. We refer to this as "short-form" hereafter.
4228 Now you may notice that there are a lot of ways for encoding the
4229 same multilingual text in ISO2022. Actually, there exist many
4230 coding systems such as Compound Text (used in X11's inter client
4231 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4232 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4233 localized platforms), and all of these are variants of ISO2022.
4235 In addition to the above, Emacs handles two more kinds of escape
4236 sequences: ISO6429's direction specification and Emacs' private
4237 sequence for specifying character composition.
4239 ISO6429's direction specification takes the following form:
4240 o CSI ']' -- end of the current direction
4241 o CSI '0' ']' -- end of the current direction
4242 o CSI '1' ']' -- start of left-to-right text
4243 o CSI '2' ']' -- start of right-to-left text
4244 The control character CSI (0x9B: control sequence introducer) is
4245 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4247 Character composition specification takes the following form:
4248 o ESC '0' -- start character composition
4249 o ESC '1' -- end character composition
4250 Since these are not standard escape sequences of any ISO standard,
4251 their use with these meanings is restricted to Emacs only. */
4254 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4258 for (i = 0; i < 4; i++)
4260 if (!NILP (coding_system))
4262 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4264 iso->charset[i] = Qt;
4265 iso->invalid_designated[i] = 0;
4267 iso->esc = ISO_ESC_NOTHING;
4268 iso->esc_bytes_index = 0;
4269 iso->register_left = 0;
4270 iso->register_right = 1;
4271 iso->switched_dir_and_no_valid_charset_yet = 0;
4272 iso->invalid_switch_dir = 0;
4273 iso->output_direction_sequence = 0;
4274 iso->output_literally = 0;
4275 #ifdef ENABLE_COMPOSITE_CHARS
4276 if (iso->composite_chars)
4277 Dynarr_reset (iso->composite_chars);
4282 fit_to_be_escape_quoted (unsigned char c)
4299 /* Parse one byte of an ISO2022 escape sequence.
4300 If the result is an invalid escape sequence, return 0 and
4301 do not change anything in STR. Otherwise, if the result is
4302 an incomplete escape sequence, update ISO2022.ESC and
4303 ISO2022.ESC_BYTES and return -1. Otherwise, update
4304 all the state variables (but not ISO2022.ESC_BYTES) and
4307 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4308 or invocation of an invalid character set and treat that as
4309 an unrecognized escape sequence. */
4312 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4313 unsigned char c, unsigned int *flags,
4314 int check_invalid_charsets)
4316 /* (1) If we're at the end of a designation sequence, CS is the
4317 charset being designated and REG is the register to designate
4320 (2) If we're at the end of a locking-shift sequence, REG is
4321 the register to invoke and HALF (0 == left, 1 == right) is
4322 the half to invoke it into.
4324 (3) If we're at the end of a single-shift sequence, REG is
4325 the register to invoke. */
4326 Lisp_Object cs = Qnil;
4329 /* NOTE: This code does goto's all over the fucking place.
4330 The reason for this is that we're basically implementing
4331 a state machine here, and hierarchical languages like C
4332 don't really provide a clean way of doing this. */
4334 if (! (*flags & CODING_STATE_ESCAPE))
4335 /* At beginning of escape sequence; we need to reset our
4336 escape-state variables. */
4337 iso->esc = ISO_ESC_NOTHING;
4339 iso->output_literally = 0;
4340 iso->output_direction_sequence = 0;
4344 case ISO_ESC_NOTHING:
4345 iso->esc_bytes_index = 0;
4348 case ISO_CODE_ESC: /* Start escape sequence */
4349 *flags |= CODING_STATE_ESCAPE;
4353 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4354 *flags |= CODING_STATE_ESCAPE;
4355 iso->esc = ISO_ESC_5_11;
4358 case ISO_CODE_SO: /* locking shift 1 */
4361 case ISO_CODE_SI: /* locking shift 0 */
4365 case ISO_CODE_SS2: /* single shift */
4368 case ISO_CODE_SS3: /* single shift */
4372 default: /* Other control characters */
4379 /**** single shift ****/
4381 case 'N': /* single shift 2 */
4384 case 'O': /* single shift 3 */
4388 /**** locking shift ****/
4390 case '~': /* locking shift 1 right */
4393 case 'n': /* locking shift 2 */
4396 case '}': /* locking shift 2 right */
4399 case 'o': /* locking shift 3 */
4402 case '|': /* locking shift 3 right */
4406 #ifdef ENABLE_COMPOSITE_CHARS
4407 /**** composite ****/
4410 iso->esc = ISO_ESC_START_COMPOSITE;
4411 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4412 CODING_STATE_COMPOSITE;
4416 iso->esc = ISO_ESC_END_COMPOSITE;
4417 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4418 ~CODING_STATE_COMPOSITE;
4420 #endif /* ENABLE_COMPOSITE_CHARS */
4422 /**** directionality ****/
4425 iso->esc = ISO_ESC_5_11;
4428 /**** designation ****/
4430 case '$': /* multibyte charset prefix */
4431 iso->esc = ISO_ESC_2_4;
4435 if (0x28 <= c && c <= 0x2F)
4437 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4441 /* This function is called with CODESYS equal to nil when
4442 doing coding-system detection. */
4444 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4445 && fit_to_be_escape_quoted (c))
4447 iso->esc = ISO_ESC_LITERAL;
4448 *flags &= CODING_STATE_ISO2022_LOCK;
4458 /**** directionality ****/
4460 case ISO_ESC_5_11: /* ISO6429 direction control */
4463 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4464 goto directionality;
4466 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4467 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4468 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4472 case ISO_ESC_5_11_0:
4475 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4476 goto directionality;
4480 case ISO_ESC_5_11_1:
4483 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4484 goto directionality;
4488 case ISO_ESC_5_11_2:
4491 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4492 goto directionality;
4497 iso->esc = ISO_ESC_DIRECTIONALITY;
4498 /* Various junk here to attempt to preserve the direction sequences
4499 literally in the text if they would otherwise be swallowed due
4500 to invalid designations that don't show up as actual charset
4501 changes in the text. */
4502 if (iso->invalid_switch_dir)
4504 /* We already inserted a direction switch literally into the
4505 text. We assume (#### this may not be right) that the
4506 next direction switch is the one going the other way,
4507 and we need to output that literally as well. */
4508 iso->output_literally = 1;
4509 iso->invalid_switch_dir = 0;
4515 /* If we are in the thrall of an invalid designation,
4516 then stick the directionality sequence literally into the
4517 output stream so it ends up in the original text again. */
4518 for (jj = 0; jj < 4; jj++)
4519 if (iso->invalid_designated[jj])
4523 iso->output_literally = 1;
4524 iso->invalid_switch_dir = 1;
4527 /* Indicate that we haven't yet seen a valid designation,
4528 so that if a switch-dir is directly followed by an
4529 invalid designation, both get inserted literally. */
4530 iso->switched_dir_and_no_valid_charset_yet = 1;
4535 /**** designation ****/
4538 if (0x28 <= c && c <= 0x2F)
4540 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4543 if (0x40 <= c && c <= 0x42)
4546 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4547 *flags & CODING_STATE_R2L ?
4548 CHARSET_RIGHT_TO_LEFT :
4549 CHARSET_LEFT_TO_RIGHT);
4560 if (c < '0' || c > '~')
4561 return 0; /* bad final byte */
4563 if (iso->esc >= ISO_ESC_2_8 &&
4564 iso->esc <= ISO_ESC_2_15)
4566 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4567 single = 1; /* single-byte */
4568 reg = (iso->esc - ISO_ESC_2_8) & 3;
4570 else if (iso->esc >= ISO_ESC_2_4_8 &&
4571 iso->esc <= ISO_ESC_2_4_15)
4573 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4574 single = -1; /* multi-byte */
4575 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4579 /* Can this ever be reached? -slb */
4583 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4584 *flags & CODING_STATE_R2L ?
4585 CHARSET_RIGHT_TO_LEFT :
4586 CHARSET_LEFT_TO_RIGHT);
4592 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4596 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4597 /* can't invoke something that ain't there. */
4599 iso->esc = ISO_ESC_SINGLE_SHIFT;
4600 *flags &= CODING_STATE_ISO2022_LOCK;
4602 *flags |= CODING_STATE_SS2;
4604 *flags |= CODING_STATE_SS3;
4608 if (check_invalid_charsets &&
4609 !CHARSETP (iso->charset[reg]))
4610 /* can't invoke something that ain't there. */
4613 iso->register_right = reg;
4615 iso->register_left = reg;
4616 *flags &= CODING_STATE_ISO2022_LOCK;
4617 iso->esc = ISO_ESC_LOCKING_SHIFT;
4621 if (NILP (cs) && check_invalid_charsets)
4623 iso->invalid_designated[reg] = 1;
4624 iso->charset[reg] = Vcharset_ascii;
4625 iso->esc = ISO_ESC_DESIGNATE;
4626 *flags &= CODING_STATE_ISO2022_LOCK;
4627 iso->output_literally = 1;
4628 if (iso->switched_dir_and_no_valid_charset_yet)
4630 /* We encountered a switch-direction followed by an
4631 invalid designation. Ensure that the switch-direction
4632 gets outputted; otherwise it will probably get eaten
4633 when the text is written out again. */
4634 iso->switched_dir_and_no_valid_charset_yet = 0;
4635 iso->output_direction_sequence = 1;
4636 /* And make sure that the switch-dir going the other
4637 way gets outputted, as well. */
4638 iso->invalid_switch_dir = 1;
4642 /* This function is called with CODESYS equal to nil when
4643 doing coding-system detection. */
4644 if (!NILP (codesys))
4646 charset_conversion_spec_dynarr *dyn =
4647 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4653 for (i = 0; i < Dynarr_length (dyn); i++)
4655 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4656 if (EQ (cs, spec->from_charset))
4657 cs = spec->to_charset;
4662 iso->charset[reg] = cs;
4663 iso->esc = ISO_ESC_DESIGNATE;
4664 *flags &= CODING_STATE_ISO2022_LOCK;
4665 if (iso->invalid_designated[reg])
4667 iso->invalid_designated[reg] = 0;
4668 iso->output_literally = 1;
4670 if (iso->switched_dir_and_no_valid_charset_yet)
4671 iso->switched_dir_and_no_valid_charset_yet = 0;
4676 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4680 /* #### There are serious deficiencies in the recognition mechanism
4681 here. This needs to be much smarter if it's going to cut it.
4682 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4683 it should be detected as Latin-1.
4684 All the ISO2022 stuff in this file should be synced up with the
4685 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4686 Perhaps we should wait till R2L works in FSF Emacs? */
4688 if (!st->iso2022.initted)
4690 reset_iso2022 (Qnil, &st->iso2022.iso);
4691 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4692 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4693 CODING_CATEGORY_ISO_8_1_MASK |
4694 CODING_CATEGORY_ISO_8_2_MASK |
4695 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4696 st->iso2022.flags = 0;
4697 st->iso2022.high_byte_count = 0;
4698 st->iso2022.saw_single_shift = 0;
4699 st->iso2022.initted = 1;
4702 mask = st->iso2022.mask;
4706 unsigned char c = *(unsigned char *)src++;
4709 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4710 st->iso2022.high_byte_count++;
4714 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4716 if (st->iso2022.high_byte_count & 1)
4717 /* odd number of high bytes; assume not iso-8-2 */
4718 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4720 st->iso2022.high_byte_count = 0;
4721 st->iso2022.saw_single_shift = 0;
4723 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4725 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4726 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4727 { /* control chars */
4730 /* Allow and ignore control characters that you might
4731 reasonably see in a text file */
4736 case 8: /* backspace */
4737 case 11: /* vertical tab */
4738 case 12: /* form feed */
4739 case 26: /* MS-DOS C-z junk */
4740 case 31: /* '^_' -- for info */
4741 goto label_continue_loop;
4748 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4751 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4752 &st->iso2022.flags, 0))
4754 switch (st->iso2022.iso.esc)
4756 case ISO_ESC_DESIGNATE:
4757 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4758 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4760 case ISO_ESC_LOCKING_SHIFT:
4761 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4762 goto ran_out_of_chars;
4763 case ISO_ESC_SINGLE_SHIFT:
4764 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4765 st->iso2022.saw_single_shift = 1;
4774 goto ran_out_of_chars;
4777 label_continue_loop:;
4786 postprocess_iso2022_mask (int mask)
4788 /* #### kind of cheesy */
4789 /* If seven-bit ISO is allowed, then assume that the encoding is
4790 entirely seven-bit and turn off the eight-bit ones. */
4791 if (mask & CODING_CATEGORY_ISO_7_MASK)
4792 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4793 CODING_CATEGORY_ISO_8_1_MASK |
4794 CODING_CATEGORY_ISO_8_2_MASK);
4798 /* If FLAGS is a null pointer or specifies right-to-left motion,
4799 output a switch-dir-to-left-to-right sequence to DST.
4800 Also update FLAGS if it is not a null pointer.
4801 If INTERNAL_P is set, we are outputting in internal format and
4802 need to handle the CSI differently. */
4805 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4806 unsigned_char_dynarr *dst,
4807 unsigned int *flags,
4810 if (!flags || (*flags & CODING_STATE_R2L))
4812 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4814 Dynarr_add (dst, ISO_CODE_ESC);
4815 Dynarr_add (dst, '[');
4817 else if (internal_p)
4818 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4820 Dynarr_add (dst, ISO_CODE_CSI);
4821 Dynarr_add (dst, '0');
4822 Dynarr_add (dst, ']');
4824 *flags &= ~CODING_STATE_R2L;
4828 /* If FLAGS is a null pointer or specifies a direction different from
4829 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4830 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4831 sequence to DST. Also update FLAGS if it is not a null pointer.
4832 If INTERNAL_P is set, we are outputting in internal format and
4833 need to handle the CSI differently. */
4836 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4837 unsigned_char_dynarr *dst, unsigned int *flags,
4840 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4841 direction == CHARSET_LEFT_TO_RIGHT)
4842 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4843 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4844 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4845 direction == CHARSET_RIGHT_TO_LEFT)
4847 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4849 Dynarr_add (dst, ISO_CODE_ESC);
4850 Dynarr_add (dst, '[');
4852 else if (internal_p)
4853 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4855 Dynarr_add (dst, ISO_CODE_CSI);
4856 Dynarr_add (dst, '2');
4857 Dynarr_add (dst, ']');
4859 *flags |= CODING_STATE_R2L;
4863 /* Convert ISO2022-format data to internal format. */
4866 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4867 unsigned_char_dynarr *dst, size_t n)
4869 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4870 unsigned int flags = str->flags;
4871 unsigned int cpos = str->cpos;
4872 unsigned char counter = str->counter;
4873 eol_type_t eol_type = str->eol_type;
4874 #ifdef ENABLE_COMPOSITE_CHARS
4875 unsigned_char_dynarr *real_dst = dst;
4877 Lisp_Object coding_system;
4879 XSETCODING_SYSTEM (coding_system, str->codesys);
4881 #ifdef ENABLE_COMPOSITE_CHARS
4882 if (flags & CODING_STATE_COMPOSITE)
4883 dst = str->iso2022.composite_chars;
4884 #endif /* ENABLE_COMPOSITE_CHARS */
4888 unsigned char c = *(unsigned char *)src++;
4889 if (flags & CODING_STATE_ESCAPE)
4890 { /* Within ESC sequence */
4891 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4896 switch (str->iso2022.esc)
4898 #ifdef ENABLE_COMPOSITE_CHARS
4899 case ISO_ESC_START_COMPOSITE:
4900 if (str->iso2022.composite_chars)
4901 Dynarr_reset (str->iso2022.composite_chars);
4903 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4904 dst = str->iso2022.composite_chars;
4906 case ISO_ESC_END_COMPOSITE:
4908 Bufbyte comstr[MAX_EMCHAR_LEN];
4910 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4911 Dynarr_length (dst));
4913 len = set_charptr_emchar (comstr, emch);
4914 Dynarr_add_many (dst, comstr, len);
4917 #endif /* ENABLE_COMPOSITE_CHARS */
4919 case ISO_ESC_LITERAL:
4920 COMPOSE_FLUSH_CHARS (str, dst);
4921 DECODE_ADD_BINARY_CHAR (c, dst);
4925 /* Everything else handled already */
4930 /* Attempted error recovery. */
4931 if (str->iso2022.output_direction_sequence)
4932 ensure_correct_direction (flags & CODING_STATE_R2L ?
4933 CHARSET_RIGHT_TO_LEFT :
4934 CHARSET_LEFT_TO_RIGHT,
4935 str->codesys, dst, 0, 1);
4936 /* More error recovery. */
4937 if (!retval || str->iso2022.output_literally)
4939 /* Output the (possibly invalid) sequence */
4941 COMPOSE_FLUSH_CHARS (str, dst);
4942 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4943 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4944 flags &= CODING_STATE_ISO2022_LOCK;
4946 n++, src--;/* Repeat the loop with the same character. */
4949 /* No sense in reprocessing the final byte of the
4950 escape sequence; it could mess things up anyway.
4952 COMPOSE_FLUSH_CHARS (str, dst);
4953 DECODE_ADD_BINARY_CHAR (c, dst);
4959 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4960 { /* Control characters */
4962 /***** Error-handling *****/
4964 /* If we were in the middle of a character, dump out the
4965 partial character. */
4968 COMPOSE_FLUSH_CHARS (str, dst);
4972 DECODE_ADD_BINARY_CHAR
4973 ((unsigned char)(cpos >> (counter * 8)), dst);
4978 /* If we just saw a single-shift character, dump it out.
4979 This may dump out the wrong sort of single-shift character,
4980 but least it will give an indication that something went
4982 if (flags & CODING_STATE_SS2)
4984 COMPOSE_FLUSH_CHARS (str, dst);
4985 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4986 flags &= ~CODING_STATE_SS2;
4988 if (flags & CODING_STATE_SS3)
4990 COMPOSE_FLUSH_CHARS (str, dst);
4991 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4992 flags &= ~CODING_STATE_SS3;
4995 /***** Now handle the control characters. *****/
5001 COMPOSE_FLUSH_CHARS (str, dst);
5002 if (eol_type == EOL_CR)
5003 Dynarr_add (dst, '\n');
5004 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5005 Dynarr_add (dst, c);
5007 flags |= CODING_STATE_CR;
5008 goto label_continue_loop;
5010 else if (flags & CODING_STATE_CR)
5011 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5013 Dynarr_add (dst, '\r');
5014 flags &= ~CODING_STATE_CR;
5017 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5020 flags &= CODING_STATE_ISO2022_LOCK;
5022 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5024 COMPOSE_FLUSH_CHARS (str, dst);
5025 DECODE_ADD_BINARY_CHAR (c, dst);
5029 { /* Graphic characters */
5030 Lisp_Object charset;
5039 COMPOSE_FLUSH_CHARS (str, dst);
5040 if (eol_type == EOL_CR)
5041 Dynarr_add (dst, '\n');
5042 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5043 Dynarr_add (dst, c);
5045 flags |= CODING_STATE_CR;
5046 goto label_continue_loop;
5048 else if (flags & CODING_STATE_CR)
5049 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5051 Dynarr_add (dst, '\r');
5052 flags &= ~CODING_STATE_CR;
5055 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5058 /* Now determine the charset. */
5059 reg = ((flags & CODING_STATE_SS2) ? 2
5060 : (flags & CODING_STATE_SS3) ? 3
5061 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5062 : str->iso2022.register_left);
5063 charset = str->iso2022.charset[reg];
5065 /* Error checking: */
5066 if (! CHARSETP (charset)
5067 || str->iso2022.invalid_designated[reg]
5068 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5069 && XCHARSET_CHARS (charset) == 94))
5070 /* Mrmph. We are trying to invoke a register that has no
5071 or an invalid charset in it, or trying to add a character
5072 outside the range of the charset. Insert that char literally
5073 to preserve it for the output. */
5075 COMPOSE_FLUSH_CHARS (str, dst);
5079 DECODE_ADD_BINARY_CHAR
5080 ((unsigned char)(cpos >> (counter * 8)), dst);
5083 DECODE_ADD_BINARY_CHAR (c, dst);
5088 /* Things are probably hunky-dorey. */
5090 /* Fetch reverse charset, maybe. */
5091 if (((flags & CODING_STATE_R2L) &&
5092 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5094 (!(flags & CODING_STATE_R2L) &&
5095 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5097 Lisp_Object new_charset =
5098 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5099 if (!NILP (new_charset))
5100 charset = new_charset;
5105 if (XCHARSET_DIMENSION (charset) == counter)
5107 COMPOSE_ADD_CHAR (str,
5108 DECODE_CHAR (charset,
5109 ((cpos & 0x7F7F7F) << 8)
5116 cpos = (cpos << 8) | c;
5118 lb = XCHARSET_LEADING_BYTE (charset);
5119 switch (XCHARSET_REP_BYTES (charset))
5122 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5123 Dynarr_add (dst, c & 0x7F);
5126 case 2: /* one-byte official */
5127 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5128 Dynarr_add (dst, lb);
5129 Dynarr_add (dst, c | 0x80);
5132 case 3: /* one-byte private or two-byte official */
5133 if (XCHARSET_PRIVATE_P (charset))
5135 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5136 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5137 Dynarr_add (dst, lb);
5138 Dynarr_add (dst, c | 0x80);
5144 Dynarr_add (dst, lb);
5145 Dynarr_add (dst, ch | 0x80);
5146 Dynarr_add (dst, c | 0x80);
5154 default: /* two-byte private */
5157 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5158 Dynarr_add (dst, lb);
5159 Dynarr_add (dst, ch | 0x80);
5160 Dynarr_add (dst, c | 0x80);
5170 flags &= CODING_STATE_ISO2022_LOCK;
5173 label_continue_loop:;
5176 if (flags & CODING_STATE_END)
5178 COMPOSE_FLUSH_CHARS (str, dst);
5179 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5183 str->counter = counter;
5187 /***** ISO2022 encoder *****/
5189 /* Designate CHARSET into register REG. */
5192 iso2022_designate (Lisp_Object charset, unsigned char reg,
5193 struct encoding_stream *str, unsigned_char_dynarr *dst)
5195 static const char inter94[] = "()*+";
5196 static const char inter96[] = ",-./";
5197 unsigned short chars;
5198 unsigned char dimension;
5199 unsigned char final;
5200 Lisp_Object old_charset = str->iso2022.charset[reg];
5202 str->iso2022.charset[reg] = charset;
5203 if (!CHARSETP (charset))
5204 /* charset might be an initial nil or t. */
5206 chars = XCHARSET_CHARS (charset);
5207 dimension = XCHARSET_DIMENSION (charset);
5208 final = XCHARSET_FINAL (charset);
5209 if (!str->iso2022.force_charset_on_output[reg] &&
5210 CHARSETP (old_charset) &&
5211 XCHARSET_CHARS (old_charset) == chars &&
5212 XCHARSET_DIMENSION (old_charset) == dimension &&
5213 XCHARSET_FINAL (old_charset) == final)
5216 str->iso2022.force_charset_on_output[reg] = 0;
5219 charset_conversion_spec_dynarr *dyn =
5220 str->codesys->iso2022.output_conv;
5226 for (i = 0; i < Dynarr_length (dyn); i++)
5228 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5229 if (EQ (charset, spec->from_charset))
5230 charset = spec->to_charset;
5235 Dynarr_add (dst, ISO_CODE_ESC);
5240 Dynarr_add (dst, inter94[reg]);
5243 Dynarr_add (dst, '$');
5245 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5248 Dynarr_add (dst, inter94[reg]);
5253 Dynarr_add (dst, inter96[reg]);
5256 Dynarr_add (dst, '$');
5257 Dynarr_add (dst, inter96[reg]);
5261 Dynarr_add (dst, final);
5265 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5267 if (str->iso2022.register_left != 0)
5269 Dynarr_add (dst, ISO_CODE_SI);
5270 str->iso2022.register_left = 0;
5275 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5277 if (str->iso2022.register_left != 1)
5279 Dynarr_add (dst, ISO_CODE_SO);
5280 str->iso2022.register_left = 1;
5285 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5286 unsigned_char_dynarr *dst, unsigned int *flags)
5288 unsigned char charmask;
5289 Lisp_Coding_System* codesys = str->codesys;
5290 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5292 Lisp_Object charset = str->iso2022.current_charset;
5293 int half = str->iso2022.current_half;
5294 int code_point = -1;
5298 restore_left_to_right_direction (codesys, dst, flags, 0);
5300 /* Make sure G0 contains ASCII */
5301 if ((ch > ' ' && ch < ISO_CODE_DEL)
5302 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5304 ensure_normal_shift (str, dst);
5305 iso2022_designate (Vcharset_ascii, 0, str, dst);
5308 /* If necessary, restore everything to the default state
5310 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5312 restore_left_to_right_direction (codesys, dst, flags, 0);
5314 ensure_normal_shift (str, dst);
5316 for (i = 0; i < 4; i++)
5318 Lisp_Object initial_charset =
5319 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5320 iso2022_designate (initial_charset, i, str, dst);
5325 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5326 Dynarr_add (dst, '\r');
5327 if (eol_type != EOL_CR)
5328 Dynarr_add (dst, ch);
5332 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5333 && fit_to_be_escape_quoted (ch))
5334 Dynarr_add (dst, ISO_CODE_ESC);
5335 Dynarr_add (dst, ch);
5338 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5340 charmask = (half == 0 ? 0x00 : 0x80);
5342 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5343 && fit_to_be_escape_quoted (ch))
5344 Dynarr_add (dst, ISO_CODE_ESC);
5345 /* you asked for it ... */
5346 Dynarr_add (dst, ch);
5352 /* Now determine which register to use. */
5354 for (i = 0; i < 4; i++)
5356 if ((CHARSETP (charset = str->iso2022.charset[i])
5357 && ((code_point = charset_code_point (charset, ch)) >= 0))
5361 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5362 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5370 Lisp_Object original_default_coded_charset_priority_list
5371 = Vdefault_coded_charset_priority_list;
5373 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5375 code_point = ENCODE_CHAR (ch, charset);
5376 if (XCHARSET_FINAL (charset))
5378 Vdefault_coded_charset_priority_list
5379 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5380 Vdefault_coded_charset_priority_list));
5382 code_point = ENCODE_CHAR (ch, charset);
5383 if (!XCHARSET_FINAL (charset))
5385 charset = Vcharset_ascii;
5389 Vdefault_coded_charset_priority_list
5390 = original_default_coded_charset_priority_list;
5392 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5393 codesys, dst, flags, 0);
5397 if (XCHARSET_GRAPHIC (charset) != 0)
5399 if (!NILP (str->iso2022.charset[1]) &&
5400 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5401 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5403 else if (!NILP (str->iso2022.charset[2]))
5405 else if (!NILP (str->iso2022.charset[3]))
5414 iso2022_designate (charset, reg, str, dst);
5416 /* Now invoke that register. */
5420 ensure_normal_shift (str, dst);
5424 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5426 ensure_shift_out (str, dst);
5433 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5435 Dynarr_add (dst, ISO_CODE_ESC);
5436 Dynarr_add (dst, 'N');
5441 Dynarr_add (dst, ISO_CODE_SS2);
5446 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5448 Dynarr_add (dst, ISO_CODE_ESC);
5449 Dynarr_add (dst, 'O');
5454 Dynarr_add (dst, ISO_CODE_SS3);
5462 charmask = (half == 0 ? 0x00 : 0x80);
5464 switch (XCHARSET_DIMENSION (charset))
5467 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5470 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5471 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5474 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5475 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5476 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5479 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5480 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5481 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5482 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5488 str->iso2022.current_charset = charset;
5489 str->iso2022.current_half = half;
5493 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5494 unsigned int *flags)
5496 Lisp_Coding_System* codesys = str->codesys;
5499 restore_left_to_right_direction (codesys, dst, flags, 0);
5500 ensure_normal_shift (str, dst);
5501 for (i = 0; i < 4; i++)
5503 Lisp_Object initial_charset
5504 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5505 iso2022_designate (initial_charset, i, str, dst);
5510 /************************************************************************/
5511 /* No-conversion methods */
5512 /************************************************************************/
5514 /* This is used when reading in "binary" files -- i.e. files that may
5515 contain all 256 possible byte values and that are not to be
5516 interpreted as being in any particular decoding. */
5518 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5519 unsigned_char_dynarr *dst, size_t n)
5521 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5522 unsigned int flags = str->flags;
5523 unsigned int cpos = str->cpos;
5524 eol_type_t eol_type = str->eol_type;
5528 unsigned char c = *(unsigned char *)src++;
5530 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5531 DECODE_ADD_BINARY_CHAR (c, dst);
5532 label_continue_loop:;
5535 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5542 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5543 unsigned_char_dynarr *dst, size_t n)
5546 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5547 unsigned int flags = str->flags;
5548 unsigned int ch = str->ch;
5549 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5551 unsigned char char_boundary = str->iso2022.current_char_boundary;
5558 if (char_boundary == 0)
5564 else if ( c >= 0xf8 )
5569 else if ( c >= 0xf0 )
5574 else if ( c >= 0xe0 )
5579 else if ( c >= 0xc0 )
5589 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5590 Dynarr_add (dst, '\r');
5591 if (eol_type != EOL_CR)
5592 Dynarr_add (dst, c);
5595 Dynarr_add (dst, c);
5598 else if (char_boundary == 1)
5600 ch = ( ch << 6 ) | ( c & 0x3f );
5601 Dynarr_add (dst, ch & 0xff);
5606 ch = ( ch << 6 ) | ( c & 0x3f );
5609 #else /* not UTF2000 */
5612 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5613 Dynarr_add (dst, '\r');
5614 if (eol_type != EOL_CR)
5615 Dynarr_add (dst, '\n');
5618 else if (BYTE_ASCII_P (c))
5621 Dynarr_add (dst, c);
5623 else if (BUFBYTE_LEADING_BYTE_P (c))
5626 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5627 c == LEADING_BYTE_CONTROL_1)
5630 Dynarr_add (dst, '~'); /* untranslatable character */
5634 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5635 Dynarr_add (dst, c);
5636 else if (ch == LEADING_BYTE_CONTROL_1)
5639 Dynarr_add (dst, c - 0x20);
5641 /* else it should be the second or third byte of an
5642 untranslatable character, so ignore it */
5645 #endif /* not UTF2000 */
5651 str->iso2022.current_char_boundary = char_boundary;
5657 /************************************************************************/
5658 /* Initialization */
5659 /************************************************************************/
5662 syms_of_file_coding (void)
5664 INIT_LRECORD_IMPLEMENTATION (coding_system);
5666 deferror (&Qcoding_system_error, "coding-system-error",
5667 "Coding-system error", Qio_error);
5669 DEFSUBR (Fcoding_system_p);
5670 DEFSUBR (Ffind_coding_system);
5671 DEFSUBR (Fget_coding_system);
5672 DEFSUBR (Fcoding_system_list);
5673 DEFSUBR (Fcoding_system_name);
5674 DEFSUBR (Fmake_coding_system);
5675 DEFSUBR (Fcopy_coding_system);
5676 DEFSUBR (Fcoding_system_canonical_name_p);
5677 DEFSUBR (Fcoding_system_alias_p);
5678 DEFSUBR (Fcoding_system_aliasee);
5679 DEFSUBR (Fdefine_coding_system_alias);
5680 DEFSUBR (Fsubsidiary_coding_system);
5682 DEFSUBR (Fcoding_system_type);
5683 DEFSUBR (Fcoding_system_doc_string);
5685 DEFSUBR (Fcoding_system_charset);
5687 DEFSUBR (Fcoding_system_property);
5689 DEFSUBR (Fcoding_category_list);
5690 DEFSUBR (Fset_coding_priority_list);
5691 DEFSUBR (Fcoding_priority_list);
5692 DEFSUBR (Fset_coding_category_system);
5693 DEFSUBR (Fcoding_category_system);
5695 DEFSUBR (Fdetect_coding_region);
5696 DEFSUBR (Fdecode_coding_region);
5697 DEFSUBR (Fencode_coding_region);
5699 DEFSUBR (Fdecode_shift_jis_char);
5700 DEFSUBR (Fencode_shift_jis_char);
5701 DEFSUBR (Fdecode_big5_char);
5702 DEFSUBR (Fencode_big5_char);
5704 defsymbol (&Qcoding_systemp, "coding-system-p");
5705 defsymbol (&Qno_conversion, "no-conversion");
5706 defsymbol (&Qraw_text, "raw-text");
5708 defsymbol (&Qbig5, "big5");
5709 defsymbol (&Qshift_jis, "shift-jis");
5710 defsymbol (&Qucs4, "ucs-4");
5711 defsymbol (&Qutf8, "utf-8");
5712 defsymbol (&Qccl, "ccl");
5713 defsymbol (&Qiso2022, "iso2022");
5715 defsymbol (&Qmnemonic, "mnemonic");
5716 defsymbol (&Qeol_type, "eol-type");
5717 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5718 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5720 defsymbol (&Qcr, "cr");
5721 defsymbol (&Qlf, "lf");
5722 defsymbol (&Qcrlf, "crlf");
5723 defsymbol (&Qeol_cr, "eol-cr");
5724 defsymbol (&Qeol_lf, "eol-lf");
5725 defsymbol (&Qeol_crlf, "eol-crlf");
5727 defsymbol (&Qcharset_g0, "charset-g0");
5728 defsymbol (&Qcharset_g1, "charset-g1");
5729 defsymbol (&Qcharset_g2, "charset-g2");
5730 defsymbol (&Qcharset_g3, "charset-g3");
5731 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5732 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5733 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5734 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5735 defsymbol (&Qno_iso6429, "no-iso6429");
5736 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5737 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5739 defsymbol (&Qshort, "short");
5740 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5741 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5742 defsymbol (&Qseven, "seven");
5743 defsymbol (&Qlock_shift, "lock-shift");
5744 defsymbol (&Qescape_quoted, "escape-quoted");
5747 defsymbol (&Qdisable_composition, "disable-composition");
5749 defsymbol (&Qencode, "encode");
5750 defsymbol (&Qdecode, "decode");
5753 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5755 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5757 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5759 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5761 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5763 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5765 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5767 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5769 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5772 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5777 lstream_type_create_file_coding (void)
5779 LSTREAM_HAS_METHOD (decoding, reader);
5780 LSTREAM_HAS_METHOD (decoding, writer);
5781 LSTREAM_HAS_METHOD (decoding, rewinder);
5782 LSTREAM_HAS_METHOD (decoding, seekable_p);
5783 LSTREAM_HAS_METHOD (decoding, flusher);
5784 LSTREAM_HAS_METHOD (decoding, closer);
5785 LSTREAM_HAS_METHOD (decoding, marker);
5787 LSTREAM_HAS_METHOD (encoding, reader);
5788 LSTREAM_HAS_METHOD (encoding, writer);
5789 LSTREAM_HAS_METHOD (encoding, rewinder);
5790 LSTREAM_HAS_METHOD (encoding, seekable_p);
5791 LSTREAM_HAS_METHOD (encoding, flusher);
5792 LSTREAM_HAS_METHOD (encoding, closer);
5793 LSTREAM_HAS_METHOD (encoding, marker);
5797 vars_of_file_coding (void)
5801 fcd = xnew (struct file_coding_dump);
5802 dump_add_root_struct_ptr (&fcd, &fcd_description);
5804 /* Initialize to something reasonable ... */
5805 for (i = 0; i < CODING_CATEGORY_LAST; i++)
5807 fcd->coding_category_system[i] = Qnil;
5808 fcd->coding_category_by_priority[i] = i;
5811 Fprovide (intern ("file-coding"));
5813 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5814 Coding system used for TTY keyboard input.
5815 Not used under a windowing system.
5817 Vkeyboard_coding_system = Qnil;
5819 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5820 Coding system used for TTY display output.
5821 Not used under a windowing system.
5823 Vterminal_coding_system = Qnil;
5825 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5826 Overriding coding system used when reading from a file or process.
5827 You should bind this variable with `let', but do not set it globally.
5828 If this is non-nil, it specifies the coding system that will be used
5829 to decode input on read operations, such as from a file or process.
5830 It overrides `buffer-file-coding-system-for-read',
5831 `insert-file-contents-pre-hook', etc. Use those variables instead of
5832 this one for permanent changes to the environment. */ );
5833 Vcoding_system_for_read = Qnil;
5835 DEFVAR_LISP ("coding-system-for-write",
5836 &Vcoding_system_for_write /*
5837 Overriding coding system used when writing to a file or process.
5838 You should bind this variable with `let', but do not set it globally.
5839 If this is non-nil, it specifies the coding system that will be used
5840 to encode output for write operations, such as to a file or process.
5841 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5842 Use those variables instead of this one for permanent changes to the
5844 Vcoding_system_for_write = Qnil;
5846 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5847 Coding system used to convert pathnames when accessing files.
5849 Vfile_name_coding_system = Qnil;
5851 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5852 Non-nil means the buffer contents are regarded as multi-byte form
5853 of characters, not a binary code. This affects the display, file I/O,
5854 and behaviors of various editing commands.
5856 Setting this to nil does not do anything.
5858 enable_multibyte_characters = 1;
5862 complex_vars_of_file_coding (void)
5864 staticpro (&Vcoding_system_hash_table);
5865 Vcoding_system_hash_table =
5866 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5868 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5869 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5871 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5873 struct codesys_prop csp; \
5875 csp.prop_type = (Prop_Type); \
5876 Dynarr_add (the_codesys_prop_dynarr, csp); \
5879 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5880 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5881 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5882 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5883 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5884 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5885 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5887 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5888 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5889 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5890 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5891 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5892 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5893 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5894 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5895 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5896 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5897 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5898 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5899 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5900 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5901 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5902 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5903 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5905 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5906 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5908 /* Need to create this here or we're really screwed. */
5910 (Qraw_text, Qno_conversion,
5911 build_string ("Raw text, which means it converts only line-break-codes."),
5912 list2 (Qmnemonic, build_string ("Raw")));
5915 (Qbinary, Qno_conversion,
5916 build_string ("Binary, which means it does not convert anything."),
5917 list4 (Qeol_type, Qlf,
5918 Qmnemonic, build_string ("Binary")));
5923 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5924 list2 (Qmnemonic, build_string ("UTF8")));
5927 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5929 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5931 Fdefine_coding_system_alias (Qterminal, Qbinary);
5932 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5934 /* Need this for bootstrapping */
5935 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5936 Fget_coding_system (Qraw_text);
5939 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5940 = Fget_coding_system (Qutf8);
5943 #if defined(MULE) && !defined(UTF2000)
5947 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5948 fcd->ucs_to_mule_table[i] = Qnil;
5950 staticpro (&mule_to_ucs_table);
5951 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5952 #endif /* defined(MULE) && !defined(UTF2000) */