1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
39 #include "file-coding.h"
41 Lisp_Object Qcoding_system_error;
43 Lisp_Object Vkeyboard_coding_system;
44 Lisp_Object Vterminal_coding_system;
45 Lisp_Object Vcoding_system_for_read;
46 Lisp_Object Vcoding_system_for_write;
47 Lisp_Object Vfile_name_coding_system;
49 /* Table of symbols identifying each coding category. */
50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
54 struct file_coding_dump {
55 /* Coding system currently associated with each coding category. */
56 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
58 /* Table of all coding categories in decreasing order of priority.
59 This describes a permutation of the possible coding categories. */
60 int coding_category_by_priority[CODING_CATEGORY_LAST];
62 #if defined(MULE) && !defined(UTF2000)
63 Lisp_Object ucs_to_mule_table[65536];
67 static const struct lrecord_description fcd_description_1[] = {
68 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
69 #if defined(MULE) && !defined(UTF2000)
70 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
75 static const struct struct_description fcd_description = {
76 sizeof (struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
105 Lisp_Object Qdisable_composition;
107 Lisp_Object Qencode, Qdecode;
109 Lisp_Object Vcoding_system_hash_table;
111 int enable_multibyte_characters;
114 /* Additional information used by the ISO2022 decoder and detector. */
115 struct iso2022_decoder
117 /* CHARSET holds the character sets currently assigned to the G0
118 through G3 variables. It is initialized from the array
119 INITIAL_CHARSET in CODESYS. */
120 Lisp_Object charset[4];
122 /* Which registers are currently invoked into the left (GL) and
123 right (GR) halves of the 8-bit encoding space? */
124 int register_left, register_right;
126 /* ISO_ESC holds a value indicating part of an escape sequence
127 that has already been seen. */
128 enum iso_esc_flag esc;
130 /* This records the bytes we've seen so far in an escape sequence,
131 in case the sequence is invalid (we spit out the bytes unchanged). */
132 unsigned char esc_bytes[8];
134 /* Index for next byte to store in ISO escape sequence. */
137 #ifdef ENABLE_COMPOSITE_CHARS
138 /* Stuff seen so far when composing a string. */
139 unsigned_char_dynarr *composite_chars;
142 /* If we saw an invalid designation sequence for a particular
143 register, we flag it here and switch to ASCII. The next time we
144 see a valid designation for this register, we turn off the flag
145 and do the designation normally, but pretend the sequence was
146 invalid. The effect of all this is that (most of the time) the
147 escape sequences for both the switch to the unknown charset, and
148 the switch back to the known charset, get inserted literally into
149 the buffer and saved out as such. The hope is that we can
150 preserve the escape sequences so that the resulting written out
151 file makes sense. If we don't do any of this, the designation
152 to the invalid charset will be preserved but that switch back
153 to the known charset will probably get eaten because it was
154 the same charset that was already present in the register. */
155 unsigned char invalid_designated[4];
157 /* We try to do similar things as above for direction-switching
158 sequences. If we encountered a direction switch while an
159 invalid designation was present, or an invalid designation
160 just after a direction switch (i.e. no valid designation
161 encountered yet), we insert the direction-switch escape
162 sequence literally into the output stream, and later on
163 insert the corresponding direction-restoring escape sequence
165 unsigned int switched_dir_and_no_valid_charset_yet :1;
166 unsigned int invalid_switch_dir :1;
168 /* Tells the decoder to output the escape sequence literally
169 even though it was valid. Used in the games we play to
170 avoid lossage when we encounter invalid designations. */
171 unsigned int output_literally :1;
172 /* We encountered a direction switch followed by an invalid
173 designation. We didn't output the direction switch
174 literally because we didn't know about the invalid designation;
175 but we have to do so now. */
176 unsigned int output_direction_sequence :1;
179 EXFUN (Fcopy_coding_system, 2);
181 struct detection_state;
184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
185 unsigned_char_dynarr *dst, size_t n);
187 static int detect_coding_sjis (struct detection_state *st,
188 const Extbyte *src, size_t n);
189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
190 unsigned_char_dynarr *dst, size_t n);
191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
192 unsigned_char_dynarr *dst, unsigned int *flags);
193 void char_finish_shift_jis (struct encoding_stream *str,
194 unsigned_char_dynarr *dst, unsigned int *flags);
196 static int detect_coding_big5 (struct detection_state *st,
197 const Extbyte *src, size_t n);
198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, size_t n);
200 void char_encode_big5 (struct encoding_stream *str, Emchar c,
201 unsigned_char_dynarr *dst, unsigned int *flags);
202 void char_finish_big5 (struct encoding_stream *str,
203 unsigned_char_dynarr *dst, unsigned int *flags);
205 static int detect_coding_ucs4 (struct detection_state *st,
206 const Extbyte *src, size_t n);
207 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, size_t n);
209 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
210 unsigned_char_dynarr *dst, unsigned int *flags);
211 void char_finish_ucs4 (struct encoding_stream *str,
212 unsigned_char_dynarr *dst, unsigned int *flags);
214 static int detect_coding_utf8 (struct detection_state *st,
215 const Extbyte *src, size_t n);
216 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, size_t n);
218 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
219 unsigned_char_dynarr *dst, unsigned int *flags);
220 void char_finish_utf8 (struct encoding_stream *str,
221 unsigned_char_dynarr *dst, unsigned int *flags);
223 static int postprocess_iso2022_mask (int mask);
224 static void reset_iso2022 (Lisp_Object coding_system,
225 struct iso2022_decoder *iso);
226 static int detect_coding_iso2022 (struct detection_state *st,
227 const Extbyte *src, size_t n);
228 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
229 unsigned_char_dynarr *dst, size_t n);
230 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
231 unsigned_char_dynarr *dst, unsigned int *flags);
232 void char_finish_iso2022 (struct encoding_stream *str,
233 unsigned_char_dynarr *dst, unsigned int *flags);
235 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
236 unsigned_char_dynarr *dst, size_t n);
237 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
238 unsigned_char_dynarr *dst, size_t n);
239 static void mule_decode (Lstream *decoding, const Extbyte *src,
240 unsigned_char_dynarr *dst, size_t n);
241 static void mule_encode (Lstream *encoding, const Bufbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
244 typedef struct codesys_prop codesys_prop;
253 Dynarr_declare (codesys_prop);
254 } codesys_prop_dynarr;
256 static const struct lrecord_description codesys_prop_description_1[] = {
257 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
261 static const struct struct_description codesys_prop_description = {
262 sizeof (codesys_prop),
263 codesys_prop_description_1
266 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
267 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
271 static const struct struct_description codesys_prop_dynarr_description = {
272 sizeof (codesys_prop_dynarr),
273 codesys_prop_dynarr_description_1
276 codesys_prop_dynarr *the_codesys_prop_dynarr;
278 enum codesys_prop_enum
281 CODESYS_PROP_ISO2022,
286 /************************************************************************/
287 /* Coding system functions */
288 /************************************************************************/
290 static Lisp_Object mark_coding_system (Lisp_Object);
291 static void print_coding_system (Lisp_Object, Lisp_Object, int);
292 static void finalize_coding_system (void *header, int for_disksave);
295 static const struct lrecord_description ccs_description_1[] = {
296 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
297 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
301 static const struct struct_description ccs_description = {
302 sizeof (charset_conversion_spec),
306 static const struct lrecord_description ccsd_description_1[] = {
307 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
311 static const struct struct_description ccsd_description = {
312 sizeof (charset_conversion_spec_dynarr),
317 static const struct lrecord_description coding_system_description[] = {
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
327 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
328 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
329 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
336 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
337 mark_coding_system, print_coding_system,
338 finalize_coding_system,
339 0, 0, coding_system_description,
343 mark_coding_system (Lisp_Object obj)
345 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
347 mark_object (CODING_SYSTEM_NAME (codesys));
348 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
349 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
350 mark_object (CODING_SYSTEM_EOL_LF (codesys));
351 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
352 mark_object (CODING_SYSTEM_EOL_CR (codesys));
354 switch (CODING_SYSTEM_TYPE (codesys))
358 case CODESYS_ISO2022:
359 for (i = 0; i < 4; i++)
360 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
361 if (codesys->iso2022.input_conv)
363 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
365 struct charset_conversion_spec *ccs =
366 Dynarr_atp (codesys->iso2022.input_conv, i);
367 mark_object (ccs->from_charset);
368 mark_object (ccs->to_charset);
371 if (codesys->iso2022.output_conv)
373 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
375 struct charset_conversion_spec *ccs =
376 Dynarr_atp (codesys->iso2022.output_conv, i);
377 mark_object (ccs->from_charset);
378 mark_object (ccs->to_charset);
385 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
386 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
391 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
392 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
399 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
400 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
404 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
407 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
409 error ("printing unreadable object #<coding_system 0x%x>",
412 write_c_string ("#<coding_system ", printcharfun);
413 print_internal (c->name, printcharfun, 1);
414 write_c_string (">", printcharfun);
418 finalize_coding_system (void *header, int for_disksave)
420 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
421 /* Since coding systems never go away, this function is not
422 necessary. But it would be necessary if we changed things
423 so that coding systems could go away. */
424 if (!for_disksave) /* see comment in lstream.c */
426 switch (CODING_SYSTEM_TYPE (c))
429 case CODESYS_ISO2022:
430 if (c->iso2022.input_conv)
432 Dynarr_free (c->iso2022.input_conv);
433 c->iso2022.input_conv = 0;
435 if (c->iso2022.output_conv)
437 Dynarr_free (c->iso2022.output_conv);
438 c->iso2022.output_conv = 0;
449 symbol_to_eol_type (Lisp_Object symbol)
451 CHECK_SYMBOL (symbol);
452 if (NILP (symbol)) return EOL_AUTODETECT;
453 if (EQ (symbol, Qlf)) return EOL_LF;
454 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
455 if (EQ (symbol, Qcr)) return EOL_CR;
457 signal_simple_error ("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
462 eol_type_to_symbol (eol_type_t type)
467 case EOL_LF: return Qlf;
468 case EOL_CRLF: return Qcrlf;
469 case EOL_CR: return Qcr;
470 case EOL_AUTODETECT: return Qnil;
475 setup_eol_coding_systems (Lisp_Coding_System *codesys)
477 Lisp_Object codesys_obj;
478 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
479 char *codesys_name = (char *) alloca (len + 7);
481 char *codesys_mnemonic=0;
483 Lisp_Object codesys_name_sym, sub_codesys_obj;
487 XSETCODING_SYSTEM (codesys_obj, codesys);
489 memcpy (codesys_name,
490 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
492 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
494 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
495 codesys_mnemonic = (char *) alloca (mlen + 7);
496 memcpy (codesys_mnemonic,
497 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
500 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
501 strcpy (codesys_name + len, "-" op_sys); \
503 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
504 codesys_name_sym = intern (codesys_name); \
505 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
506 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
508 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
509 build_string (codesys_mnemonic); \
510 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
513 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
514 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
515 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
518 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
519 Return t if OBJECT is a coding system.
520 A coding system is an object that defines how text containing multiple
521 character sets is encoded into a stream of (typically 8-bit) bytes.
522 The coding system is used to decode the stream into a series of
523 characters (which may be from multiple charsets) when the text is read
524 from a file or process, and is used to encode the text back into the
525 same format when it is written out to a file or process.
527 For example, many ISO2022-compliant coding systems (such as Compound
528 Text, which is used for inter-client data under the X Window System)
529 use escape sequences to switch between different charsets -- Japanese
530 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
531 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
532 `make-coding-system' for more information.
534 Coding systems are normally identified using a symbol, and the
535 symbol is accepted in place of the actual coding system object whenever
536 a coding system is called for. (This is similar to how faces work.)
540 return CODING_SYSTEMP (object) ? Qt : Qnil;
543 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
544 Retrieve the coding system of the given name.
546 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
547 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
548 If there is no such coding system, nil is returned. Otherwise the
549 associated coding system object is returned.
551 (coding_system_or_name))
553 if (NILP (coding_system_or_name))
554 coding_system_or_name = Qbinary;
555 else if (CODING_SYSTEMP (coding_system_or_name))
556 return coding_system_or_name;
558 CHECK_SYMBOL (coding_system_or_name);
562 coding_system_or_name =
563 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
565 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
566 return coding_system_or_name;
570 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
571 Retrieve the coding system of the given name.
572 Same as `find-coding-system' except that if there is no such
573 coding system, an error is signaled instead of returning nil.
577 Lisp_Object coding_system = Ffind_coding_system (name);
579 if (NILP (coding_system))
580 signal_simple_error ("No such coding system", name);
581 return coding_system;
584 /* We store the coding systems in hash tables with the names as the key and the
585 actual coding system object as the value. Occasionally we need to use them
586 in a list format. These routines provide us with that. */
587 struct coding_system_list_closure
589 Lisp_Object *coding_system_list;
593 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
594 void *coding_system_list_closure)
596 /* This function can GC */
597 struct coding_system_list_closure *cscl =
598 (struct coding_system_list_closure *) coding_system_list_closure;
599 Lisp_Object *coding_system_list = cscl->coding_system_list;
601 *coding_system_list = Fcons (key, *coding_system_list);
605 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
606 Return a list of the names of all defined coding systems.
610 Lisp_Object coding_system_list = Qnil;
612 struct coding_system_list_closure coding_system_list_closure;
614 GCPRO1 (coding_system_list);
615 coding_system_list_closure.coding_system_list = &coding_system_list;
616 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
617 &coding_system_list_closure);
620 return coding_system_list;
623 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
624 Return the name of the given coding system.
628 coding_system = Fget_coding_system (coding_system);
629 return XCODING_SYSTEM_NAME (coding_system);
632 static Lisp_Coding_System *
633 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
635 Lisp_Coding_System *codesys =
636 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
638 zero_lcrecord (codesys);
639 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
640 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
641 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
642 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
643 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
644 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
645 CODING_SYSTEM_TYPE (codesys) = type;
646 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
648 if (type == CODESYS_ISO2022)
651 for (i = 0; i < 4; i++)
652 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
655 if (type == CODESYS_BIG5)
657 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
659 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
660 = Vcharset_chinese_big5;
661 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
663 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
667 else if (type == CODESYS_CCL)
669 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
670 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
673 CODING_SYSTEM_NAME (codesys) = name;
679 /* Given a list of charset conversion specs as specified in a Lisp
680 program, parse it into STORE_HERE. */
683 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
684 Lisp_Object spec_list)
688 EXTERNAL_LIST_LOOP (rest, spec_list)
690 Lisp_Object car = XCAR (rest);
691 Lisp_Object from, to;
692 struct charset_conversion_spec spec;
694 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
695 signal_simple_error ("Invalid charset conversion spec", car);
696 from = Fget_charset (XCAR (car));
697 to = Fget_charset (XCAR (XCDR (car)));
698 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
699 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
700 signal_simple_error_2
701 ("Attempted conversion between different charset types",
703 spec.from_charset = from;
704 spec.to_charset = to;
706 Dynarr_add (store_here, spec);
710 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
711 specs, return the equivalent as the Lisp programmer would see it.
713 If LOAD_HERE is 0, return Qnil. */
716 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
723 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
725 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
726 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
729 return Fnreverse (result);
734 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
735 Register symbol NAME as a coding system.
737 TYPE describes the conversion method used and should be one of
740 Automatic conversion. XEmacs attempts to detect the coding system
743 No conversion. Use this for binary files and such. On output,
744 graphic characters that are not in ASCII or Latin-1 will be
745 replaced by a ?. (For a no-conversion-encoded buffer, these
746 characters will only be present if you explicitly insert them.)
748 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
750 ISO 10646 UCS-4 encoding.
752 ISO 10646 UTF-8 encoding.
754 Any ISO2022-compliant encoding. Among other things, this includes
755 JIS (the Japanese encoding commonly used for e-mail), EUC (the
756 standard Unix encoding for Japanese and other languages), and
757 Compound Text (the encoding used in X11). You can specify more
758 specific information about the conversion with the PROPS argument.
760 Big5 (the encoding commonly used for Taiwanese).
762 The conversion is performed using a user-written pseudo-code
763 program. CCL (Code Conversion Language) is the name of this
766 Write out or read in the raw contents of the memory representing
767 the buffer's text. This is primarily useful for debugging
768 purposes, and is only enabled when XEmacs has been compiled with
769 DEBUG_XEMACS defined (via the --debug configure option).
770 WARNING: Reading in a file using 'internal conversion can result
771 in an internal inconsistency in the memory representing a
772 buffer's text, which will produce unpredictable results and may
773 cause XEmacs to crash. Under normal circumstances you should
774 never use 'internal conversion.
776 DOC-STRING is a string describing the coding system.
778 PROPS is a property list, describing the specific nature of the
779 character set. Recognized properties are:
782 String to be displayed in the modeline when this coding system is
786 End-of-line conversion to be used. It should be one of
789 Automatically detect the end-of-line type (LF, CRLF,
790 or CR). Also generate subsidiary coding systems named
791 `NAME-unix', `NAME-dos', and `NAME-mac', that are
792 identical to this coding system but have an EOL-TYPE
793 value of 'lf, 'crlf, and 'cr, respectively.
795 The end of a line is marked externally using ASCII LF.
796 Since this is also the way that XEmacs represents an
797 end-of-line internally, specifying this option results
798 in no end-of-line conversion. This is the standard
799 format for Unix text files.
801 The end of a line is marked externally using ASCII
802 CRLF. This is the standard format for MS-DOS text
805 The end of a line is marked externally using ASCII CR.
806 This is the standard format for Macintosh text files.
808 Automatically detect the end-of-line type but do not
809 generate subsidiary coding systems. (This value is
810 converted to nil when stored internally, and
811 `coding-system-property' will return nil.)
814 If non-nil, composition/decomposition for combining characters
817 'post-read-conversion
818 Function called after a file has been read in, to perform the
819 decoding. Called with two arguments, START and END, denoting
820 a region of the current buffer to be decoded.
822 'pre-write-conversion
823 Function called before a file is written out, to perform the
824 encoding. Called with two arguments, START and END, denoting
825 a region of the current buffer to be encoded.
828 The following additional properties are recognized if TYPE is 'iso2022:
834 The character set initially designated to the G0 - G3 registers.
835 The value should be one of
837 -- A charset object (designate that character set)
838 -- nil (do not ever use this register)
839 -- t (no character set is initially designated to
840 the register, but may be later on; this automatically
841 sets the corresponding `force-g*-on-output' property)
847 If non-nil, send an explicit designation sequence on output before
848 using the specified register.
851 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
852 "ESC $ B" on output in place of the full designation sequences
853 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
856 If non-nil, don't designate ASCII to G0 at each end of line on output.
857 Setting this to non-nil also suppresses other state-resetting that
858 normally happens at the end of a line.
861 If non-nil, don't designate ASCII to G0 before control chars on output.
864 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
868 If non-nil, use locking-shift (SO/SI) instead of single-shift
869 or designation by escape sequence.
872 If non-nil, don't use ISO6429's direction specification.
875 If non-nil, literal control characters that are the same as
876 the beginning of a recognized ISO2022 or ISO6429 escape sequence
877 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
878 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
879 so that they can be properly distinguished from an escape sequence.
880 (Note that doing this results in a non-portable encoding.) This
881 encoding flag is used for byte-compiled files. Note that ESC
882 is a good choice for a quoting character because there are no
883 escape sequences whose second byte is a character from the Control-0
884 or Control-1 character sets; this is explicitly disallowed by the
887 'input-charset-conversion
888 A list of conversion specifications, specifying conversion of
889 characters in one charset to another when decoding is performed.
890 Each specification is a list of two elements: the source charset,
891 and the destination charset.
893 'output-charset-conversion
894 A list of conversion specifications, specifying conversion of
895 characters in one charset to another when encoding is performed.
896 The form of each specification is the same as for
897 'input-charset-conversion.
900 The following additional properties are recognized (and required)
904 CCL program used for decoding (converting to internal format).
907 CCL program used for encoding (converting to external format).
909 (name, type, doc_string, props))
911 Lisp_Coding_System *codesys;
912 enum coding_system_type ty;
913 int need_to_setup_eol_systems = 1;
915 /* Convert type to constant */
916 if (NILP (type) || EQ (type, Qundecided))
917 { ty = CODESYS_AUTODETECT; }
919 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
920 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
921 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
922 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
923 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
924 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
926 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
928 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
931 signal_simple_error ("Invalid coding system type", type);
935 codesys = allocate_coding_system (ty, name);
937 if (NILP (doc_string))
938 doc_string = build_string ("");
940 CHECK_STRING (doc_string);
941 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
944 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
946 if (EQ (key, Qmnemonic))
949 CHECK_STRING (value);
950 CODING_SYSTEM_MNEMONIC (codesys) = value;
953 else if (EQ (key, Qeol_type))
955 need_to_setup_eol_systems = NILP (value);
958 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
961 else if (EQ (key, Qpost_read_conversion))
962 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
963 else if (EQ (key, Qpre_write_conversion))
964 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
966 else if (EQ (key, Qdisable_composition))
967 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
970 else if (ty == CODESYS_ISO2022)
972 #define FROB_INITIAL_CHARSET(charset_num) \
973 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
974 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
976 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
977 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
978 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
979 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
981 #define FROB_FORCE_CHARSET(charset_num) \
982 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
984 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
985 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
986 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
987 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
989 #define FROB_BOOLEAN_PROPERTY(prop) \
990 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
992 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
993 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
994 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
995 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
996 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
997 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
998 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1000 else if (EQ (key, Qinput_charset_conversion))
1002 codesys->iso2022.input_conv =
1003 Dynarr_new (charset_conversion_spec);
1004 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1007 else if (EQ (key, Qoutput_charset_conversion))
1009 codesys->iso2022.output_conv =
1010 Dynarr_new (charset_conversion_spec);
1011 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1015 signal_simple_error ("Unrecognized property", key);
1018 else if (ty == CODESYS_BIG5)
1020 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1021 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1023 signal_simple_error ("Unrecognized property", key);
1026 else if (EQ (type, Qccl))
1029 struct ccl_program test_ccl;
1032 /* Check key first. */
1033 if (EQ (key, Qdecode))
1034 suffix = "-ccl-decode";
1035 else if (EQ (key, Qencode))
1036 suffix = "-ccl-encode";
1038 signal_simple_error ("Unrecognized property", key);
1040 /* If value is vector, register it as a ccl program
1041 associated with an newly created symbol for
1042 backward compatibility. */
1043 if (VECTORP (value))
1045 sym = Fintern (concat2 (Fsymbol_name (name),
1046 build_string (suffix)),
1048 Fregister_ccl_program (sym, value);
1052 CHECK_SYMBOL (value);
1055 /* check if the given ccl programs are valid. */
1056 if (setup_ccl_program (&test_ccl, sym) < 0)
1057 signal_simple_error ("Invalid CCL program", value);
1059 if (EQ (key, Qdecode))
1060 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1061 else if (EQ (key, Qencode))
1062 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1067 signal_simple_error ("Unrecognized property", key);
1071 if (need_to_setup_eol_systems)
1072 setup_eol_coding_systems (codesys);
1075 Lisp_Object codesys_obj;
1076 XSETCODING_SYSTEM (codesys_obj, codesys);
1077 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1082 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1083 Copy OLD-CODING-SYSTEM to NEW-NAME.
1084 If NEW-NAME does not name an existing coding system, a new one will
1087 (old_coding_system, new_name))
1089 Lisp_Object new_coding_system;
1090 old_coding_system = Fget_coding_system (old_coding_system);
1091 new_coding_system = Ffind_coding_system (new_name);
1092 if (NILP (new_coding_system))
1094 XSETCODING_SYSTEM (new_coding_system,
1095 allocate_coding_system
1096 (XCODING_SYSTEM_TYPE (old_coding_system),
1098 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1102 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1103 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1104 memcpy (((char *) to ) + sizeof (to->header),
1105 ((char *) from) + sizeof (from->header),
1106 sizeof (*from) - sizeof (from->header));
1107 to->name = new_name;
1109 return new_coding_system;
1112 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1113 Return t if OBJECT names a coding system, and is not a coding system alias.
1117 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1121 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1122 Return t if OBJECT is a coding system alias.
1123 All coding system aliases are created by `define-coding-system-alias'.
1127 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1131 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1132 Return the coding-system symbol for which symbol ALIAS is an alias.
1136 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1137 if (SYMBOLP (aliasee))
1140 signal_simple_error ("Symbol is not a coding system alias", alias);
1141 return Qnil; /* To keep the compiler happy */
1145 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1147 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1151 /* A maphash function, for removing dangling coding system aliases. */
1153 dangling_coding_system_alias_p (Lisp_Object alias,
1154 Lisp_Object aliasee,
1155 void *dangling_aliases)
1157 if (SYMBOLP (aliasee)
1158 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1160 (*(int *) dangling_aliases)++;
1167 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1168 Define symbol ALIAS as an alias for coding system ALIASEE.
1170 You can use this function to redefine an alias that has already been defined,
1171 but you cannot redefine a name which is the canonical name for a coding system.
1172 \(a canonical name of a coding system is what is returned when you call
1173 `coding-system-name' on a coding system).
1175 ALIASEE itself can be an alias, which allows you to define nested aliases.
1177 You are forbidden, however, from creating alias loops or `dangling' aliases.
1178 These will be detected, and an error will be signaled if you attempt to do so.
1180 If ALIASEE is nil, then ALIAS will simply be undefined.
1182 See also `coding-system-alias-p', `coding-system-aliasee',
1183 and `coding-system-canonical-name-p'.
1187 Lisp_Object real_coding_system, probe;
1189 CHECK_SYMBOL (alias);
1191 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1193 ("Symbol is the canonical name of a coding system and cannot be redefined",
1198 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1199 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1200 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1202 Fremhash (alias, Vcoding_system_hash_table);
1204 /* Undefine subsidiary aliases,
1205 presumably created by a previous call to this function */
1206 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1207 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1208 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1210 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1211 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1212 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1215 /* Undefine dangling coding system aliases. */
1217 int dangling_aliases;
1220 dangling_aliases = 0;
1221 elisp_map_remhash (dangling_coding_system_alias_p,
1222 Vcoding_system_hash_table,
1224 } while (dangling_aliases > 0);
1230 if (CODING_SYSTEMP (aliasee))
1231 aliasee = XCODING_SYSTEM_NAME (aliasee);
1233 /* Checks that aliasee names a coding-system */
1234 real_coding_system = Fget_coding_system (aliasee);
1236 /* Check for coding system alias loops */
1237 if (EQ (alias, aliasee))
1238 alias_loop: signal_simple_error_2
1239 ("Attempt to create a coding system alias loop", alias, aliasee);
1241 for (probe = aliasee;
1243 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1245 if (EQ (probe, alias))
1249 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1251 /* Set up aliases for subsidiaries.
1252 #### There must be a better way to handle subsidiary coding systems. */
1254 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1256 for (i = 0; i < countof (suffixes); i++)
1258 Lisp_Object alias_subsidiary =
1259 append_suffix_to_symbol (alias, suffixes[i]);
1260 Lisp_Object aliasee_subsidiary =
1261 append_suffix_to_symbol (aliasee, suffixes[i]);
1263 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1264 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1267 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1268 but it doesn't look intentional, so I'd rather return something
1269 meaningful or nothing at all. */
1274 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1276 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1277 Lisp_Object new_coding_system;
1279 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1280 return coding_system;
1284 case EOL_AUTODETECT: return coding_system;
1285 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1286 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1287 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1288 default: abort (); return Qnil;
1291 return NILP (new_coding_system) ? coding_system : new_coding_system;
1294 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1295 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1297 (coding_system, eol_type))
1299 coding_system = Fget_coding_system (coding_system);
1301 return subsidiary_coding_system (coding_system,
1302 symbol_to_eol_type (eol_type));
1306 /************************************************************************/
1307 /* Coding system accessors */
1308 /************************************************************************/
1310 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1311 Return the doc string for CODING-SYSTEM.
1315 coding_system = Fget_coding_system (coding_system);
1316 return XCODING_SYSTEM_DOC_STRING (coding_system);
1319 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1320 Return the type of CODING-SYSTEM.
1324 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1327 case CODESYS_AUTODETECT: return Qundecided;
1329 case CODESYS_SHIFT_JIS: return Qshift_jis;
1330 case CODESYS_ISO2022: return Qiso2022;
1331 case CODESYS_BIG5: return Qbig5;
1332 case CODESYS_UCS4: return Qucs4;
1333 case CODESYS_UTF8: return Qutf8;
1334 case CODESYS_CCL: return Qccl;
1336 case CODESYS_NO_CONVERSION: return Qno_conversion;
1338 case CODESYS_INTERNAL: return Qinternal;
1345 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1348 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1350 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1353 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1354 Return initial charset of CODING-SYSTEM designated to GNUM.
1357 (coding_system, gnum))
1359 coding_system = Fget_coding_system (coding_system);
1362 return coding_system_charset (coding_system, XINT (gnum));
1366 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1367 Return the PROP property of CODING-SYSTEM.
1369 (coding_system, prop))
1372 enum coding_system_type type;
1374 coding_system = Fget_coding_system (coding_system);
1375 CHECK_SYMBOL (prop);
1376 type = XCODING_SYSTEM_TYPE (coding_system);
1378 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1379 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1382 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1384 case CODESYS_PROP_ALL_OK:
1387 case CODESYS_PROP_ISO2022:
1388 if (type != CODESYS_ISO2022)
1390 ("Property only valid in ISO2022 coding systems",
1394 case CODESYS_PROP_CCL:
1395 if (type != CODESYS_CCL)
1397 ("Property only valid in CCL coding systems",
1407 signal_simple_error ("Unrecognized property", prop);
1409 if (EQ (prop, Qname))
1410 return XCODING_SYSTEM_NAME (coding_system);
1411 else if (EQ (prop, Qtype))
1412 return Fcoding_system_type (coding_system);
1413 else if (EQ (prop, Qdoc_string))
1414 return XCODING_SYSTEM_DOC_STRING (coding_system);
1415 else if (EQ (prop, Qmnemonic))
1416 return XCODING_SYSTEM_MNEMONIC (coding_system);
1417 else if (EQ (prop, Qeol_type))
1418 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1419 else if (EQ (prop, Qeol_lf))
1420 return XCODING_SYSTEM_EOL_LF (coding_system);
1421 else if (EQ (prop, Qeol_crlf))
1422 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1423 else if (EQ (prop, Qeol_cr))
1424 return XCODING_SYSTEM_EOL_CR (coding_system);
1425 else if (EQ (prop, Qpost_read_conversion))
1426 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1427 else if (EQ (prop, Qpre_write_conversion))
1428 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1430 else if (type == CODESYS_ISO2022)
1432 if (EQ (prop, Qcharset_g0))
1433 return coding_system_charset (coding_system, 0);
1434 else if (EQ (prop, Qcharset_g1))
1435 return coding_system_charset (coding_system, 1);
1436 else if (EQ (prop, Qcharset_g2))
1437 return coding_system_charset (coding_system, 2);
1438 else if (EQ (prop, Qcharset_g3))
1439 return coding_system_charset (coding_system, 3);
1441 #define FORCE_CHARSET(charset_num) \
1442 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1443 (coding_system, charset_num) ? Qt : Qnil)
1445 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1446 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1447 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1448 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1450 #define LISP_BOOLEAN(prop) \
1451 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1453 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1454 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1455 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1456 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1457 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1458 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1459 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1461 else if (EQ (prop, Qinput_charset_conversion))
1463 unparse_charset_conversion_specs
1464 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1465 else if (EQ (prop, Qoutput_charset_conversion))
1467 unparse_charset_conversion_specs
1468 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1472 else if (type == CODESYS_CCL)
1474 if (EQ (prop, Qdecode))
1475 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1476 else if (EQ (prop, Qencode))
1477 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1485 return Qnil; /* not reached */
1489 /************************************************************************/
1490 /* Coding category functions */
1491 /************************************************************************/
1494 decode_coding_category (Lisp_Object symbol)
1498 CHECK_SYMBOL (symbol);
1499 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500 if (EQ (coding_category_symbol[i], symbol))
1503 signal_simple_error ("Unrecognized coding category", symbol);
1504 return 0; /* not reached */
1507 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1508 Return a list of all recognized coding categories.
1513 Lisp_Object list = Qnil;
1515 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1516 list = Fcons (coding_category_symbol[i], list);
1520 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1521 Change the priority order of the coding categories.
1522 LIST should be list of coding categories, in descending order of
1523 priority. Unspecified coding categories will be lower in priority
1524 than all specified ones, in the same relative order they were in
1529 int category_to_priority[CODING_CATEGORY_LAST];
1533 /* First generate a list that maps coding categories to priorities. */
1535 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1536 category_to_priority[i] = -1;
1538 /* Highest priority comes from the specified list. */
1540 EXTERNAL_LIST_LOOP (rest, list)
1542 int cat = decode_coding_category (XCAR (rest));
1544 if (category_to_priority[cat] >= 0)
1545 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1546 category_to_priority[cat] = i++;
1549 /* Now go through the existing categories by priority to retrieve
1550 the categories not yet specified and preserve their priority
1552 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1554 int cat = fcd->coding_category_by_priority[j];
1555 if (category_to_priority[cat] < 0)
1556 category_to_priority[cat] = i++;
1559 /* Now we need to construct the inverse of the mapping we just
1562 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1563 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1565 /* Phew! That was confusing. */
1569 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1570 Return a list of coding categories in descending order of priority.
1575 Lisp_Object list = Qnil;
1577 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1578 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1583 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1584 Change the coding system associated with a coding category.
1586 (coding_category, coding_system))
1588 int cat = decode_coding_category (coding_category);
1590 coding_system = Fget_coding_system (coding_system);
1591 fcd->coding_category_system[cat] = coding_system;
1595 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1596 Return the coding system associated with a coding category.
1600 int cat = decode_coding_category (coding_category);
1601 Lisp_Object sys = fcd->coding_category_system[cat];
1604 return XCODING_SYSTEM_NAME (sys);
1609 /************************************************************************/
1610 /* Detecting the encoding of data */
1611 /************************************************************************/
1613 struct detection_state
1615 eol_type_t eol_type;
1651 struct iso2022_decoder iso;
1653 int high_byte_count;
1654 unsigned int saw_single_shift:1;
1667 acceptable_control_char_p (int c)
1671 /* Allow and ignore control characters that you might
1672 reasonably see in a text file */
1677 case 8: /* backspace */
1678 case 11: /* vertical tab */
1679 case 12: /* form feed */
1680 case 26: /* MS-DOS C-z junk */
1681 case 31: /* '^_' -- for info */
1689 mask_has_at_most_one_bit_p (int mask)
1691 /* Perhaps the only thing useful you learn from intensive Microsoft
1692 technical interviews */
1693 return (mask & (mask - 1)) == 0;
1697 detect_eol_type (struct detection_state *st, const Extbyte *src,
1702 unsigned char c = *(unsigned char *)src++;
1705 if (st->eol.just_saw_cr)
1707 else if (st->eol.seen_anything)
1710 else if (st->eol.just_saw_cr)
1713 st->eol.just_saw_cr = 1;
1715 st->eol.just_saw_cr = 0;
1716 st->eol.seen_anything = 1;
1719 return EOL_AUTODETECT;
1722 /* Attempt to determine the encoding and EOL type of the given text.
1723 Before calling this function for the first type, you must initialize
1724 st->eol_type as appropriate and initialize st->mask to ~0.
1726 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1729 st->mask holds the determined coding category mask, or ~0 if only
1730 ASCII has been seen so far.
1734 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1735 is present in st->mask
1736 1 == definitive answers are here for both st->eol_type and st->mask
1740 detect_coding_type (struct detection_state *st, const Extbyte *src,
1741 size_t n, int just_do_eol)
1743 if (st->eol_type == EOL_AUTODETECT)
1744 st->eol_type = detect_eol_type (st, src, n);
1747 return st->eol_type != EOL_AUTODETECT;
1749 if (!st->seen_non_ascii)
1751 for (; n; n--, src++)
1753 unsigned char c = *(unsigned char *) src;
1754 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1756 st->seen_non_ascii = 1;
1758 st->shift_jis.mask = ~0;
1762 st->iso2022.mask = ~0;
1772 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1773 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1774 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1775 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1776 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1777 st->big5.mask = detect_coding_big5 (st, src, n);
1778 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1779 st->utf8.mask = detect_coding_utf8 (st, src, n);
1780 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1781 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1784 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1785 | st->utf8.mask | st->ucs4.mask;
1788 int retval = mask_has_at_most_one_bit_p (st->mask);
1789 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1790 return retval && st->eol_type != EOL_AUTODETECT;
1795 coding_system_from_mask (int mask)
1799 /* If the file was entirely or basically ASCII, use the
1800 default value of `buffer-file-coding-system'. */
1801 Lisp_Object retval =
1802 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1805 retval = Ffind_coding_system (retval);
1809 (Qbad_variable, Qwarning,
1810 "Invalid `default-buffer-file-coding-system', set to nil");
1811 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1815 retval = Fget_coding_system (Qraw_text);
1823 mask = postprocess_iso2022_mask (mask);
1825 /* Look through the coding categories by priority and find
1826 the first one that is allowed. */
1827 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1829 cat = fcd->coding_category_by_priority[i];
1830 if ((mask & (1 << cat)) &&
1831 !NILP (fcd->coding_category_system[cat]))
1835 return fcd->coding_category_system[cat];
1837 return Fget_coding_system (Qraw_text);
1841 /* Given a seekable read stream and potential coding system and EOL type
1842 as specified, do any autodetection that is called for. If the
1843 coding system and/or EOL type are not `autodetect', they will be left
1844 alone; but this function will never return an autodetect coding system
1847 This function does not automatically fetch subsidiary coding systems;
1848 that should be unnecessary with the explicit eol-type argument. */
1850 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1853 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1854 eol_type_t *eol_type_in_out)
1856 struct detection_state decst;
1858 if (*eol_type_in_out == EOL_AUTODETECT)
1859 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1862 decst.eol_type = *eol_type_in_out;
1865 /* If autodetection is called for, do it now. */
1866 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1867 || *eol_type_in_out == EOL_AUTODETECT)
1870 Lisp_Object coding_system = Qnil;
1872 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1875 /* Look for initial "-*-"; mode line prefix */
1877 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1882 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1884 Extbyte *local_vars_beg = p + 3;
1885 /* Look for final "-*-"; mode line suffix */
1886 for (p = local_vars_beg,
1887 scan_end = buf + nread - LENGTH ("-*-");
1892 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1894 Extbyte *suffix = p;
1895 /* Look for "coding:" */
1896 for (p = local_vars_beg,
1897 scan_end = suffix - LENGTH ("coding:?");
1900 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1901 && (p == local_vars_beg
1902 || (*(p-1) == ' ' ||
1908 p += LENGTH ("coding:");
1909 while (*p == ' ' || *p == '\t') p++;
1911 /* Get coding system name */
1912 save = *suffix; *suffix = '\0';
1913 /* Characters valid in a MIME charset name (rfc 1521),
1914 and in a Lisp symbol name. */
1915 n = strspn ( (char *) p,
1916 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1917 "abcdefghijklmnopqrstuvwxyz"
1923 save = p[n]; p[n] = '\0';
1925 Ffind_coding_system (intern ((char *) p));
1935 if (NILP (coding_system))
1938 if (detect_coding_type (&decst, buf, nread,
1939 XCODING_SYSTEM_TYPE (*codesys_in_out)
1940 != CODESYS_AUTODETECT))
1942 nread = Lstream_read (stream, buf, sizeof (buf));
1948 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1949 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1952 if (detect_coding_type (&decst, buf, nread, 1))
1954 nread = Lstream_read (stream, buf, sizeof (buf));
1960 *eol_type_in_out = decst.eol_type;
1961 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1963 if (NILP (coding_system))
1964 *codesys_in_out = coding_system_from_mask (decst.mask);
1966 *codesys_in_out = coding_system;
1970 /* If we absolutely can't determine the EOL type, just assume LF. */
1971 if (*eol_type_in_out == EOL_AUTODETECT)
1972 *eol_type_in_out = EOL_LF;
1974 Lstream_rewind (stream);
1977 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1978 Detect coding system of the text in the region between START and END.
1979 Return a list of possible coding systems ordered by priority.
1980 If only ASCII characters are found, return 'undecided or one of
1981 its subsidiary coding systems according to a detected end-of-line
1982 type. Optional arg BUFFER defaults to the current buffer.
1984 (start, end, buffer))
1986 Lisp_Object val = Qnil;
1987 struct buffer *buf = decode_buffer (buffer, 0);
1989 Lisp_Object instream, lb_instream;
1990 Lstream *istr, *lb_istr;
1991 struct detection_state decst;
1992 struct gcpro gcpro1, gcpro2;
1994 get_buffer_range_char (buf, start, end, &b, &e, 0);
1995 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1996 lb_istr = XLSTREAM (lb_instream);
1997 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1998 istr = XLSTREAM (instream);
1999 GCPRO2 (instream, lb_instream);
2001 decst.eol_type = EOL_AUTODETECT;
2005 Extbyte random_buffer[4096];
2006 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2010 if (detect_coding_type (&decst, random_buffer, nread, 0))
2014 if (decst.mask == ~0)
2015 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2023 decst.mask = postprocess_iso2022_mask (decst.mask);
2025 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2027 int sys = fcd->coding_category_by_priority[i];
2028 if (decst.mask & (1 << sys))
2030 Lisp_Object codesys = fcd->coding_category_system[sys];
2031 if (!NILP (codesys))
2032 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2033 val = Fcons (codesys, val);
2037 Lstream_close (istr);
2039 Lstream_delete (istr);
2040 Lstream_delete (lb_istr);
2045 /************************************************************************/
2046 /* Converting to internal Mule format ("decoding") */
2047 /************************************************************************/
2049 /* A decoding stream is a stream used for decoding text (i.e.
2050 converting from some external format to internal format).
2051 The decoding-stream object keeps track of the actual coding
2052 stream, the stream that is at the other end, and data that
2053 needs to be persistent across the lifetime of the stream. */
2055 /* Handle the EOL stuff related to just-read-in character C.
2056 EOL_TYPE is the EOL type of the coding stream.
2057 FLAGS is the current value of FLAGS in the coding stream, and may
2058 be modified by this macro. (The macro only looks at the
2059 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2060 bytes are to be written. You need to also define a local goto
2061 label "label_continue_loop" that is at the end of the main
2062 character-reading loop.
2064 If C is a CR character, then this macro handles it entirely and
2065 jumps to label_continue_loop. Otherwise, this macro does not add
2066 anything to DST, and continues normally. You should continue
2067 processing C normally after this macro. */
2069 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2073 if (eol_type == EOL_CR) \
2074 Dynarr_add (dst, '\n'); \
2075 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2076 Dynarr_add (dst, c); \
2078 flags |= CODING_STATE_CR; \
2079 goto label_continue_loop; \
2081 else if (flags & CODING_STATE_CR) \
2082 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2084 Dynarr_add (dst, '\r'); \
2085 flags &= ~CODING_STATE_CR; \
2089 /* C should be a binary character in the range 0 - 255; convert
2090 to internal format and add to Dynarr DST. */
2093 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2095 if (BYTE_ASCII_P (c)) \
2096 Dynarr_add (dst, c); \
2099 Dynarr_add (dst, (c >> 6) | 0xc0); \
2100 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2104 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2106 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2110 Dynarr_add (dst, c);
2112 else if ( c <= 0x7ff )
2114 Dynarr_add (dst, (c >> 6) | 0xc0);
2115 Dynarr_add (dst, (c & 0x3f) | 0x80);
2117 else if ( c <= 0xffff )
2119 Dynarr_add (dst, (c >> 12) | 0xe0);
2120 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2121 Dynarr_add (dst, (c & 0x3f) | 0x80);
2123 else if ( c <= 0x1fffff )
2125 Dynarr_add (dst, (c >> 18) | 0xf0);
2126 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2127 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2128 Dynarr_add (dst, (c & 0x3f) | 0x80);
2130 else if ( c <= 0x3ffffff )
2132 Dynarr_add (dst, (c >> 24) | 0xf8);
2133 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2134 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2135 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2136 Dynarr_add (dst, (c & 0x3f) | 0x80);
2140 Dynarr_add (dst, (c >> 30) | 0xfc);
2141 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2142 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2143 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2144 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2145 Dynarr_add (dst, (c & 0x3f) | 0x80);
2149 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2151 if (BYTE_ASCII_P (c)) \
2152 Dynarr_add (dst, c); \
2153 else if (BYTE_C1_P (c)) \
2155 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2156 Dynarr_add (dst, c + 0x20); \
2160 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2161 Dynarr_add (dst, c); \
2166 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2170 DECODE_ADD_BINARY_CHAR (ch, dst); \
2175 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2177 if (flags & CODING_STATE_END) \
2179 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2180 if (flags & CODING_STATE_CR) \
2181 Dynarr_add (dst, '\r'); \
2185 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2187 struct decoding_stream
2189 /* Coding system that governs the conversion. */
2190 Lisp_Coding_System *codesys;
2192 /* Stream that we read the encoded data from or
2193 write the decoded data to. */
2196 /* If we are reading, then we can return only a fixed amount of
2197 data, so if the conversion resulted in too much data, we store it
2198 here for retrieval the next time around. */
2199 unsigned_char_dynarr *runoff;
2201 /* FLAGS holds flags indicating the current state of the decoding.
2202 Some of these flags are dependent on the coding system. */
2205 /* CPOS holds a partially built-up code-point of character. */
2208 /* EOL_TYPE specifies the type of end-of-line conversion that
2209 currently applies. We need to keep this separate from the
2210 EOL type stored in CODESYS because the latter might indicate
2211 automatic EOL-type detection while the former will always
2212 indicate a particular EOL type. */
2213 eol_type_t eol_type;
2215 /* Additional ISO2022 information. We define the structure above
2216 because it's also needed by the detection routines. */
2217 struct iso2022_decoder iso2022;
2219 /* Additional information (the state of the running CCL program)
2220 used by the CCL decoder. */
2221 struct ccl_program ccl;
2223 /* counter for UTF-8 or UCS-4 */
2224 unsigned char counter;
2227 unsigned combined_char_count;
2228 Emchar combined_chars[16];
2229 Lisp_Object combining_table;
2231 struct detection_state decst;
2235 extern Lisp_Object Vcharacter_composition_table;
2238 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2240 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2244 for (i = 0; i < str->combined_char_count; i++)
2245 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2246 str->combined_char_count = 0;
2247 str->combining_table = Qnil;
2250 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2251 unsigned_char_dynarr* dst);
2253 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2254 Emchar character, unsigned_char_dynarr* dst)
2256 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2257 DECODE_ADD_UCS_CHAR (character, dst);
2258 else if (!CHAR_TABLEP (str->combining_table))
2261 = get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2265 DECODE_ADD_UCS_CHAR (character, dst);
2268 str->combined_chars[0] = character;
2269 str->combined_char_count = 1;
2270 str->combining_table = ret;
2276 = get_char_id_table (XCHAR_TABLE(str->combining_table),
2281 Emchar char2 = XCHARVAL (ret);
2283 get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2287 DECODE_ADD_UCS_CHAR (char2, dst);
2288 str->combined_char_count = 0;
2289 str->combining_table = Qnil;
2293 str->combined_chars[0] = char2;
2294 str->combined_char_count = 1;
2295 str->combining_table = ret;
2298 else if (CHAR_TABLEP (ret))
2300 str->combined_chars[str->combined_char_count++] = character;
2301 str->combining_table = ret;
2305 COMPOSE_FLUSH_CHARS (str, dst);
2306 DECODE_ADD_UCS_CHAR (character, dst);
2310 #else /* not UTF2000 */
2311 #define COMPOSE_FLUSH_CHARS(str, dst)
2312 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2313 #endif /* UTF2000 */
2315 static ssize_t decoding_reader (Lstream *stream,
2316 unsigned char *data, size_t size);
2317 static ssize_t decoding_writer (Lstream *stream,
2318 const unsigned char *data, size_t size);
2319 static int decoding_rewinder (Lstream *stream);
2320 static int decoding_seekable_p (Lstream *stream);
2321 static int decoding_flusher (Lstream *stream);
2322 static int decoding_closer (Lstream *stream);
2324 static Lisp_Object decoding_marker (Lisp_Object stream);
2326 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2327 sizeof (struct decoding_stream));
2330 decoding_marker (Lisp_Object stream)
2332 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2333 Lisp_Object str_obj;
2335 /* We do not need to mark the coding systems or charsets stored
2336 within the stream because they are stored in a global list
2337 and automatically marked. */
2339 XSETLSTREAM (str_obj, str);
2340 mark_object (str_obj);
2341 if (str->imp->marker)
2342 return (str->imp->marker) (str_obj);
2347 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2348 so we read data from the other end, decode it, and store it into DATA. */
2351 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2353 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2354 unsigned char *orig_data = data;
2356 int error_occurred = 0;
2358 /* We need to interface to mule_decode(), which expects to take some
2359 amount of data and store the result into a Dynarr. We have
2360 mule_decode() store into str->runoff, and take data from there
2363 /* We loop until we have enough data, reading chunks from the other
2364 end and decoding it. */
2367 /* Take data from the runoff if we can. Make sure to take at
2368 most SIZE bytes, and delete the data from the runoff. */
2369 if (Dynarr_length (str->runoff) > 0)
2371 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2372 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2373 Dynarr_delete_many (str->runoff, 0, chunk);
2379 break; /* No more room for data */
2381 if (str->flags & CODING_STATE_END)
2382 /* This means that on the previous iteration, we hit the EOF on
2383 the other end. We loop once more so that mule_decode() can
2384 output any final stuff it may be holding, or any "go back
2385 to a sane state" escape sequences. (This latter makes sense
2386 during encoding.) */
2389 /* Exhausted the runoff, so get some more. DATA has at least
2390 SIZE bytes left of storage in it, so it's OK to read directly
2391 into it. (We'll be overwriting above, after we've decoded it
2392 into the runoff.) */
2393 read_size = Lstream_read (str->other_end, data, size);
2400 /* There might be some more end data produced in the translation.
2401 See the comment above. */
2402 str->flags |= CODING_STATE_END;
2403 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2406 if (data - orig_data == 0)
2407 return error_occurred ? -1 : 0;
2409 return data - orig_data;
2413 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2415 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2418 /* Decode all our data into the runoff, and then attempt to write
2419 it all out to the other end. Remove whatever chunk we succeeded
2421 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2422 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2423 Dynarr_length (str->runoff));
2425 Dynarr_delete_many (str->runoff, 0, retval);
2426 /* Do NOT return retval. The return value indicates how much
2427 of the incoming data was written, not how many bytes were
2433 reset_decoding_stream (struct decoding_stream *str)
2436 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2438 Lisp_Object coding_system;
2439 XSETCODING_SYSTEM (coding_system, str->codesys);
2440 reset_iso2022 (coding_system, &str->iso2022);
2442 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2444 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2449 str->combined_char_count = 0;
2450 str->combining_table = Qnil;
2452 str->flags = str->cpos = 0;
2456 decoding_rewinder (Lstream *stream)
2458 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2459 reset_decoding_stream (str);
2460 Dynarr_reset (str->runoff);
2461 return Lstream_rewind (str->other_end);
2465 decoding_seekable_p (Lstream *stream)
2467 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2468 return Lstream_seekable_p (str->other_end);
2472 decoding_flusher (Lstream *stream)
2474 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2475 return Lstream_flush (str->other_end);
2479 decoding_closer (Lstream *stream)
2481 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2482 if (stream->flags & LSTREAM_FL_WRITE)
2484 str->flags |= CODING_STATE_END;
2485 decoding_writer (stream, 0, 0);
2487 Dynarr_free (str->runoff);
2489 #ifdef ENABLE_COMPOSITE_CHARS
2490 if (str->iso2022.composite_chars)
2491 Dynarr_free (str->iso2022.composite_chars);
2494 return Lstream_close (str->other_end);
2498 decoding_stream_coding_system (Lstream *stream)
2500 Lisp_Object coding_system;
2501 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2503 XSETCODING_SYSTEM (coding_system, str->codesys);
2504 return subsidiary_coding_system (coding_system, str->eol_type);
2508 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2510 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2511 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2513 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2514 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2515 reset_decoding_stream (str);
2518 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2519 stream for writing, no automatic code detection will be performed.
2520 The reason for this is that automatic code detection requires a
2521 seekable input. Things will also fail if you open a decoding
2522 stream for reading using a non-fully-specified coding system and
2523 a non-seekable input stream. */
2526 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2529 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2530 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2534 str->other_end = stream;
2535 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2536 str->eol_type = EOL_AUTODETECT;
2537 if (!strcmp (mode, "r")
2538 && Lstream_seekable_p (stream))
2539 /* We can determine the coding system now. */
2540 determine_real_coding_system (stream, &codesys, &str->eol_type);
2541 set_decoding_stream_coding_system (lstr, codesys);
2542 str->decst.eol_type = str->eol_type;
2543 str->decst.mask = ~0;
2544 XSETLSTREAM (obj, lstr);
2549 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2551 return make_decoding_stream_1 (stream, codesys, "r");
2555 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2557 return make_decoding_stream_1 (stream, codesys, "w");
2560 /* Note: the decode_coding_* functions all take the same
2561 arguments as mule_decode(), which is to say some SRC data of
2562 size N, which is to be stored into dynamic array DST.
2563 DECODING is the stream within which the decoding is
2564 taking place, but no data is actually read from or
2565 written to that stream; that is handled in decoding_reader()
2566 or decoding_writer(). This allows the same functions to
2567 be used for both reading and writing. */
2570 mule_decode (Lstream *decoding, const Extbyte *src,
2571 unsigned_char_dynarr *dst, size_t n)
2573 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2575 /* If necessary, do encoding-detection now. We do this when
2576 we're a writing stream or a non-seekable reading stream,
2577 meaning that we can't just process the whole input,
2578 rewind, and start over. */
2580 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2581 str->eol_type == EOL_AUTODETECT)
2583 Lisp_Object codesys;
2585 XSETCODING_SYSTEM (codesys, str->codesys);
2586 detect_coding_type (&str->decst, src, n,
2587 CODING_SYSTEM_TYPE (str->codesys) !=
2588 CODESYS_AUTODETECT);
2589 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2590 str->decst.mask != ~0)
2591 /* #### This is cheesy. What we really ought to do is
2592 buffer up a certain amount of data so as to get a
2593 less random result. */
2594 codesys = coding_system_from_mask (str->decst.mask);
2595 str->eol_type = str->decst.eol_type;
2596 if (XCODING_SYSTEM (codesys) != str->codesys)
2598 /* Preserve the CODING_STATE_END flag in case it was set.
2599 If we erase it, bad things might happen. */
2600 int was_end = str->flags & CODING_STATE_END;
2601 set_decoding_stream_coding_system (decoding, codesys);
2603 str->flags |= CODING_STATE_END;
2607 switch (CODING_SYSTEM_TYPE (str->codesys))
2610 case CODESYS_INTERNAL:
2611 Dynarr_add_many (dst, src, n);
2614 case CODESYS_AUTODETECT:
2615 /* If we got this far and still haven't decided on the coding
2616 system, then do no conversion. */
2617 case CODESYS_NO_CONVERSION:
2618 decode_coding_no_conversion (decoding, src, dst, n);
2621 case CODESYS_SHIFT_JIS:
2622 decode_coding_sjis (decoding, src, dst, n);
2625 decode_coding_big5 (decoding, src, dst, n);
2628 decode_coding_ucs4 (decoding, src, dst, n);
2631 decode_coding_utf8 (decoding, src, dst, n);
2634 str->ccl.last_block = str->flags & CODING_STATE_END;
2635 /* When applying ccl program to stream, MUST NOT set NULL
2637 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2638 dst, n, 0, CCL_MODE_DECODING);
2640 case CODESYS_ISO2022:
2641 decode_coding_iso2022 (decoding, src, dst, n);
2649 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2650 Decode the text between START and END which is encoded in CODING-SYSTEM.
2651 This is useful if you've read in encoded text from a file without decoding
2652 it (e.g. you read in a JIS-formatted file but used the `binary' or
2653 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2654 Return length of decoded text.
2655 BUFFER defaults to the current buffer if unspecified.
2657 (start, end, coding_system, buffer))
2660 struct buffer *buf = decode_buffer (buffer, 0);
2661 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2662 Lstream *istr, *ostr;
2663 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2665 get_buffer_range_char (buf, start, end, &b, &e, 0);
2667 barf_if_buffer_read_only (buf, b, e);
2669 coding_system = Fget_coding_system (coding_system);
2670 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2671 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2672 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2674 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2675 Fget_coding_system (Qbinary));
2676 istr = XLSTREAM (instream);
2677 ostr = XLSTREAM (outstream);
2678 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2680 /* The chain of streams looks like this:
2682 [BUFFER] <----- send through
2683 ------> [ENCODE AS BINARY]
2684 ------> [DECODE AS SPECIFIED]
2690 char tempbuf[1024]; /* some random amount */
2691 Bufpos newpos, even_newer_pos;
2692 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2693 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2697 newpos = lisp_buffer_stream_startpos (istr);
2698 Lstream_write (ostr, tempbuf, size_in_bytes);
2699 even_newer_pos = lisp_buffer_stream_startpos (istr);
2700 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2703 Lstream_close (istr);
2704 Lstream_close (ostr);
2706 Lstream_delete (istr);
2707 Lstream_delete (ostr);
2708 Lstream_delete (XLSTREAM (de_outstream));
2709 Lstream_delete (XLSTREAM (lb_outstream));
2714 /************************************************************************/
2715 /* Converting to an external encoding ("encoding") */
2716 /************************************************************************/
2718 /* An encoding stream is an output stream. When you create the
2719 stream, you specify the coding system that governs the encoding
2720 and another stream that the resulting encoded data is to be
2721 sent to, and then start sending data to it. */
2723 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2725 struct encoding_stream
2727 /* Coding system that governs the conversion. */
2728 Lisp_Coding_System *codesys;
2730 /* Stream that we read the encoded data from or
2731 write the decoded data to. */
2734 /* If we are reading, then we can return only a fixed amount of
2735 data, so if the conversion resulted in too much data, we store it
2736 here for retrieval the next time around. */
2737 unsigned_char_dynarr *runoff;
2739 /* FLAGS holds flags indicating the current state of the encoding.
2740 Some of these flags are dependent on the coding system. */
2743 /* CH holds a partially built-up character. Since we only deal
2744 with one- and two-byte characters at the moment, we only use
2745 this to store the first byte of a two-byte character. */
2748 /* Additional information used by the ISO2022 encoder. */
2751 /* CHARSET holds the character sets currently assigned to the G0
2752 through G3 registers. It is initialized from the array
2753 INITIAL_CHARSET in CODESYS. */
2754 Lisp_Object charset[4];
2756 /* Which registers are currently invoked into the left (GL) and
2757 right (GR) halves of the 8-bit encoding space? */
2758 int register_left, register_right;
2760 /* Whether we need to explicitly designate the charset in the
2761 G? register before using it. It is initialized from the
2762 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2763 unsigned char force_charset_on_output[4];
2765 /* Other state variables that need to be preserved across
2767 Lisp_Object current_charset;
2769 int current_char_boundary;
2772 void (*encode_char) (struct encoding_stream *str, Emchar c,
2773 unsigned_char_dynarr *dst, unsigned int *flags);
2774 void (*finish) (struct encoding_stream *str,
2775 unsigned_char_dynarr *dst, unsigned int *flags);
2777 /* Additional information (the state of the running CCL program)
2778 used by the CCL encoder. */
2779 struct ccl_program ccl;
2783 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2784 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2786 static int encoding_rewinder (Lstream *stream);
2787 static int encoding_seekable_p (Lstream *stream);
2788 static int encoding_flusher (Lstream *stream);
2789 static int encoding_closer (Lstream *stream);
2791 static Lisp_Object encoding_marker (Lisp_Object stream);
2793 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2794 sizeof (struct encoding_stream));
2797 encoding_marker (Lisp_Object stream)
2799 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2800 Lisp_Object str_obj;
2802 /* We do not need to mark the coding systems or charsets stored
2803 within the stream because they are stored in a global list
2804 and automatically marked. */
2806 XSETLSTREAM (str_obj, str);
2807 mark_object (str_obj);
2808 if (str->imp->marker)
2809 return (str->imp->marker) (str_obj);
2814 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2815 so we read data from the other end, encode it, and store it into DATA. */
2818 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2820 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2821 unsigned char *orig_data = data;
2823 int error_occurred = 0;
2825 /* We need to interface to mule_encode(), which expects to take some
2826 amount of data and store the result into a Dynarr. We have
2827 mule_encode() store into str->runoff, and take data from there
2830 /* We loop until we have enough data, reading chunks from the other
2831 end and encoding it. */
2834 /* Take data from the runoff if we can. Make sure to take at
2835 most SIZE bytes, and delete the data from the runoff. */
2836 if (Dynarr_length (str->runoff) > 0)
2838 int chunk = min ((int) size, Dynarr_length (str->runoff));
2839 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2840 Dynarr_delete_many (str->runoff, 0, chunk);
2846 break; /* No more room for data */
2848 if (str->flags & CODING_STATE_END)
2849 /* This means that on the previous iteration, we hit the EOF on
2850 the other end. We loop once more so that mule_encode() can
2851 output any final stuff it may be holding, or any "go back
2852 to a sane state" escape sequences. (This latter makes sense
2853 during encoding.) */
2856 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2857 left of storage in it, so it's OK to read directly into it.
2858 (We'll be overwriting above, after we've encoded it into the
2860 read_size = Lstream_read (str->other_end, data, size);
2867 /* There might be some more end data produced in the translation.
2868 See the comment above. */
2869 str->flags |= CODING_STATE_END;
2870 mule_encode (stream, data, str->runoff, read_size);
2873 if (data == orig_data)
2874 return error_occurred ? -1 : 0;
2876 return data - orig_data;
2880 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2882 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2885 /* Encode all our data into the runoff, and then attempt to write
2886 it all out to the other end. Remove whatever chunk we succeeded
2888 mule_encode (stream, data, str->runoff, size);
2889 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2890 Dynarr_length (str->runoff));
2892 Dynarr_delete_many (str->runoff, 0, retval);
2893 /* Do NOT return retval. The return value indicates how much
2894 of the incoming data was written, not how many bytes were
2900 reset_encoding_stream (struct encoding_stream *str)
2903 switch (CODING_SYSTEM_TYPE (str->codesys))
2905 case CODESYS_ISO2022:
2909 str->encode_char = &char_encode_iso2022;
2910 str->finish = &char_finish_iso2022;
2911 for (i = 0; i < 4; i++)
2913 str->iso2022.charset[i] =
2914 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2915 str->iso2022.force_charset_on_output[i] =
2916 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2918 str->iso2022.register_left = 0;
2919 str->iso2022.register_right = 1;
2920 str->iso2022.current_charset = Qnil;
2921 str->iso2022.current_half = 0;
2925 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2928 str->encode_char = &char_encode_utf8;
2929 str->finish = &char_finish_utf8;
2932 str->encode_char = &char_encode_ucs4;
2933 str->finish = &char_finish_ucs4;
2935 case CODESYS_SHIFT_JIS:
2936 str->encode_char = &char_encode_shift_jis;
2937 str->finish = &char_finish_shift_jis;
2940 str->encode_char = &char_encode_big5;
2941 str->finish = &char_finish_big5;
2947 str->iso2022.current_char_boundary = 0;
2948 str->flags = str->ch = 0;
2952 encoding_rewinder (Lstream *stream)
2954 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2955 reset_encoding_stream (str);
2956 Dynarr_reset (str->runoff);
2957 return Lstream_rewind (str->other_end);
2961 encoding_seekable_p (Lstream *stream)
2963 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2964 return Lstream_seekable_p (str->other_end);
2968 encoding_flusher (Lstream *stream)
2970 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2971 return Lstream_flush (str->other_end);
2975 encoding_closer (Lstream *stream)
2977 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2978 if (stream->flags & LSTREAM_FL_WRITE)
2980 str->flags |= CODING_STATE_END;
2981 encoding_writer (stream, 0, 0);
2983 Dynarr_free (str->runoff);
2984 return Lstream_close (str->other_end);
2988 encoding_stream_coding_system (Lstream *stream)
2990 Lisp_Object coding_system;
2991 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2993 XSETCODING_SYSTEM (coding_system, str->codesys);
2994 return coding_system;
2998 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3000 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3001 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3003 reset_encoding_stream (str);
3007 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3010 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3011 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3015 str->runoff = Dynarr_new (unsigned_char);
3016 str->other_end = stream;
3017 set_encoding_stream_coding_system (lstr, codesys);
3018 XSETLSTREAM (obj, lstr);
3023 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3025 return make_encoding_stream_1 (stream, codesys, "r");
3029 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3031 return make_encoding_stream_1 (stream, codesys, "w");
3034 /* Convert N bytes of internally-formatted data stored in SRC to an
3035 external format, according to the encoding stream ENCODING.
3036 Store the encoded data into DST. */
3039 mule_encode (Lstream *encoding, const Bufbyte *src,
3040 unsigned_char_dynarr *dst, size_t n)
3042 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3044 switch (CODING_SYSTEM_TYPE (str->codesys))
3047 case CODESYS_INTERNAL:
3048 Dynarr_add_many (dst, src, n);
3051 case CODESYS_AUTODETECT:
3052 /* If we got this far and still haven't decided on the coding
3053 system, then do no conversion. */
3054 case CODESYS_NO_CONVERSION:
3055 encode_coding_no_conversion (encoding, src, dst, n);
3059 str->ccl.last_block = str->flags & CODING_STATE_END;
3060 /* When applying ccl program to stream, MUST NOT set NULL
3062 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3063 dst, n, 0, CCL_MODE_ENCODING);
3067 text_encode_generic (encoding, src, dst, n);
3071 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3072 Encode the text between START and END using CODING-SYSTEM.
3073 This will, for example, convert Japanese characters into stuff such as
3074 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3075 text. BUFFER defaults to the current buffer if unspecified.
3077 (start, end, coding_system, buffer))
3080 struct buffer *buf = decode_buffer (buffer, 0);
3081 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3082 Lstream *istr, *ostr;
3083 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3085 get_buffer_range_char (buf, start, end, &b, &e, 0);
3087 barf_if_buffer_read_only (buf, b, e);
3089 coding_system = Fget_coding_system (coding_system);
3090 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3091 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3092 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3093 Fget_coding_system (Qbinary));
3094 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3096 istr = XLSTREAM (instream);
3097 ostr = XLSTREAM (outstream);
3098 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3099 /* The chain of streams looks like this:
3101 [BUFFER] <----- send through
3102 ------> [ENCODE AS SPECIFIED]
3103 ------> [DECODE AS BINARY]
3108 char tempbuf[1024]; /* some random amount */
3109 Bufpos newpos, even_newer_pos;
3110 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3111 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3115 newpos = lisp_buffer_stream_startpos (istr);
3116 Lstream_write (ostr, tempbuf, size_in_bytes);
3117 even_newer_pos = lisp_buffer_stream_startpos (istr);
3118 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3124 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3125 Lstream_close (istr);
3126 Lstream_close (ostr);
3128 Lstream_delete (istr);
3129 Lstream_delete (ostr);
3130 Lstream_delete (XLSTREAM (de_outstream));
3131 Lstream_delete (XLSTREAM (lb_outstream));
3132 return make_int (retlen);
3139 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3140 unsigned_char_dynarr *dst, size_t n)
3143 unsigned char char_boundary;
3144 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3145 unsigned int flags = str->flags;
3146 Emchar ch = str->ch;
3148 char_boundary = str->iso2022.current_char_boundary;
3154 if (char_boundary == 0)
3182 (*str->encode_char) (str, c, dst, &flags);
3184 else if (char_boundary == 1)
3186 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3192 ch = (ch << 6) | (c & 0x3f);
3197 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3199 (*str->finish) (str, dst, &flags);
3204 str->iso2022.current_char_boundary = char_boundary;
3208 /************************************************************************/
3209 /* Shift-JIS methods */
3210 /************************************************************************/
3212 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3213 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3214 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3215 encoded by "position-code + 0x80". A character of JISX0208
3216 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3217 position-codes are divided and shifted so that it fit in the range
3220 --- CODE RANGE of Shift-JIS ---
3221 (character set) (range)
3223 JISX0201-Kana 0xA0 .. 0xDF
3224 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3225 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3226 -------------------------------
3230 /* Is this the first byte of a Shift-JIS two-byte char? */
3232 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3233 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3235 /* Is this the second byte of a Shift-JIS two-byte char? */
3237 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3238 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3240 #define BYTE_SJIS_KATAKANA_P(c) \
3241 ((c) >= 0xA1 && (c) <= 0xDF)
3244 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3248 unsigned char c = *(unsigned char *)src++;
3249 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3251 if (st->shift_jis.in_second_byte)
3253 st->shift_jis.in_second_byte = 0;
3257 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3258 st->shift_jis.in_second_byte = 1;
3260 return CODING_CATEGORY_SHIFT_JIS_MASK;
3263 /* Convert Shift-JIS data to internal format. */
3266 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3267 unsigned_char_dynarr *dst, size_t n)
3269 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3270 unsigned int flags = str->flags;
3271 unsigned int cpos = str->cpos;
3272 eol_type_t eol_type = str->eol_type;
3276 unsigned char c = *(unsigned char *)src++;
3280 /* Previous character was first byte of Shift-JIS Kanji char. */
3281 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3283 unsigned char e1, e2;
3285 DECODE_SJIS (cpos, c, e1, e2);
3287 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3291 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3292 Dynarr_add (dst, e1);
3293 Dynarr_add (dst, e2);
3298 DECODE_ADD_BINARY_CHAR (cpos, dst);
3299 DECODE_ADD_BINARY_CHAR (c, dst);
3305 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3306 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3308 else if (BYTE_SJIS_KATAKANA_P (c))
3311 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3314 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3315 Dynarr_add (dst, c);
3320 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3324 DECODE_ADD_BINARY_CHAR (c, dst);
3326 label_continue_loop:;
3329 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3335 /* Convert internal character representation to Shift_JIS. */
3338 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3339 unsigned_char_dynarr *dst, unsigned int *flags)
3341 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3345 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3346 Dynarr_add (dst, '\r');
3347 if (eol_type != EOL_CR)
3348 Dynarr_add (dst, ch);
3352 unsigned int s1, s2;
3354 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3356 if (code_point >= 0)
3357 Dynarr_add (dst, code_point);
3358 else if ((code_point
3359 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3362 ENCODE_SJIS ((code_point >> 8) | 0x80,
3363 (code_point & 0xFF) | 0x80, s1, s2);
3364 Dynarr_add (dst, s1);
3365 Dynarr_add (dst, s2);
3367 else if ((code_point
3368 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3370 Dynarr_add (dst, code_point | 0x80);
3371 else if ((code_point
3372 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3375 ENCODE_SJIS ((code_point >> 8) | 0x80,
3376 (code_point & 0xFF) | 0x80, s1, s2);
3377 Dynarr_add (dst, s1);
3378 Dynarr_add (dst, s2);
3380 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3382 Dynarr_add (dst, code_point);
3384 Dynarr_add (dst, '?');
3386 Lisp_Object charset;
3387 unsigned int c1, c2;
3389 BREAKUP_CHAR (ch, charset, c1, c2);
3391 if (EQ(charset, Vcharset_katakana_jisx0201))
3393 Dynarr_add (dst, c1 | 0x80);
3397 Dynarr_add (dst, c1);
3399 else if (EQ(charset, Vcharset_japanese_jisx0208))
3401 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3402 Dynarr_add (dst, s1);
3403 Dynarr_add (dst, s2);
3406 Dynarr_add (dst, '?');
3412 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3413 unsigned int *flags)
3417 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3418 Decode a JISX0208 character of Shift-JIS coding-system.
3419 CODE is the character code in Shift-JIS as a cons of type bytes.
3420 Return the corresponding character.
3424 unsigned char c1, c2, s1, s2;
3427 CHECK_INT (XCAR (code));
3428 CHECK_INT (XCDR (code));
3429 s1 = XINT (XCAR (code));
3430 s2 = XINT (XCDR (code));
3431 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3432 BYTE_SJIS_TWO_BYTE_2_P (s2))
3434 DECODE_SJIS (s1, s2, c1, c2);
3435 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3436 c1 & 0x7F, c2 & 0x7F));
3442 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3443 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3444 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3448 Lisp_Object charset;
3451 CHECK_CHAR_COERCE_INT (character);
3452 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3453 if (EQ (charset, Vcharset_japanese_jisx0208))
3455 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3456 return Fcons (make_int (s1), make_int (s2));
3463 /************************************************************************/
3465 /************************************************************************/
3467 /* BIG5 is a coding system encoding two character sets: ASCII and
3468 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3469 character set and is encoded in two-byte.
3471 --- CODE RANGE of BIG5 ---
3472 (character set) (range)
3474 Big5 (1st byte) 0xA1 .. 0xFE
3475 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3476 --------------------------
3478 Since the number of characters in Big5 is larger than maximum
3479 characters in Emacs' charset (96x96), it can't be handled as one
3480 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3481 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3482 contains frequently used characters and the latter contains less
3483 frequently used characters. */
3486 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3487 ((c) >= 0x81 && (c) <= 0xFE)
3489 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3490 ((c) >= 0xA1 && (c) <= 0xFE)
3493 /* Is this the second byte of a Shift-JIS two-byte char? */
3495 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3496 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3498 /* Number of Big5 characters which have the same code in 1st byte. */
3500 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3502 /* Code conversion macros. These are macros because they are used in
3503 inner loops during code conversion.
3505 Note that temporary variables in macros introduce the classic
3506 dynamic-scoping problems with variable names. We use capital-
3507 lettered variables in the assumption that XEmacs does not use
3508 capital letters in variables except in a very formalized way
3511 /* Convert Big5 code (b1, b2) into its internal string representation
3514 /* There is a much simpler way to split the Big5 charset into two.
3515 For the moment I'm going to leave the algorithm as-is because it
3516 claims to separate out the most-used characters into a single
3517 charset, which perhaps will lead to optimizations in various
3520 The way the algorithm works is something like this:
3522 Big5 can be viewed as a 94x157 charset, where the row is
3523 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3524 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3525 the split between low and high column numbers is apparently
3526 meaningless; ascending rows produce less and less frequent chars.
3527 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3528 the first charset, and the upper half (0xC9 .. 0xFE) to the
3529 second. To do the conversion, we convert the character into
3530 a single number where 0 .. 156 is the first row, 157 .. 313
3531 is the second, etc. That way, the characters are ordered by
3532 decreasing frequency. Then we just chop the space in two
3533 and coerce the result into a 94x94 space.
3536 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3538 int B1 = b1, B2 = b2; \
3540 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3544 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3548 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3549 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3551 c1 = I / (0xFF - 0xA1) + 0xA1; \
3552 c2 = I % (0xFF - 0xA1) + 0xA1; \
3555 /* Convert the internal string representation of a Big5 character
3556 (lb, c1, c2) into Big5 code (b1, b2). */
3558 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3560 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3562 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3564 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3566 b1 = I / BIG5_SAME_ROW + 0xA1; \
3567 b2 = I % BIG5_SAME_ROW; \
3568 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3572 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3576 unsigned char c = *(unsigned char *)src++;
3577 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3579 || (c >= 0x80 && c <= 0xA0)
3583 if (st->big5.in_second_byte)
3585 st->big5.in_second_byte = 0;
3586 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3596 st->big5.in_second_byte = 1;
3598 return CODING_CATEGORY_BIG5_MASK;
3601 /* Convert Big5 data to internal format. */
3604 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3605 unsigned_char_dynarr *dst, size_t n)
3607 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3608 unsigned int flags = str->flags;
3609 unsigned int cpos = str->cpos;
3610 eol_type_t eol_type = str->eol_type;
3613 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3614 (decoding)->codesys, 1);
3619 unsigned char c = *(unsigned char *)src++;
3622 /* Previous character was first byte of Big5 char. */
3623 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3626 int code_point = (cpos << 8) | c;
3627 Emchar char_id = DECODE_DEFINED_CHAR (ccs, code_point);
3630 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3631 DECODE_ADD_UCS_CHAR (char_id, dst);
3633 unsigned char b1, b2, b3;
3634 DECODE_BIG5 (cpos, c, b1, b2, b3);
3635 Dynarr_add (dst, b1);
3636 Dynarr_add (dst, b2);
3637 Dynarr_add (dst, b3);
3642 DECODE_ADD_BINARY_CHAR (cpos, dst);
3643 DECODE_ADD_BINARY_CHAR (c, dst);
3649 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3650 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3653 DECODE_ADD_BINARY_CHAR (c, dst);
3655 label_continue_loop:;
3658 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3664 /* Convert internally-formatted data to Big5. */
3667 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3668 unsigned_char_dynarr *dst, unsigned int *flags)
3670 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3674 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3675 Dynarr_add (dst, '\r');
3676 if (eol_type != EOL_CR)
3677 Dynarr_add (dst, ch);
3684 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3686 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3687 Dynarr_add (dst, code_point);
3688 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3690 Dynarr_add (dst, code_point >> 8);
3691 Dynarr_add (dst, code_point & 0xFF);
3693 else if ((code_point
3694 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3696 Dynarr_add (dst, code_point >> 8);
3697 Dynarr_add (dst, code_point & 0xFF);
3699 else if ((code_point
3700 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3703 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3704 + ((code_point & 0xFF) - 33);
3705 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3706 unsigned char b2 = I % BIG5_SAME_ROW;
3708 b2 += b2 < 0x3F ? 0x40 : 0x62;
3709 Dynarr_add (dst, b1);
3710 Dynarr_add (dst, b2);
3712 else if ((code_point
3713 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3716 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3717 + ((code_point & 0xFF) - 33);
3718 unsigned char b1, b2;
3720 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3721 b1 = I / BIG5_SAME_ROW + 0xA1;
3722 b2 = I % BIG5_SAME_ROW;
3723 b2 += b2 < 0x3F ? 0x40 : 0x62;
3724 Dynarr_add (dst, b1);
3725 Dynarr_add (dst, b2);
3728 Dynarr_add (dst, '?');
3735 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3736 unsigned int *flags)
3741 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3742 Decode a Big5 character CODE of BIG5 coding-system.
3743 CODE is the character code in BIG5, a cons of two integers.
3744 Return the corresponding character.
3748 unsigned char c1, c2, b1, b2;
3751 CHECK_INT (XCAR (code));
3752 CHECK_INT (XCDR (code));
3753 b1 = XINT (XCAR (code));
3754 b2 = XINT (XCDR (code));
3755 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3756 BYTE_BIG5_TWO_BYTE_2_P (b2))
3758 Charset_ID leading_byte;
3759 Lisp_Object charset;
3760 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3761 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3762 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3768 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3769 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3770 Return the corresponding character code in Big5.
3774 Lisp_Object charset;
3777 CHECK_CHAR_COERCE_INT (character);
3778 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3779 if (EQ (charset, Vcharset_chinese_big5_1) ||
3780 EQ (charset, Vcharset_chinese_big5_2))
3782 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3784 return Fcons (make_int (b1), make_int (b2));
3791 /************************************************************************/
3793 /************************************************************************/
3796 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3800 unsigned char c = *(unsigned char *)src++;
3801 switch (st->ucs4.in_byte)
3810 st->ucs4.in_byte = 0;
3816 return CODING_CATEGORY_UCS4_MASK;
3820 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3821 unsigned_char_dynarr *dst, size_t n)
3823 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3824 unsigned int flags = str->flags;
3825 unsigned int cpos = str->cpos;
3826 unsigned char counter = str->counter;
3830 unsigned char c = *(unsigned char *)src++;
3838 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3843 cpos = ( cpos << 8 ) | c;
3847 if (counter & CODING_STATE_END)
3848 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3852 str->counter = counter;
3856 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3857 unsigned_char_dynarr *dst, unsigned int *flags)
3859 Dynarr_add (dst, ch >> 24);
3860 Dynarr_add (dst, ch >> 16);
3861 Dynarr_add (dst, ch >> 8);
3862 Dynarr_add (dst, ch );
3866 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3867 unsigned int *flags)
3872 /************************************************************************/
3874 /************************************************************************/
3877 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3881 unsigned char c = *(unsigned char *)src++;
3882 switch (st->utf8.in_byte)
3885 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3888 st->utf8.in_byte = 5;
3890 st->utf8.in_byte = 4;
3892 st->utf8.in_byte = 3;
3894 st->utf8.in_byte = 2;
3896 st->utf8.in_byte = 1;
3901 if ((c & 0xc0) != 0x80)
3907 return CODING_CATEGORY_UTF8_MASK;
3911 decode_output_utf8_partial_char (unsigned char counter,
3913 unsigned_char_dynarr *dst)
3916 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3917 else if (counter == 4)
3919 if (cpos < (1 << 6))
3920 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3923 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3924 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3927 else if (counter == 3)
3929 if (cpos < (1 << 6))
3930 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3931 else if (cpos < (1 << 12))
3933 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3934 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3938 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3939 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3940 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3943 else if (counter == 2)
3945 if (cpos < (1 << 6))
3946 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3947 else if (cpos < (1 << 12))
3949 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3950 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3952 else if (cpos < (1 << 18))
3954 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3955 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3956 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3960 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3961 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3962 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3963 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3968 if (cpos < (1 << 6))
3969 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3970 else if (cpos < (1 << 12))
3972 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3973 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3975 else if (cpos < (1 << 18))
3977 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3978 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3979 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3981 else if (cpos < (1 << 24))
3983 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3984 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3985 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3986 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3990 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3991 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3992 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3993 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3994 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4000 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4001 unsigned_char_dynarr *dst, size_t n)
4003 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4004 unsigned int flags = str->flags;
4005 unsigned int cpos = str->cpos;
4006 eol_type_t eol_type = str->eol_type;
4007 unsigned char counter = str->counter;
4011 unsigned char c = *(unsigned char *)src++;
4016 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4017 DECODE_ADD_UCS_CHAR (c, dst);
4019 else if ( c < 0xE0 )
4024 else if ( c < 0xF0 )
4029 else if ( c < 0xF8 )
4034 else if ( c < 0xFC )
4045 else if ( (c & 0xC0) == 0x80 )
4047 cpos = ( cpos << 6 ) | ( c & 0x3f );
4050 DECODE_ADD_UCS_CHAR (cpos, dst);
4059 decode_output_utf8_partial_char (counter, cpos, dst);
4060 DECODE_ADD_BINARY_CHAR (c, dst);
4064 label_continue_loop:;
4067 if (flags & CODING_STATE_END)
4070 decode_output_utf8_partial_char (counter, cpos, dst);
4076 str->counter = counter;
4080 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4081 unsigned_char_dynarr *dst, unsigned int *flags)
4083 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4087 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4088 Dynarr_add (dst, '\r');
4089 if (eol_type != EOL_CR)
4090 Dynarr_add (dst, ch);
4092 else if (ch <= 0x7f)
4094 Dynarr_add (dst, ch);
4096 else if (ch <= 0x7ff)
4098 Dynarr_add (dst, (ch >> 6) | 0xc0);
4099 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4101 else if (ch <= 0xffff)
4103 Dynarr_add (dst, (ch >> 12) | 0xe0);
4104 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4105 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4107 else if (ch <= 0x1fffff)
4109 Dynarr_add (dst, (ch >> 18) | 0xf0);
4110 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4111 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4112 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4114 else if (ch <= 0x3ffffff)
4116 Dynarr_add (dst, (ch >> 24) | 0xf8);
4117 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4118 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4119 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4120 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4124 Dynarr_add (dst, (ch >> 30) | 0xfc);
4125 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4126 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4127 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4128 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4129 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4134 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4135 unsigned int *flags)
4140 /************************************************************************/
4141 /* ISO2022 methods */
4142 /************************************************************************/
4144 /* The following note describes the coding system ISO2022 briefly.
4145 Since the intention of this note is to help understand the
4146 functions in this file, some parts are NOT ACCURATE or OVERLY
4147 SIMPLIFIED. For thorough understanding, please refer to the
4148 original document of ISO2022.
4150 ISO2022 provides many mechanisms to encode several character sets
4151 in 7-bit and 8-bit environments. For 7-bit environments, all text
4152 is encoded using bytes less than 128. This may make the encoded
4153 text a little bit longer, but the text passes more easily through
4154 several gateways, some of which strip off MSB (Most Signigant Bit).
4156 There are two kinds of character sets: control character set and
4157 graphic character set. The former contains control characters such
4158 as `newline' and `escape' to provide control functions (control
4159 functions are also provided by escape sequences). The latter
4160 contains graphic characters such as 'A' and '-'. Emacs recognizes
4161 two control character sets and many graphic character sets.
4163 Graphic character sets are classified into one of the following
4164 four classes, according to the number of bytes (DIMENSION) and
4165 number of characters in one dimension (CHARS) of the set:
4166 - DIMENSION1_CHARS94
4167 - DIMENSION1_CHARS96
4168 - DIMENSION2_CHARS94
4169 - DIMENSION2_CHARS96
4171 In addition, each character set is assigned an identification tag,
4172 unique for each set, called "final character" (denoted as <F>
4173 hereafter). The <F> of each character set is decided by ECMA(*)
4174 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4175 (0x30..0x3F are for private use only).
4177 Note (*): ECMA = European Computer Manufacturers Association
4179 Here are examples of graphic character set [NAME(<F>)]:
4180 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4181 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4182 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4183 o DIMENSION2_CHARS96 -- none for the moment
4185 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4186 C0 [0x00..0x1F] -- control character plane 0
4187 GL [0x20..0x7F] -- graphic character plane 0
4188 C1 [0x80..0x9F] -- control character plane 1
4189 GR [0xA0..0xFF] -- graphic character plane 1
4191 A control character set is directly designated and invoked to C0 or
4192 C1 by an escape sequence. The most common case is that:
4193 - ISO646's control character set is designated/invoked to C0, and
4194 - ISO6429's control character set is designated/invoked to C1,
4195 and usually these designations/invocations are omitted in encoded
4196 text. In a 7-bit environment, only C0 can be used, and a control
4197 character for C1 is encoded by an appropriate escape sequence to
4198 fit into the environment. All control characters for C1 are
4199 defined to have corresponding escape sequences.
4201 A graphic character set is at first designated to one of four
4202 graphic registers (G0 through G3), then these graphic registers are
4203 invoked to GL or GR. These designations and invocations can be
4204 done independently. The most common case is that G0 is invoked to
4205 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4206 these invocations and designations are omitted in encoded text.
4207 In a 7-bit environment, only GL can be used.
4209 When a graphic character set of CHARS94 is invoked to GL, codes
4210 0x20 and 0x7F of the GL area work as control characters SPACE and
4211 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4214 There are two ways of invocation: locking-shift and single-shift.
4215 With locking-shift, the invocation lasts until the next different
4216 invocation, whereas with single-shift, the invocation affects the
4217 following character only and doesn't affect the locking-shift
4218 state. Invocations are done by the following control characters or
4221 ----------------------------------------------------------------------
4222 abbrev function cntrl escape seq description
4223 ----------------------------------------------------------------------
4224 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4225 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4226 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4227 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4228 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4229 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4230 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4231 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4232 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4233 ----------------------------------------------------------------------
4234 (*) These are not used by any known coding system.
4236 Control characters for these functions are defined by macros
4237 ISO_CODE_XXX in `coding.h'.
4239 Designations are done by the following escape sequences:
4240 ----------------------------------------------------------------------
4241 escape sequence description
4242 ----------------------------------------------------------------------
4243 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4244 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4245 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4246 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4247 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4248 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4249 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4250 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4251 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4252 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4253 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4254 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4255 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4256 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4257 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4258 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4259 ----------------------------------------------------------------------
4261 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4262 of dimension 1, chars 94, and final character <F>, etc...
4264 Note (*): Although these designations are not allowed in ISO2022,
4265 Emacs accepts them on decoding, and produces them on encoding
4266 CHARS96 character sets in a coding system which is characterized as
4267 7-bit environment, non-locking-shift, and non-single-shift.
4269 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4270 '(' can be omitted. We refer to this as "short-form" hereafter.
4272 Now you may notice that there are a lot of ways for encoding the
4273 same multilingual text in ISO2022. Actually, there exist many
4274 coding systems such as Compound Text (used in X11's inter client
4275 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4276 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4277 localized platforms), and all of these are variants of ISO2022.
4279 In addition to the above, Emacs handles two more kinds of escape
4280 sequences: ISO6429's direction specification and Emacs' private
4281 sequence for specifying character composition.
4283 ISO6429's direction specification takes the following form:
4284 o CSI ']' -- end of the current direction
4285 o CSI '0' ']' -- end of the current direction
4286 o CSI '1' ']' -- start of left-to-right text
4287 o CSI '2' ']' -- start of right-to-left text
4288 The control character CSI (0x9B: control sequence introducer) is
4289 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4291 Character composition specification takes the following form:
4292 o ESC '0' -- start character composition
4293 o ESC '1' -- end character composition
4294 Since these are not standard escape sequences of any ISO standard,
4295 their use with these meanings is restricted to Emacs only. */
4298 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4302 for (i = 0; i < 4; i++)
4304 if (!NILP (coding_system))
4306 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4308 iso->charset[i] = Qt;
4309 iso->invalid_designated[i] = 0;
4311 iso->esc = ISO_ESC_NOTHING;
4312 iso->esc_bytes_index = 0;
4313 iso->register_left = 0;
4314 iso->register_right = 1;
4315 iso->switched_dir_and_no_valid_charset_yet = 0;
4316 iso->invalid_switch_dir = 0;
4317 iso->output_direction_sequence = 0;
4318 iso->output_literally = 0;
4319 #ifdef ENABLE_COMPOSITE_CHARS
4320 if (iso->composite_chars)
4321 Dynarr_reset (iso->composite_chars);
4326 fit_to_be_escape_quoted (unsigned char c)
4343 /* Parse one byte of an ISO2022 escape sequence.
4344 If the result is an invalid escape sequence, return 0 and
4345 do not change anything in STR. Otherwise, if the result is
4346 an incomplete escape sequence, update ISO2022.ESC and
4347 ISO2022.ESC_BYTES and return -1. Otherwise, update
4348 all the state variables (but not ISO2022.ESC_BYTES) and
4351 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4352 or invocation of an invalid character set and treat that as
4353 an unrecognized escape sequence. */
4356 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4357 unsigned char c, unsigned int *flags,
4358 int check_invalid_charsets)
4360 /* (1) If we're at the end of a designation sequence, CS is the
4361 charset being designated and REG is the register to designate
4364 (2) If we're at the end of a locking-shift sequence, REG is
4365 the register to invoke and HALF (0 == left, 1 == right) is
4366 the half to invoke it into.
4368 (3) If we're at the end of a single-shift sequence, REG is
4369 the register to invoke. */
4370 Lisp_Object cs = Qnil;
4373 /* NOTE: This code does goto's all over the fucking place.
4374 The reason for this is that we're basically implementing
4375 a state machine here, and hierarchical languages like C
4376 don't really provide a clean way of doing this. */
4378 if (! (*flags & CODING_STATE_ESCAPE))
4379 /* At beginning of escape sequence; we need to reset our
4380 escape-state variables. */
4381 iso->esc = ISO_ESC_NOTHING;
4383 iso->output_literally = 0;
4384 iso->output_direction_sequence = 0;
4388 case ISO_ESC_NOTHING:
4389 iso->esc_bytes_index = 0;
4392 case ISO_CODE_ESC: /* Start escape sequence */
4393 *flags |= CODING_STATE_ESCAPE;
4397 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4398 *flags |= CODING_STATE_ESCAPE;
4399 iso->esc = ISO_ESC_5_11;
4402 case ISO_CODE_SO: /* locking shift 1 */
4405 case ISO_CODE_SI: /* locking shift 0 */
4409 case ISO_CODE_SS2: /* single shift */
4412 case ISO_CODE_SS3: /* single shift */
4416 default: /* Other control characters */
4423 /**** single shift ****/
4425 case 'N': /* single shift 2 */
4428 case 'O': /* single shift 3 */
4432 /**** locking shift ****/
4434 case '~': /* locking shift 1 right */
4437 case 'n': /* locking shift 2 */
4440 case '}': /* locking shift 2 right */
4443 case 'o': /* locking shift 3 */
4446 case '|': /* locking shift 3 right */
4450 #ifdef ENABLE_COMPOSITE_CHARS
4451 /**** composite ****/
4454 iso->esc = ISO_ESC_START_COMPOSITE;
4455 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4456 CODING_STATE_COMPOSITE;
4460 iso->esc = ISO_ESC_END_COMPOSITE;
4461 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4462 ~CODING_STATE_COMPOSITE;
4464 #endif /* ENABLE_COMPOSITE_CHARS */
4466 /**** directionality ****/
4469 iso->esc = ISO_ESC_5_11;
4472 /**** designation ****/
4474 case '$': /* multibyte charset prefix */
4475 iso->esc = ISO_ESC_2_4;
4479 if (0x28 <= c && c <= 0x2F)
4481 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4485 /* This function is called with CODESYS equal to nil when
4486 doing coding-system detection. */
4488 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4489 && fit_to_be_escape_quoted (c))
4491 iso->esc = ISO_ESC_LITERAL;
4492 *flags &= CODING_STATE_ISO2022_LOCK;
4502 /**** directionality ****/
4504 case ISO_ESC_5_11: /* ISO6429 direction control */
4507 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4508 goto directionality;
4510 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4511 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4512 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4516 case ISO_ESC_5_11_0:
4519 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4520 goto directionality;
4524 case ISO_ESC_5_11_1:
4527 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4528 goto directionality;
4532 case ISO_ESC_5_11_2:
4535 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4536 goto directionality;
4541 iso->esc = ISO_ESC_DIRECTIONALITY;
4542 /* Various junk here to attempt to preserve the direction sequences
4543 literally in the text if they would otherwise be swallowed due
4544 to invalid designations that don't show up as actual charset
4545 changes in the text. */
4546 if (iso->invalid_switch_dir)
4548 /* We already inserted a direction switch literally into the
4549 text. We assume (#### this may not be right) that the
4550 next direction switch is the one going the other way,
4551 and we need to output that literally as well. */
4552 iso->output_literally = 1;
4553 iso->invalid_switch_dir = 0;
4559 /* If we are in the thrall of an invalid designation,
4560 then stick the directionality sequence literally into the
4561 output stream so it ends up in the original text again. */
4562 for (jj = 0; jj < 4; jj++)
4563 if (iso->invalid_designated[jj])
4567 iso->output_literally = 1;
4568 iso->invalid_switch_dir = 1;
4571 /* Indicate that we haven't yet seen a valid designation,
4572 so that if a switch-dir is directly followed by an
4573 invalid designation, both get inserted literally. */
4574 iso->switched_dir_and_no_valid_charset_yet = 1;
4579 /**** designation ****/
4582 if (0x28 <= c && c <= 0x2F)
4584 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4587 if (0x40 <= c && c <= 0x42)
4590 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4591 *flags & CODING_STATE_R2L ?
4592 CHARSET_RIGHT_TO_LEFT :
4593 CHARSET_LEFT_TO_RIGHT);
4604 if (c < '0' || c > '~')
4605 return 0; /* bad final byte */
4607 if (iso->esc >= ISO_ESC_2_8 &&
4608 iso->esc <= ISO_ESC_2_15)
4610 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4611 single = 1; /* single-byte */
4612 reg = (iso->esc - ISO_ESC_2_8) & 3;
4614 else if (iso->esc >= ISO_ESC_2_4_8 &&
4615 iso->esc <= ISO_ESC_2_4_15)
4617 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4618 single = -1; /* multi-byte */
4619 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4623 /* Can this ever be reached? -slb */
4627 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4628 *flags & CODING_STATE_R2L ?
4629 CHARSET_RIGHT_TO_LEFT :
4630 CHARSET_LEFT_TO_RIGHT);
4636 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4640 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4641 /* can't invoke something that ain't there. */
4643 iso->esc = ISO_ESC_SINGLE_SHIFT;
4644 *flags &= CODING_STATE_ISO2022_LOCK;
4646 *flags |= CODING_STATE_SS2;
4648 *flags |= CODING_STATE_SS3;
4652 if (check_invalid_charsets &&
4653 !CHARSETP (iso->charset[reg]))
4654 /* can't invoke something that ain't there. */
4657 iso->register_right = reg;
4659 iso->register_left = reg;
4660 *flags &= CODING_STATE_ISO2022_LOCK;
4661 iso->esc = ISO_ESC_LOCKING_SHIFT;
4665 if (NILP (cs) && check_invalid_charsets)
4667 iso->invalid_designated[reg] = 1;
4668 iso->charset[reg] = Vcharset_ascii;
4669 iso->esc = ISO_ESC_DESIGNATE;
4670 *flags &= CODING_STATE_ISO2022_LOCK;
4671 iso->output_literally = 1;
4672 if (iso->switched_dir_and_no_valid_charset_yet)
4674 /* We encountered a switch-direction followed by an
4675 invalid designation. Ensure that the switch-direction
4676 gets outputted; otherwise it will probably get eaten
4677 when the text is written out again. */
4678 iso->switched_dir_and_no_valid_charset_yet = 0;
4679 iso->output_direction_sequence = 1;
4680 /* And make sure that the switch-dir going the other
4681 way gets outputted, as well. */
4682 iso->invalid_switch_dir = 1;
4686 /* This function is called with CODESYS equal to nil when
4687 doing coding-system detection. */
4688 if (!NILP (codesys))
4690 charset_conversion_spec_dynarr *dyn =
4691 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4697 for (i = 0; i < Dynarr_length (dyn); i++)
4699 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4700 if (EQ (cs, spec->from_charset))
4701 cs = spec->to_charset;
4706 iso->charset[reg] = cs;
4707 iso->esc = ISO_ESC_DESIGNATE;
4708 *flags &= CODING_STATE_ISO2022_LOCK;
4709 if (iso->invalid_designated[reg])
4711 iso->invalid_designated[reg] = 0;
4712 iso->output_literally = 1;
4714 if (iso->switched_dir_and_no_valid_charset_yet)
4715 iso->switched_dir_and_no_valid_charset_yet = 0;
4720 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4724 /* #### There are serious deficiencies in the recognition mechanism
4725 here. This needs to be much smarter if it's going to cut it.
4726 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4727 it should be detected as Latin-1.
4728 All the ISO2022 stuff in this file should be synced up with the
4729 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4730 Perhaps we should wait till R2L works in FSF Emacs? */
4732 if (!st->iso2022.initted)
4734 reset_iso2022 (Qnil, &st->iso2022.iso);
4735 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4736 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4737 CODING_CATEGORY_ISO_8_1_MASK |
4738 CODING_CATEGORY_ISO_8_2_MASK |
4739 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4740 st->iso2022.flags = 0;
4741 st->iso2022.high_byte_count = 0;
4742 st->iso2022.saw_single_shift = 0;
4743 st->iso2022.initted = 1;
4746 mask = st->iso2022.mask;
4750 unsigned char c = *(unsigned char *)src++;
4753 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4754 st->iso2022.high_byte_count++;
4758 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4760 if (st->iso2022.high_byte_count & 1)
4761 /* odd number of high bytes; assume not iso-8-2 */
4762 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4764 st->iso2022.high_byte_count = 0;
4765 st->iso2022.saw_single_shift = 0;
4767 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4769 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4770 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4771 { /* control chars */
4774 /* Allow and ignore control characters that you might
4775 reasonably see in a text file */
4780 case 8: /* backspace */
4781 case 11: /* vertical tab */
4782 case 12: /* form feed */
4783 case 26: /* MS-DOS C-z junk */
4784 case 31: /* '^_' -- for info */
4785 goto label_continue_loop;
4792 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4795 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4796 &st->iso2022.flags, 0))
4798 switch (st->iso2022.iso.esc)
4800 case ISO_ESC_DESIGNATE:
4801 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4802 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4804 case ISO_ESC_LOCKING_SHIFT:
4805 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4806 goto ran_out_of_chars;
4807 case ISO_ESC_SINGLE_SHIFT:
4808 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4809 st->iso2022.saw_single_shift = 1;
4818 goto ran_out_of_chars;
4821 label_continue_loop:;
4830 postprocess_iso2022_mask (int mask)
4832 /* #### kind of cheesy */
4833 /* If seven-bit ISO is allowed, then assume that the encoding is
4834 entirely seven-bit and turn off the eight-bit ones. */
4835 if (mask & CODING_CATEGORY_ISO_7_MASK)
4836 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4837 CODING_CATEGORY_ISO_8_1_MASK |
4838 CODING_CATEGORY_ISO_8_2_MASK);
4842 /* If FLAGS is a null pointer or specifies right-to-left motion,
4843 output a switch-dir-to-left-to-right sequence to DST.
4844 Also update FLAGS if it is not a null pointer.
4845 If INTERNAL_P is set, we are outputting in internal format and
4846 need to handle the CSI differently. */
4849 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4850 unsigned_char_dynarr *dst,
4851 unsigned int *flags,
4854 if (!flags || (*flags & CODING_STATE_R2L))
4856 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4858 Dynarr_add (dst, ISO_CODE_ESC);
4859 Dynarr_add (dst, '[');
4861 else if (internal_p)
4862 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4864 Dynarr_add (dst, ISO_CODE_CSI);
4865 Dynarr_add (dst, '0');
4866 Dynarr_add (dst, ']');
4868 *flags &= ~CODING_STATE_R2L;
4872 /* If FLAGS is a null pointer or specifies a direction different from
4873 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4874 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4875 sequence to DST. Also update FLAGS if it is not a null pointer.
4876 If INTERNAL_P is set, we are outputting in internal format and
4877 need to handle the CSI differently. */
4880 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4881 unsigned_char_dynarr *dst, unsigned int *flags,
4884 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4885 direction == CHARSET_LEFT_TO_RIGHT)
4886 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4887 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4888 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4889 direction == CHARSET_RIGHT_TO_LEFT)
4891 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4893 Dynarr_add (dst, ISO_CODE_ESC);
4894 Dynarr_add (dst, '[');
4896 else if (internal_p)
4897 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4899 Dynarr_add (dst, ISO_CODE_CSI);
4900 Dynarr_add (dst, '2');
4901 Dynarr_add (dst, ']');
4903 *flags |= CODING_STATE_R2L;
4907 /* Convert ISO2022-format data to internal format. */
4910 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4911 unsigned_char_dynarr *dst, size_t n)
4913 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4914 unsigned int flags = str->flags;
4915 unsigned int cpos = str->cpos;
4916 unsigned char counter = str->counter;
4917 eol_type_t eol_type = str->eol_type;
4918 #ifdef ENABLE_COMPOSITE_CHARS
4919 unsigned_char_dynarr *real_dst = dst;
4921 Lisp_Object coding_system;
4923 XSETCODING_SYSTEM (coding_system, str->codesys);
4925 #ifdef ENABLE_COMPOSITE_CHARS
4926 if (flags & CODING_STATE_COMPOSITE)
4927 dst = str->iso2022.composite_chars;
4928 #endif /* ENABLE_COMPOSITE_CHARS */
4932 unsigned char c = *(unsigned char *)src++;
4933 if (flags & CODING_STATE_ESCAPE)
4934 { /* Within ESC sequence */
4935 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4940 switch (str->iso2022.esc)
4942 #ifdef ENABLE_COMPOSITE_CHARS
4943 case ISO_ESC_START_COMPOSITE:
4944 if (str->iso2022.composite_chars)
4945 Dynarr_reset (str->iso2022.composite_chars);
4947 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4948 dst = str->iso2022.composite_chars;
4950 case ISO_ESC_END_COMPOSITE:
4952 Bufbyte comstr[MAX_EMCHAR_LEN];
4954 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4955 Dynarr_length (dst));
4957 len = set_charptr_emchar (comstr, emch);
4958 Dynarr_add_many (dst, comstr, len);
4961 #endif /* ENABLE_COMPOSITE_CHARS */
4963 case ISO_ESC_LITERAL:
4964 COMPOSE_FLUSH_CHARS (str, dst);
4965 DECODE_ADD_BINARY_CHAR (c, dst);
4969 /* Everything else handled already */
4974 /* Attempted error recovery. */
4975 if (str->iso2022.output_direction_sequence)
4976 ensure_correct_direction (flags & CODING_STATE_R2L ?
4977 CHARSET_RIGHT_TO_LEFT :
4978 CHARSET_LEFT_TO_RIGHT,
4979 str->codesys, dst, 0, 1);
4980 /* More error recovery. */
4981 if (!retval || str->iso2022.output_literally)
4983 /* Output the (possibly invalid) sequence */
4985 COMPOSE_FLUSH_CHARS (str, dst);
4986 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4987 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4988 flags &= CODING_STATE_ISO2022_LOCK;
4990 n++, src--;/* Repeat the loop with the same character. */
4993 /* No sense in reprocessing the final byte of the
4994 escape sequence; it could mess things up anyway.
4996 COMPOSE_FLUSH_CHARS (str, dst);
4997 DECODE_ADD_BINARY_CHAR (c, dst);
5003 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5004 { /* Control characters */
5006 /***** Error-handling *****/
5008 /* If we were in the middle of a character, dump out the
5009 partial character. */
5012 COMPOSE_FLUSH_CHARS (str, dst);
5016 DECODE_ADD_BINARY_CHAR
5017 ((unsigned char)(cpos >> (counter * 8)), dst);
5022 /* If we just saw a single-shift character, dump it out.
5023 This may dump out the wrong sort of single-shift character,
5024 but least it will give an indication that something went
5026 if (flags & CODING_STATE_SS2)
5028 COMPOSE_FLUSH_CHARS (str, dst);
5029 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5030 flags &= ~CODING_STATE_SS2;
5032 if (flags & CODING_STATE_SS3)
5034 COMPOSE_FLUSH_CHARS (str, dst);
5035 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5036 flags &= ~CODING_STATE_SS3;
5039 /***** Now handle the control characters. *****/
5045 COMPOSE_FLUSH_CHARS (str, dst);
5046 if (eol_type == EOL_CR)
5047 Dynarr_add (dst, '\n');
5048 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5049 Dynarr_add (dst, c);
5051 flags |= CODING_STATE_CR;
5052 goto label_continue_loop;
5054 else if (flags & CODING_STATE_CR)
5055 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5057 Dynarr_add (dst, '\r');
5058 flags &= ~CODING_STATE_CR;
5061 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5064 flags &= CODING_STATE_ISO2022_LOCK;
5066 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5068 COMPOSE_FLUSH_CHARS (str, dst);
5069 DECODE_ADD_BINARY_CHAR (c, dst);
5073 { /* Graphic characters */
5074 Lisp_Object charset;
5083 COMPOSE_FLUSH_CHARS (str, dst);
5084 if (eol_type == EOL_CR)
5085 Dynarr_add (dst, '\n');
5086 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5087 Dynarr_add (dst, c);
5089 flags |= CODING_STATE_CR;
5090 goto label_continue_loop;
5092 else if (flags & CODING_STATE_CR)
5093 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5095 Dynarr_add (dst, '\r');
5096 flags &= ~CODING_STATE_CR;
5099 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5102 /* Now determine the charset. */
5103 reg = ((flags & CODING_STATE_SS2) ? 2
5104 : (flags & CODING_STATE_SS3) ? 3
5105 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5106 : str->iso2022.register_left);
5107 charset = str->iso2022.charset[reg];
5109 /* Error checking: */
5110 if (! CHARSETP (charset)
5111 || str->iso2022.invalid_designated[reg]
5112 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5113 && XCHARSET_CHARS (charset) == 94))
5114 /* Mrmph. We are trying to invoke a register that has no
5115 or an invalid charset in it, or trying to add a character
5116 outside the range of the charset. Insert that char literally
5117 to preserve it for the output. */
5119 COMPOSE_FLUSH_CHARS (str, dst);
5123 DECODE_ADD_BINARY_CHAR
5124 ((unsigned char)(cpos >> (counter * 8)), dst);
5127 DECODE_ADD_BINARY_CHAR (c, dst);
5132 /* Things are probably hunky-dorey. */
5134 /* Fetch reverse charset, maybe. */
5135 if (((flags & CODING_STATE_R2L) &&
5136 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5138 (!(flags & CODING_STATE_R2L) &&
5139 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5141 Lisp_Object new_charset =
5142 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5143 if (!NILP (new_charset))
5144 charset = new_charset;
5149 if (XCHARSET_DIMENSION (charset) == counter)
5151 COMPOSE_ADD_CHAR (str,
5152 DECODE_CHAR (charset,
5153 ((cpos & 0x7F7F7F) << 8)
5160 cpos = (cpos << 8) | c;
5162 lb = XCHARSET_LEADING_BYTE (charset);
5163 switch (XCHARSET_REP_BYTES (charset))
5166 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5167 Dynarr_add (dst, c & 0x7F);
5170 case 2: /* one-byte official */
5171 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5172 Dynarr_add (dst, lb);
5173 Dynarr_add (dst, c | 0x80);
5176 case 3: /* one-byte private or two-byte official */
5177 if (XCHARSET_PRIVATE_P (charset))
5179 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5180 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5181 Dynarr_add (dst, lb);
5182 Dynarr_add (dst, c | 0x80);
5188 Dynarr_add (dst, lb);
5189 Dynarr_add (dst, ch | 0x80);
5190 Dynarr_add (dst, c | 0x80);
5198 default: /* two-byte private */
5201 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5202 Dynarr_add (dst, lb);
5203 Dynarr_add (dst, ch | 0x80);
5204 Dynarr_add (dst, c | 0x80);
5214 flags &= CODING_STATE_ISO2022_LOCK;
5217 label_continue_loop:;
5220 if (flags & CODING_STATE_END)
5222 COMPOSE_FLUSH_CHARS (str, dst);
5223 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5227 str->counter = counter;
5231 /***** ISO2022 encoder *****/
5233 /* Designate CHARSET into register REG. */
5236 iso2022_designate (Lisp_Object charset, unsigned char reg,
5237 struct encoding_stream *str, unsigned_char_dynarr *dst)
5239 static const char inter94[] = "()*+";
5240 static const char inter96[] = ",-./";
5241 unsigned short chars;
5242 unsigned char dimension;
5243 unsigned char final;
5244 Lisp_Object old_charset = str->iso2022.charset[reg];
5246 str->iso2022.charset[reg] = charset;
5247 if (!CHARSETP (charset))
5248 /* charset might be an initial nil or t. */
5250 chars = XCHARSET_CHARS (charset);
5251 dimension = XCHARSET_DIMENSION (charset);
5252 final = XCHARSET_FINAL (charset);
5253 if (!str->iso2022.force_charset_on_output[reg] &&
5254 CHARSETP (old_charset) &&
5255 XCHARSET_CHARS (old_charset) == chars &&
5256 XCHARSET_DIMENSION (old_charset) == dimension &&
5257 XCHARSET_FINAL (old_charset) == final)
5260 str->iso2022.force_charset_on_output[reg] = 0;
5263 charset_conversion_spec_dynarr *dyn =
5264 str->codesys->iso2022.output_conv;
5270 for (i = 0; i < Dynarr_length (dyn); i++)
5272 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5273 if (EQ (charset, spec->from_charset))
5274 charset = spec->to_charset;
5279 Dynarr_add (dst, ISO_CODE_ESC);
5284 Dynarr_add (dst, inter94[reg]);
5287 Dynarr_add (dst, '$');
5289 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5292 Dynarr_add (dst, inter94[reg]);
5297 Dynarr_add (dst, inter96[reg]);
5300 Dynarr_add (dst, '$');
5301 Dynarr_add (dst, inter96[reg]);
5305 Dynarr_add (dst, final);
5309 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5311 if (str->iso2022.register_left != 0)
5313 Dynarr_add (dst, ISO_CODE_SI);
5314 str->iso2022.register_left = 0;
5319 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5321 if (str->iso2022.register_left != 1)
5323 Dynarr_add (dst, ISO_CODE_SO);
5324 str->iso2022.register_left = 1;
5329 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5330 unsigned_char_dynarr *dst, unsigned int *flags)
5332 unsigned char charmask;
5333 Lisp_Coding_System* codesys = str->codesys;
5334 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5336 Lisp_Object charset = str->iso2022.current_charset;
5337 int half = str->iso2022.current_half;
5338 int code_point = -1;
5342 restore_left_to_right_direction (codesys, dst, flags, 0);
5344 /* Make sure G0 contains ASCII */
5345 if ((ch > ' ' && ch < ISO_CODE_DEL)
5346 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5348 ensure_normal_shift (str, dst);
5349 iso2022_designate (Vcharset_ascii, 0, str, dst);
5352 /* If necessary, restore everything to the default state
5354 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5356 restore_left_to_right_direction (codesys, dst, flags, 0);
5358 ensure_normal_shift (str, dst);
5360 for (i = 0; i < 4; i++)
5362 Lisp_Object initial_charset =
5363 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5364 iso2022_designate (initial_charset, i, str, dst);
5369 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5370 Dynarr_add (dst, '\r');
5371 if (eol_type != EOL_CR)
5372 Dynarr_add (dst, ch);
5376 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5377 && fit_to_be_escape_quoted (ch))
5378 Dynarr_add (dst, ISO_CODE_ESC);
5379 Dynarr_add (dst, ch);
5382 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5384 charmask = (half == 0 ? 0x00 : 0x80);
5386 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5387 && fit_to_be_escape_quoted (ch))
5388 Dynarr_add (dst, ISO_CODE_ESC);
5389 /* you asked for it ... */
5390 Dynarr_add (dst, ch);
5396 /* Now determine which register to use. */
5398 for (i = 0; i < 4; i++)
5400 if ((CHARSETP (charset = str->iso2022.charset[i])
5401 && ((code_point = charset_code_point (charset, ch)) >= 0))
5405 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5406 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5414 Lisp_Object original_default_coded_charset_priority_list
5415 = Vdefault_coded_charset_priority_list;
5417 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5419 code_point = ENCODE_CHAR (ch, charset);
5420 if (XCHARSET_FINAL (charset))
5422 Vdefault_coded_charset_priority_list
5423 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5424 Vdefault_coded_charset_priority_list));
5426 code_point = ENCODE_CHAR (ch, charset);
5427 if (!XCHARSET_FINAL (charset))
5429 charset = Vcharset_ascii;
5433 Vdefault_coded_charset_priority_list
5434 = original_default_coded_charset_priority_list;
5436 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5437 codesys, dst, flags, 0);
5441 if (XCHARSET_GRAPHIC (charset) != 0)
5443 if (!NILP (str->iso2022.charset[1]) &&
5444 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5445 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5447 else if (!NILP (str->iso2022.charset[2]))
5449 else if (!NILP (str->iso2022.charset[3]))
5458 iso2022_designate (charset, reg, str, dst);
5460 /* Now invoke that register. */
5464 ensure_normal_shift (str, dst);
5468 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5470 ensure_shift_out (str, dst);
5477 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5479 Dynarr_add (dst, ISO_CODE_ESC);
5480 Dynarr_add (dst, 'N');
5485 Dynarr_add (dst, ISO_CODE_SS2);
5490 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5492 Dynarr_add (dst, ISO_CODE_ESC);
5493 Dynarr_add (dst, 'O');
5498 Dynarr_add (dst, ISO_CODE_SS3);
5506 charmask = (half == 0 ? 0x00 : 0x80);
5508 switch (XCHARSET_DIMENSION (charset))
5511 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5514 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5515 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5518 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5519 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5520 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5523 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5524 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5525 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5526 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5532 str->iso2022.current_charset = charset;
5533 str->iso2022.current_half = half;
5537 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5538 unsigned int *flags)
5540 Lisp_Coding_System* codesys = str->codesys;
5543 restore_left_to_right_direction (codesys, dst, flags, 0);
5544 ensure_normal_shift (str, dst);
5545 for (i = 0; i < 4; i++)
5547 Lisp_Object initial_charset
5548 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5549 iso2022_designate (initial_charset, i, str, dst);
5554 /************************************************************************/
5555 /* No-conversion methods */
5556 /************************************************************************/
5558 /* This is used when reading in "binary" files -- i.e. files that may
5559 contain all 256 possible byte values and that are not to be
5560 interpreted as being in any particular decoding. */
5562 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5563 unsigned_char_dynarr *dst, size_t n)
5565 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5566 unsigned int flags = str->flags;
5567 unsigned int cpos = str->cpos;
5568 eol_type_t eol_type = str->eol_type;
5572 unsigned char c = *(unsigned char *)src++;
5574 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5575 DECODE_ADD_BINARY_CHAR (c, dst);
5576 label_continue_loop:;
5579 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5586 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5587 unsigned_char_dynarr *dst, size_t n)
5590 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5591 unsigned int flags = str->flags;
5592 unsigned int ch = str->ch;
5593 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5595 unsigned char char_boundary = str->iso2022.current_char_boundary;
5602 if (char_boundary == 0)
5608 else if ( c >= 0xf8 )
5613 else if ( c >= 0xf0 )
5618 else if ( c >= 0xe0 )
5623 else if ( c >= 0xc0 )
5633 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5634 Dynarr_add (dst, '\r');
5635 if (eol_type != EOL_CR)
5636 Dynarr_add (dst, c);
5639 Dynarr_add (dst, c);
5642 else if (char_boundary == 1)
5644 ch = ( ch << 6 ) | ( c & 0x3f );
5645 Dynarr_add (dst, ch & 0xff);
5650 ch = ( ch << 6 ) | ( c & 0x3f );
5653 #else /* not UTF2000 */
5656 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5657 Dynarr_add (dst, '\r');
5658 if (eol_type != EOL_CR)
5659 Dynarr_add (dst, '\n');
5662 else if (BYTE_ASCII_P (c))
5665 Dynarr_add (dst, c);
5667 else if (BUFBYTE_LEADING_BYTE_P (c))
5670 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5671 c == LEADING_BYTE_CONTROL_1)
5674 Dynarr_add (dst, '~'); /* untranslatable character */
5678 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5679 Dynarr_add (dst, c);
5680 else if (ch == LEADING_BYTE_CONTROL_1)
5683 Dynarr_add (dst, c - 0x20);
5685 /* else it should be the second or third byte of an
5686 untranslatable character, so ignore it */
5689 #endif /* not UTF2000 */
5695 str->iso2022.current_char_boundary = char_boundary;
5701 /************************************************************************/
5702 /* Initialization */
5703 /************************************************************************/
5706 syms_of_file_coding (void)
5708 INIT_LRECORD_IMPLEMENTATION (coding_system);
5710 deferror (&Qcoding_system_error, "coding-system-error",
5711 "Coding-system error", Qio_error);
5713 DEFSUBR (Fcoding_system_p);
5714 DEFSUBR (Ffind_coding_system);
5715 DEFSUBR (Fget_coding_system);
5716 DEFSUBR (Fcoding_system_list);
5717 DEFSUBR (Fcoding_system_name);
5718 DEFSUBR (Fmake_coding_system);
5719 DEFSUBR (Fcopy_coding_system);
5720 DEFSUBR (Fcoding_system_canonical_name_p);
5721 DEFSUBR (Fcoding_system_alias_p);
5722 DEFSUBR (Fcoding_system_aliasee);
5723 DEFSUBR (Fdefine_coding_system_alias);
5724 DEFSUBR (Fsubsidiary_coding_system);
5726 DEFSUBR (Fcoding_system_type);
5727 DEFSUBR (Fcoding_system_doc_string);
5729 DEFSUBR (Fcoding_system_charset);
5731 DEFSUBR (Fcoding_system_property);
5733 DEFSUBR (Fcoding_category_list);
5734 DEFSUBR (Fset_coding_priority_list);
5735 DEFSUBR (Fcoding_priority_list);
5736 DEFSUBR (Fset_coding_category_system);
5737 DEFSUBR (Fcoding_category_system);
5739 DEFSUBR (Fdetect_coding_region);
5740 DEFSUBR (Fdecode_coding_region);
5741 DEFSUBR (Fencode_coding_region);
5743 DEFSUBR (Fdecode_shift_jis_char);
5744 DEFSUBR (Fencode_shift_jis_char);
5745 DEFSUBR (Fdecode_big5_char);
5746 DEFSUBR (Fencode_big5_char);
5748 defsymbol (&Qcoding_systemp, "coding-system-p");
5749 defsymbol (&Qno_conversion, "no-conversion");
5750 defsymbol (&Qraw_text, "raw-text");
5752 defsymbol (&Qbig5, "big5");
5753 defsymbol (&Qshift_jis, "shift-jis");
5754 defsymbol (&Qucs4, "ucs-4");
5755 defsymbol (&Qutf8, "utf-8");
5756 defsymbol (&Qccl, "ccl");
5757 defsymbol (&Qiso2022, "iso2022");
5759 defsymbol (&Qmnemonic, "mnemonic");
5760 defsymbol (&Qeol_type, "eol-type");
5761 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5762 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5764 defsymbol (&Qcr, "cr");
5765 defsymbol (&Qlf, "lf");
5766 defsymbol (&Qcrlf, "crlf");
5767 defsymbol (&Qeol_cr, "eol-cr");
5768 defsymbol (&Qeol_lf, "eol-lf");
5769 defsymbol (&Qeol_crlf, "eol-crlf");
5771 defsymbol (&Qcharset_g0, "charset-g0");
5772 defsymbol (&Qcharset_g1, "charset-g1");
5773 defsymbol (&Qcharset_g2, "charset-g2");
5774 defsymbol (&Qcharset_g3, "charset-g3");
5775 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5776 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5777 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5778 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5779 defsymbol (&Qno_iso6429, "no-iso6429");
5780 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5781 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5783 defsymbol (&Qshort, "short");
5784 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5785 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5786 defsymbol (&Qseven, "seven");
5787 defsymbol (&Qlock_shift, "lock-shift");
5788 defsymbol (&Qescape_quoted, "escape-quoted");
5791 defsymbol (&Qdisable_composition, "disable-composition");
5793 defsymbol (&Qencode, "encode");
5794 defsymbol (&Qdecode, "decode");
5797 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5799 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5801 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5803 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5805 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5807 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5809 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5811 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5813 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5816 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5821 lstream_type_create_file_coding (void)
5823 LSTREAM_HAS_METHOD (decoding, reader);
5824 LSTREAM_HAS_METHOD (decoding, writer);
5825 LSTREAM_HAS_METHOD (decoding, rewinder);
5826 LSTREAM_HAS_METHOD (decoding, seekable_p);
5827 LSTREAM_HAS_METHOD (decoding, flusher);
5828 LSTREAM_HAS_METHOD (decoding, closer);
5829 LSTREAM_HAS_METHOD (decoding, marker);
5831 LSTREAM_HAS_METHOD (encoding, reader);
5832 LSTREAM_HAS_METHOD (encoding, writer);
5833 LSTREAM_HAS_METHOD (encoding, rewinder);
5834 LSTREAM_HAS_METHOD (encoding, seekable_p);
5835 LSTREAM_HAS_METHOD (encoding, flusher);
5836 LSTREAM_HAS_METHOD (encoding, closer);
5837 LSTREAM_HAS_METHOD (encoding, marker);
5841 vars_of_file_coding (void)
5845 fcd = xnew (struct file_coding_dump);
5846 dump_add_root_struct_ptr (&fcd, &fcd_description);
5848 /* Initialize to something reasonable ... */
5849 for (i = 0; i < CODING_CATEGORY_LAST; i++)
5851 fcd->coding_category_system[i] = Qnil;
5852 fcd->coding_category_by_priority[i] = i;
5855 Fprovide (intern ("file-coding"));
5857 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5858 Coding system used for TTY keyboard input.
5859 Not used under a windowing system.
5861 Vkeyboard_coding_system = Qnil;
5863 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5864 Coding system used for TTY display output.
5865 Not used under a windowing system.
5867 Vterminal_coding_system = Qnil;
5869 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5870 Overriding coding system used when reading from a file or process.
5871 You should bind this variable with `let', but do not set it globally.
5872 If this is non-nil, it specifies the coding system that will be used
5873 to decode input on read operations, such as from a file or process.
5874 It overrides `buffer-file-coding-system-for-read',
5875 `insert-file-contents-pre-hook', etc. Use those variables instead of
5876 this one for permanent changes to the environment. */ );
5877 Vcoding_system_for_read = Qnil;
5879 DEFVAR_LISP ("coding-system-for-write",
5880 &Vcoding_system_for_write /*
5881 Overriding coding system used when writing to a file or process.
5882 You should bind this variable with `let', but do not set it globally.
5883 If this is non-nil, it specifies the coding system that will be used
5884 to encode output for write operations, such as to a file or process.
5885 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5886 Use those variables instead of this one for permanent changes to the
5888 Vcoding_system_for_write = Qnil;
5890 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5891 Coding system used to convert pathnames when accessing files.
5893 Vfile_name_coding_system = Qnil;
5895 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5896 Non-nil means the buffer contents are regarded as multi-byte form
5897 of characters, not a binary code. This affects the display, file I/O,
5898 and behaviors of various editing commands.
5900 Setting this to nil does not do anything.
5902 enable_multibyte_characters = 1;
5906 complex_vars_of_file_coding (void)
5908 staticpro (&Vcoding_system_hash_table);
5909 Vcoding_system_hash_table =
5910 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5912 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5913 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5915 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5917 struct codesys_prop csp; \
5919 csp.prop_type = (Prop_Type); \
5920 Dynarr_add (the_codesys_prop_dynarr, csp); \
5923 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5924 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5925 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5926 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5927 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5928 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5929 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5931 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5932 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5933 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5934 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5935 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5936 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5937 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5938 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5939 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5940 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5941 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5942 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5943 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5944 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5945 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5946 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5947 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5949 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5950 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5952 /* Need to create this here or we're really screwed. */
5954 (Qraw_text, Qno_conversion,
5955 build_string ("Raw text, which means it converts only line-break-codes."),
5956 list2 (Qmnemonic, build_string ("Raw")));
5959 (Qbinary, Qno_conversion,
5960 build_string ("Binary, which means it does not convert anything."),
5961 list4 (Qeol_type, Qlf,
5962 Qmnemonic, build_string ("Binary")));
5967 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5968 list2 (Qmnemonic, build_string ("UTF8")));
5971 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5973 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5975 Fdefine_coding_system_alias (Qterminal, Qbinary);
5976 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5978 /* Need this for bootstrapping */
5979 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5980 Fget_coding_system (Qraw_text);
5983 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5984 = Fget_coding_system (Qutf8);
5987 #if defined(MULE) && !defined(UTF2000)
5991 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5992 fcd->ucs_to_mule_table[i] = Qnil;
5994 staticpro (&mule_to_ucs_table);
5995 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5996 #endif /* defined(MULE) && !defined(UTF2000) */