1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
39 #include "file-coding.h"
41 Lisp_Object Qcoding_system_error;
43 Lisp_Object Vkeyboard_coding_system;
44 Lisp_Object Vterminal_coding_system;
45 Lisp_Object Vcoding_system_for_read;
46 Lisp_Object Vcoding_system_for_write;
47 Lisp_Object Vfile_name_coding_system;
49 /* Table of symbols identifying each coding category. */
50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
54 struct file_coding_dump {
55 /* Coding system currently associated with each coding category. */
56 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
58 /* Table of all coding categories in decreasing order of priority.
59 This describes a permutation of the possible coding categories. */
60 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 #if defined(MULE) && !defined(UTF2000)
63 Lisp_Object ucs_to_mule_table[65536];
67 static const struct lrecord_description fcd_description_1[] = {
68 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 #if defined(MULE) && !defined(UTF2000)
70 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
75 static const struct struct_description fcd_description = {
76 sizeof (struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
105 Lisp_Object Qdisable_composition;
107 Lisp_Object Qencode, Qdecode;
109 Lisp_Object Vcoding_system_hash_table;
111 int enable_multibyte_characters;
114 /* Additional information used by the ISO2022 decoder and detector. */
115 struct iso2022_decoder
117 /* CHARSET holds the character sets currently assigned to the G0
118 through G3 variables. It is initialized from the array
119 INITIAL_CHARSET in CODESYS. */
120 Lisp_Object charset[4];
122 /* Which registers are currently invoked into the left (GL) and
123 right (GR) halves of the 8-bit encoding space? */
124 int register_left, register_right;
126 /* ISO_ESC holds a value indicating part of an escape sequence
127 that has already been seen. */
128 enum iso_esc_flag esc;
130 /* This records the bytes we've seen so far in an escape sequence,
131 in case the sequence is invalid (we spit out the bytes unchanged). */
132 unsigned char esc_bytes[8];
134 /* Index for next byte to store in ISO escape sequence. */
137 #ifdef ENABLE_COMPOSITE_CHARS
138 /* Stuff seen so far when composing a string. */
139 unsigned_char_dynarr *composite_chars;
142 /* If we saw an invalid designation sequence for a particular
143 register, we flag it here and switch to ASCII. The next time we
144 see a valid designation for this register, we turn off the flag
145 and do the designation normally, but pretend the sequence was
146 invalid. The effect of all this is that (most of the time) the
147 escape sequences for both the switch to the unknown charset, and
148 the switch back to the known charset, get inserted literally into
149 the buffer and saved out as such. The hope is that we can
150 preserve the escape sequences so that the resulting written out
151 file makes sense. If we don't do any of this, the designation
152 to the invalid charset will be preserved but that switch back
153 to the known charset will probably get eaten because it was
154 the same charset that was already present in the register. */
155 unsigned char invalid_designated[4];
157 /* We try to do similar things as above for direction-switching
158 sequences. If we encountered a direction switch while an
159 invalid designation was present, or an invalid designation
160 just after a direction switch (i.e. no valid designation
161 encountered yet), we insert the direction-switch escape
162 sequence literally into the output stream, and later on
163 insert the corresponding direction-restoring escape sequence
165 unsigned int switched_dir_and_no_valid_charset_yet :1;
166 unsigned int invalid_switch_dir :1;
168 /* Tells the decoder to output the escape sequence literally
169 even though it was valid. Used in the games we play to
170 avoid lossage when we encounter invalid designations. */
171 unsigned int output_literally :1;
172 /* We encountered a direction switch followed by an invalid
173 designation. We didn't output the direction switch
174 literally because we didn't know about the invalid designation;
175 but we have to do so now. */
176 unsigned int output_direction_sequence :1;
179 EXFUN (Fcopy_coding_system, 2);
181 struct detection_state;
184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
185 unsigned_char_dynarr *dst, size_t n);
187 static int detect_coding_sjis (struct detection_state *st,
188 const Extbyte *src, size_t n);
189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
190 unsigned_char_dynarr *dst, size_t n);
191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
192 unsigned_char_dynarr *dst, unsigned int *flags);
193 void char_finish_shift_jis (struct encoding_stream *str,
194 unsigned_char_dynarr *dst, unsigned int *flags);
196 static int detect_coding_big5 (struct detection_state *st,
197 const Extbyte *src, size_t n);
198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, size_t n);
200 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
201 unsigned_char_dynarr *dst, size_t n);
203 static int detect_coding_ucs4 (struct detection_state *st,
204 const Extbyte *src, size_t n);
205 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
206 unsigned_char_dynarr *dst, size_t n);
207 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
208 unsigned_char_dynarr *dst, unsigned int *flags);
209 void char_finish_ucs4 (struct encoding_stream *str,
210 unsigned_char_dynarr *dst, unsigned int *flags);
212 static int detect_coding_utf8 (struct detection_state *st,
213 const Extbyte *src, size_t n);
214 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
215 unsigned_char_dynarr *dst, size_t n);
216 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
217 unsigned_char_dynarr *dst, unsigned int *flags);
218 void char_finish_utf8 (struct encoding_stream *str,
219 unsigned_char_dynarr *dst, unsigned int *flags);
221 static int postprocess_iso2022_mask (int mask);
222 static void reset_iso2022 (Lisp_Object coding_system,
223 struct iso2022_decoder *iso);
224 static int detect_coding_iso2022 (struct detection_state *st,
225 const Extbyte *src, size_t n);
226 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
227 unsigned_char_dynarr *dst, size_t n);
228 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
229 unsigned_char_dynarr *dst, unsigned int *flags);
230 void char_finish_iso2022 (struct encoding_stream *str,
231 unsigned_char_dynarr *dst, unsigned int *flags);
233 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
234 unsigned_char_dynarr *dst, size_t n);
235 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
236 unsigned_char_dynarr *dst, size_t n);
237 static void mule_decode (Lstream *decoding, const Extbyte *src,
238 unsigned_char_dynarr *dst, size_t n);
239 static void mule_encode (Lstream *encoding, const Bufbyte *src,
240 unsigned_char_dynarr *dst, size_t n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
677 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
678 signal_simple_error_2
679 ("Attempted conversion between different charset types",
681 spec.from_charset = from;
682 spec.to_charset = to;
684 Dynarr_add (store_here, spec);
688 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
689 specs, return the equivalent as the Lisp programmer would see it.
691 If LOAD_HERE is 0, return Qnil. */
694 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
701 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
703 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
704 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
707 return Fnreverse (result);
712 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
713 Register symbol NAME as a coding system.
715 TYPE describes the conversion method used and should be one of
718 Automatic conversion. XEmacs attempts to detect the coding system
721 No conversion. Use this for binary files and such. On output,
722 graphic characters that are not in ASCII or Latin-1 will be
723 replaced by a ?. (For a no-conversion-encoded buffer, these
724 characters will only be present if you explicitly insert them.)
726 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
728 ISO 10646 UCS-4 encoding.
730 ISO 10646 UTF-8 encoding.
732 Any ISO2022-compliant encoding. Among other things, this includes
733 JIS (the Japanese encoding commonly used for e-mail), EUC (the
734 standard Unix encoding for Japanese and other languages), and
735 Compound Text (the encoding used in X11). You can specify more
736 specific information about the conversion with the PROPS argument.
738 Big5 (the encoding commonly used for Taiwanese).
740 The conversion is performed using a user-written pseudo-code
741 program. CCL (Code Conversion Language) is the name of this
744 Write out or read in the raw contents of the memory representing
745 the buffer's text. This is primarily useful for debugging
746 purposes, and is only enabled when XEmacs has been compiled with
747 DEBUG_XEMACS defined (via the --debug configure option).
748 WARNING: Reading in a file using 'internal conversion can result
749 in an internal inconsistency in the memory representing a
750 buffer's text, which will produce unpredictable results and may
751 cause XEmacs to crash. Under normal circumstances you should
752 never use 'internal conversion.
754 DOC-STRING is a string describing the coding system.
756 PROPS is a property list, describing the specific nature of the
757 character set. Recognized properties are:
760 String to be displayed in the modeline when this coding system is
764 End-of-line conversion to be used. It should be one of
767 Automatically detect the end-of-line type (LF, CRLF,
768 or CR). Also generate subsidiary coding systems named
769 `NAME-unix', `NAME-dos', and `NAME-mac', that are
770 identical to this coding system but have an EOL-TYPE
771 value of 'lf, 'crlf, and 'cr, respectively.
773 The end of a line is marked externally using ASCII LF.
774 Since this is also the way that XEmacs represents an
775 end-of-line internally, specifying this option results
776 in no end-of-line conversion. This is the standard
777 format for Unix text files.
779 The end of a line is marked externally using ASCII
780 CRLF. This is the standard format for MS-DOS text
783 The end of a line is marked externally using ASCII CR.
784 This is the standard format for Macintosh text files.
786 Automatically detect the end-of-line type but do not
787 generate subsidiary coding systems. (This value is
788 converted to nil when stored internally, and
789 `coding-system-property' will return nil.)
792 If non-nil, composition/decomposition for combining characters
795 'post-read-conversion
796 Function called after a file has been read in, to perform the
797 decoding. Called with two arguments, START and END, denoting
798 a region of the current buffer to be decoded.
800 'pre-write-conversion
801 Function called before a file is written out, to perform the
802 encoding. Called with two arguments, START and END, denoting
803 a region of the current buffer to be encoded.
806 The following additional properties are recognized if TYPE is 'iso2022:
812 The character set initially designated to the G0 - G3 registers.
813 The value should be one of
815 -- A charset object (designate that character set)
816 -- nil (do not ever use this register)
817 -- t (no character set is initially designated to
818 the register, but may be later on; this automatically
819 sets the corresponding `force-g*-on-output' property)
825 If non-nil, send an explicit designation sequence on output before
826 using the specified register.
829 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
830 "ESC $ B" on output in place of the full designation sequences
831 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834 If non-nil, don't designate ASCII to G0 at each end of line on output.
835 Setting this to non-nil also suppresses other state-resetting that
836 normally happens at the end of a line.
839 If non-nil, don't designate ASCII to G0 before control chars on output.
842 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
846 If non-nil, use locking-shift (SO/SI) instead of single-shift
847 or designation by escape sequence.
850 If non-nil, don't use ISO6429's direction specification.
853 If non-nil, literal control characters that are the same as
854 the beginning of a recognized ISO2022 or ISO6429 escape sequence
855 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
856 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
857 so that they can be properly distinguished from an escape sequence.
858 (Note that doing this results in a non-portable encoding.) This
859 encoding flag is used for byte-compiled files. Note that ESC
860 is a good choice for a quoting character because there are no
861 escape sequences whose second byte is a character from the Control-0
862 or Control-1 character sets; this is explicitly disallowed by the
865 'input-charset-conversion
866 A list of conversion specifications, specifying conversion of
867 characters in one charset to another when decoding is performed.
868 Each specification is a list of two elements: the source charset,
869 and the destination charset.
871 'output-charset-conversion
872 A list of conversion specifications, specifying conversion of
873 characters in one charset to another when encoding is performed.
874 The form of each specification is the same as for
875 'input-charset-conversion.
878 The following additional properties are recognized (and required)
882 CCL program used for decoding (converting to internal format).
885 CCL program used for encoding (converting to external format).
887 (name, type, doc_string, props))
889 Lisp_Coding_System *codesys;
890 enum coding_system_type ty;
891 int need_to_setup_eol_systems = 1;
893 /* Convert type to constant */
894 if (NILP (type) || EQ (type, Qundecided))
895 { ty = CODESYS_AUTODETECT; }
897 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
898 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
899 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
900 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
901 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
902 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
904 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
906 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
909 signal_simple_error ("Invalid coding system type", type);
913 codesys = allocate_coding_system (ty, name);
915 if (NILP (doc_string))
916 doc_string = build_string ("");
918 CHECK_STRING (doc_string);
919 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
922 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
924 if (EQ (key, Qmnemonic))
927 CHECK_STRING (value);
928 CODING_SYSTEM_MNEMONIC (codesys) = value;
931 else if (EQ (key, Qeol_type))
933 need_to_setup_eol_systems = NILP (value);
936 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
939 else if (EQ (key, Qpost_read_conversion))
940 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
941 else if (EQ (key, Qpre_write_conversion))
942 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
944 else if (EQ (key, Qdisable_composition))
945 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
948 else if (ty == CODESYS_ISO2022)
950 #define FROB_INITIAL_CHARSET(charset_num) \
951 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
952 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
954 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
955 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
956 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
957 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
959 #define FROB_FORCE_CHARSET(charset_num) \
960 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
962 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
963 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
964 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
965 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
967 #define FROB_BOOLEAN_PROPERTY(prop) \
968 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
970 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
971 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
972 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
973 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
974 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
975 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
976 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
978 else if (EQ (key, Qinput_charset_conversion))
980 codesys->iso2022.input_conv =
981 Dynarr_new (charset_conversion_spec);
982 parse_charset_conversion_specs (codesys->iso2022.input_conv,
985 else if (EQ (key, Qoutput_charset_conversion))
987 codesys->iso2022.output_conv =
988 Dynarr_new (charset_conversion_spec);
989 parse_charset_conversion_specs (codesys->iso2022.output_conv,
993 signal_simple_error ("Unrecognized property", key);
995 else if (EQ (type, Qccl))
998 struct ccl_program test_ccl;
1001 /* Check key first. */
1002 if (EQ (key, Qdecode))
1003 suffix = "-ccl-decode";
1004 else if (EQ (key, Qencode))
1005 suffix = "-ccl-encode";
1007 signal_simple_error ("Unrecognized property", key);
1009 /* If value is vector, register it as a ccl program
1010 associated with an newly created symbol for
1011 backward compatibility. */
1012 if (VECTORP (value))
1014 sym = Fintern (concat2 (Fsymbol_name (name),
1015 build_string (suffix)),
1017 Fregister_ccl_program (sym, value);
1021 CHECK_SYMBOL (value);
1024 /* check if the given ccl programs are valid. */
1025 if (setup_ccl_program (&test_ccl, sym) < 0)
1026 signal_simple_error ("Invalid CCL program", value);
1028 if (EQ (key, Qdecode))
1029 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1030 else if (EQ (key, Qencode))
1031 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1036 signal_simple_error ("Unrecognized property", key);
1040 if (need_to_setup_eol_systems)
1041 setup_eol_coding_systems (codesys);
1044 Lisp_Object codesys_obj;
1045 XSETCODING_SYSTEM (codesys_obj, codesys);
1046 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1051 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1052 Copy OLD-CODING-SYSTEM to NEW-NAME.
1053 If NEW-NAME does not name an existing coding system, a new one will
1056 (old_coding_system, new_name))
1058 Lisp_Object new_coding_system;
1059 old_coding_system = Fget_coding_system (old_coding_system);
1060 new_coding_system = Ffind_coding_system (new_name);
1061 if (NILP (new_coding_system))
1063 XSETCODING_SYSTEM (new_coding_system,
1064 allocate_coding_system
1065 (XCODING_SYSTEM_TYPE (old_coding_system),
1067 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1071 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1072 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1073 memcpy (((char *) to ) + sizeof (to->header),
1074 ((char *) from) + sizeof (from->header),
1075 sizeof (*from) - sizeof (from->header));
1076 to->name = new_name;
1078 return new_coding_system;
1081 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1082 Return t if OBJECT names a coding system, and is not a coding system alias.
1086 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1090 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1091 Return t if OBJECT is a coding system alias.
1092 All coding system aliases are created by `define-coding-system-alias'.
1096 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1100 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1101 Return the coding-system symbol for which symbol ALIAS is an alias.
1105 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1106 if (SYMBOLP (aliasee))
1109 signal_simple_error ("Symbol is not a coding system alias", alias);
1110 return Qnil; /* To keep the compiler happy */
1114 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1116 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1120 /* A maphash function, for removing dangling coding system aliases. */
1122 dangling_coding_system_alias_p (Lisp_Object alias,
1123 Lisp_Object aliasee,
1124 void *dangling_aliases)
1126 if (SYMBOLP (aliasee)
1127 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1129 (*(int *) dangling_aliases)++;
1136 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1137 Define symbol ALIAS as an alias for coding system ALIASEE.
1139 You can use this function to redefine an alias that has already been defined,
1140 but you cannot redefine a name which is the canonical name for a coding system.
1141 \(a canonical name of a coding system is what is returned when you call
1142 `coding-system-name' on a coding system).
1144 ALIASEE itself can be an alias, which allows you to define nested aliases.
1146 You are forbidden, however, from creating alias loops or `dangling' aliases.
1147 These will be detected, and an error will be signaled if you attempt to do so.
1149 If ALIASEE is nil, then ALIAS will simply be undefined.
1151 See also `coding-system-alias-p', `coding-system-aliasee',
1152 and `coding-system-canonical-name-p'.
1156 Lisp_Object real_coding_system, probe;
1158 CHECK_SYMBOL (alias);
1160 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1162 ("Symbol is the canonical name of a coding system and cannot be redefined",
1167 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1168 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1169 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1171 Fremhash (alias, Vcoding_system_hash_table);
1173 /* Undefine subsidiary aliases,
1174 presumably created by a previous call to this function */
1175 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1176 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1177 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1179 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1180 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1181 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1184 /* Undefine dangling coding system aliases. */
1186 int dangling_aliases;
1189 dangling_aliases = 0;
1190 elisp_map_remhash (dangling_coding_system_alias_p,
1191 Vcoding_system_hash_table,
1193 } while (dangling_aliases > 0);
1199 if (CODING_SYSTEMP (aliasee))
1200 aliasee = XCODING_SYSTEM_NAME (aliasee);
1202 /* Checks that aliasee names a coding-system */
1203 real_coding_system = Fget_coding_system (aliasee);
1205 /* Check for coding system alias loops */
1206 if (EQ (alias, aliasee))
1207 alias_loop: signal_simple_error_2
1208 ("Attempt to create a coding system alias loop", alias, aliasee);
1210 for (probe = aliasee;
1212 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1214 if (EQ (probe, alias))
1218 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1220 /* Set up aliases for subsidiaries.
1221 #### There must be a better way to handle subsidiary coding systems. */
1223 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1225 for (i = 0; i < countof (suffixes); i++)
1227 Lisp_Object alias_subsidiary =
1228 append_suffix_to_symbol (alias, suffixes[i]);
1229 Lisp_Object aliasee_subsidiary =
1230 append_suffix_to_symbol (aliasee, suffixes[i]);
1232 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1233 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1236 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1237 but it doesn't look intentional, so I'd rather return something
1238 meaningful or nothing at all. */
1243 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1245 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1246 Lisp_Object new_coding_system;
1248 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1249 return coding_system;
1253 case EOL_AUTODETECT: return coding_system;
1254 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1255 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1256 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1257 default: abort (); return Qnil;
1260 return NILP (new_coding_system) ? coding_system : new_coding_system;
1263 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1264 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1266 (coding_system, eol_type))
1268 coding_system = Fget_coding_system (coding_system);
1270 return subsidiary_coding_system (coding_system,
1271 symbol_to_eol_type (eol_type));
1275 /************************************************************************/
1276 /* Coding system accessors */
1277 /************************************************************************/
1279 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1280 Return the doc string for CODING-SYSTEM.
1284 coding_system = Fget_coding_system (coding_system);
1285 return XCODING_SYSTEM_DOC_STRING (coding_system);
1288 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1289 Return the type of CODING-SYSTEM.
1293 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1296 case CODESYS_AUTODETECT: return Qundecided;
1298 case CODESYS_SHIFT_JIS: return Qshift_jis;
1299 case CODESYS_ISO2022: return Qiso2022;
1300 case CODESYS_BIG5: return Qbig5;
1301 case CODESYS_UCS4: return Qucs4;
1302 case CODESYS_UTF8: return Qutf8;
1303 case CODESYS_CCL: return Qccl;
1305 case CODESYS_NO_CONVERSION: return Qno_conversion;
1307 case CODESYS_INTERNAL: return Qinternal;
1314 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1317 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1319 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1322 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1323 Return initial charset of CODING-SYSTEM designated to GNUM.
1326 (coding_system, gnum))
1328 coding_system = Fget_coding_system (coding_system);
1331 return coding_system_charset (coding_system, XINT (gnum));
1335 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1336 Return the PROP property of CODING-SYSTEM.
1338 (coding_system, prop))
1341 enum coding_system_type type;
1343 coding_system = Fget_coding_system (coding_system);
1344 CHECK_SYMBOL (prop);
1345 type = XCODING_SYSTEM_TYPE (coding_system);
1347 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1348 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1351 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1353 case CODESYS_PROP_ALL_OK:
1356 case CODESYS_PROP_ISO2022:
1357 if (type != CODESYS_ISO2022)
1359 ("Property only valid in ISO2022 coding systems",
1363 case CODESYS_PROP_CCL:
1364 if (type != CODESYS_CCL)
1366 ("Property only valid in CCL coding systems",
1376 signal_simple_error ("Unrecognized property", prop);
1378 if (EQ (prop, Qname))
1379 return XCODING_SYSTEM_NAME (coding_system);
1380 else if (EQ (prop, Qtype))
1381 return Fcoding_system_type (coding_system);
1382 else if (EQ (prop, Qdoc_string))
1383 return XCODING_SYSTEM_DOC_STRING (coding_system);
1384 else if (EQ (prop, Qmnemonic))
1385 return XCODING_SYSTEM_MNEMONIC (coding_system);
1386 else if (EQ (prop, Qeol_type))
1387 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1388 else if (EQ (prop, Qeol_lf))
1389 return XCODING_SYSTEM_EOL_LF (coding_system);
1390 else if (EQ (prop, Qeol_crlf))
1391 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1392 else if (EQ (prop, Qeol_cr))
1393 return XCODING_SYSTEM_EOL_CR (coding_system);
1394 else if (EQ (prop, Qpost_read_conversion))
1395 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1396 else if (EQ (prop, Qpre_write_conversion))
1397 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1399 else if (type == CODESYS_ISO2022)
1401 if (EQ (prop, Qcharset_g0))
1402 return coding_system_charset (coding_system, 0);
1403 else if (EQ (prop, Qcharset_g1))
1404 return coding_system_charset (coding_system, 1);
1405 else if (EQ (prop, Qcharset_g2))
1406 return coding_system_charset (coding_system, 2);
1407 else if (EQ (prop, Qcharset_g3))
1408 return coding_system_charset (coding_system, 3);
1410 #define FORCE_CHARSET(charset_num) \
1411 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1412 (coding_system, charset_num) ? Qt : Qnil)
1414 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1415 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1416 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1417 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1419 #define LISP_BOOLEAN(prop) \
1420 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1422 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1423 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1424 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1425 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1426 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1427 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1428 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1430 else if (EQ (prop, Qinput_charset_conversion))
1432 unparse_charset_conversion_specs
1433 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1434 else if (EQ (prop, Qoutput_charset_conversion))
1436 unparse_charset_conversion_specs
1437 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1441 else if (type == CODESYS_CCL)
1443 if (EQ (prop, Qdecode))
1444 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1445 else if (EQ (prop, Qencode))
1446 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1454 return Qnil; /* not reached */
1458 /************************************************************************/
1459 /* Coding category functions */
1460 /************************************************************************/
1463 decode_coding_category (Lisp_Object symbol)
1467 CHECK_SYMBOL (symbol);
1468 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1469 if (EQ (coding_category_symbol[i], symbol))
1472 signal_simple_error ("Unrecognized coding category", symbol);
1473 return 0; /* not reached */
1476 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1477 Return a list of all recognized coding categories.
1482 Lisp_Object list = Qnil;
1484 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1485 list = Fcons (coding_category_symbol[i], list);
1489 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1490 Change the priority order of the coding categories.
1491 LIST should be list of coding categories, in descending order of
1492 priority. Unspecified coding categories will be lower in priority
1493 than all specified ones, in the same relative order they were in
1498 int category_to_priority[CODING_CATEGORY_LAST + 1];
1502 /* First generate a list that maps coding categories to priorities. */
1504 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1505 category_to_priority[i] = -1;
1507 /* Highest priority comes from the specified list. */
1509 EXTERNAL_LIST_LOOP (rest, list)
1511 int cat = decode_coding_category (XCAR (rest));
1513 if (category_to_priority[cat] >= 0)
1514 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1515 category_to_priority[cat] = i++;
1518 /* Now go through the existing categories by priority to retrieve
1519 the categories not yet specified and preserve their priority
1521 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1523 int cat = fcd->coding_category_by_priority[j];
1524 if (category_to_priority[cat] < 0)
1525 category_to_priority[cat] = i++;
1528 /* Now we need to construct the inverse of the mapping we just
1531 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1532 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1534 /* Phew! That was confusing. */
1538 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1539 Return a list of coding categories in descending order of priority.
1544 Lisp_Object list = Qnil;
1546 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1547 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1552 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1553 Change the coding system associated with a coding category.
1555 (coding_category, coding_system))
1557 int cat = decode_coding_category (coding_category);
1559 coding_system = Fget_coding_system (coding_system);
1560 fcd->coding_category_system[cat] = coding_system;
1564 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1565 Return the coding system associated with a coding category.
1569 int cat = decode_coding_category (coding_category);
1570 Lisp_Object sys = fcd->coding_category_system[cat];
1573 return XCODING_SYSTEM_NAME (sys);
1578 /************************************************************************/
1579 /* Detecting the encoding of data */
1580 /************************************************************************/
1582 struct detection_state
1584 eol_type_t eol_type;
1620 struct iso2022_decoder iso;
1622 int high_byte_count;
1623 unsigned int saw_single_shift:1;
1636 acceptable_control_char_p (int c)
1640 /* Allow and ignore control characters that you might
1641 reasonably see in a text file */
1646 case 8: /* backspace */
1647 case 11: /* vertical tab */
1648 case 12: /* form feed */
1649 case 26: /* MS-DOS C-z junk */
1650 case 31: /* '^_' -- for info */
1658 mask_has_at_most_one_bit_p (int mask)
1660 /* Perhaps the only thing useful you learn from intensive Microsoft
1661 technical interviews */
1662 return (mask & (mask - 1)) == 0;
1666 detect_eol_type (struct detection_state *st, const Extbyte *src,
1671 unsigned char c = *(unsigned char *)src++;
1674 if (st->eol.just_saw_cr)
1676 else if (st->eol.seen_anything)
1679 else if (st->eol.just_saw_cr)
1682 st->eol.just_saw_cr = 1;
1684 st->eol.just_saw_cr = 0;
1685 st->eol.seen_anything = 1;
1688 return EOL_AUTODETECT;
1691 /* Attempt to determine the encoding and EOL type of the given text.
1692 Before calling this function for the first type, you must initialize
1693 st->eol_type as appropriate and initialize st->mask to ~0.
1695 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1698 st->mask holds the determined coding category mask, or ~0 if only
1699 ASCII has been seen so far.
1703 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1704 is present in st->mask
1705 1 == definitive answers are here for both st->eol_type and st->mask
1709 detect_coding_type (struct detection_state *st, const Extbyte *src,
1710 size_t n, int just_do_eol)
1712 if (st->eol_type == EOL_AUTODETECT)
1713 st->eol_type = detect_eol_type (st, src, n);
1716 return st->eol_type != EOL_AUTODETECT;
1718 if (!st->seen_non_ascii)
1720 for (; n; n--, src++)
1722 unsigned char c = *(unsigned char *) src;
1723 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1725 st->seen_non_ascii = 1;
1727 st->shift_jis.mask = ~0;
1731 st->iso2022.mask = ~0;
1741 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1742 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1743 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1744 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1745 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1746 st->big5.mask = detect_coding_big5 (st, src, n);
1747 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1748 st->utf8.mask = detect_coding_utf8 (st, src, n);
1749 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1750 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1753 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1754 | st->utf8.mask | st->ucs4.mask;
1757 int retval = mask_has_at_most_one_bit_p (st->mask);
1758 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1759 return retval && st->eol_type != EOL_AUTODETECT;
1764 coding_system_from_mask (int mask)
1768 /* If the file was entirely or basically ASCII, use the
1769 default value of `buffer-file-coding-system'. */
1770 Lisp_Object retval =
1771 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1774 retval = Ffind_coding_system (retval);
1778 (Qbad_variable, Qwarning,
1779 "Invalid `default-buffer-file-coding-system', set to nil");
1780 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1784 retval = Fget_coding_system (Qraw_text);
1792 mask = postprocess_iso2022_mask (mask);
1794 /* Look through the coding categories by priority and find
1795 the first one that is allowed. */
1796 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1798 cat = fcd->coding_category_by_priority[i];
1799 if ((mask & (1 << cat)) &&
1800 !NILP (fcd->coding_category_system[cat]))
1804 return fcd->coding_category_system[cat];
1806 return Fget_coding_system (Qraw_text);
1810 /* Given a seekable read stream and potential coding system and EOL type
1811 as specified, do any autodetection that is called for. If the
1812 coding system and/or EOL type are not `autodetect', they will be left
1813 alone; but this function will never return an autodetect coding system
1816 This function does not automatically fetch subsidiary coding systems;
1817 that should be unnecessary with the explicit eol-type argument. */
1819 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1822 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1823 eol_type_t *eol_type_in_out)
1825 struct detection_state decst;
1827 if (*eol_type_in_out == EOL_AUTODETECT)
1828 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1831 decst.eol_type = *eol_type_in_out;
1834 /* If autodetection is called for, do it now. */
1835 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1836 || *eol_type_in_out == EOL_AUTODETECT)
1839 Lisp_Object coding_system = Qnil;
1841 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1844 /* Look for initial "-*-"; mode line prefix */
1846 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1851 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1853 Extbyte *local_vars_beg = p + 3;
1854 /* Look for final "-*-"; mode line suffix */
1855 for (p = local_vars_beg,
1856 scan_end = buf + nread - LENGTH ("-*-");
1861 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1863 Extbyte *suffix = p;
1864 /* Look for "coding:" */
1865 for (p = local_vars_beg,
1866 scan_end = suffix - LENGTH ("coding:?");
1869 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1870 && (p == local_vars_beg
1871 || (*(p-1) == ' ' ||
1877 p += LENGTH ("coding:");
1878 while (*p == ' ' || *p == '\t') p++;
1880 /* Get coding system name */
1881 save = *suffix; *suffix = '\0';
1882 /* Characters valid in a MIME charset name (rfc 1521),
1883 and in a Lisp symbol name. */
1884 n = strspn ( (char *) p,
1885 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1886 "abcdefghijklmnopqrstuvwxyz"
1892 save = p[n]; p[n] = '\0';
1894 Ffind_coding_system (intern ((char *) p));
1904 if (NILP (coding_system))
1907 if (detect_coding_type (&decst, buf, nread,
1908 XCODING_SYSTEM_TYPE (*codesys_in_out)
1909 != CODESYS_AUTODETECT))
1911 nread = Lstream_read (stream, buf, sizeof (buf));
1917 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1918 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1921 if (detect_coding_type (&decst, buf, nread, 1))
1923 nread = Lstream_read (stream, buf, sizeof (buf));
1929 *eol_type_in_out = decst.eol_type;
1930 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1932 if (NILP (coding_system))
1933 *codesys_in_out = coding_system_from_mask (decst.mask);
1935 *codesys_in_out = coding_system;
1939 /* If we absolutely can't determine the EOL type, just assume LF. */
1940 if (*eol_type_in_out == EOL_AUTODETECT)
1941 *eol_type_in_out = EOL_LF;
1943 Lstream_rewind (stream);
1946 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1947 Detect coding system of the text in the region between START and END.
1948 Return a list of possible coding systems ordered by priority.
1949 If only ASCII characters are found, return 'undecided or one of
1950 its subsidiary coding systems according to a detected end-of-line
1951 type. Optional arg BUFFER defaults to the current buffer.
1953 (start, end, buffer))
1955 Lisp_Object val = Qnil;
1956 struct buffer *buf = decode_buffer (buffer, 0);
1958 Lisp_Object instream, lb_instream;
1959 Lstream *istr, *lb_istr;
1960 struct detection_state decst;
1961 struct gcpro gcpro1, gcpro2;
1963 get_buffer_range_char (buf, start, end, &b, &e, 0);
1964 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1965 lb_istr = XLSTREAM (lb_instream);
1966 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1967 istr = XLSTREAM (instream);
1968 GCPRO2 (instream, lb_instream);
1970 decst.eol_type = EOL_AUTODETECT;
1974 Extbyte random_buffer[4096];
1975 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1979 if (detect_coding_type (&decst, random_buffer, nread, 0))
1983 if (decst.mask == ~0)
1984 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1992 decst.mask = postprocess_iso2022_mask (decst.mask);
1994 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1996 int sys = fcd->coding_category_by_priority[i];
1997 if (decst.mask & (1 << sys))
1999 Lisp_Object codesys = fcd->coding_category_system[sys];
2000 if (!NILP (codesys))
2001 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2002 val = Fcons (codesys, val);
2006 Lstream_close (istr);
2008 Lstream_delete (istr);
2009 Lstream_delete (lb_istr);
2014 /************************************************************************/
2015 /* Converting to internal Mule format ("decoding") */
2016 /************************************************************************/
2018 /* A decoding stream is a stream used for decoding text (i.e.
2019 converting from some external format to internal format).
2020 The decoding-stream object keeps track of the actual coding
2021 stream, the stream that is at the other end, and data that
2022 needs to be persistent across the lifetime of the stream. */
2024 /* Handle the EOL stuff related to just-read-in character C.
2025 EOL_TYPE is the EOL type of the coding stream.
2026 FLAGS is the current value of FLAGS in the coding stream, and may
2027 be modified by this macro. (The macro only looks at the
2028 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2029 bytes are to be written. You need to also define a local goto
2030 label "label_continue_loop" that is at the end of the main
2031 character-reading loop.
2033 If C is a CR character, then this macro handles it entirely and
2034 jumps to label_continue_loop. Otherwise, this macro does not add
2035 anything to DST, and continues normally. You should continue
2036 processing C normally after this macro. */
2038 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2042 if (eol_type == EOL_CR) \
2043 Dynarr_add (dst, '\n'); \
2044 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2045 Dynarr_add (dst, c); \
2047 flags |= CODING_STATE_CR; \
2048 goto label_continue_loop; \
2050 else if (flags & CODING_STATE_CR) \
2051 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2053 Dynarr_add (dst, '\r'); \
2054 flags &= ~CODING_STATE_CR; \
2058 /* C should be a binary character in the range 0 - 255; convert
2059 to internal format and add to Dynarr DST. */
2062 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2064 if (BYTE_ASCII_P (c)) \
2065 Dynarr_add (dst, c); \
2068 Dynarr_add (dst, (c >> 6) | 0xc0); \
2069 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2073 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2075 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2079 Dynarr_add (dst, c);
2081 else if ( c <= 0x7ff )
2083 Dynarr_add (dst, (c >> 6) | 0xc0);
2084 Dynarr_add (dst, (c & 0x3f) | 0x80);
2086 else if ( c <= 0xffff )
2088 Dynarr_add (dst, (c >> 12) | 0xe0);
2089 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2090 Dynarr_add (dst, (c & 0x3f) | 0x80);
2092 else if ( c <= 0x1fffff )
2094 Dynarr_add (dst, (c >> 18) | 0xf0);
2095 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2096 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2097 Dynarr_add (dst, (c & 0x3f) | 0x80);
2099 else if ( c <= 0x3ffffff )
2101 Dynarr_add (dst, (c >> 24) | 0xf8);
2102 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2103 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2104 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2105 Dynarr_add (dst, (c & 0x3f) | 0x80);
2109 Dynarr_add (dst, (c >> 30) | 0xfc);
2110 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2111 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2112 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2113 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2114 Dynarr_add (dst, (c & 0x3f) | 0x80);
2118 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2120 if (BYTE_ASCII_P (c)) \
2121 Dynarr_add (dst, c); \
2122 else if (BYTE_C1_P (c)) \
2124 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2125 Dynarr_add (dst, c + 0x20); \
2129 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2130 Dynarr_add (dst, c); \
2135 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2139 DECODE_ADD_BINARY_CHAR (ch, dst); \
2144 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2146 if (flags & CODING_STATE_END) \
2148 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2149 if (flags & CODING_STATE_CR) \
2150 Dynarr_add (dst, '\r'); \
2154 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2156 struct decoding_stream
2158 /* Coding system that governs the conversion. */
2159 Lisp_Coding_System *codesys;
2161 /* Stream that we read the encoded data from or
2162 write the decoded data to. */
2165 /* If we are reading, then we can return only a fixed amount of
2166 data, so if the conversion resulted in too much data, we store it
2167 here for retrieval the next time around. */
2168 unsigned_char_dynarr *runoff;
2170 /* FLAGS holds flags indicating the current state of the decoding.
2171 Some of these flags are dependent on the coding system. */
2174 /* CPOS holds a partially built-up code-point of character. */
2177 /* EOL_TYPE specifies the type of end-of-line conversion that
2178 currently applies. We need to keep this separate from the
2179 EOL type stored in CODESYS because the latter might indicate
2180 automatic EOL-type detection while the former will always
2181 indicate a particular EOL type. */
2182 eol_type_t eol_type;
2184 /* Additional ISO2022 information. We define the structure above
2185 because it's also needed by the detection routines. */
2186 struct iso2022_decoder iso2022;
2188 /* Additional information (the state of the running CCL program)
2189 used by the CCL decoder. */
2190 struct ccl_program ccl;
2192 /* counter for UTF-8 or UCS-4 */
2193 unsigned char counter;
2196 unsigned combined_char_count;
2197 Emchar combined_chars[16];
2198 Lisp_Object combining_table;
2200 struct detection_state decst;
2204 extern Lisp_Object Vcharacter_composition_table;
2207 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2209 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2213 for (i = 0; i < str->combined_char_count; i++)
2214 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2215 str->combined_char_count = 0;
2216 str->combining_table = Qnil;
2219 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2220 unsigned_char_dynarr* dst);
2222 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2223 Emchar character, unsigned_char_dynarr* dst)
2225 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2226 DECODE_ADD_UCS_CHAR (character, dst);
2227 else if (!CHAR_ID_TABLE_P (str->combining_table))
2230 = get_char_id_table (character, Vcharacter_composition_table);
2233 DECODE_ADD_UCS_CHAR (character, dst);
2236 str->combined_chars[0] = character;
2237 str->combined_char_count = 1;
2238 str->combining_table = ret;
2244 = get_char_id_table (character, str->combining_table);
2248 Emchar char2 = XCHARVAL (ret);
2249 ret = get_char_id_table (char2, Vcharacter_composition_table);
2252 DECODE_ADD_UCS_CHAR (char2, dst);
2253 str->combined_char_count = 0;
2254 str->combining_table = Qnil;
2258 str->combined_chars[0] = char2;
2259 str->combined_char_count = 1;
2260 str->combining_table = ret;
2263 else if (CHAR_ID_TABLE_P (ret))
2265 str->combined_chars[str->combined_char_count++] = character;
2266 str->combining_table = ret;
2270 COMPOSE_FLUSH_CHARS (str, dst);
2271 DECODE_ADD_UCS_CHAR (character, dst);
2275 #else /* not UTF2000 */
2276 #define COMPOSE_FLUSH_CHARS(str, dst)
2277 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2278 #endif /* UTF2000 */
2280 static ssize_t decoding_reader (Lstream *stream,
2281 unsigned char *data, size_t size);
2282 static ssize_t decoding_writer (Lstream *stream,
2283 const unsigned char *data, size_t size);
2284 static int decoding_rewinder (Lstream *stream);
2285 static int decoding_seekable_p (Lstream *stream);
2286 static int decoding_flusher (Lstream *stream);
2287 static int decoding_closer (Lstream *stream);
2289 static Lisp_Object decoding_marker (Lisp_Object stream);
2291 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2292 sizeof (struct decoding_stream));
2295 decoding_marker (Lisp_Object stream)
2297 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2298 Lisp_Object str_obj;
2300 /* We do not need to mark the coding systems or charsets stored
2301 within the stream because they are stored in a global list
2302 and automatically marked. */
2304 XSETLSTREAM (str_obj, str);
2305 mark_object (str_obj);
2306 if (str->imp->marker)
2307 return (str->imp->marker) (str_obj);
2312 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2313 so we read data from the other end, decode it, and store it into DATA. */
2316 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2318 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2319 unsigned char *orig_data = data;
2321 int error_occurred = 0;
2323 /* We need to interface to mule_decode(), which expects to take some
2324 amount of data and store the result into a Dynarr. We have
2325 mule_decode() store into str->runoff, and take data from there
2328 /* We loop until we have enough data, reading chunks from the other
2329 end and decoding it. */
2332 /* Take data from the runoff if we can. Make sure to take at
2333 most SIZE bytes, and delete the data from the runoff. */
2334 if (Dynarr_length (str->runoff) > 0)
2336 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2337 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2338 Dynarr_delete_many (str->runoff, 0, chunk);
2344 break; /* No more room for data */
2346 if (str->flags & CODING_STATE_END)
2347 /* This means that on the previous iteration, we hit the EOF on
2348 the other end. We loop once more so that mule_decode() can
2349 output any final stuff it may be holding, or any "go back
2350 to a sane state" escape sequences. (This latter makes sense
2351 during encoding.) */
2354 /* Exhausted the runoff, so get some more. DATA has at least
2355 SIZE bytes left of storage in it, so it's OK to read directly
2356 into it. (We'll be overwriting above, after we've decoded it
2357 into the runoff.) */
2358 read_size = Lstream_read (str->other_end, data, size);
2365 /* There might be some more end data produced in the translation.
2366 See the comment above. */
2367 str->flags |= CODING_STATE_END;
2368 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2371 if (data - orig_data == 0)
2372 return error_occurred ? -1 : 0;
2374 return data - orig_data;
2378 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2380 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2383 /* Decode all our data into the runoff, and then attempt to write
2384 it all out to the other end. Remove whatever chunk we succeeded
2386 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2387 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2388 Dynarr_length (str->runoff));
2390 Dynarr_delete_many (str->runoff, 0, retval);
2391 /* Do NOT return retval. The return value indicates how much
2392 of the incoming data was written, not how many bytes were
2398 reset_decoding_stream (struct decoding_stream *str)
2401 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2403 Lisp_Object coding_system;
2404 XSETCODING_SYSTEM (coding_system, str->codesys);
2405 reset_iso2022 (coding_system, &str->iso2022);
2407 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2409 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2414 str->combined_char_count = 0;
2415 str->combining_table = Qnil;
2417 str->flags = str->cpos = 0;
2421 decoding_rewinder (Lstream *stream)
2423 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2424 reset_decoding_stream (str);
2425 Dynarr_reset (str->runoff);
2426 return Lstream_rewind (str->other_end);
2430 decoding_seekable_p (Lstream *stream)
2432 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2433 return Lstream_seekable_p (str->other_end);
2437 decoding_flusher (Lstream *stream)
2439 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2440 return Lstream_flush (str->other_end);
2444 decoding_closer (Lstream *stream)
2446 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2447 if (stream->flags & LSTREAM_FL_WRITE)
2449 str->flags |= CODING_STATE_END;
2450 decoding_writer (stream, 0, 0);
2452 Dynarr_free (str->runoff);
2454 #ifdef ENABLE_COMPOSITE_CHARS
2455 if (str->iso2022.composite_chars)
2456 Dynarr_free (str->iso2022.composite_chars);
2459 return Lstream_close (str->other_end);
2463 decoding_stream_coding_system (Lstream *stream)
2465 Lisp_Object coding_system;
2466 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2468 XSETCODING_SYSTEM (coding_system, str->codesys);
2469 return subsidiary_coding_system (coding_system, str->eol_type);
2473 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2475 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2476 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2478 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2479 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2480 reset_decoding_stream (str);
2483 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2484 stream for writing, no automatic code detection will be performed.
2485 The reason for this is that automatic code detection requires a
2486 seekable input. Things will also fail if you open a decoding
2487 stream for reading using a non-fully-specified coding system and
2488 a non-seekable input stream. */
2491 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2494 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2495 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2499 str->other_end = stream;
2500 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2501 str->eol_type = EOL_AUTODETECT;
2502 if (!strcmp (mode, "r")
2503 && Lstream_seekable_p (stream))
2504 /* We can determine the coding system now. */
2505 determine_real_coding_system (stream, &codesys, &str->eol_type);
2506 set_decoding_stream_coding_system (lstr, codesys);
2507 str->decst.eol_type = str->eol_type;
2508 str->decst.mask = ~0;
2509 XSETLSTREAM (obj, lstr);
2514 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2516 return make_decoding_stream_1 (stream, codesys, "r");
2520 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2522 return make_decoding_stream_1 (stream, codesys, "w");
2525 /* Note: the decode_coding_* functions all take the same
2526 arguments as mule_decode(), which is to say some SRC data of
2527 size N, which is to be stored into dynamic array DST.
2528 DECODING is the stream within which the decoding is
2529 taking place, but no data is actually read from or
2530 written to that stream; that is handled in decoding_reader()
2531 or decoding_writer(). This allows the same functions to
2532 be used for both reading and writing. */
2535 mule_decode (Lstream *decoding, const Extbyte *src,
2536 unsigned_char_dynarr *dst, size_t n)
2538 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2540 /* If necessary, do encoding-detection now. We do this when
2541 we're a writing stream or a non-seekable reading stream,
2542 meaning that we can't just process the whole input,
2543 rewind, and start over. */
2545 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2546 str->eol_type == EOL_AUTODETECT)
2548 Lisp_Object codesys;
2550 XSETCODING_SYSTEM (codesys, str->codesys);
2551 detect_coding_type (&str->decst, src, n,
2552 CODING_SYSTEM_TYPE (str->codesys) !=
2553 CODESYS_AUTODETECT);
2554 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2555 str->decst.mask != ~0)
2556 /* #### This is cheesy. What we really ought to do is
2557 buffer up a certain amount of data so as to get a
2558 less random result. */
2559 codesys = coding_system_from_mask (str->decst.mask);
2560 str->eol_type = str->decst.eol_type;
2561 if (XCODING_SYSTEM (codesys) != str->codesys)
2563 /* Preserve the CODING_STATE_END flag in case it was set.
2564 If we erase it, bad things might happen. */
2565 int was_end = str->flags & CODING_STATE_END;
2566 set_decoding_stream_coding_system (decoding, codesys);
2568 str->flags |= CODING_STATE_END;
2572 switch (CODING_SYSTEM_TYPE (str->codesys))
2575 case CODESYS_INTERNAL:
2576 Dynarr_add_many (dst, src, n);
2579 case CODESYS_AUTODETECT:
2580 /* If we got this far and still haven't decided on the coding
2581 system, then do no conversion. */
2582 case CODESYS_NO_CONVERSION:
2583 decode_coding_no_conversion (decoding, src, dst, n);
2586 case CODESYS_SHIFT_JIS:
2587 decode_coding_sjis (decoding, src, dst, n);
2590 decode_coding_big5 (decoding, src, dst, n);
2593 decode_coding_ucs4 (decoding, src, dst, n);
2596 decode_coding_utf8 (decoding, src, dst, n);
2599 str->ccl.last_block = str->flags & CODING_STATE_END;
2600 /* When applying ccl program to stream, MUST NOT set NULL
2602 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2603 dst, n, 0, CCL_MODE_DECODING);
2605 case CODESYS_ISO2022:
2606 decode_coding_iso2022 (decoding, src, dst, n);
2614 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2615 Decode the text between START and END which is encoded in CODING-SYSTEM.
2616 This is useful if you've read in encoded text from a file without decoding
2617 it (e.g. you read in a JIS-formatted file but used the `binary' or
2618 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2619 Return length of decoded text.
2620 BUFFER defaults to the current buffer if unspecified.
2622 (start, end, coding_system, buffer))
2625 struct buffer *buf = decode_buffer (buffer, 0);
2626 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2627 Lstream *istr, *ostr;
2628 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2630 get_buffer_range_char (buf, start, end, &b, &e, 0);
2632 barf_if_buffer_read_only (buf, b, e);
2634 coding_system = Fget_coding_system (coding_system);
2635 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2636 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2637 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2639 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2640 Fget_coding_system (Qbinary));
2641 istr = XLSTREAM (instream);
2642 ostr = XLSTREAM (outstream);
2643 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2645 /* The chain of streams looks like this:
2647 [BUFFER] <----- send through
2648 ------> [ENCODE AS BINARY]
2649 ------> [DECODE AS SPECIFIED]
2655 char tempbuf[1024]; /* some random amount */
2656 Bufpos newpos, even_newer_pos;
2657 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2658 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2662 newpos = lisp_buffer_stream_startpos (istr);
2663 Lstream_write (ostr, tempbuf, size_in_bytes);
2664 even_newer_pos = lisp_buffer_stream_startpos (istr);
2665 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2668 Lstream_close (istr);
2669 Lstream_close (ostr);
2671 Lstream_delete (istr);
2672 Lstream_delete (ostr);
2673 Lstream_delete (XLSTREAM (de_outstream));
2674 Lstream_delete (XLSTREAM (lb_outstream));
2679 /************************************************************************/
2680 /* Converting to an external encoding ("encoding") */
2681 /************************************************************************/
2683 /* An encoding stream is an output stream. When you create the
2684 stream, you specify the coding system that governs the encoding
2685 and another stream that the resulting encoded data is to be
2686 sent to, and then start sending data to it. */
2688 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2690 struct encoding_stream
2692 /* Coding system that governs the conversion. */
2693 Lisp_Coding_System *codesys;
2695 /* Stream that we read the encoded data from or
2696 write the decoded data to. */
2699 /* If we are reading, then we can return only a fixed amount of
2700 data, so if the conversion resulted in too much data, we store it
2701 here for retrieval the next time around. */
2702 unsigned_char_dynarr *runoff;
2704 /* FLAGS holds flags indicating the current state of the encoding.
2705 Some of these flags are dependent on the coding system. */
2708 /* CH holds a partially built-up character. Since we only deal
2709 with one- and two-byte characters at the moment, we only use
2710 this to store the first byte of a two-byte character. */
2713 /* Additional information used by the ISO2022 encoder. */
2716 /* CHARSET holds the character sets currently assigned to the G0
2717 through G3 registers. It is initialized from the array
2718 INITIAL_CHARSET in CODESYS. */
2719 Lisp_Object charset[4];
2721 /* Which registers are currently invoked into the left (GL) and
2722 right (GR) halves of the 8-bit encoding space? */
2723 int register_left, register_right;
2725 /* Whether we need to explicitly designate the charset in the
2726 G? register before using it. It is initialized from the
2727 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2728 unsigned char force_charset_on_output[4];
2730 /* Other state variables that need to be preserved across
2732 Lisp_Object current_charset;
2734 int current_char_boundary;
2737 void (*encode_char) (struct encoding_stream *str, Emchar c,
2738 unsigned_char_dynarr *dst, unsigned int *flags);
2739 void (*finish) (struct encoding_stream *str,
2740 unsigned_char_dynarr *dst, unsigned int *flags);
2742 /* Additional information (the state of the running CCL program)
2743 used by the CCL encoder. */
2744 struct ccl_program ccl;
2748 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2749 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2751 static int encoding_rewinder (Lstream *stream);
2752 static int encoding_seekable_p (Lstream *stream);
2753 static int encoding_flusher (Lstream *stream);
2754 static int encoding_closer (Lstream *stream);
2756 static Lisp_Object encoding_marker (Lisp_Object stream);
2758 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2759 sizeof (struct encoding_stream));
2762 encoding_marker (Lisp_Object stream)
2764 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2765 Lisp_Object str_obj;
2767 /* We do not need to mark the coding systems or charsets stored
2768 within the stream because they are stored in a global list
2769 and automatically marked. */
2771 XSETLSTREAM (str_obj, str);
2772 mark_object (str_obj);
2773 if (str->imp->marker)
2774 return (str->imp->marker) (str_obj);
2779 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2780 so we read data from the other end, encode it, and store it into DATA. */
2783 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2785 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2786 unsigned char *orig_data = data;
2788 int error_occurred = 0;
2790 /* We need to interface to mule_encode(), which expects to take some
2791 amount of data and store the result into a Dynarr. We have
2792 mule_encode() store into str->runoff, and take data from there
2795 /* We loop until we have enough data, reading chunks from the other
2796 end and encoding it. */
2799 /* Take data from the runoff if we can. Make sure to take at
2800 most SIZE bytes, and delete the data from the runoff. */
2801 if (Dynarr_length (str->runoff) > 0)
2803 int chunk = min ((int) size, Dynarr_length (str->runoff));
2804 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2805 Dynarr_delete_many (str->runoff, 0, chunk);
2811 break; /* No more room for data */
2813 if (str->flags & CODING_STATE_END)
2814 /* This means that on the previous iteration, we hit the EOF on
2815 the other end. We loop once more so that mule_encode() can
2816 output any final stuff it may be holding, or any "go back
2817 to a sane state" escape sequences. (This latter makes sense
2818 during encoding.) */
2821 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2822 left of storage in it, so it's OK to read directly into it.
2823 (We'll be overwriting above, after we've encoded it into the
2825 read_size = Lstream_read (str->other_end, data, size);
2832 /* There might be some more end data produced in the translation.
2833 See the comment above. */
2834 str->flags |= CODING_STATE_END;
2835 mule_encode (stream, data, str->runoff, read_size);
2838 if (data == orig_data)
2839 return error_occurred ? -1 : 0;
2841 return data - orig_data;
2845 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2847 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2850 /* Encode all our data into the runoff, and then attempt to write
2851 it all out to the other end. Remove whatever chunk we succeeded
2853 mule_encode (stream, data, str->runoff, size);
2854 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2855 Dynarr_length (str->runoff));
2857 Dynarr_delete_many (str->runoff, 0, retval);
2858 /* Do NOT return retval. The return value indicates how much
2859 of the incoming data was written, not how many bytes were
2865 reset_encoding_stream (struct encoding_stream *str)
2868 switch (CODING_SYSTEM_TYPE (str->codesys))
2870 case CODESYS_ISO2022:
2874 str->encode_char = &char_encode_iso2022;
2875 str->finish = &char_finish_iso2022;
2876 for (i = 0; i < 4; i++)
2878 str->iso2022.charset[i] =
2879 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2880 str->iso2022.force_charset_on_output[i] =
2881 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2883 str->iso2022.register_left = 0;
2884 str->iso2022.register_right = 1;
2885 str->iso2022.current_charset = Qnil;
2886 str->iso2022.current_half = 0;
2890 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2893 str->encode_char = &char_encode_utf8;
2894 str->finish = &char_finish_utf8;
2897 str->encode_char = &char_encode_ucs4;
2898 str->finish = &char_finish_ucs4;
2900 case CODESYS_SHIFT_JIS:
2901 str->encode_char = &char_encode_shift_jis;
2902 str->finish = &char_finish_shift_jis;
2908 str->iso2022.current_char_boundary = 0;
2909 str->flags = str->ch = 0;
2913 encoding_rewinder (Lstream *stream)
2915 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2916 reset_encoding_stream (str);
2917 Dynarr_reset (str->runoff);
2918 return Lstream_rewind (str->other_end);
2922 encoding_seekable_p (Lstream *stream)
2924 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2925 return Lstream_seekable_p (str->other_end);
2929 encoding_flusher (Lstream *stream)
2931 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2932 return Lstream_flush (str->other_end);
2936 encoding_closer (Lstream *stream)
2938 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2939 if (stream->flags & LSTREAM_FL_WRITE)
2941 str->flags |= CODING_STATE_END;
2942 encoding_writer (stream, 0, 0);
2944 Dynarr_free (str->runoff);
2945 return Lstream_close (str->other_end);
2949 encoding_stream_coding_system (Lstream *stream)
2951 Lisp_Object coding_system;
2952 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2954 XSETCODING_SYSTEM (coding_system, str->codesys);
2955 return coding_system;
2959 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2961 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2962 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2964 reset_encoding_stream (str);
2968 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2971 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2972 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2976 str->runoff = Dynarr_new (unsigned_char);
2977 str->other_end = stream;
2978 set_encoding_stream_coding_system (lstr, codesys);
2979 XSETLSTREAM (obj, lstr);
2984 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2986 return make_encoding_stream_1 (stream, codesys, "r");
2990 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2992 return make_encoding_stream_1 (stream, codesys, "w");
2995 /* Convert N bytes of internally-formatted data stored in SRC to an
2996 external format, according to the encoding stream ENCODING.
2997 Store the encoded data into DST. */
3000 mule_encode (Lstream *encoding, const Bufbyte *src,
3001 unsigned_char_dynarr *dst, size_t n)
3003 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3005 switch (CODING_SYSTEM_TYPE (str->codesys))
3008 case CODESYS_INTERNAL:
3009 Dynarr_add_many (dst, src, n);
3012 case CODESYS_AUTODETECT:
3013 /* If we got this far and still haven't decided on the coding
3014 system, then do no conversion. */
3015 case CODESYS_NO_CONVERSION:
3016 encode_coding_no_conversion (encoding, src, dst, n);
3020 encode_coding_big5 (encoding, src, dst, n);
3023 str->ccl.last_block = str->flags & CODING_STATE_END;
3024 /* When applying ccl program to stream, MUST NOT set NULL
3026 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3027 dst, n, 0, CCL_MODE_ENCODING);
3031 text_encode_generic (encoding, src, dst, n);
3035 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3036 Encode the text between START and END using CODING-SYSTEM.
3037 This will, for example, convert Japanese characters into stuff such as
3038 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3039 text. BUFFER defaults to the current buffer if unspecified.
3041 (start, end, coding_system, buffer))
3044 struct buffer *buf = decode_buffer (buffer, 0);
3045 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3046 Lstream *istr, *ostr;
3047 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3049 get_buffer_range_char (buf, start, end, &b, &e, 0);
3051 barf_if_buffer_read_only (buf, b, e);
3053 coding_system = Fget_coding_system (coding_system);
3054 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3055 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3056 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3057 Fget_coding_system (Qbinary));
3058 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3060 istr = XLSTREAM (instream);
3061 ostr = XLSTREAM (outstream);
3062 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3063 /* The chain of streams looks like this:
3065 [BUFFER] <----- send through
3066 ------> [ENCODE AS SPECIFIED]
3067 ------> [DECODE AS BINARY]
3072 char tempbuf[1024]; /* some random amount */
3073 Bufpos newpos, even_newer_pos;
3074 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3075 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3079 newpos = lisp_buffer_stream_startpos (istr);
3080 Lstream_write (ostr, tempbuf, size_in_bytes);
3081 even_newer_pos = lisp_buffer_stream_startpos (istr);
3082 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3088 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3089 Lstream_close (istr);
3090 Lstream_close (ostr);
3092 Lstream_delete (istr);
3093 Lstream_delete (ostr);
3094 Lstream_delete (XLSTREAM (de_outstream));
3095 Lstream_delete (XLSTREAM (lb_outstream));
3096 return make_int (retlen);
3103 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3104 unsigned_char_dynarr *dst, size_t n)
3107 unsigned char char_boundary;
3108 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3109 unsigned int flags = str->flags;
3110 Emchar ch = str->ch;
3112 char_boundary = str->iso2022.current_char_boundary;
3118 if (char_boundary == 0)
3146 (*str->encode_char) (str, c, dst, &flags);
3148 else if (char_boundary == 1)
3150 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3156 ch = (ch << 6) | (c & 0x3f);
3161 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3163 (*str->finish) (str, dst, &flags);
3168 str->iso2022.current_char_boundary = char_boundary;
3172 /************************************************************************/
3173 /* Shift-JIS methods */
3174 /************************************************************************/
3176 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3177 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3178 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3179 encoded by "position-code + 0x80". A character of JISX0208
3180 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3181 position-codes are divided and shifted so that it fit in the range
3184 --- CODE RANGE of Shift-JIS ---
3185 (character set) (range)
3187 JISX0201-Kana 0xA0 .. 0xDF
3188 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3189 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3190 -------------------------------
3194 /* Is this the first byte of a Shift-JIS two-byte char? */
3196 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3197 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3199 /* Is this the second byte of a Shift-JIS two-byte char? */
3201 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3202 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3204 #define BYTE_SJIS_KATAKANA_P(c) \
3205 ((c) >= 0xA1 && (c) <= 0xDF)
3208 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3212 unsigned char c = *(unsigned char *)src++;
3213 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3215 if (st->shift_jis.in_second_byte)
3217 st->shift_jis.in_second_byte = 0;
3221 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3222 st->shift_jis.in_second_byte = 1;
3224 return CODING_CATEGORY_SHIFT_JIS_MASK;
3227 /* Convert Shift-JIS data to internal format. */
3230 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3231 unsigned_char_dynarr *dst, size_t n)
3233 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3234 unsigned int flags = str->flags;
3235 unsigned int cpos = str->cpos;
3236 eol_type_t eol_type = str->eol_type;
3240 unsigned char c = *(unsigned char *)src++;
3244 /* Previous character was first byte of Shift-JIS Kanji char. */
3245 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3247 unsigned char e1, e2;
3249 DECODE_SJIS (cpos, c, e1, e2);
3251 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3255 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3256 Dynarr_add (dst, e1);
3257 Dynarr_add (dst, e2);
3262 DECODE_ADD_BINARY_CHAR (cpos, dst);
3263 DECODE_ADD_BINARY_CHAR (c, dst);
3269 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3270 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3272 else if (BYTE_SJIS_KATAKANA_P (c))
3275 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3278 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3279 Dynarr_add (dst, c);
3284 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3288 DECODE_ADD_BINARY_CHAR (c, dst);
3290 label_continue_loop:;
3293 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3299 /* Convert internal character representation to Shift_JIS. */
3302 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3303 unsigned_char_dynarr *dst, unsigned int *flags)
3305 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3309 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3310 Dynarr_add (dst, '\r');
3311 if (eol_type != EOL_CR)
3312 Dynarr_add (dst, ch);
3316 unsigned int s1, s2;
3318 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3320 if (code_point >= 0)
3321 Dynarr_add (dst, code_point);
3322 else if ((code_point
3323 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3326 ENCODE_SJIS ((code_point >> 8) | 0x80,
3327 (code_point & 0xFF) | 0x80, s1, s2);
3328 Dynarr_add (dst, s1);
3329 Dynarr_add (dst, s2);
3331 else if ((code_point
3332 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3334 Dynarr_add (dst, code_point | 0x80);
3335 else if ((code_point
3336 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3339 ENCODE_SJIS ((code_point >> 8) | 0x80,
3340 (code_point & 0xFF) | 0x80, s1, s2);
3341 Dynarr_add (dst, s1);
3342 Dynarr_add (dst, s2);
3344 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3346 Dynarr_add (dst, code_point);
3348 Dynarr_add (dst, '?');
3350 Lisp_Object charset;
3351 unsigned int c1, c2;
3353 BREAKUP_CHAR (ch, charset, c1, c2);
3355 if (EQ(charset, Vcharset_katakana_jisx0201))
3357 Dynarr_add (dst, c1 | 0x80);
3361 Dynarr_add (dst, c1);
3363 else if (EQ(charset, Vcharset_japanese_jisx0208))
3365 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3366 Dynarr_add (dst, s1);
3367 Dynarr_add (dst, s2);
3370 Dynarr_add (dst, '?');
3376 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3377 unsigned int *flags)
3381 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3382 Decode a JISX0208 character of Shift-JIS coding-system.
3383 CODE is the character code in Shift-JIS as a cons of type bytes.
3384 Return the corresponding character.
3388 unsigned char c1, c2, s1, s2;
3391 CHECK_INT (XCAR (code));
3392 CHECK_INT (XCDR (code));
3393 s1 = XINT (XCAR (code));
3394 s2 = XINT (XCDR (code));
3395 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3396 BYTE_SJIS_TWO_BYTE_2_P (s2))
3398 DECODE_SJIS (s1, s2, c1, c2);
3399 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3400 c1 & 0x7F, c2 & 0x7F));
3406 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3407 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3408 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3412 Lisp_Object charset;
3415 CHECK_CHAR_COERCE_INT (character);
3416 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3417 if (EQ (charset, Vcharset_japanese_jisx0208))
3419 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3420 return Fcons (make_int (s1), make_int (s2));
3427 /************************************************************************/
3429 /************************************************************************/
3431 /* BIG5 is a coding system encoding two character sets: ASCII and
3432 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3433 character set and is encoded in two-byte.
3435 --- CODE RANGE of BIG5 ---
3436 (character set) (range)
3438 Big5 (1st byte) 0xA1 .. 0xFE
3439 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3440 --------------------------
3442 Since the number of characters in Big5 is larger than maximum
3443 characters in Emacs' charset (96x96), it can't be handled as one
3444 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3445 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3446 contains frequently used characters and the latter contains less
3447 frequently used characters. */
3450 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3451 ((c) >= 0x81 && (c) <= 0xFE)
3453 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3454 ((c) >= 0xA1 && (c) <= 0xFE)
3457 /* Is this the second byte of a Shift-JIS two-byte char? */
3459 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3460 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3462 /* Number of Big5 characters which have the same code in 1st byte. */
3464 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3466 /* Code conversion macros. These are macros because they are used in
3467 inner loops during code conversion.
3469 Note that temporary variables in macros introduce the classic
3470 dynamic-scoping problems with variable names. We use capital-
3471 lettered variables in the assumption that XEmacs does not use
3472 capital letters in variables except in a very formalized way
3475 /* Convert Big5 code (b1, b2) into its internal string representation
3478 /* There is a much simpler way to split the Big5 charset into two.
3479 For the moment I'm going to leave the algorithm as-is because it
3480 claims to separate out the most-used characters into a single
3481 charset, which perhaps will lead to optimizations in various
3484 The way the algorithm works is something like this:
3486 Big5 can be viewed as a 94x157 charset, where the row is
3487 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3488 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3489 the split between low and high column numbers is apparently
3490 meaningless; ascending rows produce less and less frequent chars.
3491 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3492 the first charset, and the upper half (0xC9 .. 0xFE) to the
3493 second. To do the conversion, we convert the character into
3494 a single number where 0 .. 156 is the first row, 157 .. 313
3495 is the second, etc. That way, the characters are ordered by
3496 decreasing frequency. Then we just chop the space in two
3497 and coerce the result into a 94x94 space.
3500 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3502 int B1 = b1, B2 = b2; \
3504 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3508 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3512 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3513 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3515 c1 = I / (0xFF - 0xA1) + 0xA1; \
3516 c2 = I % (0xFF - 0xA1) + 0xA1; \
3519 /* Convert the internal string representation of a Big5 character
3520 (lb, c1, c2) into Big5 code (b1, b2). */
3522 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3524 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3526 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3528 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3530 b1 = I / BIG5_SAME_ROW + 0xA1; \
3531 b2 = I % BIG5_SAME_ROW; \
3532 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3536 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3540 unsigned char c = *(unsigned char *)src++;
3541 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3543 || (c >= 0x80 && c <= 0xA0)
3547 if (st->big5.in_second_byte)
3549 st->big5.in_second_byte = 0;
3550 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3560 st->big5.in_second_byte = 1;
3562 return CODING_CATEGORY_BIG5_MASK;
3565 /* Convert Big5 data to internal format. */
3568 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3569 unsigned_char_dynarr *dst, size_t n)
3571 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3572 unsigned int flags = str->flags;
3573 unsigned int cpos = str->cpos;
3574 eol_type_t eol_type = str->eol_type;
3578 unsigned char c = *(unsigned char *)src++;
3581 /* Previous character was first byte of Big5 char. */
3582 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3586 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3589 unsigned char b1, b2, b3;
3590 DECODE_BIG5 (cpos, c, b1, b2, b3);
3591 Dynarr_add (dst, b1);
3592 Dynarr_add (dst, b2);
3593 Dynarr_add (dst, b3);
3598 DECODE_ADD_BINARY_CHAR (cpos, dst);
3599 DECODE_ADD_BINARY_CHAR (c, dst);
3605 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3606 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3609 DECODE_ADD_BINARY_CHAR (c, dst);
3611 label_continue_loop:;
3614 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3620 /* Convert internally-formatted data to Big5. */
3623 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3624 unsigned_char_dynarr *dst, size_t n)
3628 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3629 unsigned int flags = str->flags;
3630 unsigned int ch = str->ch;
3631 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3638 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3639 Dynarr_add (dst, '\r');
3640 if (eol_type != EOL_CR)
3641 Dynarr_add (dst, '\n');
3643 else if (BYTE_ASCII_P (c))
3646 Dynarr_add (dst, c);
3648 else if (BUFBYTE_LEADING_BYTE_P (c))
3650 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3651 c == LEADING_BYTE_CHINESE_BIG5_2)
3653 /* A recognized leading byte. */
3655 continue; /* not done with this character. */
3657 /* otherwise just ignore this character. */
3659 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3660 ch == LEADING_BYTE_CHINESE_BIG5_2)
3662 /* Previous char was a recognized leading byte. */
3664 continue; /* not done with this character. */
3668 /* Encountering second byte of a Big5 character. */
3669 unsigned char b1, b2;
3671 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3672 Dynarr_add (dst, b1);
3673 Dynarr_add (dst, b2);
3685 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3686 Decode a Big5 character CODE of BIG5 coding-system.
3687 CODE is the character code in BIG5, a cons of two integers.
3688 Return the corresponding character.
3692 unsigned char c1, c2, b1, b2;
3695 CHECK_INT (XCAR (code));
3696 CHECK_INT (XCDR (code));
3697 b1 = XINT (XCAR (code));
3698 b2 = XINT (XCDR (code));
3699 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3700 BYTE_BIG5_TWO_BYTE_2_P (b2))
3702 Charset_ID leading_byte;
3703 Lisp_Object charset;
3704 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3705 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3706 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3712 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3713 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3714 Return the corresponding character code in Big5.
3718 Lisp_Object charset;
3721 CHECK_CHAR_COERCE_INT (character);
3722 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3723 if (EQ (charset, Vcharset_chinese_big5_1) ||
3724 EQ (charset, Vcharset_chinese_big5_2))
3726 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3728 return Fcons (make_int (b1), make_int (b2));
3735 /************************************************************************/
3737 /************************************************************************/
3740 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3744 unsigned char c = *(unsigned char *)src++;
3745 switch (st->ucs4.in_byte)
3754 st->ucs4.in_byte = 0;
3760 return CODING_CATEGORY_UCS4_MASK;
3764 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3765 unsigned_char_dynarr *dst, size_t n)
3767 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3768 unsigned int flags = str->flags;
3769 unsigned int cpos = str->cpos;
3770 unsigned char counter = str->counter;
3774 unsigned char c = *(unsigned char *)src++;
3782 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3787 cpos = ( cpos << 8 ) | c;
3791 if (counter & CODING_STATE_END)
3792 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3796 str->counter = counter;
3800 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3801 unsigned_char_dynarr *dst, unsigned int *flags)
3803 Dynarr_add (dst, ch >> 24);
3804 Dynarr_add (dst, ch >> 16);
3805 Dynarr_add (dst, ch >> 8);
3806 Dynarr_add (dst, ch );
3810 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3811 unsigned int *flags)
3816 /************************************************************************/
3818 /************************************************************************/
3821 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3825 unsigned char c = *(unsigned char *)src++;
3826 switch (st->utf8.in_byte)
3829 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3832 st->utf8.in_byte = 5;
3834 st->utf8.in_byte = 4;
3836 st->utf8.in_byte = 3;
3838 st->utf8.in_byte = 2;
3840 st->utf8.in_byte = 1;
3845 if ((c & 0xc0) != 0x80)
3851 return CODING_CATEGORY_UTF8_MASK;
3855 decode_output_utf8_partial_char (unsigned char counter,
3857 unsigned_char_dynarr *dst)
3860 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3861 else if (counter == 4)
3863 if (cpos < (1 << 6))
3864 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3867 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3868 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3871 else if (counter == 3)
3873 if (cpos < (1 << 6))
3874 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3875 else if (cpos < (1 << 12))
3877 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3878 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3882 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3883 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3884 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3887 else if (counter == 2)
3889 if (cpos < (1 << 6))
3890 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3891 else if (cpos < (1 << 12))
3893 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3894 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3896 else if (cpos < (1 << 18))
3898 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3899 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3900 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3904 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3905 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3906 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3907 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3912 if (cpos < (1 << 6))
3913 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3914 else if (cpos < (1 << 12))
3916 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3917 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3919 else if (cpos < (1 << 18))
3921 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3922 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3923 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3925 else if (cpos < (1 << 24))
3927 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3928 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3929 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3930 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3934 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3935 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3936 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3937 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3938 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3944 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3945 unsigned_char_dynarr *dst, size_t n)
3947 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3948 unsigned int flags = str->flags;
3949 unsigned int cpos = str->cpos;
3950 eol_type_t eol_type = str->eol_type;
3951 unsigned char counter = str->counter;
3955 unsigned char c = *(unsigned char *)src++;
3960 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3961 DECODE_ADD_UCS_CHAR (c, dst);
3963 else if ( c < 0xE0 )
3968 else if ( c < 0xF0 )
3973 else if ( c < 0xF8 )
3978 else if ( c < 0xFC )
3989 else if ( (c & 0xC0) == 0x80 )
3991 cpos = ( cpos << 6 ) | ( c & 0x3f );
3994 DECODE_ADD_UCS_CHAR (cpos, dst);
4003 decode_output_utf8_partial_char (counter, cpos, dst);
4004 DECODE_ADD_BINARY_CHAR (c, dst);
4008 label_continue_loop:;
4011 if (flags & CODING_STATE_END)
4014 decode_output_utf8_partial_char (counter, cpos, dst);
4020 str->counter = counter;
4024 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4025 unsigned_char_dynarr *dst, unsigned int *flags)
4027 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4031 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4032 Dynarr_add (dst, '\r');
4033 if (eol_type != EOL_CR)
4034 Dynarr_add (dst, ch);
4036 else if (ch <= 0x7f)
4038 Dynarr_add (dst, ch);
4040 else if (ch <= 0x7ff)
4042 Dynarr_add (dst, (ch >> 6) | 0xc0);
4043 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4045 else if (ch <= 0xffff)
4047 Dynarr_add (dst, (ch >> 12) | 0xe0);
4048 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4049 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4051 else if (ch <= 0x1fffff)
4053 Dynarr_add (dst, (ch >> 18) | 0xf0);
4054 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4055 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4056 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4058 else if (ch <= 0x3ffffff)
4060 Dynarr_add (dst, (ch >> 24) | 0xf8);
4061 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4062 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4063 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4064 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4068 Dynarr_add (dst, (ch >> 30) | 0xfc);
4069 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4070 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4071 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4072 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
4073 Dynarr_add (dst, (ch & 0x3f) | 0x80);
4078 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4079 unsigned int *flags)
4084 /************************************************************************/
4085 /* ISO2022 methods */
4086 /************************************************************************/
4088 /* The following note describes the coding system ISO2022 briefly.
4089 Since the intention of this note is to help understand the
4090 functions in this file, some parts are NOT ACCURATE or OVERLY
4091 SIMPLIFIED. For thorough understanding, please refer to the
4092 original document of ISO2022.
4094 ISO2022 provides many mechanisms to encode several character sets
4095 in 7-bit and 8-bit environments. For 7-bit environments, all text
4096 is encoded using bytes less than 128. This may make the encoded
4097 text a little bit longer, but the text passes more easily through
4098 several gateways, some of which strip off MSB (Most Signigant Bit).
4100 There are two kinds of character sets: control character set and
4101 graphic character set. The former contains control characters such
4102 as `newline' and `escape' to provide control functions (control
4103 functions are also provided by escape sequences). The latter
4104 contains graphic characters such as 'A' and '-'. Emacs recognizes
4105 two control character sets and many graphic character sets.
4107 Graphic character sets are classified into one of the following
4108 four classes, according to the number of bytes (DIMENSION) and
4109 number of characters in one dimension (CHARS) of the set:
4110 - DIMENSION1_CHARS94
4111 - DIMENSION1_CHARS96
4112 - DIMENSION2_CHARS94
4113 - DIMENSION2_CHARS96
4115 In addition, each character set is assigned an identification tag,
4116 unique for each set, called "final character" (denoted as <F>
4117 hereafter). The <F> of each character set is decided by ECMA(*)
4118 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4119 (0x30..0x3F are for private use only).
4121 Note (*): ECMA = European Computer Manufacturers Association
4123 Here are examples of graphic character set [NAME(<F>)]:
4124 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4125 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4126 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4127 o DIMENSION2_CHARS96 -- none for the moment
4129 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4130 C0 [0x00..0x1F] -- control character plane 0
4131 GL [0x20..0x7F] -- graphic character plane 0
4132 C1 [0x80..0x9F] -- control character plane 1
4133 GR [0xA0..0xFF] -- graphic character plane 1
4135 A control character set is directly designated and invoked to C0 or
4136 C1 by an escape sequence. The most common case is that:
4137 - ISO646's control character set is designated/invoked to C0, and
4138 - ISO6429's control character set is designated/invoked to C1,
4139 and usually these designations/invocations are omitted in encoded
4140 text. In a 7-bit environment, only C0 can be used, and a control
4141 character for C1 is encoded by an appropriate escape sequence to
4142 fit into the environment. All control characters for C1 are
4143 defined to have corresponding escape sequences.
4145 A graphic character set is at first designated to one of four
4146 graphic registers (G0 through G3), then these graphic registers are
4147 invoked to GL or GR. These designations and invocations can be
4148 done independently. The most common case is that G0 is invoked to
4149 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4150 these invocations and designations are omitted in encoded text.
4151 In a 7-bit environment, only GL can be used.
4153 When a graphic character set of CHARS94 is invoked to GL, codes
4154 0x20 and 0x7F of the GL area work as control characters SPACE and
4155 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4158 There are two ways of invocation: locking-shift and single-shift.
4159 With locking-shift, the invocation lasts until the next different
4160 invocation, whereas with single-shift, the invocation affects the
4161 following character only and doesn't affect the locking-shift
4162 state. Invocations are done by the following control characters or
4165 ----------------------------------------------------------------------
4166 abbrev function cntrl escape seq description
4167 ----------------------------------------------------------------------
4168 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4169 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4170 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4171 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4172 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4173 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4174 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4175 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4176 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4177 ----------------------------------------------------------------------
4178 (*) These are not used by any known coding system.
4180 Control characters for these functions are defined by macros
4181 ISO_CODE_XXX in `coding.h'.
4183 Designations are done by the following escape sequences:
4184 ----------------------------------------------------------------------
4185 escape sequence description
4186 ----------------------------------------------------------------------
4187 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4188 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4189 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4190 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4191 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4192 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4193 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4194 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4195 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4196 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4197 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4198 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4199 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4200 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4201 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4202 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4203 ----------------------------------------------------------------------
4205 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4206 of dimension 1, chars 94, and final character <F>, etc...
4208 Note (*): Although these designations are not allowed in ISO2022,
4209 Emacs accepts them on decoding, and produces them on encoding
4210 CHARS96 character sets in a coding system which is characterized as
4211 7-bit environment, non-locking-shift, and non-single-shift.
4213 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4214 '(' can be omitted. We refer to this as "short-form" hereafter.
4216 Now you may notice that there are a lot of ways for encoding the
4217 same multilingual text in ISO2022. Actually, there exist many
4218 coding systems such as Compound Text (used in X11's inter client
4219 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4220 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4221 localized platforms), and all of these are variants of ISO2022.
4223 In addition to the above, Emacs handles two more kinds of escape
4224 sequences: ISO6429's direction specification and Emacs' private
4225 sequence for specifying character composition.
4227 ISO6429's direction specification takes the following form:
4228 o CSI ']' -- end of the current direction
4229 o CSI '0' ']' -- end of the current direction
4230 o CSI '1' ']' -- start of left-to-right text
4231 o CSI '2' ']' -- start of right-to-left text
4232 The control character CSI (0x9B: control sequence introducer) is
4233 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4235 Character composition specification takes the following form:
4236 o ESC '0' -- start character composition
4237 o ESC '1' -- end character composition
4238 Since these are not standard escape sequences of any ISO standard,
4239 their use with these meanings is restricted to Emacs only. */
4242 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4246 for (i = 0; i < 4; i++)
4248 if (!NILP (coding_system))
4250 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4252 iso->charset[i] = Qt;
4253 iso->invalid_designated[i] = 0;
4255 iso->esc = ISO_ESC_NOTHING;
4256 iso->esc_bytes_index = 0;
4257 iso->register_left = 0;
4258 iso->register_right = 1;
4259 iso->switched_dir_and_no_valid_charset_yet = 0;
4260 iso->invalid_switch_dir = 0;
4261 iso->output_direction_sequence = 0;
4262 iso->output_literally = 0;
4263 #ifdef ENABLE_COMPOSITE_CHARS
4264 if (iso->composite_chars)
4265 Dynarr_reset (iso->composite_chars);
4270 fit_to_be_escape_quoted (unsigned char c)
4287 /* Parse one byte of an ISO2022 escape sequence.
4288 If the result is an invalid escape sequence, return 0 and
4289 do not change anything in STR. Otherwise, if the result is
4290 an incomplete escape sequence, update ISO2022.ESC and
4291 ISO2022.ESC_BYTES and return -1. Otherwise, update
4292 all the state variables (but not ISO2022.ESC_BYTES) and
4295 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4296 or invocation of an invalid character set and treat that as
4297 an unrecognized escape sequence. */
4300 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4301 unsigned char c, unsigned int *flags,
4302 int check_invalid_charsets)
4304 /* (1) If we're at the end of a designation sequence, CS is the
4305 charset being designated and REG is the register to designate
4308 (2) If we're at the end of a locking-shift sequence, REG is
4309 the register to invoke and HALF (0 == left, 1 == right) is
4310 the half to invoke it into.
4312 (3) If we're at the end of a single-shift sequence, REG is
4313 the register to invoke. */
4314 Lisp_Object cs = Qnil;
4317 /* NOTE: This code does goto's all over the fucking place.
4318 The reason for this is that we're basically implementing
4319 a state machine here, and hierarchical languages like C
4320 don't really provide a clean way of doing this. */
4322 if (! (*flags & CODING_STATE_ESCAPE))
4323 /* At beginning of escape sequence; we need to reset our
4324 escape-state variables. */
4325 iso->esc = ISO_ESC_NOTHING;
4327 iso->output_literally = 0;
4328 iso->output_direction_sequence = 0;
4332 case ISO_ESC_NOTHING:
4333 iso->esc_bytes_index = 0;
4336 case ISO_CODE_ESC: /* Start escape sequence */
4337 *flags |= CODING_STATE_ESCAPE;
4341 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4342 *flags |= CODING_STATE_ESCAPE;
4343 iso->esc = ISO_ESC_5_11;
4346 case ISO_CODE_SO: /* locking shift 1 */
4349 case ISO_CODE_SI: /* locking shift 0 */
4353 case ISO_CODE_SS2: /* single shift */
4356 case ISO_CODE_SS3: /* single shift */
4360 default: /* Other control characters */
4367 /**** single shift ****/
4369 case 'N': /* single shift 2 */
4372 case 'O': /* single shift 3 */
4376 /**** locking shift ****/
4378 case '~': /* locking shift 1 right */
4381 case 'n': /* locking shift 2 */
4384 case '}': /* locking shift 2 right */
4387 case 'o': /* locking shift 3 */
4390 case '|': /* locking shift 3 right */
4394 #ifdef ENABLE_COMPOSITE_CHARS
4395 /**** composite ****/
4398 iso->esc = ISO_ESC_START_COMPOSITE;
4399 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4400 CODING_STATE_COMPOSITE;
4404 iso->esc = ISO_ESC_END_COMPOSITE;
4405 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4406 ~CODING_STATE_COMPOSITE;
4408 #endif /* ENABLE_COMPOSITE_CHARS */
4410 /**** directionality ****/
4413 iso->esc = ISO_ESC_5_11;
4416 /**** designation ****/
4418 case '$': /* multibyte charset prefix */
4419 iso->esc = ISO_ESC_2_4;
4423 if (0x28 <= c && c <= 0x2F)
4425 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4429 /* This function is called with CODESYS equal to nil when
4430 doing coding-system detection. */
4432 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4433 && fit_to_be_escape_quoted (c))
4435 iso->esc = ISO_ESC_LITERAL;
4436 *flags &= CODING_STATE_ISO2022_LOCK;
4446 /**** directionality ****/
4448 case ISO_ESC_5_11: /* ISO6429 direction control */
4451 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4452 goto directionality;
4454 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4455 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4456 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4460 case ISO_ESC_5_11_0:
4463 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4464 goto directionality;
4468 case ISO_ESC_5_11_1:
4471 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4472 goto directionality;
4476 case ISO_ESC_5_11_2:
4479 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4480 goto directionality;
4485 iso->esc = ISO_ESC_DIRECTIONALITY;
4486 /* Various junk here to attempt to preserve the direction sequences
4487 literally in the text if they would otherwise be swallowed due
4488 to invalid designations that don't show up as actual charset
4489 changes in the text. */
4490 if (iso->invalid_switch_dir)
4492 /* We already inserted a direction switch literally into the
4493 text. We assume (#### this may not be right) that the
4494 next direction switch is the one going the other way,
4495 and we need to output that literally as well. */
4496 iso->output_literally = 1;
4497 iso->invalid_switch_dir = 0;
4503 /* If we are in the thrall of an invalid designation,
4504 then stick the directionality sequence literally into the
4505 output stream so it ends up in the original text again. */
4506 for (jj = 0; jj < 4; jj++)
4507 if (iso->invalid_designated[jj])
4511 iso->output_literally = 1;
4512 iso->invalid_switch_dir = 1;
4515 /* Indicate that we haven't yet seen a valid designation,
4516 so that if a switch-dir is directly followed by an
4517 invalid designation, both get inserted literally. */
4518 iso->switched_dir_and_no_valid_charset_yet = 1;
4523 /**** designation ****/
4526 if (0x28 <= c && c <= 0x2F)
4528 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4531 if (0x40 <= c && c <= 0x42)
4534 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4535 *flags & CODING_STATE_R2L ?
4536 CHARSET_RIGHT_TO_LEFT :
4537 CHARSET_LEFT_TO_RIGHT);
4548 if (c < '0' || c > '~')
4549 return 0; /* bad final byte */
4551 if (iso->esc >= ISO_ESC_2_8 &&
4552 iso->esc <= ISO_ESC_2_15)
4554 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4555 single = 1; /* single-byte */
4556 reg = (iso->esc - ISO_ESC_2_8) & 3;
4558 else if (iso->esc >= ISO_ESC_2_4_8 &&
4559 iso->esc <= ISO_ESC_2_4_15)
4561 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4562 single = -1; /* multi-byte */
4563 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4567 /* Can this ever be reached? -slb */
4571 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4572 *flags & CODING_STATE_R2L ?
4573 CHARSET_RIGHT_TO_LEFT :
4574 CHARSET_LEFT_TO_RIGHT);
4580 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4584 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4585 /* can't invoke something that ain't there. */
4587 iso->esc = ISO_ESC_SINGLE_SHIFT;
4588 *flags &= CODING_STATE_ISO2022_LOCK;
4590 *flags |= CODING_STATE_SS2;
4592 *flags |= CODING_STATE_SS3;
4596 if (check_invalid_charsets &&
4597 !CHARSETP (iso->charset[reg]))
4598 /* can't invoke something that ain't there. */
4601 iso->register_right = reg;
4603 iso->register_left = reg;
4604 *flags &= CODING_STATE_ISO2022_LOCK;
4605 iso->esc = ISO_ESC_LOCKING_SHIFT;
4609 if (NILP (cs) && check_invalid_charsets)
4611 iso->invalid_designated[reg] = 1;
4612 iso->charset[reg] = Vcharset_ascii;
4613 iso->esc = ISO_ESC_DESIGNATE;
4614 *flags &= CODING_STATE_ISO2022_LOCK;
4615 iso->output_literally = 1;
4616 if (iso->switched_dir_and_no_valid_charset_yet)
4618 /* We encountered a switch-direction followed by an
4619 invalid designation. Ensure that the switch-direction
4620 gets outputted; otherwise it will probably get eaten
4621 when the text is written out again. */
4622 iso->switched_dir_and_no_valid_charset_yet = 0;
4623 iso->output_direction_sequence = 1;
4624 /* And make sure that the switch-dir going the other
4625 way gets outputted, as well. */
4626 iso->invalid_switch_dir = 1;
4630 /* This function is called with CODESYS equal to nil when
4631 doing coding-system detection. */
4632 if (!NILP (codesys))
4634 charset_conversion_spec_dynarr *dyn =
4635 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4641 for (i = 0; i < Dynarr_length (dyn); i++)
4643 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4644 if (EQ (cs, spec->from_charset))
4645 cs = spec->to_charset;
4650 iso->charset[reg] = cs;
4651 iso->esc = ISO_ESC_DESIGNATE;
4652 *flags &= CODING_STATE_ISO2022_LOCK;
4653 if (iso->invalid_designated[reg])
4655 iso->invalid_designated[reg] = 0;
4656 iso->output_literally = 1;
4658 if (iso->switched_dir_and_no_valid_charset_yet)
4659 iso->switched_dir_and_no_valid_charset_yet = 0;
4664 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4668 /* #### There are serious deficiencies in the recognition mechanism
4669 here. This needs to be much smarter if it's going to cut it.
4670 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4671 it should be detected as Latin-1.
4672 All the ISO2022 stuff in this file should be synced up with the
4673 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4674 Perhaps we should wait till R2L works in FSF Emacs? */
4676 if (!st->iso2022.initted)
4678 reset_iso2022 (Qnil, &st->iso2022.iso);
4679 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4680 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4681 CODING_CATEGORY_ISO_8_1_MASK |
4682 CODING_CATEGORY_ISO_8_2_MASK |
4683 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4684 st->iso2022.flags = 0;
4685 st->iso2022.high_byte_count = 0;
4686 st->iso2022.saw_single_shift = 0;
4687 st->iso2022.initted = 1;
4690 mask = st->iso2022.mask;
4694 unsigned char c = *(unsigned char *)src++;
4697 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4698 st->iso2022.high_byte_count++;
4702 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4704 if (st->iso2022.high_byte_count & 1)
4705 /* odd number of high bytes; assume not iso-8-2 */
4706 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4708 st->iso2022.high_byte_count = 0;
4709 st->iso2022.saw_single_shift = 0;
4711 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4713 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4714 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4715 { /* control chars */
4718 /* Allow and ignore control characters that you might
4719 reasonably see in a text file */
4724 case 8: /* backspace */
4725 case 11: /* vertical tab */
4726 case 12: /* form feed */
4727 case 26: /* MS-DOS C-z junk */
4728 case 31: /* '^_' -- for info */
4729 goto label_continue_loop;
4736 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4739 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4740 &st->iso2022.flags, 0))
4742 switch (st->iso2022.iso.esc)
4744 case ISO_ESC_DESIGNATE:
4745 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4746 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4748 case ISO_ESC_LOCKING_SHIFT:
4749 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4750 goto ran_out_of_chars;
4751 case ISO_ESC_SINGLE_SHIFT:
4752 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4753 st->iso2022.saw_single_shift = 1;
4762 goto ran_out_of_chars;
4765 label_continue_loop:;
4774 postprocess_iso2022_mask (int mask)
4776 /* #### kind of cheesy */
4777 /* If seven-bit ISO is allowed, then assume that the encoding is
4778 entirely seven-bit and turn off the eight-bit ones. */
4779 if (mask & CODING_CATEGORY_ISO_7_MASK)
4780 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4781 CODING_CATEGORY_ISO_8_1_MASK |
4782 CODING_CATEGORY_ISO_8_2_MASK);
4786 /* If FLAGS is a null pointer or specifies right-to-left motion,
4787 output a switch-dir-to-left-to-right sequence to DST.
4788 Also update FLAGS if it is not a null pointer.
4789 If INTERNAL_P is set, we are outputting in internal format and
4790 need to handle the CSI differently. */
4793 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4794 unsigned_char_dynarr *dst,
4795 unsigned int *flags,
4798 if (!flags || (*flags & CODING_STATE_R2L))
4800 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4802 Dynarr_add (dst, ISO_CODE_ESC);
4803 Dynarr_add (dst, '[');
4805 else if (internal_p)
4806 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4808 Dynarr_add (dst, ISO_CODE_CSI);
4809 Dynarr_add (dst, '0');
4810 Dynarr_add (dst, ']');
4812 *flags &= ~CODING_STATE_R2L;
4816 /* If FLAGS is a null pointer or specifies a direction different from
4817 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4818 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4819 sequence to DST. Also update FLAGS if it is not a null pointer.
4820 If INTERNAL_P is set, we are outputting in internal format and
4821 need to handle the CSI differently. */
4824 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4825 unsigned_char_dynarr *dst, unsigned int *flags,
4828 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4829 direction == CHARSET_LEFT_TO_RIGHT)
4830 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4831 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4832 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4833 direction == CHARSET_RIGHT_TO_LEFT)
4835 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4837 Dynarr_add (dst, ISO_CODE_ESC);
4838 Dynarr_add (dst, '[');
4840 else if (internal_p)
4841 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4843 Dynarr_add (dst, ISO_CODE_CSI);
4844 Dynarr_add (dst, '2');
4845 Dynarr_add (dst, ']');
4847 *flags |= CODING_STATE_R2L;
4851 /* Convert ISO2022-format data to internal format. */
4854 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4855 unsigned_char_dynarr *dst, size_t n)
4857 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4858 unsigned int flags = str->flags;
4859 unsigned int cpos = str->cpos;
4860 unsigned char counter = str->counter;
4861 eol_type_t eol_type = str->eol_type;
4862 #ifdef ENABLE_COMPOSITE_CHARS
4863 unsigned_char_dynarr *real_dst = dst;
4865 Lisp_Object coding_system;
4867 XSETCODING_SYSTEM (coding_system, str->codesys);
4869 #ifdef ENABLE_COMPOSITE_CHARS
4870 if (flags & CODING_STATE_COMPOSITE)
4871 dst = str->iso2022.composite_chars;
4872 #endif /* ENABLE_COMPOSITE_CHARS */
4876 unsigned char c = *(unsigned char *)src++;
4877 if (flags & CODING_STATE_ESCAPE)
4878 { /* Within ESC sequence */
4879 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4884 switch (str->iso2022.esc)
4886 #ifdef ENABLE_COMPOSITE_CHARS
4887 case ISO_ESC_START_COMPOSITE:
4888 if (str->iso2022.composite_chars)
4889 Dynarr_reset (str->iso2022.composite_chars);
4891 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4892 dst = str->iso2022.composite_chars;
4894 case ISO_ESC_END_COMPOSITE:
4896 Bufbyte comstr[MAX_EMCHAR_LEN];
4898 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4899 Dynarr_length (dst));
4901 len = set_charptr_emchar (comstr, emch);
4902 Dynarr_add_many (dst, comstr, len);
4905 #endif /* ENABLE_COMPOSITE_CHARS */
4907 case ISO_ESC_LITERAL:
4908 COMPOSE_FLUSH_CHARS (str, dst);
4909 DECODE_ADD_BINARY_CHAR (c, dst);
4913 /* Everything else handled already */
4918 /* Attempted error recovery. */
4919 if (str->iso2022.output_direction_sequence)
4920 ensure_correct_direction (flags & CODING_STATE_R2L ?
4921 CHARSET_RIGHT_TO_LEFT :
4922 CHARSET_LEFT_TO_RIGHT,
4923 str->codesys, dst, 0, 1);
4924 /* More error recovery. */
4925 if (!retval || str->iso2022.output_literally)
4927 /* Output the (possibly invalid) sequence */
4929 COMPOSE_FLUSH_CHARS (str, dst);
4930 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4931 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4932 flags &= CODING_STATE_ISO2022_LOCK;
4934 n++, src--;/* Repeat the loop with the same character. */
4937 /* No sense in reprocessing the final byte of the
4938 escape sequence; it could mess things up anyway.
4940 COMPOSE_FLUSH_CHARS (str, dst);
4941 DECODE_ADD_BINARY_CHAR (c, dst);
4947 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4948 { /* Control characters */
4950 /***** Error-handling *****/
4952 /* If we were in the middle of a character, dump out the
4953 partial character. */
4956 COMPOSE_FLUSH_CHARS (str, dst);
4960 DECODE_ADD_BINARY_CHAR
4961 ((unsigned char)(cpos >> (counter * 8)), dst);
4966 /* If we just saw a single-shift character, dump it out.
4967 This may dump out the wrong sort of single-shift character,
4968 but least it will give an indication that something went
4970 if (flags & CODING_STATE_SS2)
4972 COMPOSE_FLUSH_CHARS (str, dst);
4973 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4974 flags &= ~CODING_STATE_SS2;
4976 if (flags & CODING_STATE_SS3)
4978 COMPOSE_FLUSH_CHARS (str, dst);
4979 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4980 flags &= ~CODING_STATE_SS3;
4983 /***** Now handle the control characters. *****/
4989 COMPOSE_FLUSH_CHARS (str, dst);
4990 if (eol_type == EOL_CR)
4991 Dynarr_add (dst, '\n');
4992 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4993 Dynarr_add (dst, c);
4995 flags |= CODING_STATE_CR;
4996 goto label_continue_loop;
4998 else if (flags & CODING_STATE_CR)
4999 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5001 Dynarr_add (dst, '\r');
5002 flags &= ~CODING_STATE_CR;
5005 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5008 flags &= CODING_STATE_ISO2022_LOCK;
5010 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5012 COMPOSE_FLUSH_CHARS (str, dst);
5013 DECODE_ADD_BINARY_CHAR (c, dst);
5017 { /* Graphic characters */
5018 Lisp_Object charset;
5027 COMPOSE_FLUSH_CHARS (str, dst);
5028 if (eol_type == EOL_CR)
5029 Dynarr_add (dst, '\n');
5030 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5031 Dynarr_add (dst, c);
5033 flags |= CODING_STATE_CR;
5034 goto label_continue_loop;
5036 else if (flags & CODING_STATE_CR)
5037 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5039 Dynarr_add (dst, '\r');
5040 flags &= ~CODING_STATE_CR;
5043 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5046 /* Now determine the charset. */
5047 reg = ((flags & CODING_STATE_SS2) ? 2
5048 : (flags & CODING_STATE_SS3) ? 3
5049 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5050 : str->iso2022.register_left);
5051 charset = str->iso2022.charset[reg];
5053 /* Error checking: */
5054 if (! CHARSETP (charset)
5055 || str->iso2022.invalid_designated[reg]
5056 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5057 && XCHARSET_CHARS (charset) == 94))
5058 /* Mrmph. We are trying to invoke a register that has no
5059 or an invalid charset in it, or trying to add a character
5060 outside the range of the charset. Insert that char literally
5061 to preserve it for the output. */
5063 COMPOSE_FLUSH_CHARS (str, dst);
5067 DECODE_ADD_BINARY_CHAR
5068 ((unsigned char)(cpos >> (counter * 8)), dst);
5071 DECODE_ADD_BINARY_CHAR (c, dst);
5076 /* Things are probably hunky-dorey. */
5078 /* Fetch reverse charset, maybe. */
5079 if (((flags & CODING_STATE_R2L) &&
5080 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5082 (!(flags & CODING_STATE_R2L) &&
5083 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5085 Lisp_Object new_charset =
5086 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5087 if (!NILP (new_charset))
5088 charset = new_charset;
5093 if (XCHARSET_DIMENSION (charset) == counter)
5095 COMPOSE_ADD_CHAR (str,
5096 DECODE_CHAR (charset,
5097 ((cpos & 0x7F7F7F) << 8)
5104 cpos = (cpos << 8) | c;
5106 lb = XCHARSET_LEADING_BYTE (charset);
5107 switch (XCHARSET_REP_BYTES (charset))
5110 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5111 Dynarr_add (dst, c & 0x7F);
5114 case 2: /* one-byte official */
5115 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5116 Dynarr_add (dst, lb);
5117 Dynarr_add (dst, c | 0x80);
5120 case 3: /* one-byte private or two-byte official */
5121 if (XCHARSET_PRIVATE_P (charset))
5123 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5124 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5125 Dynarr_add (dst, lb);
5126 Dynarr_add (dst, c | 0x80);
5132 Dynarr_add (dst, lb);
5133 Dynarr_add (dst, ch | 0x80);
5134 Dynarr_add (dst, c | 0x80);
5142 default: /* two-byte private */
5145 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5146 Dynarr_add (dst, lb);
5147 Dynarr_add (dst, ch | 0x80);
5148 Dynarr_add (dst, c | 0x80);
5158 flags &= CODING_STATE_ISO2022_LOCK;
5161 label_continue_loop:;
5164 if (flags & CODING_STATE_END)
5166 COMPOSE_FLUSH_CHARS (str, dst);
5167 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5171 str->counter = counter;
5175 /***** ISO2022 encoder *****/
5177 /* Designate CHARSET into register REG. */
5180 iso2022_designate (Lisp_Object charset, unsigned char reg,
5181 struct encoding_stream *str, unsigned_char_dynarr *dst)
5183 static const char inter94[] = "()*+";
5184 static const char inter96[] = ",-./";
5185 unsigned short chars;
5186 unsigned char dimension;
5187 unsigned char final;
5188 Lisp_Object old_charset = str->iso2022.charset[reg];
5190 str->iso2022.charset[reg] = charset;
5191 if (!CHARSETP (charset))
5192 /* charset might be an initial nil or t. */
5194 chars = XCHARSET_CHARS (charset);
5195 dimension = XCHARSET_DIMENSION (charset);
5196 final = XCHARSET_FINAL (charset);
5197 if (!str->iso2022.force_charset_on_output[reg] &&
5198 CHARSETP (old_charset) &&
5199 XCHARSET_CHARS (old_charset) == chars &&
5200 XCHARSET_DIMENSION (old_charset) == dimension &&
5201 XCHARSET_FINAL (old_charset) == final)
5204 str->iso2022.force_charset_on_output[reg] = 0;
5207 charset_conversion_spec_dynarr *dyn =
5208 str->codesys->iso2022.output_conv;
5214 for (i = 0; i < Dynarr_length (dyn); i++)
5216 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5217 if (EQ (charset, spec->from_charset))
5218 charset = spec->to_charset;
5223 Dynarr_add (dst, ISO_CODE_ESC);
5228 Dynarr_add (dst, inter94[reg]);
5231 Dynarr_add (dst, '$');
5233 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5236 Dynarr_add (dst, inter94[reg]);
5241 Dynarr_add (dst, inter96[reg]);
5244 Dynarr_add (dst, '$');
5245 Dynarr_add (dst, inter96[reg]);
5249 Dynarr_add (dst, final);
5253 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5255 if (str->iso2022.register_left != 0)
5257 Dynarr_add (dst, ISO_CODE_SI);
5258 str->iso2022.register_left = 0;
5263 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5265 if (str->iso2022.register_left != 1)
5267 Dynarr_add (dst, ISO_CODE_SO);
5268 str->iso2022.register_left = 1;
5273 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5274 unsigned_char_dynarr *dst, unsigned int *flags)
5276 unsigned char charmask;
5277 Lisp_Coding_System* codesys = str->codesys;
5278 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5280 Lisp_Object charset = str->iso2022.current_charset;
5281 int half = str->iso2022.current_half;
5282 int code_point = -1;
5286 restore_left_to_right_direction (codesys, dst, flags, 0);
5288 /* Make sure G0 contains ASCII */
5289 if ((ch > ' ' && ch < ISO_CODE_DEL)
5290 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5292 ensure_normal_shift (str, dst);
5293 iso2022_designate (Vcharset_ascii, 0, str, dst);
5296 /* If necessary, restore everything to the default state
5298 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5300 restore_left_to_right_direction (codesys, dst, flags, 0);
5302 ensure_normal_shift (str, dst);
5304 for (i = 0; i < 4; i++)
5306 Lisp_Object initial_charset =
5307 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5308 iso2022_designate (initial_charset, i, str, dst);
5313 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5314 Dynarr_add (dst, '\r');
5315 if (eol_type != EOL_CR)
5316 Dynarr_add (dst, ch);
5320 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5321 && fit_to_be_escape_quoted (ch))
5322 Dynarr_add (dst, ISO_CODE_ESC);
5323 Dynarr_add (dst, ch);
5326 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5328 charmask = (half == 0 ? 0x00 : 0x80);
5330 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5331 && fit_to_be_escape_quoted (ch))
5332 Dynarr_add (dst, ISO_CODE_ESC);
5333 /* you asked for it ... */
5334 Dynarr_add (dst, ch);
5340 /* Now determine which register to use. */
5342 for (i = 0; i < 4; i++)
5344 if ((CHARSETP (charset = str->iso2022.charset[i])
5345 && ((code_point = charset_code_point (charset, ch)) >= 0))
5349 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5350 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5358 Lisp_Object original_default_coded_charset_priority_list
5359 = Vdefault_coded_charset_priority_list;
5361 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5363 code_point = ENCODE_CHAR (ch, charset);
5364 if (XCHARSET_FINAL (charset))
5366 Vdefault_coded_charset_priority_list
5367 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5368 Vdefault_coded_charset_priority_list));
5370 code_point = ENCODE_CHAR (ch, charset);
5371 if (!XCHARSET_FINAL (charset))
5373 charset = Vcharset_ascii;
5377 Vdefault_coded_charset_priority_list
5378 = original_default_coded_charset_priority_list;
5380 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5381 codesys, dst, flags, 0);
5385 if (XCHARSET_GRAPHIC (charset) != 0)
5387 if (!NILP (str->iso2022.charset[1]) &&
5388 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5389 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5391 else if (!NILP (str->iso2022.charset[2]))
5393 else if (!NILP (str->iso2022.charset[3]))
5402 iso2022_designate (charset, reg, str, dst);
5404 /* Now invoke that register. */
5408 ensure_normal_shift (str, dst);
5412 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5414 ensure_shift_out (str, dst);
5421 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5423 Dynarr_add (dst, ISO_CODE_ESC);
5424 Dynarr_add (dst, 'N');
5429 Dynarr_add (dst, ISO_CODE_SS2);
5434 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5436 Dynarr_add (dst, ISO_CODE_ESC);
5437 Dynarr_add (dst, 'O');
5442 Dynarr_add (dst, ISO_CODE_SS3);
5450 charmask = (half == 0 ? 0x00 : 0x80);
5452 switch (XCHARSET_DIMENSION (charset))
5455 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5458 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5459 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5462 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5463 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5464 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5467 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5468 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5469 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5470 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5476 str->iso2022.current_charset = charset;
5477 str->iso2022.current_half = half;
5481 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5482 unsigned int *flags)
5484 Lisp_Coding_System* codesys = str->codesys;
5487 restore_left_to_right_direction (codesys, dst, flags, 0);
5488 ensure_normal_shift (str, dst);
5489 for (i = 0; i < 4; i++)
5491 Lisp_Object initial_charset
5492 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5493 iso2022_designate (initial_charset, i, str, dst);
5498 /************************************************************************/
5499 /* No-conversion methods */
5500 /************************************************************************/
5502 /* This is used when reading in "binary" files -- i.e. files that may
5503 contain all 256 possible byte values and that are not to be
5504 interpreted as being in any particular decoding. */
5506 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5507 unsigned_char_dynarr *dst, size_t n)
5509 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5510 unsigned int flags = str->flags;
5511 unsigned int cpos = str->cpos;
5512 eol_type_t eol_type = str->eol_type;
5516 unsigned char c = *(unsigned char *)src++;
5518 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5519 DECODE_ADD_BINARY_CHAR (c, dst);
5520 label_continue_loop:;
5523 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5530 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5531 unsigned_char_dynarr *dst, size_t n)
5534 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5535 unsigned int flags = str->flags;
5536 unsigned int ch = str->ch;
5537 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5539 unsigned char char_boundary = str->iso2022.current_char_boundary;
5546 if (char_boundary == 0)
5552 else if ( c >= 0xf8 )
5557 else if ( c >= 0xf0 )
5562 else if ( c >= 0xe0 )
5567 else if ( c >= 0xc0 )
5577 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5578 Dynarr_add (dst, '\r');
5579 if (eol_type != EOL_CR)
5580 Dynarr_add (dst, c);
5583 Dynarr_add (dst, c);
5586 else if (char_boundary == 1)
5588 ch = ( ch << 6 ) | ( c & 0x3f );
5589 Dynarr_add (dst, ch & 0xff);
5594 ch = ( ch << 6 ) | ( c & 0x3f );
5597 #else /* not UTF2000 */
5600 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5601 Dynarr_add (dst, '\r');
5602 if (eol_type != EOL_CR)
5603 Dynarr_add (dst, '\n');
5606 else if (BYTE_ASCII_P (c))
5609 Dynarr_add (dst, c);
5611 else if (BUFBYTE_LEADING_BYTE_P (c))
5614 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5615 c == LEADING_BYTE_CONTROL_1)
5618 Dynarr_add (dst, '~'); /* untranslatable character */
5622 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5623 Dynarr_add (dst, c);
5624 else if (ch == LEADING_BYTE_CONTROL_1)
5627 Dynarr_add (dst, c - 0x20);
5629 /* else it should be the second or third byte of an
5630 untranslatable character, so ignore it */
5633 #endif /* not UTF2000 */
5639 str->iso2022.current_char_boundary = char_boundary;
5645 /************************************************************************/
5646 /* Initialization */
5647 /************************************************************************/
5650 syms_of_file_coding (void)
5652 INIT_LRECORD_IMPLEMENTATION (coding_system);
5654 deferror (&Qcoding_system_error, "coding-system-error",
5655 "Coding-system error", Qio_error);
5657 DEFSUBR (Fcoding_system_p);
5658 DEFSUBR (Ffind_coding_system);
5659 DEFSUBR (Fget_coding_system);
5660 DEFSUBR (Fcoding_system_list);
5661 DEFSUBR (Fcoding_system_name);
5662 DEFSUBR (Fmake_coding_system);
5663 DEFSUBR (Fcopy_coding_system);
5664 DEFSUBR (Fcoding_system_canonical_name_p);
5665 DEFSUBR (Fcoding_system_alias_p);
5666 DEFSUBR (Fcoding_system_aliasee);
5667 DEFSUBR (Fdefine_coding_system_alias);
5668 DEFSUBR (Fsubsidiary_coding_system);
5670 DEFSUBR (Fcoding_system_type);
5671 DEFSUBR (Fcoding_system_doc_string);
5673 DEFSUBR (Fcoding_system_charset);
5675 DEFSUBR (Fcoding_system_property);
5677 DEFSUBR (Fcoding_category_list);
5678 DEFSUBR (Fset_coding_priority_list);
5679 DEFSUBR (Fcoding_priority_list);
5680 DEFSUBR (Fset_coding_category_system);
5681 DEFSUBR (Fcoding_category_system);
5683 DEFSUBR (Fdetect_coding_region);
5684 DEFSUBR (Fdecode_coding_region);
5685 DEFSUBR (Fencode_coding_region);
5687 DEFSUBR (Fdecode_shift_jis_char);
5688 DEFSUBR (Fencode_shift_jis_char);
5689 DEFSUBR (Fdecode_big5_char);
5690 DEFSUBR (Fencode_big5_char);
5692 defsymbol (&Qcoding_systemp, "coding-system-p");
5693 defsymbol (&Qno_conversion, "no-conversion");
5694 defsymbol (&Qraw_text, "raw-text");
5696 defsymbol (&Qbig5, "big5");
5697 defsymbol (&Qshift_jis, "shift-jis");
5698 defsymbol (&Qucs4, "ucs-4");
5699 defsymbol (&Qutf8, "utf-8");
5700 defsymbol (&Qccl, "ccl");
5701 defsymbol (&Qiso2022, "iso2022");
5703 defsymbol (&Qmnemonic, "mnemonic");
5704 defsymbol (&Qeol_type, "eol-type");
5705 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5706 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5708 defsymbol (&Qcr, "cr");
5709 defsymbol (&Qlf, "lf");
5710 defsymbol (&Qcrlf, "crlf");
5711 defsymbol (&Qeol_cr, "eol-cr");
5712 defsymbol (&Qeol_lf, "eol-lf");
5713 defsymbol (&Qeol_crlf, "eol-crlf");
5715 defsymbol (&Qcharset_g0, "charset-g0");
5716 defsymbol (&Qcharset_g1, "charset-g1");
5717 defsymbol (&Qcharset_g2, "charset-g2");
5718 defsymbol (&Qcharset_g3, "charset-g3");
5719 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5720 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5721 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5722 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5723 defsymbol (&Qno_iso6429, "no-iso6429");
5724 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5725 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5727 defsymbol (&Qshort, "short");
5728 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5729 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5730 defsymbol (&Qseven, "seven");
5731 defsymbol (&Qlock_shift, "lock-shift");
5732 defsymbol (&Qescape_quoted, "escape-quoted");
5735 defsymbol (&Qdisable_composition, "disable-composition");
5737 defsymbol (&Qencode, "encode");
5738 defsymbol (&Qdecode, "decode");
5741 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5743 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5745 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5747 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5749 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5751 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5753 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5755 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5757 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5760 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5765 lstream_type_create_file_coding (void)
5767 LSTREAM_HAS_METHOD (decoding, reader);
5768 LSTREAM_HAS_METHOD (decoding, writer);
5769 LSTREAM_HAS_METHOD (decoding, rewinder);
5770 LSTREAM_HAS_METHOD (decoding, seekable_p);
5771 LSTREAM_HAS_METHOD (decoding, flusher);
5772 LSTREAM_HAS_METHOD (decoding, closer);
5773 LSTREAM_HAS_METHOD (decoding, marker);
5775 LSTREAM_HAS_METHOD (encoding, reader);
5776 LSTREAM_HAS_METHOD (encoding, writer);
5777 LSTREAM_HAS_METHOD (encoding, rewinder);
5778 LSTREAM_HAS_METHOD (encoding, seekable_p);
5779 LSTREAM_HAS_METHOD (encoding, flusher);
5780 LSTREAM_HAS_METHOD (encoding, closer);
5781 LSTREAM_HAS_METHOD (encoding, marker);
5785 vars_of_file_coding (void)
5789 fcd = xnew (struct file_coding_dump);
5790 dumpstruct (&fcd, &fcd_description);
5792 /* Initialize to something reasonable ... */
5793 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5795 fcd->coding_category_system[i] = Qnil;
5796 fcd->coding_category_by_priority[i] = i;
5799 Fprovide (intern ("file-coding"));
5801 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5802 Coding system used for TTY keyboard input.
5803 Not used under a windowing system.
5805 Vkeyboard_coding_system = Qnil;
5807 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5808 Coding system used for TTY display output.
5809 Not used under a windowing system.
5811 Vterminal_coding_system = Qnil;
5813 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5814 Overriding coding system used when reading from a file or process.
5815 You should bind this variable with `let', but do not set it globally.
5816 If this is non-nil, it specifies the coding system that will be used
5817 to decode input on read operations, such as from a file or process.
5818 It overrides `buffer-file-coding-system-for-read',
5819 `insert-file-contents-pre-hook', etc. Use those variables instead of
5820 this one for permanent changes to the environment. */ );
5821 Vcoding_system_for_read = Qnil;
5823 DEFVAR_LISP ("coding-system-for-write",
5824 &Vcoding_system_for_write /*
5825 Overriding coding system used when writing to a file or process.
5826 You should bind this variable with `let', but do not set it globally.
5827 If this is non-nil, it specifies the coding system that will be used
5828 to encode output for write operations, such as to a file or process.
5829 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5830 Use those variables instead of this one for permanent changes to the
5832 Vcoding_system_for_write = Qnil;
5834 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5835 Coding system used to convert pathnames when accessing files.
5837 Vfile_name_coding_system = Qnil;
5839 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5840 Non-nil means the buffer contents are regarded as multi-byte form
5841 of characters, not a binary code. This affects the display, file I/O,
5842 and behaviors of various editing commands.
5844 Setting this to nil does not do anything.
5846 enable_multibyte_characters = 1;
5850 complex_vars_of_file_coding (void)
5852 staticpro (&Vcoding_system_hash_table);
5853 Vcoding_system_hash_table =
5854 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5856 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5857 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5859 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5861 struct codesys_prop csp; \
5863 csp.prop_type = (Prop_Type); \
5864 Dynarr_add (the_codesys_prop_dynarr, csp); \
5867 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5868 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5869 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5870 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5871 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5872 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5873 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5875 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5876 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5877 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5878 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5879 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5880 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5881 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5882 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5883 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5884 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5885 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5886 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5887 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5888 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5889 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5890 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5891 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5893 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5894 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5896 /* Need to create this here or we're really screwed. */
5898 (Qraw_text, Qno_conversion,
5899 build_string ("Raw text, which means it converts only line-break-codes."),
5900 list2 (Qmnemonic, build_string ("Raw")));
5903 (Qbinary, Qno_conversion,
5904 build_string ("Binary, which means it does not convert anything."),
5905 list4 (Qeol_type, Qlf,
5906 Qmnemonic, build_string ("Binary")));
5911 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5912 list2 (Qmnemonic, build_string ("UTF8")));
5915 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5917 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5919 Fdefine_coding_system_alias (Qterminal, Qbinary);
5920 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5922 /* Need this for bootstrapping */
5923 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5924 Fget_coding_system (Qraw_text);
5927 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5928 = Fget_coding_system (Qutf8);
5931 #if defined(MULE) && !defined(UTF2000)
5935 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5936 fcd->ucs_to_mule_table[i] = Qnil;
5938 staticpro (&mule_to_ucs_table);
5939 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5940 #endif /* defined(MULE) && !defined(UTF2000) */