1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
61 #if defined(MULE) && !defined(UTF2000)
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
68 #if defined(MULE) && !defined(UTF2000)
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 CONST unsigned char *src,
181 static void decode_coding_sjis (Lstream *decoding,
182 CONST unsigned char *src,
183 unsigned_char_dynarr *dst,
185 static void encode_coding_sjis (Lstream *encoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst,
189 static int detect_coding_big5 (struct detection_state *st,
190 CONST unsigned char *src,
192 static void decode_coding_big5 (Lstream *decoding,
193 CONST unsigned char *src,
194 unsigned_char_dynarr *dst, unsigned int n);
195 static void encode_coding_big5 (Lstream *encoding,
196 CONST unsigned char *src,
197 unsigned_char_dynarr *dst, unsigned int n);
198 static int detect_coding_ucs4 (struct detection_state *st,
199 CONST unsigned char *src,
201 static void decode_coding_ucs4 (Lstream *decoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_ucs4 (Lstream *encoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_utf8 (struct detection_state *st,
208 CONST unsigned char *src,
210 static void decode_coding_utf8 (Lstream *decoding,
211 CONST unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 static void encode_coding_utf8 (Lstream *encoding,
214 CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
216 static int postprocess_iso2022_mask (int mask);
217 static void reset_iso2022 (Lisp_Object coding_system,
218 struct iso2022_decoder *iso);
219 static int detect_coding_iso2022 (struct detection_state *st,
220 CONST unsigned char *src,
222 static void decode_coding_iso2022 (Lstream *decoding,
223 CONST unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 static void encode_coding_iso2022 (Lstream *encoding,
226 CONST unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 static void decode_coding_no_conversion (Lstream *decoding,
230 CONST unsigned char *src,
231 unsigned_char_dynarr *dst,
233 static void encode_coding_no_conversion (Lstream *encoding,
234 CONST unsigned char *src,
235 unsigned_char_dynarr *dst,
237 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
238 unsigned_char_dynarr *dst, unsigned int n);
239 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
240 unsigned_char_dynarr *dst, unsigned int n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
677 signal_simple_error_2
678 ("Attempted conversion between different charset types",
680 spec.from_charset = from;
681 spec.to_charset = to;
683 Dynarr_add (store_here, spec);
687 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
688 specs, return the equivalent as the Lisp programmer would see it.
690 If LOAD_HERE is 0, return Qnil. */
693 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
700 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
702 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
703 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
706 return Fnreverse (result);
711 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
712 Register symbol NAME as a coding system.
714 TYPE describes the conversion method used and should be one of
717 Automatic conversion. XEmacs attempts to detect the coding system
720 No conversion. Use this for binary files and such. On output,
721 graphic characters that are not in ASCII or Latin-1 will be
722 replaced by a ?. (For a no-conversion-encoded buffer, these
723 characters will only be present if you explicitly insert them.)
725 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
727 ISO 10646 UCS-4 encoding.
729 ISO 10646 UTF-8 encoding.
731 Any ISO2022-compliant encoding. Among other things, this includes
732 JIS (the Japanese encoding commonly used for e-mail), EUC (the
733 standard Unix encoding for Japanese and other languages), and
734 Compound Text (the encoding used in X11). You can specify more
735 specific information about the conversion with the FLAGS argument.
737 Big5 (the encoding commonly used for Taiwanese).
739 The conversion is performed using a user-written pseudo-code
740 program. CCL (Code Conversion Language) is the name of this
743 Write out or read in the raw contents of the memory representing
744 the buffer's text. This is primarily useful for debugging
745 purposes, and is only enabled when XEmacs has been compiled with
746 DEBUG_XEMACS defined (via the --debug configure option).
747 WARNING: Reading in a file using 'internal conversion can result
748 in an internal inconsistency in the memory representing a
749 buffer's text, which will produce unpredictable results and may
750 cause XEmacs to crash. Under normal circumstances you should
751 never use 'internal conversion.
753 DOC-STRING is a string describing the coding system.
755 PROPS is a property list, describing the specific nature of the
756 character set. Recognized properties are:
759 String to be displayed in the modeline when this coding system is
763 End-of-line conversion to be used. It should be one of
766 Automatically detect the end-of-line type (LF, CRLF,
767 or CR). Also generate subsidiary coding systems named
768 `NAME-unix', `NAME-dos', and `NAME-mac', that are
769 identical to this coding system but have an EOL-TYPE
770 value of 'lf, 'crlf, and 'cr, respectively.
772 The end of a line is marked externally using ASCII LF.
773 Since this is also the way that XEmacs represents an
774 end-of-line internally, specifying this option results
775 in no end-of-line conversion. This is the standard
776 format for Unix text files.
778 The end of a line is marked externally using ASCII
779 CRLF. This is the standard format for MS-DOS text
782 The end of a line is marked externally using ASCII CR.
783 This is the standard format for Macintosh text files.
785 Automatically detect the end-of-line type but do not
786 generate subsidiary coding systems. (This value is
787 converted to nil when stored internally, and
788 `coding-system-property' will return nil.)
790 'post-read-conversion
791 Function called after a file has been read in, to perform the
792 decoding. Called with two arguments, BEG and END, denoting
793 a region of the current buffer to be decoded.
795 'pre-write-conversion
796 Function called before a file is written out, to perform the
797 encoding. Called with two arguments, BEG and END, denoting
798 a region of the current buffer to be encoded.
801 The following additional properties are recognized if TYPE is 'iso2022:
807 The character set initially designated to the G0 - G3 registers.
808 The value should be one of
810 -- A charset object (designate that character set)
811 -- nil (do not ever use this register)
812 -- t (no character set is initially designated to
813 the register, but may be later on; this automatically
814 sets the corresponding `force-g*-on-output' property)
820 If non-nil, send an explicit designation sequence on output before
821 using the specified register.
824 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
825 "ESC $ B" on output in place of the full designation sequences
826 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
829 If non-nil, don't designate ASCII to G0 at each end of line on output.
830 Setting this to non-nil also suppresses other state-resetting that
831 normally happens at the end of a line.
834 If non-nil, don't designate ASCII to G0 before control chars on output.
837 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
841 If non-nil, use locking-shift (SO/SI) instead of single-shift
842 or designation by escape sequence.
845 If non-nil, don't use ISO6429's direction specification.
848 If non-nil, literal control characters that are the same as
849 the beginning of a recognized ISO2022 or ISO6429 escape sequence
850 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
851 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
852 so that they can be properly distinguished from an escape sequence.
853 (Note that doing this results in a non-portable encoding.) This
854 encoding flag is used for byte-compiled files. Note that ESC
855 is a good choice for a quoting character because there are no
856 escape sequences whose second byte is a character from the Control-0
857 or Control-1 character sets; this is explicitly disallowed by the
860 'input-charset-conversion
861 A list of conversion specifications, specifying conversion of
862 characters in one charset to another when decoding is performed.
863 Each specification is a list of two elements: the source charset,
864 and the destination charset.
866 'output-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when encoding is performed.
869 The form of each specification is the same as for
870 'input-charset-conversion.
873 The following additional properties are recognized (and required)
877 CCL program used for decoding (converting to internal format).
880 CCL program used for encoding (converting to external format).
882 (name, type, doc_string, props))
884 Lisp_Coding_System *codesys;
885 Lisp_Object rest, key, value;
886 enum coding_system_type ty;
887 int need_to_setup_eol_systems = 1;
889 /* Convert type to constant */
890 if (NILP (type) || EQ (type, Qundecided))
891 { ty = CODESYS_AUTODETECT; }
893 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
894 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
895 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
896 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
897 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
898 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
900 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
902 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
905 signal_simple_error ("Invalid coding system type", type);
909 codesys = allocate_coding_system (ty, name);
911 if (NILP (doc_string))
912 doc_string = build_string ("");
914 CHECK_STRING (doc_string);
915 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
917 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
919 if (EQ (key, Qmnemonic))
922 CHECK_STRING (value);
923 CODING_SYSTEM_MNEMONIC (codesys) = value;
926 else if (EQ (key, Qeol_type))
928 need_to_setup_eol_systems = NILP (value);
931 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
934 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
935 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
937 else if (ty == CODESYS_ISO2022)
939 #define FROB_INITIAL_CHARSET(charset_num) \
940 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
941 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
943 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
944 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
945 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
946 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
948 #define FROB_FORCE_CHARSET(charset_num) \
949 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
951 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
952 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
953 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
954 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
956 #define FROB_BOOLEAN_PROPERTY(prop) \
957 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
959 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
960 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
961 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
962 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
963 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
964 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
965 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
967 else if (EQ (key, Qinput_charset_conversion))
969 codesys->iso2022.input_conv =
970 Dynarr_new (charset_conversion_spec);
971 parse_charset_conversion_specs (codesys->iso2022.input_conv,
974 else if (EQ (key, Qoutput_charset_conversion))
976 codesys->iso2022.output_conv =
977 Dynarr_new (charset_conversion_spec);
978 parse_charset_conversion_specs (codesys->iso2022.output_conv,
982 signal_simple_error ("Unrecognized property", key);
984 else if (EQ (type, Qccl))
986 if (EQ (key, Qdecode))
988 CHECK_VECTOR (value);
989 CODING_SYSTEM_CCL_DECODE (codesys) = value;
991 else if (EQ (key, Qencode))
993 CHECK_VECTOR (value);
994 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
997 signal_simple_error ("Unrecognized property", key);
1001 signal_simple_error ("Unrecognized property", key);
1004 if (need_to_setup_eol_systems)
1005 setup_eol_coding_systems (codesys);
1008 Lisp_Object codesys_obj;
1009 XSETCODING_SYSTEM (codesys_obj, codesys);
1010 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1016 Copy OLD-CODING-SYSTEM to NEW-NAME.
1017 If NEW-NAME does not name an existing coding system, a new one will
1020 (old_coding_system, new_name))
1022 Lisp_Object new_coding_system;
1023 old_coding_system = Fget_coding_system (old_coding_system);
1024 new_coding_system = Ffind_coding_system (new_name);
1025 if (NILP (new_coding_system))
1027 XSETCODING_SYSTEM (new_coding_system,
1028 allocate_coding_system
1029 (XCODING_SYSTEM_TYPE (old_coding_system),
1031 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1035 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1036 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1037 memcpy (((char *) to ) + sizeof (to->header),
1038 ((char *) from) + sizeof (from->header),
1039 sizeof (*from) - sizeof (from->header));
1040 to->name = new_name;
1042 return new_coding_system;
1045 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1046 Return t if OBJECT names a coding system, and is not a coding system alias.
1050 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1054 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1055 Return t if OBJECT is a coding system alias.
1056 All coding system aliases are created by `define-coding-system-alias'.
1060 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1064 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1065 Return the coding-system symbol for which symbol ALIAS is an alias.
1069 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1070 if (SYMBOLP (aliasee))
1073 signal_simple_error ("Symbol is not a coding system alias", alias);
1077 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1079 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1083 /* A maphash function, for removing dangling coding system aliases. */
1085 dangling_coding_system_alias_p (Lisp_Object alias,
1086 Lisp_Object aliasee,
1087 void *dangling_aliases)
1089 if (SYMBOLP (aliasee)
1090 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1092 (*(int *) dangling_aliases)++;
1099 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1100 Define symbol ALIAS as an alias for coding system ALIASEE.
1102 You can use this function to redefine an alias that has already been defined,
1103 but you cannot redefine a name which is the canonical name for a coding system.
1104 \(a canonical name of a coding system is what is returned when you call
1105 `coding-system-name' on a coding system).
1107 ALIASEE itself can be an alias, which allows you to define nested aliases.
1109 You are forbidden, however, from creating alias loops or `dangling' aliases.
1110 These will be detected, and an error will be signaled if you attempt to do so.
1112 If ALIASEE is nil, then ALIAS will simply be undefined.
1114 See also `coding-system-alias-p', `coding-system-aliasee',
1115 and `coding-system-canonical-name-p'.
1119 Lisp_Object real_coding_system, probe;
1121 CHECK_SYMBOL (alias);
1123 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1125 ("Symbol is the canonical name of a coding system and cannot be redefined",
1130 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1131 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1132 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1134 Fremhash (alias, Vcoding_system_hash_table);
1136 /* Undefine subsidiary aliases,
1137 presumably created by a previous call to this function */
1138 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1139 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1140 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1142 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1143 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1144 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1147 /* Undefine dangling coding system aliases. */
1149 int dangling_aliases;
1152 dangling_aliases = 0;
1153 elisp_map_remhash (dangling_coding_system_alias_p,
1154 Vcoding_system_hash_table,
1156 } while (dangling_aliases > 0);
1162 if (CODING_SYSTEMP (aliasee))
1163 aliasee = XCODING_SYSTEM_NAME (aliasee);
1165 /* Checks that aliasee names a coding-system */
1166 real_coding_system = Fget_coding_system (aliasee);
1168 /* Check for coding system alias loops */
1169 if (EQ (alias, aliasee))
1170 alias_loop: signal_simple_error_2
1171 ("Attempt to create a coding system alias loop", alias, aliasee);
1173 for (probe = aliasee;
1175 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1177 if (EQ (probe, alias))
1181 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1183 /* Set up aliases for subsidiaries.
1184 #### There must be a better way to handle subsidiary coding systems. */
1186 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1188 for (i = 0; i < countof (suffixes); i++)
1190 Lisp_Object alias_subsidiary =
1191 append_suffix_to_symbol (alias, suffixes[i]);
1192 Lisp_Object aliasee_subsidiary =
1193 append_suffix_to_symbol (aliasee, suffixes[i]);
1195 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1196 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1199 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1200 but it doesn't look intentional, so I'd rather return something
1201 meaningful or nothing at all. */
1206 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1208 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1209 Lisp_Object new_coding_system;
1211 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1212 return coding_system;
1216 case EOL_AUTODETECT: return coding_system;
1217 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1218 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1219 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1223 return NILP (new_coding_system) ? coding_system : new_coding_system;
1226 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1227 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1229 (coding_system, eol_type))
1231 coding_system = Fget_coding_system (coding_system);
1233 return subsidiary_coding_system (coding_system,
1234 symbol_to_eol_type (eol_type));
1238 /************************************************************************/
1239 /* Coding system accessors */
1240 /************************************************************************/
1242 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1243 Return the doc string for CODING-SYSTEM.
1247 coding_system = Fget_coding_system (coding_system);
1248 return XCODING_SYSTEM_DOC_STRING (coding_system);
1251 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1252 Return the type of CODING-SYSTEM.
1256 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1259 case CODESYS_AUTODETECT: return Qundecided;
1261 case CODESYS_SHIFT_JIS: return Qshift_jis;
1262 case CODESYS_ISO2022: return Qiso2022;
1263 case CODESYS_BIG5: return Qbig5;
1264 case CODESYS_UCS4: return Qucs4;
1265 case CODESYS_UTF8: return Qutf8;
1266 case CODESYS_CCL: return Qccl;
1268 case CODESYS_NO_CONVERSION: return Qno_conversion;
1270 case CODESYS_INTERNAL: return Qinternal;
1277 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1280 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1282 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1285 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1286 Return initial charset of CODING-SYSTEM designated to GNUM.
1289 (coding_system, gnum))
1291 coding_system = Fget_coding_system (coding_system);
1294 return coding_system_charset (coding_system, XINT (gnum));
1298 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1299 Return the PROP property of CODING-SYSTEM.
1301 (coding_system, prop))
1304 enum coding_system_type type;
1306 coding_system = Fget_coding_system (coding_system);
1307 CHECK_SYMBOL (prop);
1308 type = XCODING_SYSTEM_TYPE (coding_system);
1310 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1311 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1314 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1316 case CODESYS_PROP_ALL_OK:
1319 case CODESYS_PROP_ISO2022:
1320 if (type != CODESYS_ISO2022)
1322 ("Property only valid in ISO2022 coding systems",
1326 case CODESYS_PROP_CCL:
1327 if (type != CODESYS_CCL)
1329 ("Property only valid in CCL coding systems",
1339 signal_simple_error ("Unrecognized property", prop);
1341 if (EQ (prop, Qname))
1342 return XCODING_SYSTEM_NAME (coding_system);
1343 else if (EQ (prop, Qtype))
1344 return Fcoding_system_type (coding_system);
1345 else if (EQ (prop, Qdoc_string))
1346 return XCODING_SYSTEM_DOC_STRING (coding_system);
1347 else if (EQ (prop, Qmnemonic))
1348 return XCODING_SYSTEM_MNEMONIC (coding_system);
1349 else if (EQ (prop, Qeol_type))
1350 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1351 else if (EQ (prop, Qeol_lf))
1352 return XCODING_SYSTEM_EOL_LF (coding_system);
1353 else if (EQ (prop, Qeol_crlf))
1354 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1355 else if (EQ (prop, Qeol_cr))
1356 return XCODING_SYSTEM_EOL_CR (coding_system);
1357 else if (EQ (prop, Qpost_read_conversion))
1358 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1359 else if (EQ (prop, Qpre_write_conversion))
1360 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1362 else if (type == CODESYS_ISO2022)
1364 if (EQ (prop, Qcharset_g0))
1365 return coding_system_charset (coding_system, 0);
1366 else if (EQ (prop, Qcharset_g1))
1367 return coding_system_charset (coding_system, 1);
1368 else if (EQ (prop, Qcharset_g2))
1369 return coding_system_charset (coding_system, 2);
1370 else if (EQ (prop, Qcharset_g3))
1371 return coding_system_charset (coding_system, 3);
1373 #define FORCE_CHARSET(charset_num) \
1374 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1375 (coding_system, charset_num) ? Qt : Qnil)
1377 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1378 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1379 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1380 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1382 #define LISP_BOOLEAN(prop) \
1383 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1385 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1386 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1387 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1388 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1389 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1390 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1391 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1393 else if (EQ (prop, Qinput_charset_conversion))
1395 unparse_charset_conversion_specs
1396 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1397 else if (EQ (prop, Qoutput_charset_conversion))
1399 unparse_charset_conversion_specs
1400 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1404 else if (type == CODESYS_CCL)
1406 if (EQ (prop, Qdecode))
1407 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1408 else if (EQ (prop, Qencode))
1409 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1417 return Qnil; /* not reached */
1421 /************************************************************************/
1422 /* Coding category functions */
1423 /************************************************************************/
1426 decode_coding_category (Lisp_Object symbol)
1430 CHECK_SYMBOL (symbol);
1431 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1432 if (EQ (coding_category_symbol[i], symbol))
1435 signal_simple_error ("Unrecognized coding category", symbol);
1436 return 0; /* not reached */
1439 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1440 Return a list of all recognized coding categories.
1445 Lisp_Object list = Qnil;
1447 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1448 list = Fcons (coding_category_symbol[i], list);
1452 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1453 Change the priority order of the coding categories.
1454 LIST should be list of coding categories, in descending order of
1455 priority. Unspecified coding categories will be lower in priority
1456 than all specified ones, in the same relative order they were in
1461 int category_to_priority[CODING_CATEGORY_LAST + 1];
1465 /* First generate a list that maps coding categories to priorities. */
1467 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1468 category_to_priority[i] = -1;
1470 /* Highest priority comes from the specified list. */
1472 EXTERNAL_LIST_LOOP (rest, list)
1474 int cat = decode_coding_category (XCAR (rest));
1476 if (category_to_priority[cat] >= 0)
1477 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1478 category_to_priority[cat] = i++;
1481 /* Now go through the existing categories by priority to retrieve
1482 the categories not yet specified and preserve their priority
1484 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1486 int cat = fcd->coding_category_by_priority[j];
1487 if (category_to_priority[cat] < 0)
1488 category_to_priority[cat] = i++;
1491 /* Now we need to construct the inverse of the mapping we just
1494 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1495 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1497 /* Phew! That was confusing. */
1501 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1502 Return a list of coding categories in descending order of priority.
1507 Lisp_Object list = Qnil;
1509 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1510 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1515 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1516 Change the coding system associated with a coding category.
1518 (coding_category, coding_system))
1520 int cat = decode_coding_category (coding_category);
1522 coding_system = Fget_coding_system (coding_system);
1523 fcd->coding_category_system[cat] = coding_system;
1527 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1528 Return the coding system associated with a coding category.
1532 int cat = decode_coding_category (coding_category);
1533 Lisp_Object sys = fcd->coding_category_system[cat];
1536 return XCODING_SYSTEM_NAME (sys);
1541 /************************************************************************/
1542 /* Detecting the encoding of data */
1543 /************************************************************************/
1545 struct detection_state
1547 eol_type_t eol_type;
1583 struct iso2022_decoder iso;
1585 int high_byte_count;
1586 unsigned int saw_single_shift:1;
1599 acceptable_control_char_p (int c)
1603 /* Allow and ignore control characters that you might
1604 reasonably see in a text file */
1609 case 8: /* backspace */
1610 case 11: /* vertical tab */
1611 case 12: /* form feed */
1612 case 26: /* MS-DOS C-z junk */
1613 case 31: /* '^_' -- for info */
1621 mask_has_at_most_one_bit_p (int mask)
1623 /* Perhaps the only thing useful you learn from intensive Microsoft
1624 technical interviews */
1625 return (mask & (mask - 1)) == 0;
1629 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1639 if (st->eol.just_saw_cr)
1641 else if (st->eol.seen_anything)
1644 else if (st->eol.just_saw_cr)
1647 st->eol.just_saw_cr = 1;
1649 st->eol.just_saw_cr = 0;
1650 st->eol.seen_anything = 1;
1653 return EOL_AUTODETECT;
1656 /* Attempt to determine the encoding and EOL type of the given text.
1657 Before calling this function for the first type, you must initialize
1658 st->eol_type as appropriate and initialize st->mask to ~0.
1660 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1663 st->mask holds the determined coding category mask, or ~0 if only
1664 ASCII has been seen so far.
1668 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1669 is present in st->mask
1670 1 == definitive answers are here for both st->eol_type and st->mask
1674 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1675 unsigned int n, int just_do_eol)
1679 if (st->eol_type == EOL_AUTODETECT)
1680 st->eol_type = detect_eol_type (st, src, n);
1683 return st->eol_type != EOL_AUTODETECT;
1685 if (!st->seen_non_ascii)
1687 for (; n; n--, src++)
1690 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1692 st->seen_non_ascii = 1;
1694 st->shift_jis.mask = ~0;
1698 st->iso2022.mask = ~0;
1708 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1709 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1710 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1711 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1712 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1713 st->big5.mask = detect_coding_big5 (st, src, n);
1714 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1715 st->utf8.mask = detect_coding_utf8 (st, src, n);
1716 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1717 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1720 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1721 | st->utf8.mask | st->ucs4.mask;
1724 int retval = mask_has_at_most_one_bit_p (st->mask);
1725 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1726 return retval && st->eol_type != EOL_AUTODETECT;
1731 coding_system_from_mask (int mask)
1735 /* If the file was entirely or basically ASCII, use the
1736 default value of `buffer-file-coding-system'. */
1737 Lisp_Object retval =
1738 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1741 retval = Ffind_coding_system (retval);
1745 (Qbad_variable, Qwarning,
1746 "Invalid `default-buffer-file-coding-system', set to nil");
1747 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1751 retval = Fget_coding_system (Qraw_text);
1759 mask = postprocess_iso2022_mask (mask);
1761 /* Look through the coding categories by priority and find
1762 the first one that is allowed. */
1763 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1765 cat = fcd->coding_category_by_priority[i];
1766 if ((mask & (1 << cat)) &&
1767 !NILP (fcd->coding_category_system[cat]))
1771 return fcd->coding_category_system[cat];
1773 return Fget_coding_system (Qraw_text);
1777 /* Given a seekable read stream and potential coding system and EOL type
1778 as specified, do any autodetection that is called for. If the
1779 coding system and/or EOL type are not `autodetect', they will be left
1780 alone; but this function will never return an autodetect coding system
1783 This function does not automatically fetch subsidiary coding systems;
1784 that should be unnecessary with the explicit eol-type argument. */
1786 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1789 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1790 eol_type_t *eol_type_in_out)
1792 struct detection_state decst;
1794 if (*eol_type_in_out == EOL_AUTODETECT)
1795 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1798 decst.eol_type = *eol_type_in_out;
1801 /* If autodetection is called for, do it now. */
1802 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1803 || *eol_type_in_out == EOL_AUTODETECT)
1806 Lisp_Object coding_system = Qnil;
1808 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1811 /* Look for initial "-*-"; mode line prefix */
1813 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1818 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1820 Extbyte *local_vars_beg = p + 3;
1821 /* Look for final "-*-"; mode line suffix */
1822 for (p = local_vars_beg,
1823 scan_end = buf + nread - LENGTH ("-*-");
1828 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1830 Extbyte *suffix = p;
1831 /* Look for "coding:" */
1832 for (p = local_vars_beg,
1833 scan_end = suffix - LENGTH ("coding:?");
1836 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1837 && (p == local_vars_beg
1838 || (*(p-1) == ' ' ||
1844 p += LENGTH ("coding:");
1845 while (*p == ' ' || *p == '\t') p++;
1847 /* Get coding system name */
1848 save = *suffix; *suffix = '\0';
1849 /* Characters valid in a MIME charset name (rfc 1521),
1850 and in a Lisp symbol name. */
1851 n = strspn ( (char *) p,
1852 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1853 "abcdefghijklmnopqrstuvwxyz"
1859 save = p[n]; p[n] = '\0';
1861 Ffind_coding_system (intern ((char *) p));
1871 if (NILP (coding_system))
1874 if (detect_coding_type (&decst, buf, nread,
1875 XCODING_SYSTEM_TYPE (*codesys_in_out)
1876 != CODESYS_AUTODETECT))
1878 nread = Lstream_read (stream, buf, sizeof (buf));
1884 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1885 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1888 if (detect_coding_type (&decst, buf, nread, 1))
1890 nread = Lstream_read (stream, buf, sizeof (buf));
1896 *eol_type_in_out = decst.eol_type;
1897 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1899 if (NILP (coding_system))
1900 *codesys_in_out = coding_system_from_mask (decst.mask);
1902 *codesys_in_out = coding_system;
1906 /* If we absolutely can't determine the EOL type, just assume LF. */
1907 if (*eol_type_in_out == EOL_AUTODETECT)
1908 *eol_type_in_out = EOL_LF;
1910 Lstream_rewind (stream);
1913 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1914 Detect coding system of the text in the region between START and END.
1915 Returned a list of possible coding systems ordered by priority.
1916 If only ASCII characters are found, it returns 'undecided or one of
1917 its subsidiary coding systems according to a detected end-of-line
1918 type. Optional arg BUFFER defaults to the current buffer.
1920 (start, end, buffer))
1922 Lisp_Object val = Qnil;
1923 struct buffer *buf = decode_buffer (buffer, 0);
1925 Lisp_Object instream, lb_instream;
1926 Lstream *istr, *lb_istr;
1927 struct detection_state decst;
1928 struct gcpro gcpro1, gcpro2;
1930 get_buffer_range_char (buf, start, end, &b, &e, 0);
1931 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1932 lb_istr = XLSTREAM (lb_instream);
1933 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1934 istr = XLSTREAM (instream);
1935 GCPRO2 (instream, lb_instream);
1937 decst.eol_type = EOL_AUTODETECT;
1941 unsigned char random_buffer[4096];
1942 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1946 if (detect_coding_type (&decst, random_buffer, nread, 0))
1950 if (decst.mask == ~0)
1951 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1959 decst.mask = postprocess_iso2022_mask (decst.mask);
1961 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1963 int sys = fcd->coding_category_by_priority[i];
1964 if (decst.mask & (1 << sys))
1966 Lisp_Object codesys = fcd->coding_category_system[sys];
1967 if (!NILP (codesys))
1968 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1969 val = Fcons (codesys, val);
1973 Lstream_close (istr);
1975 Lstream_delete (istr);
1976 Lstream_delete (lb_istr);
1981 /************************************************************************/
1982 /* Converting to internal Mule format ("decoding") */
1983 /************************************************************************/
1985 /* A decoding stream is a stream used for decoding text (i.e.
1986 converting from some external format to internal format).
1987 The decoding-stream object keeps track of the actual coding
1988 stream, the stream that is at the other end, and data that
1989 needs to be persistent across the lifetime of the stream. */
1991 /* Handle the EOL stuff related to just-read-in character C.
1992 EOL_TYPE is the EOL type of the coding stream.
1993 FLAGS is the current value of FLAGS in the coding stream, and may
1994 be modified by this macro. (The macro only looks at the
1995 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1996 bytes are to be written. You need to also define a local goto
1997 label "label_continue_loop" that is at the end of the main
1998 character-reading loop.
2000 If C is a CR character, then this macro handles it entirely and
2001 jumps to label_continue_loop. Otherwise, this macro does not add
2002 anything to DST, and continues normally. You should continue
2003 processing C normally after this macro. */
2005 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2009 if (eol_type == EOL_CR) \
2010 Dynarr_add (dst, '\n'); \
2011 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2012 Dynarr_add (dst, c); \
2014 flags |= CODING_STATE_CR; \
2015 goto label_continue_loop; \
2017 else if (flags & CODING_STATE_CR) \
2018 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2020 Dynarr_add (dst, '\r'); \
2021 flags &= ~CODING_STATE_CR; \
2025 /* C should be a binary character in the range 0 - 255; convert
2026 to internal format and add to Dynarr DST. */
2029 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2031 if (BYTE_ASCII_P (c)) \
2032 Dynarr_add (dst, c); \
2035 Dynarr_add (dst, (c >> 6) | 0xc0); \
2036 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2041 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2045 Dynarr_add (dst, c);
2047 else if ( c <= 0x7ff )
2049 Dynarr_add (dst, (c >> 6) | 0xc0);
2050 Dynarr_add (dst, (c & 0x3f) | 0x80);
2052 else if ( c <= 0xffff )
2054 Dynarr_add (dst, (c >> 12) | 0xe0);
2055 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2056 Dynarr_add (dst, (c & 0x3f) | 0x80);
2058 else if ( c <= 0x1fffff )
2060 Dynarr_add (dst, (c >> 18) | 0xf0);
2061 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2062 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2063 Dynarr_add (dst, (c & 0x3f) | 0x80);
2065 else if ( c <= 0x3ffffff )
2067 Dynarr_add (dst, (c >> 24) | 0xf8);
2068 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2069 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2070 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2071 Dynarr_add (dst, (c & 0x3f) | 0x80);
2075 Dynarr_add (dst, (c >> 30) | 0xfc);
2076 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2077 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2078 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2079 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2080 Dynarr_add (dst, (c & 0x3f) | 0x80);
2084 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2086 if (BYTE_ASCII_P (c)) \
2087 Dynarr_add (dst, c); \
2088 else if (BYTE_C1_P (c)) \
2090 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2091 Dynarr_add (dst, c + 0x20); \
2095 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2096 Dynarr_add (dst, c); \
2101 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2105 DECODE_ADD_BINARY_CHAR (ch, dst); \
2110 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2112 if (flags & CODING_STATE_END) \
2114 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2115 if (flags & CODING_STATE_CR) \
2116 Dynarr_add (dst, '\r'); \
2120 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2122 struct decoding_stream
2124 /* Coding system that governs the conversion. */
2125 Lisp_Coding_System *codesys;
2127 /* Stream that we read the encoded data from or
2128 write the decoded data to. */
2131 /* If we are reading, then we can return only a fixed amount of
2132 data, so if the conversion resulted in too much data, we store it
2133 here for retrieval the next time around. */
2134 unsigned_char_dynarr *runoff;
2136 /* FLAGS holds flags indicating the current state of the decoding.
2137 Some of these flags are dependent on the coding system. */
2140 /* CH holds a partially built-up character. Since we only deal
2141 with one- and two-byte characters at the moment, we only use
2142 this to store the first byte of a two-byte character. */
2145 /* EOL_TYPE specifies the type of end-of-line conversion that
2146 currently applies. We need to keep this separate from the
2147 EOL type stored in CODESYS because the latter might indicate
2148 automatic EOL-type detection while the former will always
2149 indicate a particular EOL type. */
2150 eol_type_t eol_type;
2152 /* Additional ISO2022 information. We define the structure above
2153 because it's also needed by the detection routines. */
2154 struct iso2022_decoder iso2022;
2156 /* Additional information (the state of the running CCL program)
2157 used by the CCL decoder. */
2158 struct ccl_program ccl;
2160 /* counter for UTF-8 or UCS-4 */
2161 unsigned char counter;
2163 struct detection_state decst;
2166 static ssize_t decoding_reader (Lstream *stream,
2167 unsigned char *data, size_t size);
2168 static ssize_t decoding_writer (Lstream *stream,
2169 CONST unsigned char *data, size_t size);
2170 static int decoding_rewinder (Lstream *stream);
2171 static int decoding_seekable_p (Lstream *stream);
2172 static int decoding_flusher (Lstream *stream);
2173 static int decoding_closer (Lstream *stream);
2175 static Lisp_Object decoding_marker (Lisp_Object stream);
2177 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2178 sizeof (struct decoding_stream));
2181 decoding_marker (Lisp_Object stream)
2183 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2184 Lisp_Object str_obj;
2186 /* We do not need to mark the coding systems or charsets stored
2187 within the stream because they are stored in a global list
2188 and automatically marked. */
2190 XSETLSTREAM (str_obj, str);
2191 mark_object (str_obj);
2192 if (str->imp->marker)
2193 return (str->imp->marker) (str_obj);
2198 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2199 so we read data from the other end, decode it, and store it into DATA. */
2202 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2204 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2205 unsigned char *orig_data = data;
2207 int error_occurred = 0;
2209 /* We need to interface to mule_decode(), which expects to take some
2210 amount of data and store the result into a Dynarr. We have
2211 mule_decode() store into str->runoff, and take data from there
2214 /* We loop until we have enough data, reading chunks from the other
2215 end and decoding it. */
2218 /* Take data from the runoff if we can. Make sure to take at
2219 most SIZE bytes, and delete the data from the runoff. */
2220 if (Dynarr_length (str->runoff) > 0)
2222 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2223 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2224 Dynarr_delete_many (str->runoff, 0, chunk);
2230 break; /* No more room for data */
2232 if (str->flags & CODING_STATE_END)
2233 /* This means that on the previous iteration, we hit the EOF on
2234 the other end. We loop once more so that mule_decode() can
2235 output any final stuff it may be holding, or any "go back
2236 to a sane state" escape sequences. (This latter makes sense
2237 during encoding.) */
2240 /* Exhausted the runoff, so get some more. DATA has at least
2241 SIZE bytes left of storage in it, so it's OK to read directly
2242 into it. (We'll be overwriting above, after we've decoded it
2243 into the runoff.) */
2244 read_size = Lstream_read (str->other_end, data, size);
2251 /* There might be some more end data produced in the translation.
2252 See the comment above. */
2253 str->flags |= CODING_STATE_END;
2254 mule_decode (stream, data, str->runoff, read_size);
2257 if (data - orig_data == 0)
2258 return error_occurred ? -1 : 0;
2260 return data - orig_data;
2264 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2266 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2269 /* Decode all our data into the runoff, and then attempt to write
2270 it all out to the other end. Remove whatever chunk we succeeded
2272 mule_decode (stream, data, str->runoff, size);
2273 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2274 Dynarr_length (str->runoff));
2276 Dynarr_delete_many (str->runoff, 0, retval);
2277 /* Do NOT return retval. The return value indicates how much
2278 of the incoming data was written, not how many bytes were
2284 reset_decoding_stream (struct decoding_stream *str)
2287 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2289 Lisp_Object coding_system;
2290 XSETCODING_SYSTEM (coding_system, str->codesys);
2291 reset_iso2022 (coding_system, &str->iso2022);
2293 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2295 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2299 str->flags = str->ch = 0;
2303 decoding_rewinder (Lstream *stream)
2305 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2306 reset_decoding_stream (str);
2307 Dynarr_reset (str->runoff);
2308 return Lstream_rewind (str->other_end);
2312 decoding_seekable_p (Lstream *stream)
2314 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2315 return Lstream_seekable_p (str->other_end);
2319 decoding_flusher (Lstream *stream)
2321 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2322 return Lstream_flush (str->other_end);
2326 decoding_closer (Lstream *stream)
2328 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2329 if (stream->flags & LSTREAM_FL_WRITE)
2331 str->flags |= CODING_STATE_END;
2332 decoding_writer (stream, 0, 0);
2334 Dynarr_free (str->runoff);
2336 #ifdef ENABLE_COMPOSITE_CHARS
2337 if (str->iso2022.composite_chars)
2338 Dynarr_free (str->iso2022.composite_chars);
2341 return Lstream_close (str->other_end);
2345 decoding_stream_coding_system (Lstream *stream)
2347 Lisp_Object coding_system;
2348 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2350 XSETCODING_SYSTEM (coding_system, str->codesys);
2351 return subsidiary_coding_system (coding_system, str->eol_type);
2355 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2357 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2358 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2360 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2361 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2362 reset_decoding_stream (str);
2365 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2366 stream for writing, no automatic code detection will be performed.
2367 The reason for this is that automatic code detection requires a
2368 seekable input. Things will also fail if you open a decoding
2369 stream for reading using a non-fully-specified coding system and
2370 a non-seekable input stream. */
2373 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2376 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2377 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2381 str->other_end = stream;
2382 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2383 str->eol_type = EOL_AUTODETECT;
2384 if (!strcmp (mode, "r")
2385 && Lstream_seekable_p (stream))
2386 /* We can determine the coding system now. */
2387 determine_real_coding_system (stream, &codesys, &str->eol_type);
2388 set_decoding_stream_coding_system (lstr, codesys);
2389 str->decst.eol_type = str->eol_type;
2390 str->decst.mask = ~0;
2391 XSETLSTREAM (obj, lstr);
2396 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2398 return make_decoding_stream_1 (stream, codesys, "r");
2402 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2404 return make_decoding_stream_1 (stream, codesys, "w");
2407 /* Note: the decode_coding_* functions all take the same
2408 arguments as mule_decode(), which is to say some SRC data of
2409 size N, which is to be stored into dynamic array DST.
2410 DECODING is the stream within which the decoding is
2411 taking place, but no data is actually read from or
2412 written to that stream; that is handled in decoding_reader()
2413 or decoding_writer(). This allows the same functions to
2414 be used for both reading and writing. */
2417 mule_decode (Lstream *decoding, CONST unsigned char *src,
2418 unsigned_char_dynarr *dst, unsigned int n)
2420 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2422 /* If necessary, do encoding-detection now. We do this when
2423 we're a writing stream or a non-seekable reading stream,
2424 meaning that we can't just process the whole input,
2425 rewind, and start over. */
2427 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2428 str->eol_type == EOL_AUTODETECT)
2430 Lisp_Object codesys;
2432 XSETCODING_SYSTEM (codesys, str->codesys);
2433 detect_coding_type (&str->decst, src, n,
2434 CODING_SYSTEM_TYPE (str->codesys) !=
2435 CODESYS_AUTODETECT);
2436 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2437 str->decst.mask != ~0)
2438 /* #### This is cheesy. What we really ought to do is
2439 buffer up a certain amount of data so as to get a
2440 less random result. */
2441 codesys = coding_system_from_mask (str->decst.mask);
2442 str->eol_type = str->decst.eol_type;
2443 if (XCODING_SYSTEM (codesys) != str->codesys)
2445 /* Preserve the CODING_STATE_END flag in case it was set.
2446 If we erase it, bad things might happen. */
2447 int was_end = str->flags & CODING_STATE_END;
2448 set_decoding_stream_coding_system (decoding, codesys);
2450 str->flags |= CODING_STATE_END;
2454 switch (CODING_SYSTEM_TYPE (str->codesys))
2457 case CODESYS_INTERNAL:
2458 Dynarr_add_many (dst, src, n);
2461 case CODESYS_AUTODETECT:
2462 /* If we got this far and still haven't decided on the coding
2463 system, then do no conversion. */
2464 case CODESYS_NO_CONVERSION:
2465 decode_coding_no_conversion (decoding, src, dst, n);
2468 case CODESYS_SHIFT_JIS:
2469 decode_coding_sjis (decoding, src, dst, n);
2472 decode_coding_big5 (decoding, src, dst, n);
2475 decode_coding_ucs4 (decoding, src, dst, n);
2478 decode_coding_utf8 (decoding, src, dst, n);
2481 str->ccl.last_block = str->flags & CODING_STATE_END;
2482 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2484 case CODESYS_ISO2022:
2485 decode_coding_iso2022 (decoding, src, dst, n);
2493 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2494 Decode the text between START and END which is encoded in CODING-SYSTEM.
2495 This is useful if you've read in encoded text from a file without decoding
2496 it (e.g. you read in a JIS-formatted file but used the `binary' or
2497 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2498 Return length of decoded text.
2499 BUFFER defaults to the current buffer if unspecified.
2501 (start, end, coding_system, buffer))
2504 struct buffer *buf = decode_buffer (buffer, 0);
2505 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2506 Lstream *istr, *ostr;
2507 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2509 get_buffer_range_char (buf, start, end, &b, &e, 0);
2511 barf_if_buffer_read_only (buf, b, e);
2513 coding_system = Fget_coding_system (coding_system);
2514 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2515 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2516 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2518 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2519 Fget_coding_system (Qbinary));
2520 istr = XLSTREAM (instream);
2521 ostr = XLSTREAM (outstream);
2522 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2524 /* The chain of streams looks like this:
2526 [BUFFER] <----- send through
2527 ------> [ENCODE AS BINARY]
2528 ------> [DECODE AS SPECIFIED]
2534 char tempbuf[1024]; /* some random amount */
2535 Bufpos newpos, even_newer_pos;
2536 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2537 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2541 newpos = lisp_buffer_stream_startpos (istr);
2542 Lstream_write (ostr, tempbuf, size_in_bytes);
2543 even_newer_pos = lisp_buffer_stream_startpos (istr);
2544 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2547 Lstream_close (istr);
2548 Lstream_close (ostr);
2550 Lstream_delete (istr);
2551 Lstream_delete (ostr);
2552 Lstream_delete (XLSTREAM (de_outstream));
2553 Lstream_delete (XLSTREAM (lb_outstream));
2558 /************************************************************************/
2559 /* Converting to an external encoding ("encoding") */
2560 /************************************************************************/
2562 /* An encoding stream is an output stream. When you create the
2563 stream, you specify the coding system that governs the encoding
2564 and another stream that the resulting encoded data is to be
2565 sent to, and then start sending data to it. */
2567 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2569 struct encoding_stream
2571 /* Coding system that governs the conversion. */
2572 Lisp_Coding_System *codesys;
2574 /* Stream that we read the encoded data from or
2575 write the decoded data to. */
2578 /* If we are reading, then we can return only a fixed amount of
2579 data, so if the conversion resulted in too much data, we store it
2580 here for retrieval the next time around. */
2581 unsigned_char_dynarr *runoff;
2583 /* FLAGS holds flags indicating the current state of the encoding.
2584 Some of these flags are dependent on the coding system. */
2587 /* CH holds a partially built-up character. Since we only deal
2588 with one- and two-byte characters at the moment, we only use
2589 this to store the first byte of a two-byte character. */
2592 /* Additional information used by the ISO2022 encoder. */
2595 /* CHARSET holds the character sets currently assigned to the G0
2596 through G3 registers. It is initialized from the array
2597 INITIAL_CHARSET in CODESYS. */
2598 Lisp_Object charset[4];
2600 /* Which registers are currently invoked into the left (GL) and
2601 right (GR) halves of the 8-bit encoding space? */
2602 int register_left, register_right;
2604 /* Whether we need to explicitly designate the charset in the
2605 G? register before using it. It is initialized from the
2606 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2607 unsigned char force_charset_on_output[4];
2609 /* Other state variables that need to be preserved across
2611 Lisp_Object current_charset;
2613 int current_char_boundary;
2616 /* Additional information (the state of the running CCL program)
2617 used by the CCL encoder. */
2618 struct ccl_program ccl;
2622 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2623 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2625 static int encoding_rewinder (Lstream *stream);
2626 static int encoding_seekable_p (Lstream *stream);
2627 static int encoding_flusher (Lstream *stream);
2628 static int encoding_closer (Lstream *stream);
2630 static Lisp_Object encoding_marker (Lisp_Object stream);
2632 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2633 sizeof (struct encoding_stream));
2636 encoding_marker (Lisp_Object stream)
2638 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2639 Lisp_Object str_obj;
2641 /* We do not need to mark the coding systems or charsets stored
2642 within the stream because they are stored in a global list
2643 and automatically marked. */
2645 XSETLSTREAM (str_obj, str);
2646 mark_object (str_obj);
2647 if (str->imp->marker)
2648 return (str->imp->marker) (str_obj);
2653 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2654 so we read data from the other end, encode it, and store it into DATA. */
2657 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2659 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2660 unsigned char *orig_data = data;
2662 int error_occurred = 0;
2664 /* We need to interface to mule_encode(), which expects to take some
2665 amount of data and store the result into a Dynarr. We have
2666 mule_encode() store into str->runoff, and take data from there
2669 /* We loop until we have enough data, reading chunks from the other
2670 end and encoding it. */
2673 /* Take data from the runoff if we can. Make sure to take at
2674 most SIZE bytes, and delete the data from the runoff. */
2675 if (Dynarr_length (str->runoff) > 0)
2677 int chunk = min ((int) size, Dynarr_length (str->runoff));
2678 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2679 Dynarr_delete_many (str->runoff, 0, chunk);
2685 break; /* No more room for data */
2687 if (str->flags & CODING_STATE_END)
2688 /* This means that on the previous iteration, we hit the EOF on
2689 the other end. We loop once more so that mule_encode() can
2690 output any final stuff it may be holding, or any "go back
2691 to a sane state" escape sequences. (This latter makes sense
2692 during encoding.) */
2695 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2696 left of storage in it, so it's OK to read directly into it.
2697 (We'll be overwriting above, after we've encoded it into the
2699 read_size = Lstream_read (str->other_end, data, size);
2706 /* There might be some more end data produced in the translation.
2707 See the comment above. */
2708 str->flags |= CODING_STATE_END;
2709 mule_encode (stream, data, str->runoff, read_size);
2712 if (data == orig_data)
2713 return error_occurred ? -1 : 0;
2715 return data - orig_data;
2719 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2721 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2724 /* Encode all our data into the runoff, and then attempt to write
2725 it all out to the other end. Remove whatever chunk we succeeded
2727 mule_encode (stream, data, str->runoff, size);
2728 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2729 Dynarr_length (str->runoff));
2731 Dynarr_delete_many (str->runoff, 0, retval);
2732 /* Do NOT return retval. The return value indicates how much
2733 of the incoming data was written, not how many bytes were
2739 reset_encoding_stream (struct encoding_stream *str)
2742 switch (CODING_SYSTEM_TYPE (str->codesys))
2744 case CODESYS_ISO2022:
2748 for (i = 0; i < 4; i++)
2750 str->iso2022.charset[i] =
2751 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2752 str->iso2022.force_charset_on_output[i] =
2753 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2755 str->iso2022.register_left = 0;
2756 str->iso2022.register_right = 1;
2757 str->iso2022.current_charset = Qnil;
2758 str->iso2022.current_half = 0;
2760 str->iso2022.current_char_boundary = 0;
2762 str->iso2022.current_char_boundary = 1;
2767 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2774 str->flags = str->ch = 0;
2778 encoding_rewinder (Lstream *stream)
2780 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2781 reset_encoding_stream (str);
2782 Dynarr_reset (str->runoff);
2783 return Lstream_rewind (str->other_end);
2787 encoding_seekable_p (Lstream *stream)
2789 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2790 return Lstream_seekable_p (str->other_end);
2794 encoding_flusher (Lstream *stream)
2796 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2797 return Lstream_flush (str->other_end);
2801 encoding_closer (Lstream *stream)
2803 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2804 if (stream->flags & LSTREAM_FL_WRITE)
2806 str->flags |= CODING_STATE_END;
2807 encoding_writer (stream, 0, 0);
2809 Dynarr_free (str->runoff);
2810 return Lstream_close (str->other_end);
2814 encoding_stream_coding_system (Lstream *stream)
2816 Lisp_Object coding_system;
2817 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2819 XSETCODING_SYSTEM (coding_system, str->codesys);
2820 return coding_system;
2824 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2826 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2827 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2829 reset_encoding_stream (str);
2833 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2836 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2837 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2841 str->runoff = Dynarr_new (unsigned_char);
2842 str->other_end = stream;
2843 set_encoding_stream_coding_system (lstr, codesys);
2844 XSETLSTREAM (obj, lstr);
2849 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2851 return make_encoding_stream_1 (stream, codesys, "r");
2855 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2857 return make_encoding_stream_1 (stream, codesys, "w");
2860 /* Convert N bytes of internally-formatted data stored in SRC to an
2861 external format, according to the encoding stream ENCODING.
2862 Store the encoded data into DST. */
2865 mule_encode (Lstream *encoding, CONST unsigned char *src,
2866 unsigned_char_dynarr *dst, unsigned int n)
2868 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2870 switch (CODING_SYSTEM_TYPE (str->codesys))
2873 case CODESYS_INTERNAL:
2874 Dynarr_add_many (dst, src, n);
2877 case CODESYS_AUTODETECT:
2878 /* If we got this far and still haven't decided on the coding
2879 system, then do no conversion. */
2880 case CODESYS_NO_CONVERSION:
2881 encode_coding_no_conversion (encoding, src, dst, n);
2884 case CODESYS_SHIFT_JIS:
2885 encode_coding_sjis (encoding, src, dst, n);
2888 encode_coding_big5 (encoding, src, dst, n);
2891 encode_coding_ucs4 (encoding, src, dst, n);
2894 encode_coding_utf8 (encoding, src, dst, n);
2897 str->ccl.last_block = str->flags & CODING_STATE_END;
2898 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2900 case CODESYS_ISO2022:
2901 encode_coding_iso2022 (encoding, src, dst, n);
2909 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2910 Encode the text between START and END using CODING-SYSTEM.
2911 This will, for example, convert Japanese characters into stuff such as
2912 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2913 text. BUFFER defaults to the current buffer if unspecified.
2915 (start, end, coding_system, buffer))
2918 struct buffer *buf = decode_buffer (buffer, 0);
2919 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2920 Lstream *istr, *ostr;
2921 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2923 get_buffer_range_char (buf, start, end, &b, &e, 0);
2925 barf_if_buffer_read_only (buf, b, e);
2927 coding_system = Fget_coding_system (coding_system);
2928 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2929 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2930 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2931 Fget_coding_system (Qbinary));
2932 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2934 istr = XLSTREAM (instream);
2935 ostr = XLSTREAM (outstream);
2936 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2937 /* The chain of streams looks like this:
2939 [BUFFER] <----- send through
2940 ------> [ENCODE AS SPECIFIED]
2941 ------> [DECODE AS BINARY]
2946 char tempbuf[1024]; /* some random amount */
2947 Bufpos newpos, even_newer_pos;
2948 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2949 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2953 newpos = lisp_buffer_stream_startpos (istr);
2954 Lstream_write (ostr, tempbuf, size_in_bytes);
2955 even_newer_pos = lisp_buffer_stream_startpos (istr);
2956 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2962 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2963 Lstream_close (istr);
2964 Lstream_close (ostr);
2966 Lstream_delete (istr);
2967 Lstream_delete (ostr);
2968 Lstream_delete (XLSTREAM (de_outstream));
2969 Lstream_delete (XLSTREAM (lb_outstream));
2970 return make_int (retlen);
2976 /************************************************************************/
2977 /* Shift-JIS methods */
2978 /************************************************************************/
2980 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2981 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2982 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2983 encoded by "position-code + 0x80". A character of JISX0208
2984 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2985 position-codes are divided and shifted so that it fit in the range
2988 --- CODE RANGE of Shift-JIS ---
2989 (character set) (range)
2991 JISX0201-Kana 0xA0 .. 0xDF
2992 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2993 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2994 -------------------------------
2998 /* Is this the first byte of a Shift-JIS two-byte char? */
3000 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3001 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3003 /* Is this the second byte of a Shift-JIS two-byte char? */
3005 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3006 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3008 #define BYTE_SJIS_KATAKANA_P(c) \
3009 ((c) >= 0xA1 && (c) <= 0xDF)
3012 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
3020 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3022 if (st->shift_jis.in_second_byte)
3024 st->shift_jis.in_second_byte = 0;
3028 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3029 st->shift_jis.in_second_byte = 1;
3031 return CODING_CATEGORY_SHIFT_JIS_MASK;
3034 /* Convert Shift-JIS data to internal format. */
3037 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
3038 unsigned_char_dynarr *dst, unsigned int n)
3041 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3042 unsigned int flags = str->flags;
3043 unsigned int ch = str->ch;
3044 eol_type_t eol_type = str->eol_type;
3052 /* Previous character was first byte of Shift-JIS Kanji char. */
3053 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3055 unsigned char e1, e2;
3057 DECODE_SJIS (ch, c, e1, e2);
3059 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3063 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3064 Dynarr_add (dst, e1);
3065 Dynarr_add (dst, e2);
3070 DECODE_ADD_BINARY_CHAR (ch, dst);
3071 DECODE_ADD_BINARY_CHAR (c, dst);
3077 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3078 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3080 else if (BYTE_SJIS_KATAKANA_P (c))
3083 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3086 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3087 Dynarr_add (dst, c);
3091 DECODE_ADD_BINARY_CHAR (c, dst);
3093 label_continue_loop:;
3096 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3102 /* Convert internally-formatted data to Shift-JIS. */
3105 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
3106 unsigned_char_dynarr *dst, unsigned int n)
3109 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3110 unsigned int flags = str->flags;
3111 unsigned int ch = str->ch;
3112 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3114 unsigned char char_boundary = str->iso2022.current_char_boundary;
3121 switch (char_boundary)
3129 else if ( c >= 0xf8 )
3134 else if ( c >= 0xf0 )
3139 else if ( c >= 0xe0 )
3144 else if ( c >= 0xc0 )
3154 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3155 Dynarr_add (dst, '\r');
3156 if (eol_type != EOL_CR)
3157 Dynarr_add (dst, c);
3160 Dynarr_add (dst, c);
3165 ch = ( ch << 6 ) | ( c & 0x3f );
3167 Lisp_Object charset;
3168 unsigned int c1, c2, s1, s2;
3170 BREAKUP_CHAR (ch, charset, c1, c2);
3171 if (EQ(charset, Vcharset_katakana_jisx0201))
3173 Dynarr_add (dst, c1 | 0x80);
3175 else if (EQ(charset, Vcharset_japanese_jisx0208))
3177 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3178 Dynarr_add (dst, s1);
3179 Dynarr_add (dst, s2);
3185 ch = ( ch << 6 ) | ( c & 0x3f );
3191 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3192 Dynarr_add (dst, '\r');
3193 if (eol_type != EOL_CR)
3194 Dynarr_add (dst, '\n');
3197 else if (BYTE_ASCII_P (c))
3199 Dynarr_add (dst, c);
3202 else if (BUFBYTE_LEADING_BYTE_P (c))
3203 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3204 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3205 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3208 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3210 Dynarr_add (dst, c);
3213 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3214 ch == LEADING_BYTE_JAPANESE_JISX0208)
3218 unsigned char j1, j2;
3219 ENCODE_SJIS (ch, c, j1, j2);
3220 Dynarr_add (dst, j1);
3221 Dynarr_add (dst, j2);
3231 str->iso2022.current_char_boundary = char_boundary;
3235 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3236 Decode a JISX0208 character of Shift-JIS coding-system.
3237 CODE is the character code in Shift-JIS as a cons of type bytes.
3238 Return the corresponding character.
3242 unsigned char c1, c2, s1, s2;
3245 CHECK_INT (XCAR (code));
3246 CHECK_INT (XCDR (code));
3247 s1 = XINT (XCAR (code));
3248 s2 = XINT (XCDR (code));
3249 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3250 BYTE_SJIS_TWO_BYTE_2_P (s2))
3252 DECODE_SJIS (s1, s2, c1, c2);
3253 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3254 c1 & 0x7F, c2 & 0x7F));
3260 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3261 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3262 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3266 Lisp_Object charset;
3269 CHECK_CHAR_COERCE_INT (ch);
3270 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3271 if (EQ (charset, Vcharset_japanese_jisx0208))
3273 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3274 return Fcons (make_int (s1), make_int (s2));
3281 /************************************************************************/
3283 /************************************************************************/
3285 /* BIG5 is a coding system encoding two character sets: ASCII and
3286 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3287 character set and is encoded in two-byte.
3289 --- CODE RANGE of BIG5 ---
3290 (character set) (range)
3292 Big5 (1st byte) 0xA1 .. 0xFE
3293 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3294 --------------------------
3296 Since the number of characters in Big5 is larger than maximum
3297 characters in Emacs' charset (96x96), it can't be handled as one
3298 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3299 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3300 contains frequently used characters and the latter contains less
3301 frequently used characters. */
3303 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3304 ((c) >= 0xA1 && (c) <= 0xFE)
3306 /* Is this the second byte of a Shift-JIS two-byte char? */
3308 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3309 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3311 /* Number of Big5 characters which have the same code in 1st byte. */
3313 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3315 /* Code conversion macros. These are macros because they are used in
3316 inner loops during code conversion.
3318 Note that temporary variables in macros introduce the classic
3319 dynamic-scoping problems with variable names. We use capital-
3320 lettered variables in the assumption that XEmacs does not use
3321 capital letters in variables except in a very formalized way
3324 /* Convert Big5 code (b1, b2) into its internal string representation
3327 /* There is a much simpler way to split the Big5 charset into two.
3328 For the moment I'm going to leave the algorithm as-is because it
3329 claims to separate out the most-used characters into a single
3330 charset, which perhaps will lead to optimizations in various
3333 The way the algorithm works is something like this:
3335 Big5 can be viewed as a 94x157 charset, where the row is
3336 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3337 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3338 the split between low and high column numbers is apparently
3339 meaningless; ascending rows produce less and less frequent chars.
3340 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3341 the first charset, and the upper half (0xC9 .. 0xFE) to the
3342 second. To do the conversion, we convert the character into
3343 a single number where 0 .. 156 is the first row, 157 .. 313
3344 is the second, etc. That way, the characters are ordered by
3345 decreasing frequency. Then we just chop the space in two
3346 and coerce the result into a 94x94 space.
3349 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3351 int B1 = b1, B2 = b2; \
3353 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3357 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3361 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3362 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3364 c1 = I / (0xFF - 0xA1) + 0xA1; \
3365 c2 = I % (0xFF - 0xA1) + 0xA1; \
3368 /* Convert the internal string representation of a Big5 character
3369 (lb, c1, c2) into Big5 code (b1, b2). */
3371 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3373 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3375 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3377 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3379 b1 = I / BIG5_SAME_ROW + 0xA1; \
3380 b2 = I % BIG5_SAME_ROW; \
3381 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3385 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3393 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3394 (c >= 0x80 && c <= 0xA0))
3396 if (st->big5.in_second_byte)
3398 st->big5.in_second_byte = 0;
3399 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3403 st->big5.in_second_byte = 1;
3405 return CODING_CATEGORY_BIG5_MASK;
3408 /* Convert Big5 data to internal format. */
3411 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3412 unsigned_char_dynarr *dst, unsigned int n)
3415 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3416 unsigned int flags = str->flags;
3417 unsigned int ch = str->ch;
3418 eol_type_t eol_type = str->eol_type;
3425 /* Previous character was first byte of Big5 char. */
3426 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3428 unsigned char b1, b2, b3;
3429 DECODE_BIG5 (ch, c, b1, b2, b3);
3430 Dynarr_add (dst, b1);
3431 Dynarr_add (dst, b2);
3432 Dynarr_add (dst, b3);
3436 DECODE_ADD_BINARY_CHAR (ch, dst);
3437 DECODE_ADD_BINARY_CHAR (c, dst);
3443 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3444 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3447 DECODE_ADD_BINARY_CHAR (c, dst);
3449 label_continue_loop:;
3452 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3458 /* Convert internally-formatted data to Big5. */
3461 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3462 unsigned_char_dynarr *dst, unsigned int n)
3466 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3467 unsigned int flags = str->flags;
3468 unsigned int ch = str->ch;
3469 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3476 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3477 Dynarr_add (dst, '\r');
3478 if (eol_type != EOL_CR)
3479 Dynarr_add (dst, '\n');
3481 else if (BYTE_ASCII_P (c))
3484 Dynarr_add (dst, c);
3486 else if (BUFBYTE_LEADING_BYTE_P (c))
3488 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3489 c == LEADING_BYTE_CHINESE_BIG5_2)
3491 /* A recognized leading byte. */
3493 continue; /* not done with this character. */
3495 /* otherwise just ignore this character. */
3497 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3498 ch == LEADING_BYTE_CHINESE_BIG5_2)
3500 /* Previous char was a recognized leading byte. */
3502 continue; /* not done with this character. */
3506 /* Encountering second byte of a Big5 character. */
3507 unsigned char b1, b2;
3509 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3510 Dynarr_add (dst, b1);
3511 Dynarr_add (dst, b2);
3523 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3524 Decode a Big5 character CODE of BIG5 coding-system.
3525 CODE is the character code in BIG5, a cons of two integers.
3526 Return the corresponding character.
3530 unsigned char c1, c2, b1, b2;
3533 CHECK_INT (XCAR (code));
3534 CHECK_INT (XCDR (code));
3535 b1 = XINT (XCAR (code));
3536 b2 = XINT (XCDR (code));
3537 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3538 BYTE_BIG5_TWO_BYTE_2_P (b2))
3540 Charset_ID leading_byte;
3541 Lisp_Object charset;
3542 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3543 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3544 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3550 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3551 Encode the Big5 character CH to BIG5 coding-system.
3552 Return the corresponding character code in Big5.
3556 Lisp_Object charset;
3559 CHECK_CHAR_COERCE_INT (ch);
3560 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3561 if (EQ (charset, Vcharset_chinese_big5_1) ||
3562 EQ (charset, Vcharset_chinese_big5_2))
3564 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3566 return Fcons (make_int (b1), make_int (b2));
3573 /************************************************************************/
3576 /* UCS-4 character codes are implemented as nonnegative integers. */
3578 /************************************************************************/
3581 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3582 Map UCS-4 code CODE to Mule character CHARACTER.
3584 Return T on success, NIL on failure.
3590 CHECK_CHAR (character);
3594 if (c < sizeof (fcd->ucs_to_mule_table))
3596 fcd->ucs_to_mule_table[c] = character;
3604 ucs_to_char (unsigned long code)
3606 if (code < sizeof (fcd->ucs_to_mule_table))
3608 return fcd->ucs_to_mule_table[code];
3610 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3615 c = code % (94 * 94);
3617 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3618 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3619 CHARSET_LEFT_TO_RIGHT),
3620 c / 94 + 33, c % 94 + 33));
3626 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3627 Return Mule character corresponding to UCS code CODE (a positive integer).
3631 CHECK_NATNUM (code);
3632 return ucs_to_char (XINT (code));
3635 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3636 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3640 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3641 Fset_char_ucs is more restrictive on index arg, but should
3642 check code arg in a char_table method. */
3643 CHECK_CHAR (character);
3644 CHECK_NATNUM (code);
3645 return Fput_char_table (character, code, mule_to_ucs_table);
3648 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3649 Return the UCS code (a positive integer) corresponding to CHARACTER.
3653 return Fget_char_table (character, mule_to_ucs_table);
3658 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3660 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3661 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3662 is not found, instead.
3663 #### do something more appropriate (use blob?)
3664 Danger, Will Robinson! Data loss. Should we signal user? */
3666 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3668 Lisp_Object chr = ucs_to_char (ch);
3672 Bufbyte work[MAX_EMCHAR_LEN];
3677 simple_set_charptr_emchar (work, ch) :
3678 non_ascii_set_charptr_emchar (work, ch);
3679 Dynarr_add_many (dst, work, len);
3683 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3684 Dynarr_add (dst, 34 + 128);
3685 Dynarr_add (dst, 46 + 128);
3691 static unsigned long
3692 mule_char_to_ucs4 (Lisp_Object charset,
3693 unsigned char h, unsigned char l)
3696 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3703 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3704 (XCHARSET_CHARS (charset) == 94) )
3706 unsigned char final = XCHARSET_FINAL (charset);
3708 if ( ('@' <= final) && (final < 0x7f) )
3710 return 0xe00000 + (final - '@') * 94 * 94
3711 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3725 encode_ucs4 (Lisp_Object charset,
3726 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3728 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3729 Dynarr_add (dst, code >> 24);
3730 Dynarr_add (dst, (code >> 16) & 255);
3731 Dynarr_add (dst, (code >> 8) & 255);
3732 Dynarr_add (dst, code & 255);
3737 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3743 switch (st->ucs4.in_byte)
3752 st->ucs4.in_byte = 0;
3758 return CODING_CATEGORY_UCS4_MASK;
3762 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3763 unsigned_char_dynarr *dst, unsigned int n)
3765 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3766 unsigned int flags = str->flags;
3767 unsigned int ch = str->ch;
3768 unsigned char counter = str->counter;
3772 unsigned char c = *src++;
3780 decode_ucs4 ( ( ch << 8 ) | c, dst);
3785 ch = ( ch << 8 ) | c;
3789 if (counter & CODING_STATE_END)
3790 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3794 str->counter = counter;
3798 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3799 unsigned_char_dynarr *dst, unsigned int n)
3802 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3803 unsigned int flags = str->flags;
3804 unsigned int ch = str->ch;
3805 unsigned char char_boundary = str->iso2022.current_char_boundary;
3806 Lisp_Object charset = str->iso2022.current_charset;
3808 #ifdef ENABLE_COMPOSITE_CHARS
3809 /* flags for handling composite chars. We do a little switcharoo
3810 on the source while we're outputting the composite char. */
3811 unsigned int saved_n = 0;
3812 CONST unsigned char *saved_src = NULL;
3813 int in_composite = 0;
3820 unsigned char c = *src++;
3822 if (BYTE_ASCII_P (c))
3823 { /* Processing ASCII character */
3825 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3828 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3829 { /* Processing Leading Byte */
3831 charset = CHARSET_BY_LEADING_BYTE (c);
3832 if (LEADING_BYTE_PREFIX_P(c))
3837 { /* Processing Non-ASCII character */
3839 if (EQ (charset, Vcharset_control_1))
3841 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3845 switch (XCHARSET_REP_BYTES (charset))
3848 encode_ucs4 (charset, c, 0, dst);
3851 if (XCHARSET_PRIVATE_P (charset))
3853 encode_ucs4 (charset, c, 0, dst);
3858 #ifdef ENABLE_COMPOSITE_CHARS
3859 if (EQ (charset, Vcharset_composite))
3863 /* #### Bother! We don't know how to
3865 Dynarr_add (dst, 0);
3866 Dynarr_add (dst, 0);
3867 Dynarr_add (dst, 0);
3868 Dynarr_add (dst, '~');
3872 Emchar emch = MAKE_CHAR (Vcharset_composite,
3873 ch & 0x7F, c & 0x7F);
3874 Lisp_Object lstr = composite_char_string (emch);
3878 src = XSTRING_DATA (lstr);
3879 n = XSTRING_LENGTH (lstr);
3883 #endif /* ENABLE_COMPOSITE_CHARS */
3885 encode_ucs4(charset, ch, c, dst);
3898 encode_ucs4 (charset, ch, c, dst);
3914 #ifdef ENABLE_COMPOSITE_CHARS
3920 goto back_to_square_n; /* Wheeeeeeeee ..... */
3922 #endif /* ENABLE_COMPOSITE_CHARS */
3926 str->iso2022.current_char_boundary = char_boundary;
3927 str->iso2022.current_charset = charset;
3929 /* Verbum caro factum est! */
3934 /************************************************************************/
3936 /************************************************************************/
3939 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3944 unsigned char c = *src++;
3945 switch (st->utf8.in_byte)
3948 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3951 st->utf8.in_byte = 5;
3953 st->utf8.in_byte = 4;
3955 st->utf8.in_byte = 3;
3957 st->utf8.in_byte = 2;
3959 st->utf8.in_byte = 1;
3964 if ((c & 0xc0) != 0x80)
3970 return CODING_CATEGORY_UTF8_MASK;
3974 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3975 unsigned_char_dynarr *dst, unsigned int n)
3977 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3978 unsigned int flags = str->flags;
3979 unsigned int ch = str->ch;
3980 eol_type_t eol_type = str->eol_type;
3981 unsigned char counter = str->counter;
3985 unsigned char c = *src++;
3994 else if ( c >= 0xf8 )
3999 else if ( c >= 0xf0 )
4004 else if ( c >= 0xe0 )
4009 else if ( c >= 0xc0 )
4016 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4017 decode_ucs4 (c, dst);
4021 ch = ( ch << 6 ) | ( c & 0x3f );
4022 decode_ucs4 (ch, dst);
4027 ch = ( ch << 6 ) | ( c & 0x3f );
4030 label_continue_loop:;
4033 if (flags & CODING_STATE_END)
4034 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4038 str->counter = counter;
4043 encode_utf8 (Lisp_Object charset,
4044 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4046 unsigned long code = mule_char_to_ucs4 (charset, h, l);
4049 Dynarr_add (dst, code);
4051 else if ( code <= 0x7ff )
4053 Dynarr_add (dst, (code >> 6) | 0xc0);
4054 Dynarr_add (dst, (code & 0x3f) | 0x80);
4056 else if ( code <= 0xffff )
4058 Dynarr_add (dst, (code >> 12) | 0xe0);
4059 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4060 Dynarr_add (dst, (code & 0x3f) | 0x80);
4062 else if ( code <= 0x1fffff )
4064 Dynarr_add (dst, (code >> 18) | 0xf0);
4065 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4066 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4067 Dynarr_add (dst, (code & 0x3f) | 0x80);
4069 else if ( code <= 0x3ffffff )
4071 Dynarr_add (dst, (code >> 24) | 0xf8);
4072 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4073 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4074 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4075 Dynarr_add (dst, (code & 0x3f) | 0x80);
4079 Dynarr_add (dst, (code >> 30) | 0xfc);
4080 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4081 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4082 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4083 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
4084 Dynarr_add (dst, (code & 0x3f) | 0x80);
4090 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
4091 unsigned_char_dynarr *dst, unsigned int n)
4093 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4094 unsigned int flags = str->flags;
4095 unsigned int ch = str->ch;
4096 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4097 unsigned char char_boundary = str->iso2022.current_char_boundary;
4102 unsigned char c = *src++;
4103 switch (char_boundary)
4108 Dynarr_add (dst, c);
4111 else if ( c >= 0xf8 )
4113 Dynarr_add (dst, c);
4116 else if ( c >= 0xf0 )
4118 Dynarr_add (dst, c);
4121 else if ( c >= 0xe0 )
4123 Dynarr_add (dst, c);
4126 else if ( c >= 0xc0 )
4128 Dynarr_add (dst, c);
4135 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4136 Dynarr_add (dst, '\r');
4137 if (eol_type != EOL_CR)
4138 Dynarr_add (dst, c);
4141 Dynarr_add (dst, c);
4146 Dynarr_add (dst, c);
4150 Dynarr_add (dst, c);
4154 #else /* not UTF2000 */
4155 Lisp_Object charset = str->iso2022.current_charset;
4157 #ifdef ENABLE_COMPOSITE_CHARS
4158 /* flags for handling composite chars. We do a little switcharoo
4159 on the source while we're outputting the composite char. */
4160 unsigned int saved_n = 0;
4161 CONST unsigned char *saved_src = NULL;
4162 int in_composite = 0;
4165 #endif /* ENABLE_COMPOSITE_CHARS */
4169 unsigned char c = *src++;
4171 if (BYTE_ASCII_P (c))
4172 { /* Processing ASCII character */
4176 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4177 Dynarr_add (dst, '\r');
4178 if (eol_type != EOL_CR)
4179 Dynarr_add (dst, c);
4182 encode_utf8 (Vcharset_ascii, c, 0, dst);
4185 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4186 { /* Processing Leading Byte */
4188 charset = CHARSET_BY_LEADING_BYTE (c);
4189 if (LEADING_BYTE_PREFIX_P(c))
4194 { /* Processing Non-ASCII character */
4196 if (EQ (charset, Vcharset_control_1))
4198 encode_utf8 (Vcharset_control_1, c, 0, dst);
4202 switch (XCHARSET_REP_BYTES (charset))
4205 encode_utf8 (charset, c, 0, dst);
4208 if (XCHARSET_PRIVATE_P (charset))
4210 encode_utf8 (charset, c, 0, dst);
4215 #ifdef ENABLE_COMPOSITE_CHARS
4216 if (EQ (charset, Vcharset_composite))
4220 /* #### Bother! We don't know how to
4222 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4226 Emchar emch = MAKE_CHAR (Vcharset_composite,
4227 ch & 0x7F, c & 0x7F);
4228 Lisp_Object lstr = composite_char_string (emch);
4232 src = XSTRING_DATA (lstr);
4233 n = XSTRING_LENGTH (lstr);
4237 #endif /* ENABLE_COMPOSITE_CHARS */
4239 encode_utf8 (charset, ch, c, dst);
4252 encode_utf8 (charset, ch, c, dst);
4268 #ifdef ENABLE_COMPOSITE_CHARS
4274 goto back_to_square_n; /* Wheeeeeeeee ..... */
4278 #endif /* not UTF2000 */
4281 str->iso2022.current_char_boundary = char_boundary;
4283 str->iso2022.current_charset = charset;
4286 /* Verbum caro factum est! */
4290 /************************************************************************/
4291 /* ISO2022 methods */
4292 /************************************************************************/
4294 /* The following note describes the coding system ISO2022 briefly.
4295 Since the intention of this note is to help understand the
4296 functions in this file, some parts are NOT ACCURATE or OVERLY
4297 SIMPLIFIED. For thorough understanding, please refer to the
4298 original document of ISO2022.
4300 ISO2022 provides many mechanisms to encode several character sets
4301 in 7-bit and 8-bit environments. For 7-bit environments, all text
4302 is encoded using bytes less than 128. This may make the encoded
4303 text a little bit longer, but the text passes more easily through
4304 several gateways, some of which strip off MSB (Most Signigant Bit).
4306 There are two kinds of character sets: control character set and
4307 graphic character set. The former contains control characters such
4308 as `newline' and `escape' to provide control functions (control
4309 functions are also provided by escape sequences). The latter
4310 contains graphic characters such as 'A' and '-'. Emacs recognizes
4311 two control character sets and many graphic character sets.
4313 Graphic character sets are classified into one of the following
4314 four classes, according to the number of bytes (DIMENSION) and
4315 number of characters in one dimension (CHARS) of the set:
4316 - DIMENSION1_CHARS94
4317 - DIMENSION1_CHARS96
4318 - DIMENSION2_CHARS94
4319 - DIMENSION2_CHARS96
4321 In addition, each character set is assigned an identification tag,
4322 unique for each set, called "final character" (denoted as <F>
4323 hereafter). The <F> of each character set is decided by ECMA(*)
4324 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4325 (0x30..0x3F are for private use only).
4327 Note (*): ECMA = European Computer Manufacturers Association
4329 Here are examples of graphic character set [NAME(<F>)]:
4330 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4331 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4332 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4333 o DIMENSION2_CHARS96 -- none for the moment
4335 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4336 C0 [0x00..0x1F] -- control character plane 0
4337 GL [0x20..0x7F] -- graphic character plane 0
4338 C1 [0x80..0x9F] -- control character plane 1
4339 GR [0xA0..0xFF] -- graphic character plane 1
4341 A control character set is directly designated and invoked to C0 or
4342 C1 by an escape sequence. The most common case is that:
4343 - ISO646's control character set is designated/invoked to C0, and
4344 - ISO6429's control character set is designated/invoked to C1,
4345 and usually these designations/invocations are omitted in encoded
4346 text. In a 7-bit environment, only C0 can be used, and a control
4347 character for C1 is encoded by an appropriate escape sequence to
4348 fit into the environment. All control characters for C1 are
4349 defined to have corresponding escape sequences.
4351 A graphic character set is at first designated to one of four
4352 graphic registers (G0 through G3), then these graphic registers are
4353 invoked to GL or GR. These designations and invocations can be
4354 done independently. The most common case is that G0 is invoked to
4355 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4356 these invocations and designations are omitted in encoded text.
4357 In a 7-bit environment, only GL can be used.
4359 When a graphic character set of CHARS94 is invoked to GL, codes
4360 0x20 and 0x7F of the GL area work as control characters SPACE and
4361 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4364 There are two ways of invocation: locking-shift and single-shift.
4365 With locking-shift, the invocation lasts until the next different
4366 invocation, whereas with single-shift, the invocation affects the
4367 following character only and doesn't affect the locking-shift
4368 state. Invocations are done by the following control characters or
4371 ----------------------------------------------------------------------
4372 abbrev function cntrl escape seq description
4373 ----------------------------------------------------------------------
4374 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4375 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4376 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4377 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4378 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4379 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4380 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4381 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4382 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4383 ----------------------------------------------------------------------
4384 (*) These are not used by any known coding system.
4386 Control characters for these functions are defined by macros
4387 ISO_CODE_XXX in `coding.h'.
4389 Designations are done by the following escape sequences:
4390 ----------------------------------------------------------------------
4391 escape sequence description
4392 ----------------------------------------------------------------------
4393 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4394 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4395 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4396 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4397 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4398 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4399 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4400 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4401 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4402 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4403 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4404 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4405 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4406 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4407 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4408 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4409 ----------------------------------------------------------------------
4411 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4412 of dimension 1, chars 94, and final character <F>, etc...
4414 Note (*): Although these designations are not allowed in ISO2022,
4415 Emacs accepts them on decoding, and produces them on encoding
4416 CHARS96 character sets in a coding system which is characterized as
4417 7-bit environment, non-locking-shift, and non-single-shift.
4419 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4420 '(' can be omitted. We refer to this as "short-form" hereafter.
4422 Now you may notice that there are a lot of ways for encoding the
4423 same multilingual text in ISO2022. Actually, there exist many
4424 coding systems such as Compound Text (used in X11's inter client
4425 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4426 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4427 localized platforms), and all of these are variants of ISO2022.
4429 In addition to the above, Emacs handles two more kinds of escape
4430 sequences: ISO6429's direction specification and Emacs' private
4431 sequence for specifying character composition.
4433 ISO6429's direction specification takes the following form:
4434 o CSI ']' -- end of the current direction
4435 o CSI '0' ']' -- end of the current direction
4436 o CSI '1' ']' -- start of left-to-right text
4437 o CSI '2' ']' -- start of right-to-left text
4438 The control character CSI (0x9B: control sequence introducer) is
4439 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4441 Character composition specification takes the following form:
4442 o ESC '0' -- start character composition
4443 o ESC '1' -- end character composition
4444 Since these are not standard escape sequences of any ISO standard,
4445 their use with these meanings is restricted to Emacs only. */
4448 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4452 for (i = 0; i < 4; i++)
4454 if (!NILP (coding_system))
4456 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4458 iso->charset[i] = Qt;
4459 iso->invalid_designated[i] = 0;
4461 iso->esc = ISO_ESC_NOTHING;
4462 iso->esc_bytes_index = 0;
4463 iso->register_left = 0;
4464 iso->register_right = 1;
4465 iso->switched_dir_and_no_valid_charset_yet = 0;
4466 iso->invalid_switch_dir = 0;
4467 iso->output_direction_sequence = 0;
4468 iso->output_literally = 0;
4469 #ifdef ENABLE_COMPOSITE_CHARS
4470 if (iso->composite_chars)
4471 Dynarr_reset (iso->composite_chars);
4476 fit_to_be_escape_quoted (unsigned char c)
4493 /* Parse one byte of an ISO2022 escape sequence.
4494 If the result is an invalid escape sequence, return 0 and
4495 do not change anything in STR. Otherwise, if the result is
4496 an incomplete escape sequence, update ISO2022.ESC and
4497 ISO2022.ESC_BYTES and return -1. Otherwise, update
4498 all the state variables (but not ISO2022.ESC_BYTES) and
4501 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4502 or invocation of an invalid character set and treat that as
4503 an unrecognized escape sequence. */
4506 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4507 unsigned char c, unsigned int *flags,
4508 int check_invalid_charsets)
4510 /* (1) If we're at the end of a designation sequence, CS is the
4511 charset being designated and REG is the register to designate
4514 (2) If we're at the end of a locking-shift sequence, REG is
4515 the register to invoke and HALF (0 == left, 1 == right) is
4516 the half to invoke it into.
4518 (3) If we're at the end of a single-shift sequence, REG is
4519 the register to invoke. */
4520 Lisp_Object cs = Qnil;
4523 /* NOTE: This code does goto's all over the fucking place.
4524 The reason for this is that we're basically implementing
4525 a state machine here, and hierarchical languages like C
4526 don't really provide a clean way of doing this. */
4528 if (! (*flags & CODING_STATE_ESCAPE))
4529 /* At beginning of escape sequence; we need to reset our
4530 escape-state variables. */
4531 iso->esc = ISO_ESC_NOTHING;
4533 iso->output_literally = 0;
4534 iso->output_direction_sequence = 0;
4538 case ISO_ESC_NOTHING:
4539 iso->esc_bytes_index = 0;
4542 case ISO_CODE_ESC: /* Start escape sequence */
4543 *flags |= CODING_STATE_ESCAPE;
4547 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4548 *flags |= CODING_STATE_ESCAPE;
4549 iso->esc = ISO_ESC_5_11;
4552 case ISO_CODE_SO: /* locking shift 1 */
4555 case ISO_CODE_SI: /* locking shift 0 */
4559 case ISO_CODE_SS2: /* single shift */
4562 case ISO_CODE_SS3: /* single shift */
4566 default: /* Other control characters */
4573 /**** single shift ****/
4575 case 'N': /* single shift 2 */
4578 case 'O': /* single shift 3 */
4582 /**** locking shift ****/
4584 case '~': /* locking shift 1 right */
4587 case 'n': /* locking shift 2 */
4590 case '}': /* locking shift 2 right */
4593 case 'o': /* locking shift 3 */
4596 case '|': /* locking shift 3 right */
4600 #ifdef ENABLE_COMPOSITE_CHARS
4601 /**** composite ****/
4604 iso->esc = ISO_ESC_START_COMPOSITE;
4605 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4606 CODING_STATE_COMPOSITE;
4610 iso->esc = ISO_ESC_END_COMPOSITE;
4611 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4612 ~CODING_STATE_COMPOSITE;
4614 #endif /* ENABLE_COMPOSITE_CHARS */
4616 /**** directionality ****/
4619 iso->esc = ISO_ESC_5_11;
4622 /**** designation ****/
4624 case '$': /* multibyte charset prefix */
4625 iso->esc = ISO_ESC_2_4;
4629 if (0x28 <= c && c <= 0x2F)
4631 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4635 /* This function is called with CODESYS equal to nil when
4636 doing coding-system detection. */
4638 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4639 && fit_to_be_escape_quoted (c))
4641 iso->esc = ISO_ESC_LITERAL;
4642 *flags &= CODING_STATE_ISO2022_LOCK;
4652 /**** directionality ****/
4654 case ISO_ESC_5_11: /* ISO6429 direction control */
4657 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4658 goto directionality;
4660 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4661 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4662 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4666 case ISO_ESC_5_11_0:
4669 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4670 goto directionality;
4674 case ISO_ESC_5_11_1:
4677 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4678 goto directionality;
4682 case ISO_ESC_5_11_2:
4685 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4686 goto directionality;
4691 iso->esc = ISO_ESC_DIRECTIONALITY;
4692 /* Various junk here to attempt to preserve the direction sequences
4693 literally in the text if they would otherwise be swallowed due
4694 to invalid designations that don't show up as actual charset
4695 changes in the text. */
4696 if (iso->invalid_switch_dir)
4698 /* We already inserted a direction switch literally into the
4699 text. We assume (#### this may not be right) that the
4700 next direction switch is the one going the other way,
4701 and we need to output that literally as well. */
4702 iso->output_literally = 1;
4703 iso->invalid_switch_dir = 0;
4709 /* If we are in the thrall of an invalid designation,
4710 then stick the directionality sequence literally into the
4711 output stream so it ends up in the original text again. */
4712 for (jj = 0; jj < 4; jj++)
4713 if (iso->invalid_designated[jj])
4717 iso->output_literally = 1;
4718 iso->invalid_switch_dir = 1;
4721 /* Indicate that we haven't yet seen a valid designation,
4722 so that if a switch-dir is directly followed by an
4723 invalid designation, both get inserted literally. */
4724 iso->switched_dir_and_no_valid_charset_yet = 1;
4729 /**** designation ****/
4732 if (0x28 <= c && c <= 0x2F)
4734 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4737 if (0x40 <= c && c <= 0x42)
4739 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4740 *flags & CODING_STATE_R2L ?
4741 CHARSET_RIGHT_TO_LEFT :
4742 CHARSET_LEFT_TO_RIGHT);
4752 if (c < '0' || c > '~')
4753 return 0; /* bad final byte */
4755 if (iso->esc >= ISO_ESC_2_8 &&
4756 iso->esc <= ISO_ESC_2_15)
4758 type = ((iso->esc >= ISO_ESC_2_12) ?
4759 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4760 reg = (iso->esc - ISO_ESC_2_8) & 3;
4762 else if (iso->esc >= ISO_ESC_2_4_8 &&
4763 iso->esc <= ISO_ESC_2_4_15)
4765 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4766 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4767 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4771 /* Can this ever be reached? -slb */
4775 cs = CHARSET_BY_ATTRIBUTES (type, c,
4776 *flags & CODING_STATE_R2L ?
4777 CHARSET_RIGHT_TO_LEFT :
4778 CHARSET_LEFT_TO_RIGHT);
4784 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4788 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4789 /* can't invoke something that ain't there. */
4791 iso->esc = ISO_ESC_SINGLE_SHIFT;
4792 *flags &= CODING_STATE_ISO2022_LOCK;
4794 *flags |= CODING_STATE_SS2;
4796 *flags |= CODING_STATE_SS3;
4800 if (check_invalid_charsets &&
4801 !CHARSETP (iso->charset[reg]))
4802 /* can't invoke something that ain't there. */
4805 iso->register_right = reg;
4807 iso->register_left = reg;
4808 *flags &= CODING_STATE_ISO2022_LOCK;
4809 iso->esc = ISO_ESC_LOCKING_SHIFT;
4813 if (NILP (cs) && check_invalid_charsets)
4815 iso->invalid_designated[reg] = 1;
4816 iso->charset[reg] = Vcharset_ascii;
4817 iso->esc = ISO_ESC_DESIGNATE;
4818 *flags &= CODING_STATE_ISO2022_LOCK;
4819 iso->output_literally = 1;
4820 if (iso->switched_dir_and_no_valid_charset_yet)
4822 /* We encountered a switch-direction followed by an
4823 invalid designation. Ensure that the switch-direction
4824 gets outputted; otherwise it will probably get eaten
4825 when the text is written out again. */
4826 iso->switched_dir_and_no_valid_charset_yet = 0;
4827 iso->output_direction_sequence = 1;
4828 /* And make sure that the switch-dir going the other
4829 way gets outputted, as well. */
4830 iso->invalid_switch_dir = 1;
4834 /* This function is called with CODESYS equal to nil when
4835 doing coding-system detection. */
4836 if (!NILP (codesys))
4838 charset_conversion_spec_dynarr *dyn =
4839 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4845 for (i = 0; i < Dynarr_length (dyn); i++)
4847 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4848 if (EQ (cs, spec->from_charset))
4849 cs = spec->to_charset;
4854 iso->charset[reg] = cs;
4855 iso->esc = ISO_ESC_DESIGNATE;
4856 *flags &= CODING_STATE_ISO2022_LOCK;
4857 if (iso->invalid_designated[reg])
4859 iso->invalid_designated[reg] = 0;
4860 iso->output_literally = 1;
4862 if (iso->switched_dir_and_no_valid_charset_yet)
4863 iso->switched_dir_and_no_valid_charset_yet = 0;
4868 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4873 /* #### There are serious deficiencies in the recognition mechanism
4874 here. This needs to be much smarter if it's going to cut it.
4875 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4876 it should be detected as Latin-1.
4877 All the ISO2022 stuff in this file should be synced up with the
4878 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4879 Perhaps we should wait till R2L works in FSF Emacs? */
4881 if (!st->iso2022.initted)
4883 reset_iso2022 (Qnil, &st->iso2022.iso);
4884 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4885 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4886 CODING_CATEGORY_ISO_8_1_MASK |
4887 CODING_CATEGORY_ISO_8_2_MASK |
4888 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4889 st->iso2022.flags = 0;
4890 st->iso2022.high_byte_count = 0;
4891 st->iso2022.saw_single_shift = 0;
4892 st->iso2022.initted = 1;
4895 mask = st->iso2022.mask;
4902 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4903 st->iso2022.high_byte_count++;
4907 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4909 if (st->iso2022.high_byte_count & 1)
4910 /* odd number of high bytes; assume not iso-8-2 */
4911 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4913 st->iso2022.high_byte_count = 0;
4914 st->iso2022.saw_single_shift = 0;
4916 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4918 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4919 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4920 { /* control chars */
4923 /* Allow and ignore control characters that you might
4924 reasonably see in a text file */
4929 case 8: /* backspace */
4930 case 11: /* vertical tab */
4931 case 12: /* form feed */
4932 case 26: /* MS-DOS C-z junk */
4933 case 31: /* '^_' -- for info */
4934 goto label_continue_loop;
4941 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4944 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4945 &st->iso2022.flags, 0))
4947 switch (st->iso2022.iso.esc)
4949 case ISO_ESC_DESIGNATE:
4950 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4951 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4953 case ISO_ESC_LOCKING_SHIFT:
4954 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4955 goto ran_out_of_chars;
4956 case ISO_ESC_SINGLE_SHIFT:
4957 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4958 st->iso2022.saw_single_shift = 1;
4967 goto ran_out_of_chars;
4970 label_continue_loop:;
4979 postprocess_iso2022_mask (int mask)
4981 /* #### kind of cheesy */
4982 /* If seven-bit ISO is allowed, then assume that the encoding is
4983 entirely seven-bit and turn off the eight-bit ones. */
4984 if (mask & CODING_CATEGORY_ISO_7_MASK)
4985 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4986 CODING_CATEGORY_ISO_8_1_MASK |
4987 CODING_CATEGORY_ISO_8_2_MASK);
4991 /* If FLAGS is a null pointer or specifies right-to-left motion,
4992 output a switch-dir-to-left-to-right sequence to DST.
4993 Also update FLAGS if it is not a null pointer.
4994 If INTERNAL_P is set, we are outputting in internal format and
4995 need to handle the CSI differently. */
4998 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4999 unsigned_char_dynarr *dst,
5000 unsigned int *flags,
5003 if (!flags || (*flags & CODING_STATE_R2L))
5005 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5007 Dynarr_add (dst, ISO_CODE_ESC);
5008 Dynarr_add (dst, '[');
5010 else if (internal_p)
5011 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5013 Dynarr_add (dst, ISO_CODE_CSI);
5014 Dynarr_add (dst, '0');
5015 Dynarr_add (dst, ']');
5017 *flags &= ~CODING_STATE_R2L;
5021 /* If FLAGS is a null pointer or specifies a direction different from
5022 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5023 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5024 sequence to DST. Also update FLAGS if it is not a null pointer.
5025 If INTERNAL_P is set, we are outputting in internal format and
5026 need to handle the CSI differently. */
5029 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5030 unsigned_char_dynarr *dst, unsigned int *flags,
5033 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5034 direction == CHARSET_LEFT_TO_RIGHT)
5035 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5036 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5037 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5038 direction == CHARSET_RIGHT_TO_LEFT)
5040 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5042 Dynarr_add (dst, ISO_CODE_ESC);
5043 Dynarr_add (dst, '[');
5045 else if (internal_p)
5046 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5048 Dynarr_add (dst, ISO_CODE_CSI);
5049 Dynarr_add (dst, '2');
5050 Dynarr_add (dst, ']');
5052 *flags |= CODING_STATE_R2L;
5056 /* Convert ISO2022-format data to internal format. */
5059 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
5060 unsigned_char_dynarr *dst, unsigned int n)
5062 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5063 unsigned int flags = str->flags;
5064 unsigned int ch = str->ch;
5065 eol_type_t eol_type = str->eol_type;
5066 #ifdef ENABLE_COMPOSITE_CHARS
5067 unsigned_char_dynarr *real_dst = dst;
5069 Lisp_Object coding_system;
5071 XSETCODING_SYSTEM (coding_system, str->codesys);
5073 #ifdef ENABLE_COMPOSITE_CHARS
5074 if (flags & CODING_STATE_COMPOSITE)
5075 dst = str->iso2022.composite_chars;
5076 #endif /* ENABLE_COMPOSITE_CHARS */
5080 unsigned char c = *src++;
5081 if (flags & CODING_STATE_ESCAPE)
5082 { /* Within ESC sequence */
5083 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5088 switch (str->iso2022.esc)
5090 #ifdef ENABLE_COMPOSITE_CHARS
5091 case ISO_ESC_START_COMPOSITE:
5092 if (str->iso2022.composite_chars)
5093 Dynarr_reset (str->iso2022.composite_chars);
5095 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5096 dst = str->iso2022.composite_chars;
5098 case ISO_ESC_END_COMPOSITE:
5100 Bufbyte comstr[MAX_EMCHAR_LEN];
5102 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5103 Dynarr_length (dst));
5105 len = set_charptr_emchar (comstr, emch);
5106 Dynarr_add_many (dst, comstr, len);
5109 #endif /* ENABLE_COMPOSITE_CHARS */
5111 case ISO_ESC_LITERAL:
5112 DECODE_ADD_BINARY_CHAR (c, dst);
5116 /* Everything else handled already */
5121 /* Attempted error recovery. */
5122 if (str->iso2022.output_direction_sequence)
5123 ensure_correct_direction (flags & CODING_STATE_R2L ?
5124 CHARSET_RIGHT_TO_LEFT :
5125 CHARSET_LEFT_TO_RIGHT,
5126 str->codesys, dst, 0, 1);
5127 /* More error recovery. */
5128 if (!retval || str->iso2022.output_literally)
5130 /* Output the (possibly invalid) sequence */
5132 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5133 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5134 flags &= CODING_STATE_ISO2022_LOCK;
5136 n++, src--;/* Repeat the loop with the same character. */
5139 /* No sense in reprocessing the final byte of the
5140 escape sequence; it could mess things up anyway.
5142 DECODE_ADD_BINARY_CHAR (c, dst);
5147 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5148 { /* Control characters */
5150 /***** Error-handling *****/
5152 /* If we were in the middle of a character, dump out the
5153 partial character. */
5154 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5156 /* If we just saw a single-shift character, dump it out.
5157 This may dump out the wrong sort of single-shift character,
5158 but least it will give an indication that something went
5160 if (flags & CODING_STATE_SS2)
5162 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5163 flags &= ~CODING_STATE_SS2;
5165 if (flags & CODING_STATE_SS3)
5167 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5168 flags &= ~CODING_STATE_SS3;
5171 /***** Now handle the control characters. *****/
5174 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5176 flags &= CODING_STATE_ISO2022_LOCK;
5178 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5179 DECODE_ADD_BINARY_CHAR (c, dst);
5182 { /* Graphic characters */
5183 Lisp_Object charset;
5189 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5191 /* Now determine the charset. */
5192 reg = ((flags & CODING_STATE_SS2) ? 2
5193 : (flags & CODING_STATE_SS3) ? 3
5194 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5195 : str->iso2022.register_left);
5196 charset = str->iso2022.charset[reg];
5198 /* Error checking: */
5199 if (! CHARSETP (charset)
5200 || str->iso2022.invalid_designated[reg]
5201 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5202 && XCHARSET_CHARS (charset) == 94))
5203 /* Mrmph. We are trying to invoke a register that has no
5204 or an invalid charset in it, or trying to add a character
5205 outside the range of the charset. Insert that char literally
5206 to preserve it for the output. */
5208 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5209 DECODE_ADD_BINARY_CHAR (c, dst);
5214 /* Things are probably hunky-dorey. */
5216 /* Fetch reverse charset, maybe. */
5217 if (((flags & CODING_STATE_R2L) &&
5218 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5220 (!(flags & CODING_STATE_R2L) &&
5221 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5223 Lisp_Object new_charset =
5224 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5225 if (!NILP (new_charset))
5226 charset = new_charset;
5230 if (XCHARSET_DIMENSION (charset) == 1)
5232 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5234 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5239 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5245 lb = XCHARSET_LEADING_BYTE (charset);
5246 switch (XCHARSET_REP_BYTES (charset))
5249 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5250 Dynarr_add (dst, c & 0x7F);
5253 case 2: /* one-byte official */
5254 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5255 Dynarr_add (dst, lb);
5256 Dynarr_add (dst, c | 0x80);
5259 case 3: /* one-byte private or two-byte official */
5260 if (XCHARSET_PRIVATE_P (charset))
5262 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5263 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5264 Dynarr_add (dst, lb);
5265 Dynarr_add (dst, c | 0x80);
5271 Dynarr_add (dst, lb);
5272 Dynarr_add (dst, ch | 0x80);
5273 Dynarr_add (dst, c | 0x80);
5281 default: /* two-byte private */
5284 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5285 Dynarr_add (dst, lb);
5286 Dynarr_add (dst, ch | 0x80);
5287 Dynarr_add (dst, c | 0x80);
5297 flags &= CODING_STATE_ISO2022_LOCK;
5300 label_continue_loop:;
5303 if (flags & CODING_STATE_END)
5304 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5311 /***** ISO2022 encoder *****/
5313 /* Designate CHARSET into register REG. */
5316 iso2022_designate (Lisp_Object charset, unsigned char reg,
5317 struct encoding_stream *str, unsigned_char_dynarr *dst)
5319 static CONST char inter94[] = "()*+";
5320 static CONST char inter96[] = ",-./";
5322 unsigned char final;
5323 Lisp_Object old_charset = str->iso2022.charset[reg];
5325 str->iso2022.charset[reg] = charset;
5326 if (!CHARSETP (charset))
5327 /* charset might be an initial nil or t. */
5329 type = XCHARSET_TYPE (charset);
5330 final = XCHARSET_FINAL (charset);
5331 if (!str->iso2022.force_charset_on_output[reg] &&
5332 CHARSETP (old_charset) &&
5333 XCHARSET_TYPE (old_charset) == type &&
5334 XCHARSET_FINAL (old_charset) == final)
5337 str->iso2022.force_charset_on_output[reg] = 0;
5340 charset_conversion_spec_dynarr *dyn =
5341 str->codesys->iso2022.output_conv;
5347 for (i = 0; i < Dynarr_length (dyn); i++)
5349 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5350 if (EQ (charset, spec->from_charset))
5351 charset = spec->to_charset;
5356 Dynarr_add (dst, ISO_CODE_ESC);
5359 case CHARSET_TYPE_94:
5360 Dynarr_add (dst, inter94[reg]);
5362 case CHARSET_TYPE_96:
5363 Dynarr_add (dst, inter96[reg]);
5365 case CHARSET_TYPE_94X94:
5366 Dynarr_add (dst, '$');
5368 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5371 Dynarr_add (dst, inter94[reg]);
5373 case CHARSET_TYPE_96X96:
5374 Dynarr_add (dst, '$');
5375 Dynarr_add (dst, inter96[reg]);
5378 Dynarr_add (dst, final);
5382 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5384 if (str->iso2022.register_left != 0)
5386 Dynarr_add (dst, ISO_CODE_SI);
5387 str->iso2022.register_left = 0;
5392 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5394 if (str->iso2022.register_left != 1)
5396 Dynarr_add (dst, ISO_CODE_SO);
5397 str->iso2022.register_left = 1;
5401 /* Convert internally-formatted data to ISO2022 format. */
5404 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5405 unsigned_char_dynarr *dst, unsigned int n)
5407 unsigned char charmask, c;
5408 unsigned char char_boundary;
5409 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5410 unsigned int flags = str->flags;
5411 Emchar ch = str->ch;
5412 Lisp_Coding_System *codesys = str->codesys;
5413 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5415 Lisp_Object charset;
5418 unsigned int byte1, byte2;
5421 #ifdef ENABLE_COMPOSITE_CHARS
5422 /* flags for handling composite chars. We do a little switcharoo
5423 on the source while we're outputting the composite char. */
5424 unsigned int saved_n = 0;
5425 CONST unsigned char *saved_src = NULL;
5426 int in_composite = 0;
5427 #endif /* ENABLE_COMPOSITE_CHARS */
5429 char_boundary = str->iso2022.current_char_boundary;
5430 charset = str->iso2022.current_charset;
5431 half = str->iso2022.current_half;
5433 #ifdef ENABLE_COMPOSITE_CHARS
5441 switch (char_boundary)
5449 else if ( c >= 0xf8 )
5454 else if ( c >= 0xf0 )
5459 else if ( c >= 0xe0 )
5464 else if ( c >= 0xc0 )
5473 restore_left_to_right_direction (codesys, dst, &flags, 0);
5475 /* Make sure G0 contains ASCII */
5476 if ((c > ' ' && c < ISO_CODE_DEL) ||
5477 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5479 ensure_normal_shift (str, dst);
5480 iso2022_designate (Vcharset_ascii, 0, str, dst);
5483 /* If necessary, restore everything to the default state
5486 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5488 restore_left_to_right_direction (codesys, dst, &flags, 0);
5490 ensure_normal_shift (str, dst);
5492 for (i = 0; i < 4; i++)
5494 Lisp_Object initial_charset =
5495 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5496 iso2022_designate (initial_charset, i, str, dst);
5501 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5502 Dynarr_add (dst, '\r');
5503 if (eol_type != EOL_CR)
5504 Dynarr_add (dst, c);
5508 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5509 && fit_to_be_escape_quoted (c))
5510 Dynarr_add (dst, ISO_CODE_ESC);
5511 Dynarr_add (dst, c);
5517 ch = ( ch << 6 ) | ( c & 0x3f );
5520 if ( (0x80 <= ch) && (ch <= 0x9f) )
5522 charmask = (half == 0 ? 0x00 : 0x80);
5524 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5525 && fit_to_be_escape_quoted (ch))
5526 Dynarr_add (dst, ISO_CODE_ESC);
5527 /* you asked for it ... */
5528 Dynarr_add (dst, ch);
5534 BREAKUP_CHAR (ch, charset, byte1, byte2);
5535 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5536 codesys, dst, &flags, 0);
5538 /* Now determine which register to use. */
5540 for (i = 0; i < 4; i++)
5542 if (EQ (charset, str->iso2022.charset[i]) ||
5544 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5553 if (XCHARSET_GRAPHIC (charset) != 0)
5555 if (!NILP (str->iso2022.charset[1]) &&
5556 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5557 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5559 else if (!NILP (str->iso2022.charset[2]))
5561 else if (!NILP (str->iso2022.charset[3]))
5570 iso2022_designate (charset, reg, str, dst);
5572 /* Now invoke that register. */
5576 ensure_normal_shift (str, dst);
5581 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5583 ensure_shift_out (str, dst);
5591 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5593 Dynarr_add (dst, ISO_CODE_ESC);
5594 Dynarr_add (dst, 'N');
5599 Dynarr_add (dst, ISO_CODE_SS2);
5605 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5607 Dynarr_add (dst, ISO_CODE_ESC);
5608 Dynarr_add (dst, 'O');
5613 Dynarr_add (dst, ISO_CODE_SS3);
5622 charmask = (half == 0 ? 0x00 : 0x80);
5624 switch (XCHARSET_DIMENSION (charset))
5627 Dynarr_add (dst, byte1 | charmask);
5630 Dynarr_add (dst, byte1 | charmask);
5631 Dynarr_add (dst, byte2 | charmask);
5640 ch = ( ch << 6 ) | ( c & 0x3f );
5644 #else /* not UTF2000 */
5650 if (BYTE_ASCII_P (c))
5651 { /* Processing ASCII character */
5654 restore_left_to_right_direction (codesys, dst, &flags, 0);
5656 /* Make sure G0 contains ASCII */
5657 if ((c > ' ' && c < ISO_CODE_DEL) ||
5658 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5660 ensure_normal_shift (str, dst);
5661 iso2022_designate (Vcharset_ascii, 0, str, dst);
5664 /* If necessary, restore everything to the default state
5667 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5669 restore_left_to_right_direction (codesys, dst, &flags, 0);
5671 ensure_normal_shift (str, dst);
5673 for (i = 0; i < 4; i++)
5675 Lisp_Object initial_charset =
5676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5677 iso2022_designate (initial_charset, i, str, dst);
5682 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5683 Dynarr_add (dst, '\r');
5684 if (eol_type != EOL_CR)
5685 Dynarr_add (dst, c);
5689 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5690 && fit_to_be_escape_quoted (c))
5691 Dynarr_add (dst, ISO_CODE_ESC);
5692 Dynarr_add (dst, c);
5697 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5698 { /* Processing Leading Byte */
5700 charset = CHARSET_BY_LEADING_BYTE (c);
5701 if (LEADING_BYTE_PREFIX_P(c))
5703 else if (!EQ (charset, Vcharset_control_1)
5704 #ifdef ENABLE_COMPOSITE_CHARS
5705 && !EQ (charset, Vcharset_composite)
5711 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5712 codesys, dst, &flags, 0);
5714 /* Now determine which register to use. */
5716 for (i = 0; i < 4; i++)
5718 if (EQ (charset, str->iso2022.charset[i]) ||
5720 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5729 if (XCHARSET_GRAPHIC (charset) != 0)
5731 if (!NILP (str->iso2022.charset[1]) &&
5732 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5733 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5735 else if (!NILP (str->iso2022.charset[2]))
5737 else if (!NILP (str->iso2022.charset[3]))
5746 iso2022_designate (charset, reg, str, dst);
5748 /* Now invoke that register. */
5752 ensure_normal_shift (str, dst);
5757 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5759 ensure_shift_out (str, dst);
5767 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5769 Dynarr_add (dst, ISO_CODE_ESC);
5770 Dynarr_add (dst, 'N');
5775 Dynarr_add (dst, ISO_CODE_SS2);
5781 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5783 Dynarr_add (dst, ISO_CODE_ESC);
5784 Dynarr_add (dst, 'O');
5789 Dynarr_add (dst, ISO_CODE_SS3);
5801 { /* Processing Non-ASCII character */
5802 charmask = (half == 0 ? 0x7F : 0xFF);
5804 if (EQ (charset, Vcharset_control_1))
5806 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5807 && fit_to_be_escape_quoted (c))
5808 Dynarr_add (dst, ISO_CODE_ESC);
5809 /* you asked for it ... */
5810 Dynarr_add (dst, c - 0x20);
5814 switch (XCHARSET_REP_BYTES (charset))
5817 Dynarr_add (dst, c & charmask);
5820 if (XCHARSET_PRIVATE_P (charset))
5822 Dynarr_add (dst, c & charmask);
5827 #ifdef ENABLE_COMPOSITE_CHARS
5828 if (EQ (charset, Vcharset_composite))
5832 /* #### Bother! We don't know how to
5834 Dynarr_add (dst, '~');
5838 Emchar emch = MAKE_CHAR (Vcharset_composite,
5839 ch & 0x7F, c & 0x7F);
5840 Lisp_Object lstr = composite_char_string (emch);
5844 src = XSTRING_DATA (lstr);
5845 n = XSTRING_LENGTH (lstr);
5846 Dynarr_add (dst, ISO_CODE_ESC);
5847 Dynarr_add (dst, '0'); /* start composing */
5851 #endif /* ENABLE_COMPOSITE_CHARS */
5853 Dynarr_add (dst, ch & charmask);
5854 Dynarr_add (dst, c & charmask);
5867 Dynarr_add (dst, ch & charmask);
5868 Dynarr_add (dst, c & charmask);
5883 #endif /* not UTF2000 */
5885 #ifdef ENABLE_COMPOSITE_CHARS
5891 Dynarr_add (dst, ISO_CODE_ESC);
5892 Dynarr_add (dst, '1'); /* end composing */
5893 goto back_to_square_n; /* Wheeeeeeeee ..... */
5895 #endif /* ENABLE_COMPOSITE_CHARS */
5898 if ( (char_boundary == 0) && flags & CODING_STATE_END)
5900 if (char_boundary && flags & CODING_STATE_END)
5903 restore_left_to_right_direction (codesys, dst, &flags, 0);
5904 ensure_normal_shift (str, dst);
5905 for (i = 0; i < 4; i++)
5907 Lisp_Object initial_charset =
5908 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5909 iso2022_designate (initial_charset, i, str, dst);
5915 str->iso2022.current_char_boundary = char_boundary;
5916 str->iso2022.current_charset = charset;
5917 str->iso2022.current_half = half;
5919 /* Verbum caro factum est! */
5923 /************************************************************************/
5924 /* No-conversion methods */
5925 /************************************************************************/
5927 /* This is used when reading in "binary" files -- i.e. files that may
5928 contain all 256 possible byte values and that are not to be
5929 interpreted as being in any particular decoding. */
5931 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5932 unsigned_char_dynarr *dst, unsigned int n)
5935 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5936 unsigned int flags = str->flags;
5937 unsigned int ch = str->ch;
5938 eol_type_t eol_type = str->eol_type;
5944 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5945 DECODE_ADD_BINARY_CHAR (c, dst);
5946 label_continue_loop:;
5949 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5956 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5957 unsigned_char_dynarr *dst, unsigned int n)
5960 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5961 unsigned int flags = str->flags;
5962 unsigned int ch = str->ch;
5963 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5965 unsigned char char_boundary = str->iso2022.current_char_boundary;
5972 switch (char_boundary)
5980 else if ( c >= 0xf8 )
5985 else if ( c >= 0xf0 )
5990 else if ( c >= 0xe0 )
5995 else if ( c >= 0xc0 )
6006 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6007 Dynarr_add (dst, '\r');
6008 if (eol_type != EOL_CR)
6009 Dynarr_add (dst, c);
6012 Dynarr_add (dst, c);
6017 ch = ( ch << 6 ) | ( c & 0x3f );
6018 Dynarr_add (dst, ch & 0xff);
6022 ch = ( ch << 6 ) | ( c & 0x3f );
6025 #else /* not UTF2000 */
6028 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6029 Dynarr_add (dst, '\r');
6030 if (eol_type != EOL_CR)
6031 Dynarr_add (dst, '\n');
6034 else if (BYTE_ASCII_P (c))
6037 Dynarr_add (dst, c);
6039 else if (BUFBYTE_LEADING_BYTE_P (c))
6042 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6043 c == LEADING_BYTE_CONTROL_1)
6046 Dynarr_add (dst, '~'); /* untranslatable character */
6050 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6051 Dynarr_add (dst, c);
6052 else if (ch == LEADING_BYTE_CONTROL_1)
6055 Dynarr_add (dst, c - 0x20);
6057 /* else it should be the second or third byte of an
6058 untranslatable character, so ignore it */
6061 #endif /* not UTF2000 */
6067 str->iso2022.current_char_boundary = char_boundary;
6073 /************************************************************************/
6074 /* Initialization */
6075 /************************************************************************/
6078 syms_of_file_coding (void)
6080 deferror (&Qcoding_system_error, "coding-system-error",
6081 "Coding-system error", Qio_error);
6083 DEFSUBR (Fcoding_system_p);
6084 DEFSUBR (Ffind_coding_system);
6085 DEFSUBR (Fget_coding_system);
6086 DEFSUBR (Fcoding_system_list);
6087 DEFSUBR (Fcoding_system_name);
6088 DEFSUBR (Fmake_coding_system);
6089 DEFSUBR (Fcopy_coding_system);
6090 DEFSUBR (Fcoding_system_canonical_name_p);
6091 DEFSUBR (Fcoding_system_alias_p);
6092 DEFSUBR (Fcoding_system_aliasee);
6093 DEFSUBR (Fdefine_coding_system_alias);
6094 DEFSUBR (Fsubsidiary_coding_system);
6096 DEFSUBR (Fcoding_system_type);
6097 DEFSUBR (Fcoding_system_doc_string);
6099 DEFSUBR (Fcoding_system_charset);
6101 DEFSUBR (Fcoding_system_property);
6103 DEFSUBR (Fcoding_category_list);
6104 DEFSUBR (Fset_coding_priority_list);
6105 DEFSUBR (Fcoding_priority_list);
6106 DEFSUBR (Fset_coding_category_system);
6107 DEFSUBR (Fcoding_category_system);
6109 DEFSUBR (Fdetect_coding_region);
6110 DEFSUBR (Fdecode_coding_region);
6111 DEFSUBR (Fencode_coding_region);
6113 DEFSUBR (Fdecode_shift_jis_char);
6114 DEFSUBR (Fencode_shift_jis_char);
6115 DEFSUBR (Fdecode_big5_char);
6116 DEFSUBR (Fencode_big5_char);
6118 DEFSUBR (Fset_ucs_char);
6119 DEFSUBR (Fucs_char);
6120 DEFSUBR (Fset_char_ucs);
6121 DEFSUBR (Fchar_ucs);
6122 #endif /* not UTF2000 */
6124 defsymbol (&Qcoding_systemp, "coding-system-p");
6125 defsymbol (&Qno_conversion, "no-conversion");
6126 defsymbol (&Qraw_text, "raw-text");
6128 defsymbol (&Qbig5, "big5");
6129 defsymbol (&Qshift_jis, "shift-jis");
6130 defsymbol (&Qucs4, "ucs-4");
6131 defsymbol (&Qutf8, "utf-8");
6132 defsymbol (&Qccl, "ccl");
6133 defsymbol (&Qiso2022, "iso2022");
6135 defsymbol (&Qmnemonic, "mnemonic");
6136 defsymbol (&Qeol_type, "eol-type");
6137 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6138 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6140 defsymbol (&Qcr, "cr");
6141 defsymbol (&Qlf, "lf");
6142 defsymbol (&Qcrlf, "crlf");
6143 defsymbol (&Qeol_cr, "eol-cr");
6144 defsymbol (&Qeol_lf, "eol-lf");
6145 defsymbol (&Qeol_crlf, "eol-crlf");
6147 defsymbol (&Qcharset_g0, "charset-g0");
6148 defsymbol (&Qcharset_g1, "charset-g1");
6149 defsymbol (&Qcharset_g2, "charset-g2");
6150 defsymbol (&Qcharset_g3, "charset-g3");
6151 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6152 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6153 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6154 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6155 defsymbol (&Qno_iso6429, "no-iso6429");
6156 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6157 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6159 defsymbol (&Qshort, "short");
6160 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6161 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6162 defsymbol (&Qseven, "seven");
6163 defsymbol (&Qlock_shift, "lock-shift");
6164 defsymbol (&Qescape_quoted, "escape-quoted");
6166 defsymbol (&Qencode, "encode");
6167 defsymbol (&Qdecode, "decode");
6170 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6172 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6174 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6176 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6178 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6180 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6182 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6184 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6186 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6189 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6194 lstream_type_create_file_coding (void)
6196 LSTREAM_HAS_METHOD (decoding, reader);
6197 LSTREAM_HAS_METHOD (decoding, writer);
6198 LSTREAM_HAS_METHOD (decoding, rewinder);
6199 LSTREAM_HAS_METHOD (decoding, seekable_p);
6200 LSTREAM_HAS_METHOD (decoding, flusher);
6201 LSTREAM_HAS_METHOD (decoding, closer);
6202 LSTREAM_HAS_METHOD (decoding, marker);
6204 LSTREAM_HAS_METHOD (encoding, reader);
6205 LSTREAM_HAS_METHOD (encoding, writer);
6206 LSTREAM_HAS_METHOD (encoding, rewinder);
6207 LSTREAM_HAS_METHOD (encoding, seekable_p);
6208 LSTREAM_HAS_METHOD (encoding, flusher);
6209 LSTREAM_HAS_METHOD (encoding, closer);
6210 LSTREAM_HAS_METHOD (encoding, marker);
6214 vars_of_file_coding (void)
6218 fcd = xnew (struct file_coding_dump);
6219 dumpstruct (&fcd, &fcd_description);
6221 /* Initialize to something reasonable ... */
6222 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6224 fcd->coding_category_system[i] = Qnil;
6225 fcd->coding_category_by_priority[i] = i;
6228 Fprovide (intern ("file-coding"));
6230 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6231 Coding system used for TTY keyboard input.
6232 Not used under a windowing system.
6234 Vkeyboard_coding_system = Qnil;
6236 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6237 Coding system used for TTY display output.
6238 Not used under a windowing system.
6240 Vterminal_coding_system = Qnil;
6242 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6243 Overriding coding system used when reading from a file or process.
6244 You should bind this variable with `let', but do not set it globally.
6245 If this is non-nil, it specifies the coding system that will be used
6246 to decode input on read operations, such as from a file or process.
6247 It overrides `buffer-file-coding-system-for-read',
6248 `insert-file-contents-pre-hook', etc. Use those variables instead of
6249 this one for permanent changes to the environment. */ );
6250 Vcoding_system_for_read = Qnil;
6252 DEFVAR_LISP ("coding-system-for-write",
6253 &Vcoding_system_for_write /*
6254 Overriding coding system used when writing to a file or process.
6255 You should bind this variable with `let', but do not set it globally.
6256 If this is non-nil, it specifies the coding system that will be used
6257 to encode output for write operations, such as to a file or process.
6258 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6259 Use those variables instead of this one for permanent changes to the
6261 Vcoding_system_for_write = Qnil;
6263 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6264 Coding system used to convert pathnames when accessing files.
6266 Vfile_name_coding_system = Qnil;
6268 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6269 Non-nil means the buffer contents are regarded as multi-byte form
6270 of characters, not a binary code. This affects the display, file I/O,
6271 and behaviors of various editing commands.
6273 Setting this to nil does not do anything.
6275 enable_multibyte_characters = 1;
6279 complex_vars_of_file_coding (void)
6281 staticpro (&Vcoding_system_hash_table);
6282 Vcoding_system_hash_table =
6283 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6285 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6286 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6288 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6290 struct codesys_prop csp; \
6292 csp.prop_type = (Prop_Type); \
6293 Dynarr_add (the_codesys_prop_dynarr, csp); \
6296 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6297 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6298 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6299 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6300 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6301 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6302 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6304 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6305 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6306 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6307 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6308 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6309 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6310 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6311 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6312 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6313 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6314 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6315 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6316 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6317 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6318 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6319 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6320 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6322 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6323 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6325 /* Need to create this here or we're really screwed. */
6327 (Qraw_text, Qno_conversion,
6328 build_string ("Raw text, which means it converts only line-break-codes."),
6329 list2 (Qmnemonic, build_string ("Raw")));
6332 (Qbinary, Qno_conversion,
6333 build_string ("Binary, which means it does not convert anything."),
6334 list4 (Qeol_type, Qlf,
6335 Qmnemonic, build_string ("Binary")));
6340 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6341 list2 (Qmnemonic, build_string ("UTF8")));
6344 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6346 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6348 Fdefine_coding_system_alias (Qterminal, Qbinary);
6349 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6351 /* Need this for bootstrapping */
6352 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6353 Fget_coding_system (Qraw_text);
6356 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6357 = Fget_coding_system (Qutf8);
6360 #if defined(MULE) && !defined(UTF2000)
6364 for (i = 0; i < 65536; i++)
6365 fcd->ucs_to_mule_table[i] = Qnil;
6367 staticpro (&mule_to_ucs_table);
6368 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6369 #endif /* defined(MULE) && !defined(UTF2000) */