1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011,
5 2012, 2013 MORIOKA Tomohiko
7 This file is part of XEmacs.
9 XEmacs is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with XEmacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 Boston, MA 02111-1307, USA. */
24 /* Synched up with: Mule 2.3. Not in FSF. */
26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
44 #include "file-coding.h"
46 Lisp_Object Qcoding_system_error;
48 Lisp_Object Vkeyboard_coding_system;
49 Lisp_Object Vterminal_coding_system;
50 Lisp_Object Vcoding_system_for_read;
51 Lisp_Object Vcoding_system_for_write;
52 Lisp_Object Vfile_name_coding_system;
54 Lisp_Object Vcoded_charset_entity_reference_alist;
56 /* Table of symbols identifying each coding category. */
57 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
61 struct file_coding_dump {
62 /* Coding system currently associated with each coding category. */
63 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
65 /* Table of all coding categories in decreasing order of priority.
66 This describes a permutation of the possible coding categories. */
67 int coding_category_by_priority[CODING_CATEGORY_LAST];
69 #if defined(MULE) && !defined(UTF2000)
70 Lisp_Object ucs_to_mule_table[65536];
74 static const struct lrecord_description fcd_description_1[] = {
75 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
76 #if defined(MULE) && !defined(UTF2000)
77 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
82 static const struct struct_description fcd_description = {
83 sizeof (struct file_coding_dump),
87 Lisp_Object mule_to_ucs_table;
89 Lisp_Object Qcoding_systemp;
91 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
92 /* Qinternal in general.c */
94 Lisp_Object Qmnemonic, Qeol_type;
95 Lisp_Object Qcr, Qcrlf, Qlf;
96 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
97 Lisp_Object Qpost_read_conversion;
98 Lisp_Object Qpre_write_conversion;
101 Lisp_Object Qucs4, Qutf16, Qutf8;
102 Lisp_Object Qbig5, Qshift_jis;
103 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
104 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
105 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
106 Lisp_Object Qno_iso6429;
107 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
108 Lisp_Object Qescape_quoted;
109 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
112 Lisp_Object Qutf_8_mcs;
113 Lisp_Object Qdisable_composition, Qenable_decomposition;
114 Lisp_Object Qccs_priority_list;
115 Lisp_Object Quse_entity_reference;
116 Lisp_Object Qd, Qx, QX;
117 Lisp_Object Vdecomposition_feature_list;
119 Lisp_Object Qencode, Qdecode;
121 Lisp_Object Vcoding_system_hash_table;
123 int enable_multibyte_characters;
126 /* Additional information used by the ISO2022 decoder and detector. */
127 struct iso2022_decoder
129 /* CHARSET holds the character sets currently assigned to the G0
130 through G3 variables. It is initialized from the array
131 INITIAL_CHARSET in CODESYS. */
132 Lisp_Object charset[4];
134 /* Which registers are currently invoked into the left (GL) and
135 right (GR) halves of the 8-bit encoding space? */
136 int register_left, register_right;
138 /* ISO_ESC holds a value indicating part of an escape sequence
139 that has already been seen. */
140 enum iso_esc_flag esc;
142 /* This records the bytes we've seen so far in an escape sequence,
143 in case the sequence is invalid (we spit out the bytes unchanged). */
144 unsigned char esc_bytes[8];
146 /* Index for next byte to store in ISO escape sequence. */
149 #ifdef ENABLE_COMPOSITE_CHARS
150 /* Stuff seen so far when composing a string. */
151 unsigned_char_dynarr *composite_chars;
154 /* If we saw an invalid designation sequence for a particular
155 register, we flag it here and switch to ASCII. The next time we
156 see a valid designation for this register, we turn off the flag
157 and do the designation normally, but pretend the sequence was
158 invalid. The effect of all this is that (most of the time) the
159 escape sequences for both the switch to the unknown charset, and
160 the switch back to the known charset, get inserted literally into
161 the buffer and saved out as such. The hope is that we can
162 preserve the escape sequences so that the resulting written out
163 file makes sense. If we don't do any of this, the designation
164 to the invalid charset will be preserved but that switch back
165 to the known charset will probably get eaten because it was
166 the same charset that was already present in the register. */
167 unsigned char invalid_designated[4];
169 /* We try to do similar things as above for direction-switching
170 sequences. If we encountered a direction switch while an
171 invalid designation was present, or an invalid designation
172 just after a direction switch (i.e. no valid designation
173 encountered yet), we insert the direction-switch escape
174 sequence literally into the output stream, and later on
175 insert the corresponding direction-restoring escape sequence
177 unsigned int switched_dir_and_no_valid_charset_yet :1;
178 unsigned int invalid_switch_dir :1;
180 /* Tells the decoder to output the escape sequence literally
181 even though it was valid. Used in the games we play to
182 avoid lossage when we encounter invalid designations. */
183 unsigned int output_literally :1;
184 /* We encountered a direction switch followed by an invalid
185 designation. We didn't output the direction switch
186 literally because we didn't know about the invalid designation;
187 but we have to do so now. */
188 unsigned int output_direction_sequence :1;
191 EXFUN (Fcopy_coding_system, 2);
193 struct detection_state;
196 text_encode_generic (Lstream *encoding, const Bufbyte *src,
197 unsigned_char_dynarr *dst, Lstream_data_count n);
199 static int detect_coding_sjis (struct detection_state *st,
200 const Extbyte *src, Lstream_data_count n);
201 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
202 unsigned_char_dynarr *dst, Lstream_data_count n);
203 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
204 unsigned_char_dynarr *dst, unsigned int *flags);
205 void char_finish_shift_jis (struct encoding_stream *str,
206 unsigned_char_dynarr *dst, unsigned int *flags);
208 static int detect_coding_big5 (struct detection_state *st,
209 const Extbyte *src, Lstream_data_count n);
210 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
211 unsigned_char_dynarr *dst, Lstream_data_count n);
212 void char_encode_big5 (struct encoding_stream *str, Emchar c,
213 unsigned_char_dynarr *dst, unsigned int *flags);
214 void char_finish_big5 (struct encoding_stream *str,
215 unsigned_char_dynarr *dst, unsigned int *flags);
217 static int detect_coding_ucs4 (struct detection_state *st,
218 const Extbyte *src, Lstream_data_count n);
219 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
220 unsigned_char_dynarr *dst, Lstream_data_count n);
221 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
222 unsigned_char_dynarr *dst, unsigned int *flags);
223 void char_finish_ucs4 (struct encoding_stream *str,
224 unsigned_char_dynarr *dst, unsigned int *flags);
226 static int detect_coding_utf16 (struct detection_state *st,
227 const Extbyte *src, Lstream_data_count n);
228 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
229 unsigned_char_dynarr *dst, Lstream_data_count n);
230 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
231 unsigned_char_dynarr *dst, unsigned int *flags);
232 void char_finish_utf16 (struct encoding_stream *str,
233 unsigned_char_dynarr *dst, unsigned int *flags);
235 static int detect_coding_utf8 (struct detection_state *st,
236 const Extbyte *src, Lstream_data_count n);
237 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
238 unsigned_char_dynarr *dst, Lstream_data_count n);
239 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
240 unsigned_char_dynarr *dst, unsigned int *flags);
241 void char_finish_utf8 (struct encoding_stream *str,
242 unsigned_char_dynarr *dst, unsigned int *flags);
244 static int postprocess_iso2022_mask (int mask);
245 static void reset_iso2022 (Lisp_Object coding_system,
246 struct iso2022_decoder *iso);
247 static int detect_coding_iso2022 (struct detection_state *st,
248 const Extbyte *src, Lstream_data_count n);
249 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
250 unsigned_char_dynarr *dst, Lstream_data_count n);
251 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
252 unsigned_char_dynarr *dst, unsigned int *flags);
253 void char_finish_iso2022 (struct encoding_stream *str,
254 unsigned_char_dynarr *dst, unsigned int *flags);
256 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
257 unsigned_char_dynarr *dst, Lstream_data_count n);
258 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
259 unsigned_char_dynarr *dst, Lstream_data_count n);
260 static void mule_decode (Lstream *decoding, const Extbyte *src,
261 unsigned_char_dynarr *dst, Lstream_data_count n);
262 static void mule_encode (Lstream *encoding, const Bufbyte *src,
263 unsigned_char_dynarr *dst, Lstream_data_count n);
265 typedef struct codesys_prop codesys_prop;
274 Dynarr_declare (codesys_prop);
275 } codesys_prop_dynarr;
277 static const struct lrecord_description codesys_prop_description_1[] = {
278 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
282 static const struct struct_description codesys_prop_description = {
283 sizeof (codesys_prop),
284 codesys_prop_description_1
287 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
288 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
292 static const struct struct_description codesys_prop_dynarr_description = {
293 sizeof (codesys_prop_dynarr),
294 codesys_prop_dynarr_description_1
297 codesys_prop_dynarr *the_codesys_prop_dynarr;
299 enum codesys_prop_enum
302 CODESYS_PROP_ISO2022,
307 /************************************************************************/
308 /* Coding system functions */
309 /************************************************************************/
311 static Lisp_Object mark_coding_system (Lisp_Object);
312 static void print_coding_system (Lisp_Object, Lisp_Object, int);
313 static void finalize_coding_system (void *header, int for_disksave);
316 static const struct lrecord_description ccs_description_1[] = {
317 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
318 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
322 static const struct struct_description ccs_description = {
323 sizeof (charset_conversion_spec),
327 static const struct lrecord_description ccsd_description_1[] = {
328 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
332 static const struct struct_description ccsd_description = {
333 sizeof (charset_conversion_spec_dynarr),
338 static const struct lrecord_description coding_system_description[] = {
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
342 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
343 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
344 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
345 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
348 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
349 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
350 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
351 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
352 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
354 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
360 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
361 mark_coding_system, print_coding_system,
362 finalize_coding_system,
363 0, 0, coding_system_description,
367 mark_coding_system (Lisp_Object obj)
369 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
371 mark_object (CODING_SYSTEM_NAME (codesys));
372 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
373 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
374 mark_object (CODING_SYSTEM_EOL_LF (codesys));
375 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
376 mark_object (CODING_SYSTEM_EOL_CR (codesys));
378 switch (CODING_SYSTEM_TYPE (codesys))
382 case CODESYS_ISO2022:
383 for (i = 0; i < 4; i++)
384 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
385 if (codesys->iso2022.input_conv)
387 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
389 struct charset_conversion_spec *ccs =
390 Dynarr_atp (codesys->iso2022.input_conv, i);
391 mark_object (ccs->from_charset);
392 mark_object (ccs->to_charset);
395 if (codesys->iso2022.output_conv)
397 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
399 struct charset_conversion_spec *ccs =
400 Dynarr_atp (codesys->iso2022.output_conv, i);
401 mark_object (ccs->from_charset);
402 mark_object (ccs->to_charset);
409 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
410 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
415 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
416 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
423 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
425 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
427 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
431 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
434 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
436 error ("printing unreadable object #<coding_system 0x%x>",
439 write_c_string ("#<coding_system ", printcharfun);
440 print_internal (c->name, printcharfun, 1);
441 write_c_string (">", printcharfun);
445 finalize_coding_system (void *header, int for_disksave)
447 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
448 /* Since coding systems never go away, this function is not
449 necessary. But it would be necessary if we changed things
450 so that coding systems could go away. */
451 if (!for_disksave) /* see comment in lstream.c */
453 switch (CODING_SYSTEM_TYPE (c))
456 case CODESYS_ISO2022:
457 if (c->iso2022.input_conv)
459 Dynarr_free (c->iso2022.input_conv);
460 c->iso2022.input_conv = 0;
462 if (c->iso2022.output_conv)
464 Dynarr_free (c->iso2022.output_conv);
465 c->iso2022.output_conv = 0;
476 symbol_to_eol_type (Lisp_Object symbol)
478 CHECK_SYMBOL (symbol);
479 if (NILP (symbol)) return EOL_AUTODETECT;
480 if (EQ (symbol, Qlf)) return EOL_LF;
481 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
482 if (EQ (symbol, Qcr)) return EOL_CR;
484 signal_simple_error ("Unrecognized eol type", symbol);
485 return EOL_AUTODETECT; /* not reached */
489 eol_type_to_symbol (eol_type_t type)
494 case EOL_LF: return Qlf;
495 case EOL_CRLF: return Qcrlf;
496 case EOL_CR: return Qcr;
497 case EOL_AUTODETECT: return Qnil;
502 setup_eol_coding_systems (Lisp_Coding_System *codesys)
504 Lisp_Object codesys_obj;
505 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
506 char *codesys_name = (char *) alloca (len + 7);
508 char *codesys_mnemonic=0;
510 Lisp_Object codesys_name_sym, sub_codesys_obj;
514 XSETCODING_SYSTEM (codesys_obj, codesys);
516 memcpy (codesys_name,
517 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
519 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
521 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
522 codesys_mnemonic = (char *) alloca (mlen + 7);
523 memcpy (codesys_mnemonic,
524 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
527 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
528 strcpy (codesys_name + len, "-" op_sys); \
530 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
531 codesys_name_sym = intern (codesys_name); \
532 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
533 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
535 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
536 build_string (codesys_mnemonic); \
537 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
540 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
541 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
542 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
545 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
546 Return t if OBJECT is a coding system.
547 A coding system is an object that defines how text containing multiple
548 character sets is encoded into a stream of (typically 8-bit) bytes.
549 The coding system is used to decode the stream into a series of
550 characters (which may be from multiple charsets) when the text is read
551 from a file or process, and is used to encode the text back into the
552 same format when it is written out to a file or process.
554 For example, many ISO2022-compliant coding systems (such as Compound
555 Text, which is used for inter-client data under the X Window System)
556 use escape sequences to switch between different charsets -- Japanese
557 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
558 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
559 `make-coding-system' for more information.
561 Coding systems are normally identified using a symbol, and the
562 symbol is accepted in place of the actual coding system object whenever
563 a coding system is called for. (This is similar to how faces work.)
567 return CODING_SYSTEMP (object) ? Qt : Qnil;
570 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
571 Retrieve the coding system of the given name.
573 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
574 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
575 If there is no such coding system, nil is returned. Otherwise the
576 associated coding system object is returned.
578 (coding_system_or_name))
580 if (NILP (coding_system_or_name))
581 coding_system_or_name = Qbinary;
582 else if (CODING_SYSTEMP (coding_system_or_name))
583 return coding_system_or_name;
585 CHECK_SYMBOL (coding_system_or_name);
589 coding_system_or_name =
590 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
592 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
593 return coding_system_or_name;
597 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
598 Retrieve the coding system of the given name.
599 Same as `find-coding-system' except that if there is no such
600 coding system, an error is signaled instead of returning nil.
604 Lisp_Object coding_system = Ffind_coding_system (name);
606 if (NILP (coding_system))
607 signal_simple_error ("No such coding system", name);
608 return coding_system;
611 /* We store the coding systems in hash tables with the names as the key and the
612 actual coding system object as the value. Occasionally we need to use them
613 in a list format. These routines provide us with that. */
614 struct coding_system_list_closure
616 Lisp_Object *coding_system_list;
620 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
621 void *coding_system_list_closure)
623 /* This function can GC */
624 struct coding_system_list_closure *cscl =
625 (struct coding_system_list_closure *) coding_system_list_closure;
626 Lisp_Object *coding_system_list = cscl->coding_system_list;
628 *coding_system_list = Fcons (key, *coding_system_list);
632 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
633 Return a list of the names of all defined coding systems.
637 Lisp_Object coding_system_list = Qnil;
639 struct coding_system_list_closure coding_system_list_closure;
641 GCPRO1 (coding_system_list);
642 coding_system_list_closure.coding_system_list = &coding_system_list;
643 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
644 &coding_system_list_closure);
647 return coding_system_list;
650 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
651 Return the name of the given coding system.
655 coding_system = Fget_coding_system (coding_system);
656 return XCODING_SYSTEM_NAME (coding_system);
659 static Lisp_Coding_System *
660 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
662 Lisp_Coding_System *codesys =
663 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
665 zero_lcrecord (codesys);
666 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
667 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
668 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
669 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
670 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
671 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
672 CODING_SYSTEM_TYPE (codesys) = type;
673 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
676 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
678 if (type == CODESYS_ISO2022)
681 for (i = 0; i < 4; i++)
682 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
685 if (type == CODESYS_UTF8)
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
691 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
693 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
696 else if (type == CODESYS_BIG5)
698 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
700 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
701 = Vcharset_chinese_big5;
702 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
704 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
708 else if (type == CODESYS_CCL)
710 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
711 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
714 CODING_SYSTEM_NAME (codesys) = name;
720 /* Given a list of charset conversion specs as specified in a Lisp
721 program, parse it into STORE_HERE. */
724 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
725 Lisp_Object spec_list)
729 EXTERNAL_LIST_LOOP (rest, spec_list)
731 Lisp_Object car = XCAR (rest);
732 Lisp_Object from, to;
733 struct charset_conversion_spec spec;
735 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
736 signal_simple_error ("Invalid charset conversion spec", car);
737 from = Fget_charset (XCAR (car));
738 to = Fget_charset (XCAR (XCDR (car)));
739 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
740 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
741 signal_simple_error_2
742 ("Attempted conversion between different charset types",
744 spec.from_charset = from;
745 spec.to_charset = to;
747 Dynarr_add (store_here, spec);
751 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
752 specs, return the equivalent as the Lisp programmer would see it.
754 If LOAD_HERE is 0, return Qnil. */
757 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
764 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
766 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
767 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
770 return Fnreverse (result);
775 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
776 Register symbol NAME as a coding system.
778 TYPE describes the conversion method used and should be one of
781 Automatic conversion. XEmacs attempts to detect the coding system
784 No conversion. Use this for binary files and such. On output,
785 graphic characters that are not in ASCII or Latin-1 will be
786 replaced by a ?. (For a no-conversion-encoded buffer, these
787 characters will only be present if you explicitly insert them.)
789 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
791 ISO 10646 UCS-4 encoding.
793 ISO 10646 UTF-8 encoding.
795 Any ISO2022-compliant encoding. Among other things, this includes
796 JIS (the Japanese encoding commonly used for e-mail), EUC (the
797 standard Unix encoding for Japanese and other languages), and
798 Compound Text (the encoding used in X11). You can specify more
799 specific information about the conversion with the PROPS argument.
801 Big5 (the encoding commonly used for Taiwanese).
803 The conversion is performed using a user-written pseudo-code
804 program. CCL (Code Conversion Language) is the name of this
807 Write out or read in the raw contents of the memory representing
808 the buffer's text. This is primarily useful for debugging
809 purposes, and is only enabled when XEmacs has been compiled with
810 DEBUG_XEMACS defined (via the --debug configure option).
811 WARNING: Reading in a file using 'internal conversion can result
812 in an internal inconsistency in the memory representing a
813 buffer's text, which will produce unpredictable results and may
814 cause XEmacs to crash. Under normal circumstances you should
815 never use 'internal conversion.
817 DOC-STRING is a string describing the coding system.
819 PROPS is a property list, describing the specific nature of the
820 character set. Recognized properties are:
823 String to be displayed in the modeline when this coding system is
827 End-of-line conversion to be used. It should be one of
830 Automatically detect the end-of-line type (LF, CRLF,
831 or CR). Also generate subsidiary coding systems named
832 `NAME-unix', `NAME-dos', and `NAME-mac', that are
833 identical to this coding system but have an EOL-TYPE
834 value of 'lf, 'crlf, and 'cr, respectively.
836 The end of a line is marked externally using ASCII LF.
837 Since this is also the way that XEmacs represents an
838 end-of-line internally, specifying this option results
839 in no end-of-line conversion. This is the standard
840 format for Unix text files.
842 The end of a line is marked externally using ASCII
843 CRLF. This is the standard format for MS-DOS text
846 The end of a line is marked externally using ASCII CR.
847 This is the standard format for Macintosh text files.
849 Automatically detect the end-of-line type but do not
850 generate subsidiary coding systems. (This value is
851 converted to nil when stored internally, and
852 `coding-system-property' will return nil.)
855 If non-nil, composition for combining characters is disabled.
857 'enable-decomposition
858 If non-nil, decomposition for combining characters is enabled.
860 'use-entity-reference
861 If non-nil, SGML style entity-reference is used for non-system-characters.
863 'post-read-conversion
864 Function called after a file has been read in, to perform the
865 decoding. Called with two arguments, START and END, denoting
866 a region of the current buffer to be decoded.
868 'pre-write-conversion
869 Function called before a file is written out, to perform the
870 encoding. Called with two arguments, START and END, denoting
871 a region of the current buffer to be encoded.
874 The following additional properties are recognized if TYPE is 'iso2022:
880 The character set initially designated to the G0 - G3 registers.
881 The value should be one of
883 -- A charset object (designate that character set)
884 -- nil (do not ever use this register)
885 -- t (no character set is initially designated to
886 the register, but may be later on; this automatically
887 sets the corresponding `force-g*-on-output' property)
893 If non-nil, send an explicit designation sequence on output before
894 using the specified register.
897 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
898 "ESC $ B" on output in place of the full designation sequences
899 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
902 If non-nil, don't designate ASCII to G0 at each end of line on output.
903 Setting this to non-nil also suppresses other state-resetting that
904 normally happens at the end of a line.
907 If non-nil, don't designate ASCII to G0 before control chars on output.
910 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
914 If non-nil, use locking-shift (SO/SI) instead of single-shift
915 or designation by escape sequence.
918 If non-nil, don't use ISO6429's direction specification.
921 If non-nil, literal control characters that are the same as
922 the beginning of a recognized ISO2022 or ISO6429 escape sequence
923 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
924 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
925 so that they can be properly distinguished from an escape sequence.
926 (Note that doing this results in a non-portable encoding.) This
927 encoding flag is used for byte-compiled files. Note that ESC
928 is a good choice for a quoting character because there are no
929 escape sequences whose second byte is a character from the Control-0
930 or Control-1 character sets; this is explicitly disallowed by the
933 'input-charset-conversion
934 A list of conversion specifications, specifying conversion of
935 characters in one charset to another when decoding is performed.
936 Each specification is a list of two elements: the source charset,
937 and the destination charset.
939 'output-charset-conversion
940 A list of conversion specifications, specifying conversion of
941 characters in one charset to another when encoding is performed.
942 The form of each specification is the same as for
943 'input-charset-conversion.
946 The following additional properties are recognized (and required)
950 CCL program used for decoding (converting to internal format).
953 CCL program used for encoding (converting to external format).
955 (name, type, doc_string, props))
957 Lisp_Coding_System *codesys;
958 enum coding_system_type ty;
959 int need_to_setup_eol_systems = 1;
961 /* Convert type to constant */
962 if (NILP (type) || EQ (type, Qundecided))
963 { ty = CODESYS_AUTODETECT; }
965 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
966 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
967 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
968 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
969 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
970 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
971 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
973 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
975 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
978 signal_simple_error ("Invalid coding system type", type);
982 codesys = allocate_coding_system (ty, name);
984 if (NILP (doc_string))
985 doc_string = build_string ("");
987 CHECK_STRING (doc_string);
988 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
991 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
993 if (EQ (key, Qmnemonic))
996 CHECK_STRING (value);
997 CODING_SYSTEM_MNEMONIC (codesys) = value;
1000 else if (EQ (key, Qeol_type))
1002 need_to_setup_eol_systems = NILP (value);
1005 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1008 else if (EQ (key, Qpost_read_conversion))
1009 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1010 else if (EQ (key, Qpre_write_conversion))
1011 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1013 else if (EQ (key, Qdisable_composition))
1014 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1015 else if (EQ (key, Qenable_decomposition))
1016 CODING_SYSTEM_ENABLE_DECOMPOSITION (codesys) = !NILP (value);
1017 else if (EQ (key, Quse_entity_reference))
1018 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1021 else if (ty == CODESYS_ISO2022)
1023 #define FROB_INITIAL_CHARSET(charset_num) \
1024 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1025 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1027 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1028 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1029 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1030 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1032 #define FROB_FORCE_CHARSET(charset_num) \
1033 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1035 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1036 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1037 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1038 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1040 #define FROB_BOOLEAN_PROPERTY(prop) \
1041 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1043 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1044 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1045 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1046 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1047 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1048 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1049 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1051 else if (EQ (key, Qinput_charset_conversion))
1053 codesys->iso2022.input_conv =
1054 Dynarr_new (charset_conversion_spec);
1055 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1058 else if (EQ (key, Qoutput_charset_conversion))
1060 codesys->iso2022.output_conv =
1061 Dynarr_new (charset_conversion_spec);
1062 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1066 else if (EQ (key, Qccs_priority_list))
1068 codesys->ccs_priority_list = value;
1072 signal_simple_error ("Unrecognized property", key);
1075 else if (ty == CODESYS_UTF8)
1077 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1078 else if (EQ (key, Qcharset_g1))
1079 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1080 else if (EQ (key, Qcharset_g2))
1081 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1083 signal_simple_error ("Unrecognized property", key);
1085 else if (ty == CODESYS_BIG5)
1087 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1088 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1090 signal_simple_error ("Unrecognized property", key);
1093 else if (EQ (type, Qccl))
1096 struct ccl_program test_ccl;
1099 /* Check key first. */
1100 if (EQ (key, Qdecode))
1101 suffix = "-ccl-decode";
1102 else if (EQ (key, Qencode))
1103 suffix = "-ccl-encode";
1105 signal_simple_error ("Unrecognized property", key);
1107 /* If value is vector, register it as a ccl program
1108 associated with an newly created symbol for
1109 backward compatibility. */
1110 if (VECTORP (value))
1112 sym = Fintern (concat2 (Fsymbol_name (name),
1113 build_string (suffix)),
1115 Fregister_ccl_program (sym, value);
1119 CHECK_SYMBOL (value);
1122 /* check if the given ccl programs are valid. */
1123 if (setup_ccl_program (&test_ccl, sym) < 0)
1124 signal_simple_error ("Invalid CCL program", value);
1126 if (EQ (key, Qdecode))
1127 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1128 else if (EQ (key, Qencode))
1129 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1134 signal_simple_error ("Unrecognized property", key);
1138 if (need_to_setup_eol_systems)
1139 setup_eol_coding_systems (codesys);
1142 Lisp_Object codesys_obj;
1143 XSETCODING_SYSTEM (codesys_obj, codesys);
1144 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1149 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1150 Copy OLD-CODING-SYSTEM to NEW-NAME.
1151 If NEW-NAME does not name an existing coding system, a new one will
1154 (old_coding_system, new_name))
1156 Lisp_Object new_coding_system;
1157 old_coding_system = Fget_coding_system (old_coding_system);
1158 new_coding_system = Ffind_coding_system (new_name);
1159 if (NILP (new_coding_system))
1161 XSETCODING_SYSTEM (new_coding_system,
1162 allocate_coding_system
1163 (XCODING_SYSTEM_TYPE (old_coding_system),
1165 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1169 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1170 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1171 memcpy (((char *) to ) + sizeof (to->header),
1172 ((char *) from) + sizeof (from->header),
1173 sizeof (*from) - sizeof (from->header));
1174 to->name = new_name;
1176 return new_coding_system;
1179 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1180 Return t if OBJECT names a coding system, and is not a coding system alias.
1184 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1188 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1189 Return t if OBJECT is a coding system alias.
1190 All coding system aliases are created by `define-coding-system-alias'.
1194 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1198 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1199 Return the coding-system symbol for which symbol ALIAS is an alias.
1203 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1204 if (SYMBOLP (aliasee))
1207 signal_simple_error ("Symbol is not a coding system alias", alias);
1208 return Qnil; /* To keep the compiler happy */
1212 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1214 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1218 /* A maphash function, for removing dangling coding system aliases. */
1220 dangling_coding_system_alias_p (Lisp_Object alias,
1221 Lisp_Object aliasee,
1222 void *dangling_aliases)
1224 if (SYMBOLP (aliasee)
1225 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1227 (*(int *) dangling_aliases)++;
1234 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1235 Define symbol ALIAS as an alias for coding system ALIASEE.
1237 You can use this function to redefine an alias that has already been defined,
1238 but you cannot redefine a name which is the canonical name for a coding system.
1239 \(a canonical name of a coding system is what is returned when you call
1240 `coding-system-name' on a coding system).
1242 ALIASEE itself can be an alias, which allows you to define nested aliases.
1244 You are forbidden, however, from creating alias loops or `dangling' aliases.
1245 These will be detected, and an error will be signaled if you attempt to do so.
1247 If ALIASEE is nil, then ALIAS will simply be undefined.
1249 See also `coding-system-alias-p', `coding-system-aliasee',
1250 and `coding-system-canonical-name-p'.
1254 Lisp_Object real_coding_system, probe;
1256 CHECK_SYMBOL (alias);
1258 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1260 ("Symbol is the canonical name of a coding system and cannot be redefined",
1265 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1266 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1267 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1269 Fremhash (alias, Vcoding_system_hash_table);
1271 /* Undefine subsidiary aliases,
1272 presumably created by a previous call to this function */
1273 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1274 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1275 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1277 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1278 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1279 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1282 /* Undefine dangling coding system aliases. */
1284 int dangling_aliases;
1287 dangling_aliases = 0;
1288 elisp_map_remhash (dangling_coding_system_alias_p,
1289 Vcoding_system_hash_table,
1291 } while (dangling_aliases > 0);
1297 if (CODING_SYSTEMP (aliasee))
1298 aliasee = XCODING_SYSTEM_NAME (aliasee);
1300 /* Checks that aliasee names a coding-system */
1301 real_coding_system = Fget_coding_system (aliasee);
1303 /* Check for coding system alias loops */
1304 if (EQ (alias, aliasee))
1305 alias_loop: signal_simple_error_2
1306 ("Attempt to create a coding system alias loop", alias, aliasee);
1308 for (probe = aliasee;
1310 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1312 if (EQ (probe, alias))
1316 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1318 /* Set up aliases for subsidiaries.
1319 #### There must be a better way to handle subsidiary coding systems. */
1321 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1323 for (i = 0; i < countof (suffixes); i++)
1325 Lisp_Object alias_subsidiary =
1326 append_suffix_to_symbol (alias, suffixes[i]);
1327 Lisp_Object aliasee_subsidiary =
1328 append_suffix_to_symbol (aliasee, suffixes[i]);
1330 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1331 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1334 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1335 but it doesn't look intentional, so I'd rather return something
1336 meaningful or nothing at all. */
1341 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1343 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1344 Lisp_Object new_coding_system;
1346 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1347 return coding_system;
1351 case EOL_AUTODETECT: return coding_system;
1352 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1353 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1354 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1355 default: abort (); return Qnil;
1358 return NILP (new_coding_system) ? coding_system : new_coding_system;
1361 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1362 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1364 (coding_system, eol_type))
1366 coding_system = Fget_coding_system (coding_system);
1368 return subsidiary_coding_system (coding_system,
1369 symbol_to_eol_type (eol_type));
1373 /************************************************************************/
1374 /* Coding system accessors */
1375 /************************************************************************/
1377 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1378 Return the doc string for CODING-SYSTEM.
1382 coding_system = Fget_coding_system (coding_system);
1383 return XCODING_SYSTEM_DOC_STRING (coding_system);
1386 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1387 Return the type of CODING-SYSTEM.
1391 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1394 case CODESYS_AUTODETECT: return Qundecided;
1396 case CODESYS_SHIFT_JIS: return Qshift_jis;
1397 case CODESYS_ISO2022: return Qiso2022;
1398 case CODESYS_BIG5: return Qbig5;
1399 case CODESYS_UCS4: return Qucs4;
1400 case CODESYS_UTF16: return Qutf16;
1401 case CODESYS_UTF8: return Qutf8;
1402 case CODESYS_CCL: return Qccl;
1404 case CODESYS_NO_CONVERSION: return Qno_conversion;
1406 case CODESYS_INTERNAL: return Qinternal;
1413 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1416 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1418 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1421 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1422 Return initial charset of CODING-SYSTEM designated to GNUM.
1425 (coding_system, gnum))
1427 coding_system = Fget_coding_system (coding_system);
1430 return coding_system_charset (coding_system, XINT (gnum));
1434 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1435 Return the PROP property of CODING-SYSTEM.
1437 (coding_system, prop))
1440 enum coding_system_type type;
1442 coding_system = Fget_coding_system (coding_system);
1443 CHECK_SYMBOL (prop);
1444 type = XCODING_SYSTEM_TYPE (coding_system);
1446 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1447 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1450 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1452 case CODESYS_PROP_ALL_OK:
1455 case CODESYS_PROP_ISO2022:
1456 if (type != CODESYS_ISO2022)
1458 ("Property only valid in ISO2022 coding systems",
1462 case CODESYS_PROP_CCL:
1463 if (type != CODESYS_CCL)
1465 ("Property only valid in CCL coding systems",
1475 signal_simple_error ("Unrecognized property", prop);
1477 if (EQ (prop, Qname))
1478 return XCODING_SYSTEM_NAME (coding_system);
1479 else if (EQ (prop, Qtype))
1480 return Fcoding_system_type (coding_system);
1481 else if (EQ (prop, Qdoc_string))
1482 return XCODING_SYSTEM_DOC_STRING (coding_system);
1483 else if (EQ (prop, Qmnemonic))
1484 return XCODING_SYSTEM_MNEMONIC (coding_system);
1485 else if (EQ (prop, Qeol_type))
1486 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1487 else if (EQ (prop, Qeol_lf))
1488 return XCODING_SYSTEM_EOL_LF (coding_system);
1489 else if (EQ (prop, Qeol_crlf))
1490 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1491 else if (EQ (prop, Qeol_cr))
1492 return XCODING_SYSTEM_EOL_CR (coding_system);
1493 else if (EQ (prop, Qpost_read_conversion))
1494 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1495 else if (EQ (prop, Qpre_write_conversion))
1496 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1499 else if (EQ (prop, Qdisable_composition))
1500 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1501 else if (EQ (prop, Qenable_decomposition))
1502 return XCODING_SYSTEM_ENABLE_DECOMPOSITION (coding_system) ? Qt : Qnil;
1503 else if (EQ (prop, Quse_entity_reference))
1504 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1505 else if (EQ (prop, Qccs_priority_list))
1506 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1508 else if (type == CODESYS_ISO2022)
1510 if (EQ (prop, Qcharset_g0))
1511 return coding_system_charset (coding_system, 0);
1512 else if (EQ (prop, Qcharset_g1))
1513 return coding_system_charset (coding_system, 1);
1514 else if (EQ (prop, Qcharset_g2))
1515 return coding_system_charset (coding_system, 2);
1516 else if (EQ (prop, Qcharset_g3))
1517 return coding_system_charset (coding_system, 3);
1519 #define FORCE_CHARSET(charset_num) \
1520 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1521 (coding_system, charset_num) ? Qt : Qnil)
1523 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1524 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1525 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1526 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1528 #define LISP_BOOLEAN(prop) \
1529 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1531 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1532 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1533 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1534 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1535 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1536 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1537 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1539 else if (EQ (prop, Qinput_charset_conversion))
1541 unparse_charset_conversion_specs
1542 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1543 else if (EQ (prop, Qoutput_charset_conversion))
1545 unparse_charset_conversion_specs
1546 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1550 else if (type == CODESYS_CCL)
1552 if (EQ (prop, Qdecode))
1553 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1554 else if (EQ (prop, Qencode))
1555 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1563 return Qnil; /* not reached */
1567 /************************************************************************/
1568 /* Coding category functions */
1569 /************************************************************************/
1572 decode_coding_category (Lisp_Object symbol)
1576 CHECK_SYMBOL (symbol);
1577 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1578 if (EQ (coding_category_symbol[i], symbol))
1581 signal_simple_error ("Unrecognized coding category", symbol);
1582 return 0; /* not reached */
1585 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1586 Return a list of all recognized coding categories.
1591 Lisp_Object list = Qnil;
1593 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1594 list = Fcons (coding_category_symbol[i], list);
1598 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1599 Change the priority order of the coding categories.
1600 LIST should be list of coding categories, in descending order of
1601 priority. Unspecified coding categories will be lower in priority
1602 than all specified ones, in the same relative order they were in
1607 int category_to_priority[CODING_CATEGORY_LAST];
1611 /* First generate a list that maps coding categories to priorities. */
1613 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1614 category_to_priority[i] = -1;
1616 /* Highest priority comes from the specified list. */
1618 EXTERNAL_LIST_LOOP (rest, list)
1620 int cat = decode_coding_category (XCAR (rest));
1622 if (category_to_priority[cat] >= 0)
1623 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1624 category_to_priority[cat] = i++;
1627 /* Now go through the existing categories by priority to retrieve
1628 the categories not yet specified and preserve their priority
1630 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1632 int cat = fcd->coding_category_by_priority[j];
1633 if (category_to_priority[cat] < 0)
1634 category_to_priority[cat] = i++;
1637 /* Now we need to construct the inverse of the mapping we just
1640 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1641 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1643 /* Phew! That was confusing. */
1647 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1648 Return a list of coding categories in descending order of priority.
1653 Lisp_Object list = Qnil;
1655 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1656 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1661 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1662 Change the coding system associated with a coding category.
1664 (coding_category, coding_system))
1666 int cat = decode_coding_category (coding_category);
1668 coding_system = Fget_coding_system (coding_system);
1669 fcd->coding_category_system[cat] = coding_system;
1673 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1674 Return the coding system associated with a coding category.
1678 int cat = decode_coding_category (coding_category);
1679 Lisp_Object sys = fcd->coding_category_system[cat];
1682 return XCODING_SYSTEM_NAME (sys);
1687 /************************************************************************/
1688 /* Detecting the encoding of data */
1689 /************************************************************************/
1691 struct detection_state
1693 eol_type_t eol_type;
1736 struct iso2022_decoder iso;
1738 int high_byte_count;
1739 unsigned int saw_single_shift:1;
1752 acceptable_control_char_p (int c)
1756 /* Allow and ignore control characters that you might
1757 reasonably see in a text file */
1762 case 8: /* backspace */
1763 case 11: /* vertical tab */
1764 case 12: /* form feed */
1765 case 26: /* MS-DOS C-z junk */
1766 case 31: /* '^_' -- for info */
1774 mask_has_at_most_one_bit_p (int mask)
1776 /* Perhaps the only thing useful you learn from intensive Microsoft
1777 technical interviews */
1778 return (mask & (mask - 1)) == 0;
1782 detect_eol_type (struct detection_state *st, const Extbyte *src,
1783 Lstream_data_count n)
1787 unsigned char c = *(unsigned char *)src++;
1790 if (st->eol.just_saw_cr)
1792 else if (st->eol.seen_anything)
1795 else if (st->eol.just_saw_cr)
1798 st->eol.just_saw_cr = 1;
1800 st->eol.just_saw_cr = 0;
1801 st->eol.seen_anything = 1;
1804 return EOL_AUTODETECT;
1807 /* Attempt to determine the encoding and EOL type of the given text.
1808 Before calling this function for the first type, you must initialize
1809 st->eol_type as appropriate and initialize st->mask to ~0.
1811 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1814 st->mask holds the determined coding category mask, or ~0 if only
1815 ASCII has been seen so far.
1819 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1820 is present in st->mask
1821 1 == definitive answers are here for both st->eol_type and st->mask
1825 detect_coding_type (struct detection_state *st, const Extbyte *src,
1826 Lstream_data_count n, int just_do_eol)
1828 if (st->eol_type == EOL_AUTODETECT)
1829 st->eol_type = detect_eol_type (st, src, n);
1832 return st->eol_type != EOL_AUTODETECT;
1834 if (!st->seen_non_ascii)
1836 for (; n; n--, src++)
1838 unsigned char c = *(unsigned char *) src;
1839 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1841 st->seen_non_ascii = 1;
1843 st->shift_jis.mask = ~0;
1846 st->utf16.mask = ~0;
1848 st->iso2022.mask = ~0;
1858 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1859 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1860 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1861 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1862 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1863 st->big5.mask = detect_coding_big5 (st, src, n);
1864 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1865 st->utf8.mask = detect_coding_utf8 (st, src, n);
1866 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1867 st->utf16.mask = detect_coding_utf16 (st, src, n);
1868 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1869 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1872 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1873 | st->utf8.mask | st->ucs4.mask;
1876 int retval = mask_has_at_most_one_bit_p (st->mask);
1877 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1878 return retval && st->eol_type != EOL_AUTODETECT;
1883 coding_system_from_mask (int mask)
1887 /* If the file was entirely or basically ASCII, use the
1888 default value of `buffer-file-coding-system'. */
1889 Lisp_Object retval =
1890 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1893 retval = Ffind_coding_system (retval);
1897 (Qbad_variable, Qwarning,
1898 "Invalid `default-buffer-file-coding-system', set to nil");
1899 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1903 retval = Fget_coding_system (Qraw_text);
1911 mask = postprocess_iso2022_mask (mask);
1913 /* Look through the coding categories by priority and find
1914 the first one that is allowed. */
1915 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1917 cat = fcd->coding_category_by_priority[i];
1918 if ((mask & (1 << cat)) &&
1919 !NILP (fcd->coding_category_system[cat]))
1923 return fcd->coding_category_system[cat];
1925 return Fget_coding_system (Qraw_text);
1929 /* Given a seekable read stream and potential coding system and EOL type
1930 as specified, do any autodetection that is called for. If the
1931 coding system and/or EOL type are not `autodetect', they will be left
1932 alone; but this function will never return an autodetect coding system
1935 This function does not automatically fetch subsidiary coding systems;
1936 that should be unnecessary with the explicit eol-type argument. */
1938 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1939 /* number of leading lines to check for a coding cookie */
1940 #define LINES_TO_CHECK 2
1943 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1944 eol_type_t *eol_type_in_out)
1946 struct detection_state decst;
1948 if (*eol_type_in_out == EOL_AUTODETECT)
1949 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1952 decst.eol_type = *eol_type_in_out;
1955 /* If autodetection is called for, do it now. */
1956 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1957 || *eol_type_in_out == EOL_AUTODETECT)
1960 Lisp_Object coding_system = Qnil;
1962 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1964 int lines_checked = 0;
1966 /* Look for initial "-*-"; mode line prefix */
1968 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1970 && lines_checked < LINES_TO_CHECK;
1972 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1974 Extbyte *local_vars_beg = p + 3;
1975 /* Look for final "-*-"; mode line suffix */
1976 for (p = local_vars_beg,
1977 scan_end = buf + nread - LENGTH ("-*-");
1979 && lines_checked < LINES_TO_CHECK;
1981 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1983 Extbyte *suffix = p;
1984 /* Look for "coding:" */
1985 for (p = local_vars_beg,
1986 scan_end = suffix - LENGTH ("coding:?");
1989 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1990 && (p == local_vars_beg
1991 || (*(p-1) == ' ' ||
1997 p += LENGTH ("coding:");
1998 while (*p == ' ' || *p == '\t') p++;
2000 /* Get coding system name */
2001 save = *suffix; *suffix = '\0';
2002 /* Characters valid in a MIME charset name (rfc 1521),
2003 and in a Lisp symbol name. */
2004 n = strspn ( (char *) p,
2005 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
2006 "abcdefghijklmnopqrstuvwxyz"
2012 save = p[n]; p[n] = '\0';
2014 Ffind_coding_system (intern ((char *) p));
2021 /* #### file must use standard EOLs or we miss 2d line */
2022 /* #### not to mention this is broken for UTF-16 DOS files */
2023 else if (*p == '\n' || *p == '\r')
2026 /* skip past multibyte (DOS) newline */
2027 if (*p == '\r' && *(p+1) == '\n') p++;
2031 /* #### file must use standard EOLs or we miss 2d line */
2032 /* #### not to mention this is broken for UTF-16 DOS files */
2033 else if (*p == '\n' || *p == '\r')
2036 /* skip past multibyte (DOS) newline */
2037 if (*p == '\r' && *(p+1) == '\n') p++;
2040 if (NILP (coding_system))
2043 if (detect_coding_type (&decst, buf, nread,
2044 XCODING_SYSTEM_TYPE (*codesys_in_out)
2045 != CODESYS_AUTODETECT))
2047 nread = Lstream_read (stream, buf, sizeof (buf));
2053 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2054 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2057 if (detect_coding_type (&decst, buf, nread, 1))
2059 nread = Lstream_read (stream, buf, sizeof (buf));
2065 *eol_type_in_out = decst.eol_type;
2066 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2068 if (NILP (coding_system))
2069 *codesys_in_out = coding_system_from_mask (decst.mask);
2071 *codesys_in_out = coding_system;
2075 /* If we absolutely can't determine the EOL type, just assume LF. */
2076 if (*eol_type_in_out == EOL_AUTODETECT)
2077 *eol_type_in_out = EOL_LF;
2079 Lstream_rewind (stream);
2082 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2083 Detect coding system of the text in the region between START and END.
2084 Return a list of possible coding systems ordered by priority.
2085 If only ASCII characters are found, return 'undecided or one of
2086 its subsidiary coding systems according to a detected end-of-line
2087 type. Optional arg BUFFER defaults to the current buffer.
2089 (start, end, buffer))
2091 Lisp_Object val = Qnil;
2092 struct buffer *buf = decode_buffer (buffer, 0);
2094 Lisp_Object instream, lb_instream;
2095 Lstream *istr, *lb_istr;
2096 struct detection_state decst;
2097 struct gcpro gcpro1, gcpro2;
2099 get_buffer_range_char (buf, start, end, &b, &e, 0);
2100 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2101 lb_istr = XLSTREAM (lb_instream);
2102 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2103 istr = XLSTREAM (instream);
2104 GCPRO2 (instream, lb_instream);
2106 decst.eol_type = EOL_AUTODETECT;
2110 Extbyte random_buffer[4096];
2111 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2115 if (detect_coding_type (&decst, random_buffer, nread, 0))
2119 if (decst.mask == ~0)
2120 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2128 decst.mask = postprocess_iso2022_mask (decst.mask);
2130 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2132 int sys = fcd->coding_category_by_priority[i];
2133 if (decst.mask & (1 << sys))
2135 Lisp_Object codesys = fcd->coding_category_system[sys];
2136 if (!NILP (codesys))
2137 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2138 val = Fcons (codesys, val);
2142 Lstream_close (istr);
2144 Lstream_delete (istr);
2145 Lstream_delete (lb_istr);
2150 /************************************************************************/
2151 /* Converting to internal Mule format ("decoding") */
2152 /************************************************************************/
2154 /* A decoding stream is a stream used for decoding text (i.e.
2155 converting from some external format to internal format).
2156 The decoding-stream object keeps track of the actual coding
2157 stream, the stream that is at the other end, and data that
2158 needs to be persistent across the lifetime of the stream. */
2160 /* Handle the EOL stuff related to just-read-in character C.
2161 EOL_TYPE is the EOL type of the coding stream.
2162 FLAGS is the current value of FLAGS in the coding stream, and may
2163 be modified by this macro. (The macro only looks at the
2164 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2165 bytes are to be written. You need to also define a local goto
2166 label "label_continue_loop" that is at the end of the main
2167 character-reading loop.
2169 If C is a CR character, then this macro handles it entirely and
2170 jumps to label_continue_loop. Otherwise, this macro does not add
2171 anything to DST, and continues normally. You should continue
2172 processing C normally after this macro. */
2174 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2178 if (eol_type == EOL_CR) \
2179 Dynarr_add (dst, '\n'); \
2180 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2181 Dynarr_add (dst, c); \
2183 flags |= CODING_STATE_CR; \
2184 goto label_continue_loop; \
2186 else if (flags & CODING_STATE_CR) \
2187 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2189 Dynarr_add (dst, '\r'); \
2190 flags &= ~CODING_STATE_CR; \
2194 /* C should be a binary character in the range 0 - 255; convert
2195 to internal format and add to Dynarr DST. */
2198 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2200 if (BYTE_ASCII_P (c)) \
2201 Dynarr_add (dst, c); \
2204 Dynarr_add (dst, (c >> 6) | 0xc0); \
2205 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2209 static void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2211 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2215 Dynarr_add (dst, c);
2217 else if ( c <= 0x7ff )
2219 Dynarr_add (dst, (c >> 6) | 0xc0);
2220 Dynarr_add (dst, (c & 0x3f) | 0x80);
2222 else if ( c <= 0xffff )
2224 Dynarr_add (dst, (c >> 12) | 0xe0);
2225 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2226 Dynarr_add (dst, (c & 0x3f) | 0x80);
2228 else if ( c <= 0x1fffff )
2230 Dynarr_add (dst, (c >> 18) | 0xf0);
2231 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2232 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2233 Dynarr_add (dst, (c & 0x3f) | 0x80);
2235 else if ( c <= 0x3ffffff )
2237 Dynarr_add (dst, (c >> 24) | 0xf8);
2238 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2239 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2240 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2241 Dynarr_add (dst, (c & 0x3f) | 0x80);
2245 Dynarr_add (dst, (c >> 30) | 0xfc);
2246 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2247 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2248 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2249 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2250 Dynarr_add (dst, (c & 0x3f) | 0x80);
2254 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2256 if (BYTE_ASCII_P (c)) \
2257 Dynarr_add (dst, c); \
2258 else if (BYTE_C1_P (c)) \
2260 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2261 Dynarr_add (dst, c + 0x20); \
2265 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2266 Dynarr_add (dst, c); \
2271 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2275 DECODE_ADD_BINARY_CHAR (ch, dst); \
2280 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2282 if (flags & CODING_STATE_END) \
2284 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2285 if (flags & CODING_STATE_CR) \
2286 Dynarr_add (dst, '\r'); \
2290 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2292 #define ER_BUF_SIZE 24
2294 struct decoding_stream
2296 /* Coding system that governs the conversion. */
2297 Lisp_Coding_System *codesys;
2299 /* Stream that we read the encoded data from or
2300 write the decoded data to. */
2303 /* If we are reading, then we can return only a fixed amount of
2304 data, so if the conversion resulted in too much data, we store it
2305 here for retrieval the next time around. */
2306 unsigned_char_dynarr *runoff;
2308 /* FLAGS holds flags indicating the current state of the decoding.
2309 Some of these flags are dependent on the coding system. */
2312 /* CPOS holds a partially built-up code-point of character. */
2315 /* EOL_TYPE specifies the type of end-of-line conversion that
2316 currently applies. We need to keep this separate from the
2317 EOL type stored in CODESYS because the latter might indicate
2318 automatic EOL-type detection while the former will always
2319 indicate a particular EOL type. */
2320 eol_type_t eol_type;
2322 /* Additional ISO2022 information. We define the structure above
2323 because it's also needed by the detection routines. */
2324 struct iso2022_decoder iso2022;
2326 /* Additional information (the state of the running CCL program)
2327 used by the CCL decoder. */
2328 struct ccl_program ccl;
2330 /* counter for UTF-8 or UCS-4 */
2331 unsigned char counter;
2335 unsigned char er_counter;
2336 unsigned char er_buf[ER_BUF_SIZE];
2338 unsigned combined_char_count;
2339 Emchar combined_chars[16];
2340 #ifdef HAVE_LIBCHISE
2341 COS_object combining_table;
2343 Lisp_Object combining_table;
2344 #endif /* HAVE_LIBCHISE */
2346 struct detection_state decst;
2349 static Lstream_data_count decoding_reader (Lstream *stream,
2350 unsigned char *data, Lstream_data_count size);
2351 static Lstream_data_count decoding_writer (Lstream *stream,
2352 const unsigned char *data, Lstream_data_count size);
2353 static int decoding_rewinder (Lstream *stream);
2354 static int decoding_seekable_p (Lstream *stream);
2355 static int decoding_flusher (Lstream *stream);
2356 static int decoding_closer (Lstream *stream);
2358 static Lisp_Object decoding_marker (Lisp_Object stream);
2360 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2361 sizeof (struct decoding_stream));
2364 decoding_marker (Lisp_Object stream)
2366 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2367 Lisp_Object str_obj;
2369 /* We do not need to mark the coding systems or charsets stored
2370 within the stream because they are stored in a global list
2371 and automatically marked. */
2373 XSETLSTREAM (str_obj, str);
2374 mark_object (str_obj);
2375 if (str->imp->marker)
2376 return (str->imp->marker) (str_obj);
2381 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2382 so we read data from the other end, decode it, and store it into DATA. */
2384 static Lstream_data_count
2385 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2387 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2388 unsigned char *orig_data = data;
2389 Lstream_data_count read_size;
2390 int error_occurred = 0;
2392 /* We need to interface to mule_decode(), which expects to take some
2393 amount of data and store the result into a Dynarr. We have
2394 mule_decode() store into str->runoff, and take data from there
2397 /* We loop until we have enough data, reading chunks from the other
2398 end and decoding it. */
2401 /* Take data from the runoff if we can. Make sure to take at
2402 most SIZE bytes, and delete the data from the runoff. */
2403 if (Dynarr_length (str->runoff) > 0)
2405 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2406 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2407 Dynarr_delete_many (str->runoff, 0, chunk);
2413 break; /* No more room for data */
2415 if (str->flags & CODING_STATE_END)
2416 /* This means that on the previous iteration, we hit the EOF on
2417 the other end. We loop once more so that mule_decode() can
2418 output any final stuff it may be holding, or any "go back
2419 to a sane state" escape sequences. (This latter makes sense
2420 during encoding.) */
2423 /* Exhausted the runoff, so get some more. DATA has at least
2424 SIZE bytes left of storage in it, so it's OK to read directly
2425 into it. (We'll be overwriting above, after we've decoded it
2426 into the runoff.) */
2427 read_size = Lstream_read (str->other_end, data, size);
2434 /* There might be some more end data produced in the translation.
2435 See the comment above. */
2436 str->flags |= CODING_STATE_END;
2437 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2440 if (data - orig_data == 0)
2441 return error_occurred ? -1 : 0;
2443 return data - orig_data;
2446 static Lstream_data_count
2447 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2449 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2450 Lstream_data_count retval;
2452 /* Decode all our data into the runoff, and then attempt to write
2453 it all out to the other end. Remove whatever chunk we succeeded
2455 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2456 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2457 Dynarr_length (str->runoff));
2459 Dynarr_delete_many (str->runoff, 0, retval);
2460 /* Do NOT return retval. The return value indicates how much
2461 of the incoming data was written, not how many bytes were
2467 reset_decoding_stream (struct decoding_stream *str)
2470 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2472 Lisp_Object coding_system;
2473 XSETCODING_SYSTEM (coding_system, str->codesys);
2474 reset_iso2022 (coding_system, &str->iso2022);
2476 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2478 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2484 str->er_counter = 0;
2485 str->combined_char_count = 0;
2486 #ifdef HAVE_LIBCHISE
2487 str->combining_table = COS_NIL;
2489 str->combining_table = Qnil;
2490 #endif /* HAVE_LIBCHISE */
2492 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2493 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2496 str->decst.eol_type = EOL_AUTODETECT;
2497 str->decst.mask = ~0;
2499 str->flags = str->cpos = 0;
2503 decoding_rewinder (Lstream *stream)
2505 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2506 reset_decoding_stream (str);
2507 Dynarr_reset (str->runoff);
2508 return Lstream_rewind (str->other_end);
2512 decoding_seekable_p (Lstream *stream)
2514 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2515 return Lstream_seekable_p (str->other_end);
2519 decoding_flusher (Lstream *stream)
2521 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2522 return Lstream_flush (str->other_end);
2526 decoding_closer (Lstream *stream)
2528 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2529 if (stream->flags & LSTREAM_FL_WRITE)
2531 str->flags |= CODING_STATE_END;
2532 decoding_writer (stream, 0, 0);
2534 Dynarr_free (str->runoff);
2536 #ifdef ENABLE_COMPOSITE_CHARS
2537 if (str->iso2022.composite_chars)
2538 Dynarr_free (str->iso2022.composite_chars);
2541 return Lstream_close (str->other_end);
2545 decoding_stream_coding_system (Lstream *stream)
2547 Lisp_Object coding_system;
2548 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2550 XSETCODING_SYSTEM (coding_system, str->codesys);
2551 return subsidiary_coding_system (coding_system, str->eol_type);
2555 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2557 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2558 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2560 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2561 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2562 reset_decoding_stream (str);
2565 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2566 stream for writing, no automatic code detection will be performed.
2567 The reason for this is that automatic code detection requires a
2568 seekable input. Things will also fail if you open a decoding
2569 stream for reading using a non-fully-specified coding system and
2570 a non-seekable input stream. */
2573 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2576 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2577 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2581 str->other_end = stream;
2582 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2583 str->eol_type = EOL_AUTODETECT;
2584 if (!strcmp (mode, "r")
2585 && Lstream_seekable_p (stream))
2586 /* We can determine the coding system now. */
2587 determine_real_coding_system (stream, &codesys, &str->eol_type);
2588 set_decoding_stream_coding_system (lstr, codesys);
2589 str->decst.eol_type = str->eol_type;
2590 str->decst.mask = ~0;
2591 XSETLSTREAM (obj, lstr);
2596 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2598 return make_decoding_stream_1 (stream, codesys, "r");
2602 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2604 return make_decoding_stream_1 (stream, codesys, "w");
2607 /* Note: the decode_coding_* functions all take the same
2608 arguments as mule_decode(), which is to say some SRC data of
2609 size N, which is to be stored into dynamic array DST.
2610 DECODING is the stream within which the decoding is
2611 taking place, but no data is actually read from or
2612 written to that stream; that is handled in decoding_reader()
2613 or decoding_writer(). This allows the same functions to
2614 be used for both reading and writing. */
2617 mule_decode (Lstream *decoding, const Extbyte *src,
2618 unsigned_char_dynarr *dst, Lstream_data_count n)
2620 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2622 /* If necessary, do encoding-detection now. We do this when
2623 we're a writing stream or a non-seekable reading stream,
2624 meaning that we can't just process the whole input,
2625 rewind, and start over. */
2627 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2628 str->eol_type == EOL_AUTODETECT)
2630 Lisp_Object codesys;
2632 XSETCODING_SYSTEM (codesys, str->codesys);
2633 detect_coding_type (&str->decst, src, n,
2634 CODING_SYSTEM_TYPE (str->codesys) !=
2635 CODESYS_AUTODETECT);
2636 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2637 str->decst.mask != ~0)
2638 /* #### This is cheesy. What we really ought to do is
2639 buffer up a certain amount of data so as to get a
2640 less random result. */
2641 codesys = coding_system_from_mask (str->decst.mask);
2642 str->eol_type = str->decst.eol_type;
2643 if (XCODING_SYSTEM (codesys) != str->codesys)
2645 /* Preserve the CODING_STATE_END flag in case it was set.
2646 If we erase it, bad things might happen. */
2647 int was_end = str->flags & CODING_STATE_END;
2648 set_decoding_stream_coding_system (decoding, codesys);
2650 str->flags |= CODING_STATE_END;
2654 switch (CODING_SYSTEM_TYPE (str->codesys))
2657 case CODESYS_INTERNAL:
2658 Dynarr_add_many (dst, src, n);
2661 case CODESYS_AUTODETECT:
2662 /* If we got this far and still haven't decided on the coding
2663 system, then do no conversion. */
2664 case CODESYS_NO_CONVERSION:
2665 decode_coding_no_conversion (decoding, src, dst, n);
2668 case CODESYS_SHIFT_JIS:
2669 decode_coding_sjis (decoding, src, dst, n);
2672 decode_coding_big5 (decoding, src, dst, n);
2675 decode_coding_ucs4 (decoding, src, dst, n);
2678 decode_coding_utf16 (decoding, src, dst, n);
2681 decode_coding_utf8 (decoding, src, dst, n);
2684 str->ccl.last_block = str->flags & CODING_STATE_END;
2685 /* When applying ccl program to stream, MUST NOT set NULL
2687 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2688 dst, n, 0, CCL_MODE_DECODING);
2690 case CODESYS_ISO2022:
2691 decode_coding_iso2022 (decoding, src, dst, n);
2699 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2700 Decode the text between START and END which is encoded in CODING-SYSTEM.
2701 This is useful if you've read in encoded text from a file without decoding
2702 it (e.g. you read in a JIS-formatted file but used the `binary' or
2703 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2704 Return length of decoded text.
2705 BUFFER defaults to the current buffer if unspecified.
2707 (start, end, coding_system, buffer))
2710 struct buffer *buf = decode_buffer (buffer, 0);
2711 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2712 Lstream *istr, *ostr;
2713 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2715 get_buffer_range_char (buf, start, end, &b, &e, 0);
2717 barf_if_buffer_read_only (buf, b, e);
2719 coding_system = Fget_coding_system (coding_system);
2720 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2721 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2722 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2724 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2725 Fget_coding_system (Qbinary));
2726 istr = XLSTREAM (instream);
2727 ostr = XLSTREAM (outstream);
2728 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2730 /* The chain of streams looks like this:
2732 [BUFFER] <----- send through
2733 ------> [ENCODE AS BINARY]
2734 ------> [DECODE AS SPECIFIED]
2740 char tempbuf[1024]; /* some random amount */
2741 Bufpos newpos, even_newer_pos;
2742 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2743 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2747 newpos = lisp_buffer_stream_startpos (istr);
2748 Lstream_write (ostr, tempbuf, size_in_bytes);
2749 even_newer_pos = lisp_buffer_stream_startpos (istr);
2750 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2753 Lstream_close (istr);
2754 Lstream_close (ostr);
2756 Lstream_delete (istr);
2757 Lstream_delete (ostr);
2758 Lstream_delete (XLSTREAM (de_outstream));
2759 Lstream_delete (XLSTREAM (lb_outstream));
2764 /************************************************************************/
2765 /* Converting to an external encoding ("encoding") */
2766 /************************************************************************/
2768 /* An encoding stream is an output stream. When you create the
2769 stream, you specify the coding system that governs the encoding
2770 and another stream that the resulting encoded data is to be
2771 sent to, and then start sending data to it. */
2773 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2775 struct encoding_stream
2777 /* Coding system that governs the conversion. */
2778 Lisp_Coding_System *codesys;
2780 /* Stream that we read the encoded data from or
2781 write the decoded data to. */
2784 /* If we are reading, then we can return only a fixed amount of
2785 data, so if the conversion resulted in too much data, we store it
2786 here for retrieval the next time around. */
2787 unsigned_char_dynarr *runoff;
2789 /* FLAGS holds flags indicating the current state of the encoding.
2790 Some of these flags are dependent on the coding system. */
2793 /* CH holds a partially built-up character. Since we only deal
2794 with one- and two-byte characters at the moment, we only use
2795 this to store the first byte of a two-byte character. */
2798 /* Additional information used by the ISO2022 encoder. */
2801 /* CHARSET holds the character sets currently assigned to the G0
2802 through G3 registers. It is initialized from the array
2803 INITIAL_CHARSET in CODESYS. */
2804 Lisp_Object charset[4];
2806 /* Which registers are currently invoked into the left (GL) and
2807 right (GR) halves of the 8-bit encoding space? */
2808 int register_left, register_right;
2810 /* Whether we need to explicitly designate the charset in the
2811 G? register before using it. It is initialized from the
2812 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2813 unsigned char force_charset_on_output[4];
2815 /* Other state variables that need to be preserved across
2817 Lisp_Object current_charset;
2819 int current_char_boundary;
2822 void (*encode_char) (struct encoding_stream *str, Emchar c,
2823 unsigned_char_dynarr *dst, unsigned int *flags);
2824 void (*finish) (struct encoding_stream *str,
2825 unsigned_char_dynarr *dst, unsigned int *flags);
2827 /* Additional information (the state of the running CCL program)
2828 used by the CCL encoder. */
2829 struct ccl_program ccl;
2833 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2834 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2835 Lstream_data_count size);
2836 static int encoding_rewinder (Lstream *stream);
2837 static int encoding_seekable_p (Lstream *stream);
2838 static int encoding_flusher (Lstream *stream);
2839 static int encoding_closer (Lstream *stream);
2841 static Lisp_Object encoding_marker (Lisp_Object stream);
2843 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2844 sizeof (struct encoding_stream));
2847 encoding_marker (Lisp_Object stream)
2849 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2850 Lisp_Object str_obj;
2852 /* We do not need to mark the coding systems or charsets stored
2853 within the stream because they are stored in a global list
2854 and automatically marked. */
2856 XSETLSTREAM (str_obj, str);
2857 mark_object (str_obj);
2858 if (str->imp->marker)
2859 return (str->imp->marker) (str_obj);
2864 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2865 so we read data from the other end, encode it, and store it into DATA. */
2867 static Lstream_data_count
2868 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2870 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2871 unsigned char *orig_data = data;
2872 Lstream_data_count read_size;
2873 int error_occurred = 0;
2875 /* We need to interface to mule_encode(), which expects to take some
2876 amount of data and store the result into a Dynarr. We have
2877 mule_encode() store into str->runoff, and take data from there
2880 /* We loop until we have enough data, reading chunks from the other
2881 end and encoding it. */
2884 /* Take data from the runoff if we can. Make sure to take at
2885 most SIZE bytes, and delete the data from the runoff. */
2886 if (Dynarr_length (str->runoff) > 0)
2888 int chunk = min ((int) size, Dynarr_length (str->runoff));
2889 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2890 Dynarr_delete_many (str->runoff, 0, chunk);
2896 break; /* No more room for data */
2898 if (str->flags & CODING_STATE_END)
2899 /* This means that on the previous iteration, we hit the EOF on
2900 the other end. We loop once more so that mule_encode() can
2901 output any final stuff it may be holding, or any "go back
2902 to a sane state" escape sequences. (This latter makes sense
2903 during encoding.) */
2906 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2907 left of storage in it, so it's OK to read directly into it.
2908 (We'll be overwriting above, after we've encoded it into the
2910 read_size = Lstream_read (str->other_end, data, size);
2917 /* There might be some more end data produced in the translation.
2918 See the comment above. */
2919 str->flags |= CODING_STATE_END;
2920 mule_encode (stream, data, str->runoff, read_size);
2923 if (data == orig_data)
2924 return error_occurred ? -1 : 0;
2926 return data - orig_data;
2929 static Lstream_data_count
2930 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2932 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2933 Lstream_data_count retval;
2935 /* Encode all our data into the runoff, and then attempt to write
2936 it all out to the other end. Remove whatever chunk we succeeded
2938 mule_encode (stream, data, str->runoff, size);
2939 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2940 Dynarr_length (str->runoff));
2942 Dynarr_delete_many (str->runoff, 0, retval);
2943 /* Do NOT return retval. The return value indicates how much
2944 of the incoming data was written, not how many bytes were
2950 reset_encoding_stream (struct encoding_stream *str)
2953 switch (CODING_SYSTEM_TYPE (str->codesys))
2955 case CODESYS_ISO2022:
2959 str->encode_char = &char_encode_iso2022;
2960 str->finish = &char_finish_iso2022;
2961 for (i = 0; i < 4; i++)
2963 str->iso2022.charset[i] =
2964 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2965 str->iso2022.force_charset_on_output[i] =
2966 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2968 str->iso2022.register_left = 0;
2969 str->iso2022.register_right = 1;
2970 str->iso2022.current_charset = Qnil;
2971 str->iso2022.current_half = 0;
2975 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2978 str->encode_char = &char_encode_utf8;
2979 str->finish = &char_finish_utf8;
2982 str->encode_char = &char_encode_utf16;
2983 str->finish = &char_finish_utf16;
2986 str->encode_char = &char_encode_ucs4;
2987 str->finish = &char_finish_ucs4;
2989 case CODESYS_SHIFT_JIS:
2990 str->encode_char = &char_encode_shift_jis;
2991 str->finish = &char_finish_shift_jis;
2994 str->encode_char = &char_encode_big5;
2995 str->finish = &char_finish_big5;
3001 str->iso2022.current_char_boundary = 0;
3002 str->flags = str->ch = 0;
3006 encoding_rewinder (Lstream *stream)
3008 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3009 reset_encoding_stream (str);
3010 Dynarr_reset (str->runoff);
3011 return Lstream_rewind (str->other_end);
3015 encoding_seekable_p (Lstream *stream)
3017 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3018 return Lstream_seekable_p (str->other_end);
3022 encoding_flusher (Lstream *stream)
3024 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3025 return Lstream_flush (str->other_end);
3029 encoding_closer (Lstream *stream)
3031 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3032 if (stream->flags & LSTREAM_FL_WRITE)
3034 str->flags |= CODING_STATE_END;
3035 encoding_writer (stream, 0, 0);
3037 Dynarr_free (str->runoff);
3038 return Lstream_close (str->other_end);
3042 encoding_stream_coding_system (Lstream *stream)
3044 Lisp_Object coding_system;
3045 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3047 XSETCODING_SYSTEM (coding_system, str->codesys);
3048 return coding_system;
3052 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3054 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3055 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3057 reset_encoding_stream (str);
3061 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3064 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3065 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3069 str->runoff = Dynarr_new (unsigned_char);
3070 str->other_end = stream;
3071 set_encoding_stream_coding_system (lstr, codesys);
3072 XSETLSTREAM (obj, lstr);
3077 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3079 return make_encoding_stream_1 (stream, codesys, "r");
3083 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3085 return make_encoding_stream_1 (stream, codesys, "w");
3088 /* Convert N bytes of internally-formatted data stored in SRC to an
3089 external format, according to the encoding stream ENCODING.
3090 Store the encoded data into DST. */
3093 mule_encode (Lstream *encoding, const Bufbyte *src,
3094 unsigned_char_dynarr *dst, Lstream_data_count n)
3096 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3098 switch (CODING_SYSTEM_TYPE (str->codesys))
3101 case CODESYS_INTERNAL:
3102 Dynarr_add_many (dst, src, n);
3105 case CODESYS_AUTODETECT:
3106 /* If we got this far and still haven't decided on the coding
3107 system, then do no conversion. */
3108 case CODESYS_NO_CONVERSION:
3109 encode_coding_no_conversion (encoding, src, dst, n);
3113 str->ccl.last_block = str->flags & CODING_STATE_END;
3114 /* When applying ccl program to stream, MUST NOT set NULL
3116 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3117 dst, n, 0, CCL_MODE_ENCODING);
3121 text_encode_generic (encoding, src, dst, n);
3125 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3126 Encode the text between START and END using CODING-SYSTEM.
3127 This will, for example, convert Japanese characters into stuff such as
3128 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3129 text. BUFFER defaults to the current buffer if unspecified.
3131 (start, end, coding_system, buffer))
3134 struct buffer *buf = decode_buffer (buffer, 0);
3135 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3136 Lstream *istr, *ostr;
3137 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3139 get_buffer_range_char (buf, start, end, &b, &e, 0);
3141 barf_if_buffer_read_only (buf, b, e);
3143 coding_system = Fget_coding_system (coding_system);
3144 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3145 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3146 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3147 Fget_coding_system (Qbinary));
3148 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3150 istr = XLSTREAM (instream);
3151 ostr = XLSTREAM (outstream);
3152 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3153 /* The chain of streams looks like this:
3155 [BUFFER] <----- send through
3156 ------> [ENCODE AS SPECIFIED]
3157 ------> [DECODE AS BINARY]
3162 char tempbuf[1024]; /* some random amount */
3163 Bufpos newpos, even_newer_pos;
3164 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3165 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3169 newpos = lisp_buffer_stream_startpos (istr);
3170 Lstream_write (ostr, tempbuf, size_in_bytes);
3171 even_newer_pos = lisp_buffer_stream_startpos (istr);
3172 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3178 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3179 Lstream_close (istr);
3180 Lstream_close (ostr);
3182 Lstream_delete (istr);
3183 Lstream_delete (ostr);
3184 Lstream_delete (XLSTREAM (de_outstream));
3185 Lstream_delete (XLSTREAM (lb_outstream));
3186 return make_int (retlen);
3193 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3194 unsigned_char_dynarr *dst, Lstream_data_count n)
3197 unsigned char char_boundary;
3198 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3199 unsigned int flags = str->flags;
3200 Emchar ch = str->ch;
3202 char_boundary = str->iso2022.current_char_boundary;
3208 if (char_boundary == 0)
3236 (*str->encode_char) (str, c, dst, &flags);
3238 else if (char_boundary == 1)
3240 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3246 ch = (ch << 6) | (c & 0x3f);
3251 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3253 (*str->finish) (str, dst, &flags);
3258 str->iso2022.current_char_boundary = char_boundary;
3263 /************************************************************************/
3264 /* entity reference */
3265 /************************************************************************/
3268 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3270 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3272 if ( str->er_counter > 0)
3274 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3275 str->er_counter = 0;
3279 EXFUN (Fregexp_quote, 1);
3281 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3282 unsigned_char_dynarr* dst);
3284 decode_add_er_char (struct decoding_stream *str, Emchar c,
3285 unsigned_char_dynarr* dst)
3287 if (str->er_counter == 0)
3289 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3292 str->er_buf[0] = '&';
3296 DECODE_ADD_UCS_CHAR (c, dst);
3300 Lisp_Object string = make_string (str->er_buf,
3307 Lisp_Object char_type;
3310 for ( rest = Vcoded_charset_entity_reference_alist;
3311 !NILP (rest); rest = Fcdr (rest) )
3317 char_type = XCDR (ccs);
3322 if (NILP (ccs = Ffind_charset (ccs)))
3331 pat = Fregexp_quote (pat);
3338 pat = concat3 (build_string ("^&"),
3339 pat, build_string ("\\([0-9]+\\)$"));
3342 else if (EQ (ret, Qx))
3344 pat = concat3 (build_string ("^&"),
3345 pat, build_string ("\\([0-9a-f]+\\)$"));
3348 else if (EQ (ret, QX))
3350 pat = concat3 (build_string ("^&"),
3351 pat, build_string ("\\([0-9A-F]+\\)$"));
3357 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3360 = XINT (Fstring_to_number
3361 (Fsubstring (string,
3362 Fmatch_beginning (make_int (1)),
3363 Fmatch_end (make_int (1))),
3367 ? DECODE_CHAR (ccs, code, 0)
3368 : decode_builtin_char (ccs, code);
3371 DECODE_ADD_UCS_CHAR (chr, dst);
3374 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3375 Dynarr_add (dst, ';');
3381 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3382 string, Qnil, Qnil)))
3385 = XUINT (Fstring_to_number
3386 (Fsubstring (string,
3387 Fmatch_beginning (make_int (1)),
3388 Fmatch_end (make_int (1))),
3391 DECODE_ADD_UCS_CHAR (code, dst);
3395 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3396 Dynarr_add (dst, ';');
3399 str->er_counter = 0;
3401 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3403 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3404 str->er_counter = 0;
3405 DECODE_ADD_UCS_CHAR (c, dst);
3408 str->er_buf[str->er_counter++] = c;
3411 void char_encode_as_entity_reference (Emchar ch, char* buf);
3413 char_encode_as_entity_reference (Emchar ch, char* buf)
3415 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3418 Lisp_Object char_type;
3419 int format_columns, idx;
3420 char format[ER_BUF_SIZE];
3422 while (!NILP (rest))
3428 char_type = XCDR (ccs);
3433 if (!NILP (ccs = Ffind_charset (ccs)))
3436 = charset_code_point (ccs, ch,
3438 CHAR_ALL : CHAR_ISOLATED_ONLY );
3440 if ( (code_point >= 0)
3441 && ( NILP (char_type)
3443 || ( charset_code_point (ccs, ch, CHAR_DEFINED_ONLY)
3447 || ( DECODE_CHAR (ccs, code_point, 0) != ch )
3456 if ( STRINGP (ret) &&
3457 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3460 strncpy (&format[1], XSTRING_DATA (ret), idx);
3470 format[idx++] = '%';
3471 format_columns = XINT (ret);
3472 if ( (2 <= format_columns) && (format_columns <= 8)
3473 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3475 format [idx++] = '0';
3476 format [idx++] = '0' + format_columns;
3485 format [idx++] = 'd';
3486 else if (EQ (ret, Qx))
3487 format [idx++] = 'x';
3488 else if (EQ (ret, QX))
3489 format [idx++] = 'X';
3492 format [idx++] = ';';
3495 sprintf (buf, format, code_point);
3502 sprintf (buf, "&MCS-%08X;", ch);
3506 /************************************************************************/
3507 /* character composition */
3508 /************************************************************************/
3509 extern Lisp_Object Qcomposition, Qrep_decomposition;
3512 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3514 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3518 for (i = 0; i < str->combined_char_count; i++)
3519 decode_add_er_char (str, str->combined_chars[i], dst);
3520 str->combined_char_count = 0;
3521 #ifdef HAVE_LIBCHISE
3522 str->combining_table = COS_NIL;
3524 str->combining_table = Qnil;
3525 #endif /* HAVE_LIBCHISE */
3528 extern CONCORD_DS concord_current_env;
3531 concord_setup_env_maybe ()
3533 if (concord_current_env == NULL)
3535 concord_open_env ("/usr/local/share/chise/1.0/db/");
3540 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3541 unsigned_char_dynarr* dst);
3543 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3544 Emchar character, unsigned_char_dynarr* dst)
3546 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3547 decode_add_er_char (str, character, dst);
3548 #ifdef HAVE_LIBCHISE
3549 else if (!cos_cons_p (str->combining_table))
3553 concord_setup_env_maybe ();
3554 ret = concord_object_get_feature_value (cos_make_char (character),
3557 if (!cos_cons_p (ret))
3558 decode_add_er_char (str, character, dst);
3561 //cos_retain_object (ret);
3562 str->combined_chars[0] = character;
3563 str->combined_char_count = 1;
3564 str->combining_table = ret;
3570 = cos_cdr (cos_assoc (cos_make_char (character),
3571 str->combining_table));
3573 //cos_release_object (str->combining_table);
3574 if (cos_char_p (ret))
3576 Emchar char2 = cos_char_id (ret);
3579 concord_setup_env_maybe ();
3580 ret2 = concord_object_get_feature_value (ret, COS_COMPOSITION);
3582 if (!cos_cons_p (ret2))
3584 decode_add_er_char (str, char2, dst);
3585 str->combined_char_count = 0;
3586 str->combining_table = COS_NIL;
3590 //cos_retain_object (ret2);
3591 str->combined_chars[0] = char2;
3592 str->combined_char_count = 1;
3593 str->combining_table = ret2;
3598 concord_setup_env_maybe ();
3599 ret = concord_object_get_feature_value (cos_make_char (character),
3602 COMPOSE_FLUSH_CHARS (str, dst);
3603 if (!cos_cons_p (ret))
3604 decode_add_er_char (str, character, dst);
3607 //cos_retain_object (ret);
3608 str->combined_chars[0] = character;
3609 str->combined_char_count = 1;
3610 str->combining_table = ret;
3615 else if (!CONSP (str->combining_table))
3618 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3622 decode_add_er_char (str, character, dst);
3625 str->combined_chars[0] = character;
3626 str->combined_char_count = 1;
3627 str->combining_table = ret;
3633 = Fcdr (Fassq (make_char (character), str->combining_table));
3637 Emchar char2 = XCHARVAL (ret);
3638 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3643 decode_add_er_char (str, char2, dst);
3644 str->combined_char_count = 0;
3645 str->combining_table = Qnil;
3649 str->combined_chars[0] = char2;
3650 str->combined_char_count = 1;
3651 str->combining_table = ret2;
3656 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3659 COMPOSE_FLUSH_CHARS (str, dst);
3661 decode_add_er_char (str, character, dst);
3664 str->combined_chars[0] = character;
3665 str->combined_char_count = 1;
3666 str->combining_table = ret;
3670 #endif /* HAVE_LIBCHISE */
3672 #else /* not UTF2000 */
3673 #define COMPOSE_FLUSH_CHARS(str, dst)
3674 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3675 #endif /* UTF2000 */
3678 /************************************************************************/
3679 /* Shift-JIS methods */
3680 /************************************************************************/
3682 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3683 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3684 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3685 encoded by "position-code + 0x80". A character of JISX0208
3686 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3687 position-codes are divided and shifted so that it fit in the range
3690 --- CODE RANGE of Shift-JIS ---
3691 (character set) (range)
3693 JISX0201-Kana 0xA0 .. 0xDF
3694 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3695 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3696 -------------------------------
3700 /* Is this the first byte of a Shift-JIS two-byte char? */
3702 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3703 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3705 /* Is this the second byte of a Shift-JIS two-byte char? */
3707 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3708 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3710 #define BYTE_SJIS_KATAKANA_P(c) \
3711 ((c) >= 0xA1 && (c) <= 0xDF)
3714 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3718 unsigned char c = *(unsigned char *)src++;
3719 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3721 if (st->shift_jis.in_second_byte)
3723 st->shift_jis.in_second_byte = 0;
3727 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3728 st->shift_jis.in_second_byte = 1;
3730 return CODING_CATEGORY_SHIFT_JIS_MASK;
3733 /* Convert Shift-JIS data to internal format. */
3736 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3737 unsigned_char_dynarr *dst, Lstream_data_count n)
3739 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3740 unsigned int flags = str->flags;
3741 unsigned int cpos = str->cpos;
3742 eol_type_t eol_type = str->eol_type;
3746 unsigned char c = *(unsigned char *)src++;
3750 /* Previous character was first byte of Shift-JIS Kanji char. */
3751 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3753 unsigned char e1, e2;
3755 DECODE_SJIS (cpos, c, e1, e2);
3757 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3761 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3762 Dynarr_add (dst, e1);
3763 Dynarr_add (dst, e2);
3768 DECODE_ADD_BINARY_CHAR (cpos, dst);
3769 DECODE_ADD_BINARY_CHAR (c, dst);
3775 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3776 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3778 else if (BYTE_SJIS_KATAKANA_P (c))
3781 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3784 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3785 Dynarr_add (dst, c);
3790 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3794 DECODE_ADD_BINARY_CHAR (c, dst);
3796 label_continue_loop:;
3799 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3805 /* Convert internal character representation to Shift_JIS. */
3808 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3809 unsigned_char_dynarr *dst, unsigned int *flags)
3811 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3815 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3816 Dynarr_add (dst, '\r');
3817 if (eol_type != EOL_CR)
3818 Dynarr_add (dst, ch);
3822 unsigned int s1, s2;
3824 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3826 if (code_point >= 0)
3827 Dynarr_add (dst, code_point);
3828 else if ((code_point
3829 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3832 ENCODE_SJIS ((code_point >> 8) | 0x80,
3833 (code_point & 0xFF) | 0x80, s1, s2);
3834 Dynarr_add (dst, s1);
3835 Dynarr_add (dst, s2);
3837 else if ((code_point
3838 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3840 Dynarr_add (dst, code_point | 0x80);
3841 else if ((code_point
3842 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3845 ENCODE_SJIS ((code_point >> 8) | 0x80,
3846 (code_point & 0xFF) | 0x80, s1, s2);
3847 Dynarr_add (dst, s1);
3848 Dynarr_add (dst, s2);
3850 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3852 Dynarr_add (dst, code_point);
3854 Dynarr_add (dst, '?');
3856 Lisp_Object charset;
3857 unsigned int c1, c2;
3859 BREAKUP_CHAR (ch, charset, c1, c2);
3861 if (EQ(charset, Vcharset_katakana_jisx0201))
3863 Dynarr_add (dst, c1 | 0x80);
3867 Dynarr_add (dst, c1);
3869 else if (EQ(charset, Vcharset_japanese_jisx0208))
3871 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3872 Dynarr_add (dst, s1);
3873 Dynarr_add (dst, s2);
3876 Dynarr_add (dst, '?');
3882 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3883 unsigned int *flags)
3887 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3888 Decode a JISX0208 character of Shift-JIS coding-system.
3889 CODE is the character code in Shift-JIS as a cons of type bytes.
3890 Return the corresponding character.
3894 unsigned char c1, c2, s1, s2;
3897 CHECK_INT (XCAR (code));
3898 CHECK_INT (XCDR (code));
3899 s1 = XINT (XCAR (code));
3900 s2 = XINT (XCDR (code));
3901 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3902 BYTE_SJIS_TWO_BYTE_2_P (s2))
3904 DECODE_SJIS (s1, s2, c1, c2);
3905 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3906 c1 & 0x7F, c2 & 0x7F));
3912 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3913 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3914 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3918 Lisp_Object charset;
3921 CHECK_CHAR_COERCE_INT (character);
3922 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3923 if (EQ (charset, Vcharset_japanese_jisx0208))
3925 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3926 return Fcons (make_int (s1), make_int (s2));
3933 /************************************************************************/
3935 /************************************************************************/
3937 /* BIG5 is a coding system encoding two character sets: ASCII and
3938 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3939 character set and is encoded in two-byte.
3941 --- CODE RANGE of BIG5 ---
3942 (character set) (range)
3944 Big5 (1st byte) 0xA1 .. 0xFE
3945 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3946 --------------------------
3948 Since the number of characters in Big5 is larger than maximum
3949 characters in Emacs' charset (96x96), it can't be handled as one
3950 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3951 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3952 contains frequently used characters and the latter contains less
3953 frequently used characters. */
3956 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3957 ((c) >= 0x81 && (c) <= 0xFE)
3959 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3960 ((c) >= 0xA1 && (c) <= 0xFE)
3963 /* Is this the second byte of a Shift-JIS two-byte char? */
3965 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3966 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3968 /* Number of Big5 characters which have the same code in 1st byte. */
3970 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3972 /* Code conversion macros. These are macros because they are used in
3973 inner loops during code conversion.
3975 Note that temporary variables in macros introduce the classic
3976 dynamic-scoping problems with variable names. We use capital-
3977 lettered variables in the assumption that XEmacs does not use
3978 capital letters in variables except in a very formalized way
3981 /* Convert Big5 code (b1, b2) into its internal string representation
3984 /* There is a much simpler way to split the Big5 charset into two.
3985 For the moment I'm going to leave the algorithm as-is because it
3986 claims to separate out the most-used characters into a single
3987 charset, which perhaps will lead to optimizations in various
3990 The way the algorithm works is something like this:
3992 Big5 can be viewed as a 94x157 charset, where the row is
3993 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3994 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3995 the split between low and high column numbers is apparently
3996 meaningless; ascending rows produce less and less frequent chars.
3997 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3998 the first charset, and the upper half (0xC9 .. 0xFE) to the
3999 second. To do the conversion, we convert the character into
4000 a single number where 0 .. 156 is the first row, 157 .. 313
4001 is the second, etc. That way, the characters are ordered by
4002 decreasing frequency. Then we just chop the space in two
4003 and coerce the result into a 94x94 space.
4006 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
4008 int B1 = b1, B2 = b2; \
4010 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
4014 lb = LEADING_BYTE_CHINESE_BIG5_1; \
4018 lb = LEADING_BYTE_CHINESE_BIG5_2; \
4019 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
4021 c1 = I / (0xFF - 0xA1) + 0xA1; \
4022 c2 = I % (0xFF - 0xA1) + 0xA1; \
4025 /* Convert the internal string representation of a Big5 character
4026 (lb, c1, c2) into Big5 code (b1, b2). */
4028 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
4030 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
4032 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
4034 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
4036 b1 = I / BIG5_SAME_ROW + 0xA1; \
4037 b2 = I % BIG5_SAME_ROW; \
4038 b2 += b2 < 0x3F ? 0x40 : 0x62; \
4042 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4046 unsigned char c = *(unsigned char *)src++;
4047 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
4049 || (c >= 0x80 && c <= 0xA0)
4053 if (st->big5.in_second_byte)
4055 st->big5.in_second_byte = 0;
4056 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
4066 st->big5.in_second_byte = 1;
4068 return CODING_CATEGORY_BIG5_MASK;
4071 /* Convert Big5 data to internal format. */
4074 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
4075 unsigned_char_dynarr *dst, Lstream_data_count n)
4077 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4078 unsigned int flags = str->flags;
4079 unsigned int cpos = str->cpos;
4080 eol_type_t eol_type = str->eol_type;
4083 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4084 (decoding)->codesys, 1);
4089 unsigned char c = *(unsigned char *)src++;
4092 /* Previous character was first byte of Big5 char. */
4093 if (BYTE_BIG5_TWO_BYTE_2_P (c))
4096 int code_point = (cpos << 8) | c;
4097 Emchar char_id = decode_defined_char (ccs, code_point, 0);
4101 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
4102 DECODE_ADD_UCS_CHAR (char_id, dst);
4104 unsigned char b1, b2, b3;
4105 DECODE_BIG5 (cpos, c, b1, b2, b3);
4106 Dynarr_add (dst, b1);
4107 Dynarr_add (dst, b2);
4108 Dynarr_add (dst, b3);
4113 DECODE_ADD_BINARY_CHAR (cpos, dst);
4114 DECODE_ADD_BINARY_CHAR (c, dst);
4120 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4121 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4123 decode_flush_er_chars (str, dst);
4128 decode_flush_er_chars (str, dst);
4129 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4130 DECODE_ADD_BINARY_CHAR (c, dst);
4134 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4135 decode_add_er_char (str, c, dst);
4138 label_continue_loop:;
4141 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4142 if (flags & CODING_STATE_END)
4144 decode_flush_er_chars (str, dst);
4145 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4146 if (flags & CODING_STATE_CR)
4147 Dynarr_add (dst, '\r');
4154 /* Convert internally-formatted data to Big5. */
4157 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4158 unsigned_char_dynarr *dst, unsigned int *flags)
4160 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4164 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4165 Dynarr_add (dst, '\r');
4166 if (eol_type != EOL_CR)
4167 Dynarr_add (dst, ch);
4174 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4176 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4177 Dynarr_add (dst, code_point);
4178 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4180 Dynarr_add (dst, code_point >> 8);
4181 Dynarr_add (dst, code_point & 0xFF);
4183 else if ((code_point
4184 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4186 Dynarr_add (dst, code_point >> 8);
4187 Dynarr_add (dst, code_point & 0xFF);
4189 else if ((code_point
4190 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4193 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4194 + ((code_point & 0xFF) - 33);
4195 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4196 unsigned char b2 = I % BIG5_SAME_ROW;
4198 b2 += b2 < 0x3F ? 0x40 : 0x62;
4199 Dynarr_add (dst, b1);
4200 Dynarr_add (dst, b2);
4202 else if ((code_point
4203 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4206 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4207 + ((code_point & 0xFF) - 33);
4208 unsigned char b1, b2;
4210 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4211 b1 = I / BIG5_SAME_ROW + 0xA1;
4212 b2 = I % BIG5_SAME_ROW;
4213 b2 += b2 < 0x3F ? 0x40 : 0x62;
4214 Dynarr_add (dst, b1);
4215 Dynarr_add (dst, b2);
4217 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4221 char_encode_as_entity_reference (ch, buf);
4222 Dynarr_add_many (dst, buf, strlen (buf));
4225 Dynarr_add (dst, '?');
4232 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4233 unsigned int *flags)
4238 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4239 Decode a Big5 character CODE of BIG5 coding-system.
4240 CODE is the character code in BIG5, a cons of two integers.
4241 Return the corresponding character.
4245 unsigned char c1, c2, b1, b2;
4248 CHECK_INT (XCAR (code));
4249 CHECK_INT (XCDR (code));
4250 b1 = XINT (XCAR (code));
4251 b2 = XINT (XCDR (code));
4252 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4253 BYTE_BIG5_TWO_BYTE_2_P (b2))
4255 Charset_ID leading_byte;
4256 Lisp_Object charset;
4257 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4258 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4259 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4265 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4266 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4267 Return the corresponding character code in Big5.
4271 Lisp_Object charset;
4274 CHECK_CHAR_COERCE_INT (character);
4275 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4276 if (EQ (charset, Vcharset_chinese_big5_1) ||
4277 EQ (charset, Vcharset_chinese_big5_2))
4279 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4281 return Fcons (make_int (b1), make_int (b2));
4288 /************************************************************************/
4290 /************************************************************************/
4293 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4297 unsigned char c = *(unsigned char *)src++;
4298 switch (st->ucs4.in_byte)
4307 st->ucs4.in_byte = 0;
4313 return CODING_CATEGORY_UCS4_MASK;
4317 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4318 unsigned_char_dynarr *dst, Lstream_data_count n)
4320 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4321 unsigned int flags = str->flags;
4322 unsigned int cpos = str->cpos;
4323 unsigned char counter = str->counter;
4327 unsigned char c = *(unsigned char *)src++;
4335 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4340 cpos = ( cpos << 8 ) | c;
4344 if (counter & CODING_STATE_END)
4345 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4349 str->counter = counter;
4353 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4354 unsigned_char_dynarr *dst, unsigned int *flags)
4356 Dynarr_add (dst, ch >> 24);
4357 Dynarr_add (dst, ch >> 16);
4358 Dynarr_add (dst, ch >> 8);
4359 Dynarr_add (dst, ch );
4363 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4364 unsigned int *flags)
4369 /************************************************************************/
4370 /* UTF-16 methods */
4371 /************************************************************************/
4374 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4376 return CODING_CATEGORY_UTF16_MASK;
4380 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4381 unsigned_char_dynarr *dst, Lstream_data_count n)
4383 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4384 unsigned int flags = str->flags;
4385 unsigned int cpos = str->cpos;
4386 unsigned char counter = str->counter & 3;
4387 unsigned char byte_order = str->counter >> 2;
4388 eol_type_t eol_type = str->eol_type;
4392 unsigned char c = *(unsigned char *)src++;
4398 else if (counter == 1)
4402 if (byte_order == 0)
4403 code = (c << 8) | cpos;
4405 code = (cpos << 8) | c;
4408 code = ((code & 0xFF) << 8) | (code >> 8);
4409 if ( byte_order == 0 )
4414 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4425 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4426 DECODE_ADD_UCS_CHAR (code, dst);
4430 else if (counter == 2)
4432 cpos = (cpos << 8) | c;
4440 ? (c << 8) | (cpos & 0xFF)
4441 : ((cpos & 0xFF) << 8) | c;
4443 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4448 label_continue_loop:;
4450 if (counter & CODING_STATE_END)
4451 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4455 str->counter = (byte_order << 2) | counter;
4459 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4460 unsigned_char_dynarr *dst, unsigned int *flags)
4464 Dynarr_add (dst, ch);
4465 Dynarr_add (dst, ch >> 8);
4469 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4470 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4472 Dynarr_add (dst, y);
4473 Dynarr_add (dst, y >> 8);
4474 Dynarr_add (dst, z);
4475 Dynarr_add (dst, z >> 8);
4480 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4481 unsigned int *flags)
4486 /************************************************************************/
4488 /************************************************************************/
4491 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4495 unsigned char c = *(unsigned char *)src++;
4496 switch (st->utf8.in_byte)
4499 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4502 st->utf8.in_byte = 5;
4504 st->utf8.in_byte = 4;
4506 st->utf8.in_byte = 3;
4508 st->utf8.in_byte = 2;
4510 st->utf8.in_byte = 1;
4515 if ((c & 0xc0) != 0x80)
4521 return CODING_CATEGORY_UTF8_MASK;
4525 decode_output_utf8_partial_char (unsigned char counter,
4527 unsigned_char_dynarr *dst)
4530 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4531 else if (counter == 4)
4533 if (cpos < (1 << 6))
4534 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4537 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4538 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4541 else if (counter == 3)
4543 if (cpos < (1 << 6))
4544 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4545 else if (cpos < (1 << 12))
4547 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4548 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4552 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4553 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4554 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4557 else if (counter == 2)
4559 if (cpos < (1 << 6))
4560 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4561 else if (cpos < (1 << 12))
4563 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4564 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4566 else if (cpos < (1 << 18))
4568 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4569 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4570 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4574 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4575 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4576 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4577 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4582 if (cpos < (1 << 6))
4583 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4584 else if (cpos < (1 << 12))
4586 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4587 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4589 else if (cpos < (1 << 18))
4591 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4592 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4593 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4595 else if (cpos < (1 << 24))
4597 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4598 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4599 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4600 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4604 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4605 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4606 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4607 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4608 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4614 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4615 unsigned_char_dynarr *dst, Lstream_data_count n)
4617 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4618 unsigned int flags = str->flags;
4619 unsigned int cpos = str->cpos;
4620 eol_type_t eol_type = str->eol_type;
4621 unsigned char counter = str->counter;
4623 int bom_flag = str->bom_flag;
4625 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4626 (decoding)->codesys, 0);
4631 unsigned char c = *(unsigned char *)src++;
4636 COMPOSE_FLUSH_CHARS (str, dst);
4637 decode_flush_er_chars (str, dst);
4638 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4640 if ( bom_flag == 0 )
4643 DECODE_ADD_UCS_CHAR (c, dst);
4645 else if ( c < 0xC0 )
4647 if ( bom_flag == 0 )
4650 /* decode_add_er_char (str, c, dst); */
4651 COMPOSE_ADD_CHAR (str, c, dst);
4655 /* decode_flush_er_chars (str, dst); */
4661 else if ( c < 0xF0 )
4666 else if ( c < 0xF8 )
4671 else if ( c < 0xFC )
4683 else if ( (c & 0xC0) == 0x80 )
4685 cpos = ( cpos << 6 ) | ( c & 0x3f );
4690 if ( bom_flag == 0 )
4692 if ( cpos == 0xFEFF )
4703 char_id = decode_defined_char (ccs, cpos, 0);
4710 COMPOSE_ADD_CHAR (str, char_id, dst);
4720 COMPOSE_FLUSH_CHARS (str, dst);
4721 decode_flush_er_chars (str, dst);
4722 decode_output_utf8_partial_char (counter, cpos, dst);
4723 DECODE_ADD_BINARY_CHAR (c, dst);
4727 label_continue_loop:;
4730 if (flags & CODING_STATE_END)
4732 COMPOSE_FLUSH_CHARS (str, dst);
4733 decode_flush_er_chars (str, dst);
4736 decode_output_utf8_partial_char (counter, cpos, dst);
4743 str->counter = counter;
4745 str->bom_flag = bom_flag;
4750 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4751 unsigned_char_dynarr *dst, unsigned int *flags)
4753 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4757 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4758 Dynarr_add (dst, '\r');
4759 if (eol_type != EOL_CR)
4760 Dynarr_add (dst, ch);
4762 else if (ch <= 0x7f)
4764 Dynarr_add (dst, ch);
4769 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4770 int code_point = charset_code_point (ucs_ccs, ch, 0);
4772 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4774 Lisp_Object map, ret;
4776 if ( CODING_SYSTEM_ENABLE_DECOMPOSITION (str->codesys) )
4778 Lisp_Object rest = Vdecomposition_feature_list;
4779 Lisp_Object decomp_f;
4780 Lisp_Object seq = Qnil;
4781 struct gcpro gcpro1;
4783 while ( CONSP (rest) )
4785 decomp_f = XCAR (rest);
4787 seq = Fchar_feature (make_char (ch), decomp_f, Qnil,
4797 Lisp_Object base = Fcar (seq);
4800 if ( CHARP (base) && CONSP (seq) )
4802 Lisp_Object comb = Fcar (seq);
4806 char_encode_utf8 (str, XCHAR (base), dst, flags);
4807 char_encode_utf8 (str, XCHAR (comb), dst, flags);
4814 map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4816 && INTP (ret = Fchar_feature (make_char (ch),
4819 code_point = XINT (ret);
4820 else if ( !NILP (map =
4821 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4823 && INTP (ret = Fchar_feature (make_char (ch),
4826 code_point = XINT (ret);
4827 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4831 char_encode_as_entity_reference (ch, buf);
4832 Dynarr_add_many (dst, buf, strlen (buf));
4838 if (code_point <= 0x7ff)
4840 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4841 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4843 else if (code_point <= 0xffff)
4845 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4846 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4847 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4849 else if (code_point <= 0x1fffff)
4851 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4852 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4853 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4854 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4856 else if (code_point <= 0x3ffffff)
4858 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4859 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4860 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4861 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4862 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4866 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4867 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4868 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4869 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4870 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4871 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4877 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4878 unsigned int *flags)
4883 /************************************************************************/
4884 /* ISO2022 methods */
4885 /************************************************************************/
4887 /* The following note describes the coding system ISO2022 briefly.
4888 Since the intention of this note is to help understand the
4889 functions in this file, some parts are NOT ACCURATE or OVERLY
4890 SIMPLIFIED. For thorough understanding, please refer to the
4891 original document of ISO2022.
4893 ISO2022 provides many mechanisms to encode several character sets
4894 in 7-bit and 8-bit environments. For 7-bit environments, all text
4895 is encoded using bytes less than 128. This may make the encoded
4896 text a little bit longer, but the text passes more easily through
4897 several gateways, some of which strip off MSB (Most Signigant Bit).
4899 There are two kinds of character sets: control character set and
4900 graphic character set. The former contains control characters such
4901 as `newline' and `escape' to provide control functions (control
4902 functions are also provided by escape sequences). The latter
4903 contains graphic characters such as 'A' and '-'. Emacs recognizes
4904 two control character sets and many graphic character sets.
4906 Graphic character sets are classified into one of the following
4907 four classes, according to the number of bytes (DIMENSION) and
4908 number of characters in one dimension (CHARS) of the set:
4909 - DIMENSION1_CHARS94
4910 - DIMENSION1_CHARS96
4911 - DIMENSION2_CHARS94
4912 - DIMENSION2_CHARS96
4914 In addition, each character set is assigned an identification tag,
4915 unique for each set, called "final character" (denoted as <F>
4916 hereafter). The <F> of each character set is decided by ECMA(*)
4917 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4918 (0x30..0x3F are for private use only).
4920 Note (*): ECMA = European Computer Manufacturers Association
4922 Here are examples of graphic character set [NAME(<F>)]:
4923 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4924 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4925 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4926 o DIMENSION2_CHARS96 -- none for the moment
4928 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4929 C0 [0x00..0x1F] -- control character plane 0
4930 GL [0x20..0x7F] -- graphic character plane 0
4931 C1 [0x80..0x9F] -- control character plane 1
4932 GR [0xA0..0xFF] -- graphic character plane 1
4934 A control character set is directly designated and invoked to C0 or
4935 C1 by an escape sequence. The most common case is that:
4936 - ISO646's control character set is designated/invoked to C0, and
4937 - ISO6429's control character set is designated/invoked to C1,
4938 and usually these designations/invocations are omitted in encoded
4939 text. In a 7-bit environment, only C0 can be used, and a control
4940 character for C1 is encoded by an appropriate escape sequence to
4941 fit into the environment. All control characters for C1 are
4942 defined to have corresponding escape sequences.
4944 A graphic character set is at first designated to one of four
4945 graphic registers (G0 through G3), then these graphic registers are
4946 invoked to GL or GR. These designations and invocations can be
4947 done independently. The most common case is that G0 is invoked to
4948 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4949 these invocations and designations are omitted in encoded text.
4950 In a 7-bit environment, only GL can be used.
4952 When a graphic character set of CHARS94 is invoked to GL, codes
4953 0x20 and 0x7F of the GL area work as control characters SPACE and
4954 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4957 There are two ways of invocation: locking-shift and single-shift.
4958 With locking-shift, the invocation lasts until the next different
4959 invocation, whereas with single-shift, the invocation affects the
4960 following character only and doesn't affect the locking-shift
4961 state. Invocations are done by the following control characters or
4964 ----------------------------------------------------------------------
4965 abbrev function cntrl escape seq description
4966 ----------------------------------------------------------------------
4967 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4968 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4969 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4970 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4971 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4972 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4973 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4974 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4975 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4976 ----------------------------------------------------------------------
4977 (*) These are not used by any known coding system.
4979 Control characters for these functions are defined by macros
4980 ISO_CODE_XXX in `coding.h'.
4982 Designations are done by the following escape sequences:
4983 ----------------------------------------------------------------------
4984 escape sequence description
4985 ----------------------------------------------------------------------
4986 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4987 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4988 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4989 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4990 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4991 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4992 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4993 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4994 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4995 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4996 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4997 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4998 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4999 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
5000 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
5001 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
5002 ----------------------------------------------------------------------
5004 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
5005 of dimension 1, chars 94, and final character <F>, etc...
5007 Note (*): Although these designations are not allowed in ISO2022,
5008 Emacs accepts them on decoding, and produces them on encoding
5009 CHARS96 character sets in a coding system which is characterized as
5010 7-bit environment, non-locking-shift, and non-single-shift.
5012 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
5013 '(' can be omitted. We refer to this as "short-form" hereafter.
5015 Now you may notice that there are a lot of ways for encoding the
5016 same multilingual text in ISO2022. Actually, there exist many
5017 coding systems such as Compound Text (used in X11's inter client
5018 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
5019 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
5020 localized platforms), and all of these are variants of ISO2022.
5022 In addition to the above, Emacs handles two more kinds of escape
5023 sequences: ISO6429's direction specification and Emacs' private
5024 sequence for specifying character composition.
5026 ISO6429's direction specification takes the following form:
5027 o CSI ']' -- end of the current direction
5028 o CSI '0' ']' -- end of the current direction
5029 o CSI '1' ']' -- start of left-to-right text
5030 o CSI '2' ']' -- start of right-to-left text
5031 The control character CSI (0x9B: control sequence introducer) is
5032 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
5034 Character composition specification takes the following form:
5035 o ESC '0' -- start character composition
5036 o ESC '1' -- end character composition
5037 Since these are not standard escape sequences of any ISO standard,
5038 their use with these meanings is restricted to Emacs only. */
5041 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
5045 for (i = 0; i < 4; i++)
5047 if (!NILP (coding_system))
5049 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
5051 iso->charset[i] = Qt;
5052 iso->invalid_designated[i] = 0;
5054 iso->esc = ISO_ESC_NOTHING;
5055 iso->esc_bytes_index = 0;
5056 iso->register_left = 0;
5057 iso->register_right = 1;
5058 iso->switched_dir_and_no_valid_charset_yet = 0;
5059 iso->invalid_switch_dir = 0;
5060 iso->output_direction_sequence = 0;
5061 iso->output_literally = 0;
5062 #ifdef ENABLE_COMPOSITE_CHARS
5063 if (iso->composite_chars)
5064 Dynarr_reset (iso->composite_chars);
5069 fit_to_be_escape_quoted (unsigned char c)
5086 /* Parse one byte of an ISO2022 escape sequence.
5087 If the result is an invalid escape sequence, return 0 and
5088 do not change anything in STR. Otherwise, if the result is
5089 an incomplete escape sequence, update ISO2022.ESC and
5090 ISO2022.ESC_BYTES and return -1. Otherwise, update
5091 all the state variables (but not ISO2022.ESC_BYTES) and
5094 If CHECK_INVALID_CHARSETS is non-zero, check for designation
5095 or invocation of an invalid character set and treat that as
5096 an unrecognized escape sequence. */
5099 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
5100 unsigned char c, unsigned int *flags,
5101 int check_invalid_charsets)
5103 /* (1) If we're at the end of a designation sequence, CS is the
5104 charset being designated and REG is the register to designate
5107 (2) If we're at the end of a locking-shift sequence, REG is
5108 the register to invoke and HALF (0 == left, 1 == right) is
5109 the half to invoke it into.
5111 (3) If we're at the end of a single-shift sequence, REG is
5112 the register to invoke. */
5113 Lisp_Object cs = Qnil;
5116 /* NOTE: This code does goto's all over the fucking place.
5117 The reason for this is that we're basically implementing
5118 a state machine here, and hierarchical languages like C
5119 don't really provide a clean way of doing this. */
5121 if (! (*flags & CODING_STATE_ESCAPE))
5122 /* At beginning of escape sequence; we need to reset our
5123 escape-state variables. */
5124 iso->esc = ISO_ESC_NOTHING;
5126 iso->output_literally = 0;
5127 iso->output_direction_sequence = 0;
5131 case ISO_ESC_NOTHING:
5132 iso->esc_bytes_index = 0;
5135 case ISO_CODE_ESC: /* Start escape sequence */
5136 *flags |= CODING_STATE_ESCAPE;
5140 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
5141 *flags |= CODING_STATE_ESCAPE;
5142 iso->esc = ISO_ESC_5_11;
5145 case ISO_CODE_SO: /* locking shift 1 */
5148 case ISO_CODE_SI: /* locking shift 0 */
5152 case ISO_CODE_SS2: /* single shift */
5155 case ISO_CODE_SS3: /* single shift */
5159 default: /* Other control characters */
5166 /**** single shift ****/
5168 case 'N': /* single shift 2 */
5171 case 'O': /* single shift 3 */
5175 /**** locking shift ****/
5177 case '~': /* locking shift 1 right */
5180 case 'n': /* locking shift 2 */
5183 case '}': /* locking shift 2 right */
5186 case 'o': /* locking shift 3 */
5189 case '|': /* locking shift 3 right */
5193 #ifdef ENABLE_COMPOSITE_CHARS
5194 /**** composite ****/
5197 iso->esc = ISO_ESC_START_COMPOSITE;
5198 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5199 CODING_STATE_COMPOSITE;
5203 iso->esc = ISO_ESC_END_COMPOSITE;
5204 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5205 ~CODING_STATE_COMPOSITE;
5207 #endif /* ENABLE_COMPOSITE_CHARS */
5209 /**** directionality ****/
5212 iso->esc = ISO_ESC_5_11;
5215 /**** designation ****/
5217 case '$': /* multibyte charset prefix */
5218 iso->esc = ISO_ESC_2_4;
5222 if (0x28 <= c && c <= 0x2F)
5224 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5228 /* This function is called with CODESYS equal to nil when
5229 doing coding-system detection. */
5231 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5232 && fit_to_be_escape_quoted (c))
5234 iso->esc = ISO_ESC_LITERAL;
5235 *flags &= CODING_STATE_ISO2022_LOCK;
5245 /**** directionality ****/
5247 case ISO_ESC_5_11: /* ISO6429 direction control */
5250 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5251 goto directionality;
5253 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5254 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5255 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5259 case ISO_ESC_5_11_0:
5262 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5263 goto directionality;
5267 case ISO_ESC_5_11_1:
5270 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5271 goto directionality;
5275 case ISO_ESC_5_11_2:
5278 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5279 goto directionality;
5284 iso->esc = ISO_ESC_DIRECTIONALITY;
5285 /* Various junk here to attempt to preserve the direction sequences
5286 literally in the text if they would otherwise be swallowed due
5287 to invalid designations that don't show up as actual charset
5288 changes in the text. */
5289 if (iso->invalid_switch_dir)
5291 /* We already inserted a direction switch literally into the
5292 text. We assume (#### this may not be right) that the
5293 next direction switch is the one going the other way,
5294 and we need to output that literally as well. */
5295 iso->output_literally = 1;
5296 iso->invalid_switch_dir = 0;
5302 /* If we are in the thrall of an invalid designation,
5303 then stick the directionality sequence literally into the
5304 output stream so it ends up in the original text again. */
5305 for (jj = 0; jj < 4; jj++)
5306 if (iso->invalid_designated[jj])
5310 iso->output_literally = 1;
5311 iso->invalid_switch_dir = 1;
5314 /* Indicate that we haven't yet seen a valid designation,
5315 so that if a switch-dir is directly followed by an
5316 invalid designation, both get inserted literally. */
5317 iso->switched_dir_and_no_valid_charset_yet = 1;
5322 /**** designation ****/
5325 if (0x28 <= c && c <= 0x2F)
5327 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5330 if (0x40 <= c && c <= 0x42)
5333 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5334 *flags & CODING_STATE_R2L ?
5335 CHARSET_RIGHT_TO_LEFT :
5336 CHARSET_LEFT_TO_RIGHT);
5347 if (c < '0' || c > '~')
5348 return 0; /* bad final byte */
5350 if (iso->esc >= ISO_ESC_2_8 &&
5351 iso->esc <= ISO_ESC_2_15)
5353 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5354 single = 1; /* single-byte */
5355 reg = (iso->esc - ISO_ESC_2_8) & 3;
5357 else if (iso->esc >= ISO_ESC_2_4_8 &&
5358 iso->esc <= ISO_ESC_2_4_15)
5360 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5361 single = -1; /* multi-byte */
5362 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5366 /* Can this ever be reached? -slb */
5370 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5371 *flags & CODING_STATE_R2L ?
5372 CHARSET_RIGHT_TO_LEFT :
5373 CHARSET_LEFT_TO_RIGHT);
5379 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5383 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5384 /* can't invoke something that ain't there. */
5386 iso->esc = ISO_ESC_SINGLE_SHIFT;
5387 *flags &= CODING_STATE_ISO2022_LOCK;
5389 *flags |= CODING_STATE_SS2;
5391 *flags |= CODING_STATE_SS3;
5395 if (check_invalid_charsets &&
5396 !CHARSETP (iso->charset[reg]))
5397 /* can't invoke something that ain't there. */
5400 iso->register_right = reg;
5402 iso->register_left = reg;
5403 *flags &= CODING_STATE_ISO2022_LOCK;
5404 iso->esc = ISO_ESC_LOCKING_SHIFT;
5408 if (NILP (cs) && check_invalid_charsets)
5410 iso->invalid_designated[reg] = 1;
5411 iso->charset[reg] = Vcharset_ascii;
5412 iso->esc = ISO_ESC_DESIGNATE;
5413 *flags &= CODING_STATE_ISO2022_LOCK;
5414 iso->output_literally = 1;
5415 if (iso->switched_dir_and_no_valid_charset_yet)
5417 /* We encountered a switch-direction followed by an
5418 invalid designation. Ensure that the switch-direction
5419 gets outputted; otherwise it will probably get eaten
5420 when the text is written out again. */
5421 iso->switched_dir_and_no_valid_charset_yet = 0;
5422 iso->output_direction_sequence = 1;
5423 /* And make sure that the switch-dir going the other
5424 way gets outputted, as well. */
5425 iso->invalid_switch_dir = 1;
5429 /* This function is called with CODESYS equal to nil when
5430 doing coding-system detection. */
5431 if (!NILP (codesys))
5433 charset_conversion_spec_dynarr *dyn =
5434 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5440 for (i = 0; i < Dynarr_length (dyn); i++)
5442 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5443 if (EQ (cs, spec->from_charset))
5444 cs = spec->to_charset;
5449 iso->charset[reg] = cs;
5450 iso->esc = ISO_ESC_DESIGNATE;
5451 *flags &= CODING_STATE_ISO2022_LOCK;
5452 if (iso->invalid_designated[reg])
5454 iso->invalid_designated[reg] = 0;
5455 iso->output_literally = 1;
5457 if (iso->switched_dir_and_no_valid_charset_yet)
5458 iso->switched_dir_and_no_valid_charset_yet = 0;
5463 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5467 /* #### There are serious deficiencies in the recognition mechanism
5468 here. This needs to be much smarter if it's going to cut it.
5469 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5470 it should be detected as Latin-1.
5471 All the ISO2022 stuff in this file should be synced up with the
5472 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5473 Perhaps we should wait till R2L works in FSF Emacs? */
5475 if (!st->iso2022.initted)
5477 reset_iso2022 (Qnil, &st->iso2022.iso);
5478 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5479 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5480 CODING_CATEGORY_ISO_8_1_MASK |
5481 CODING_CATEGORY_ISO_8_2_MASK |
5482 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5483 st->iso2022.flags = 0;
5484 st->iso2022.high_byte_count = 0;
5485 st->iso2022.saw_single_shift = 0;
5486 st->iso2022.initted = 1;
5489 mask = st->iso2022.mask;
5493 unsigned char c = *(unsigned char *)src++;
5496 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5497 st->iso2022.high_byte_count++;
5501 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5503 if (st->iso2022.high_byte_count & 1)
5504 /* odd number of high bytes; assume not iso-8-2 */
5505 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5507 st->iso2022.high_byte_count = 0;
5508 st->iso2022.saw_single_shift = 0;
5510 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5512 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5513 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5514 { /* control chars */
5517 /* Allow and ignore control characters that you might
5518 reasonably see in a text file */
5523 case 8: /* backspace */
5524 case 11: /* vertical tab */
5525 case 12: /* form feed */
5526 case 26: /* MS-DOS C-z junk */
5527 case 31: /* '^_' -- for info */
5528 goto label_continue_loop;
5535 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5538 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5539 &st->iso2022.flags, 0))
5541 switch (st->iso2022.iso.esc)
5543 case ISO_ESC_DESIGNATE:
5544 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5545 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5547 case ISO_ESC_LOCKING_SHIFT:
5548 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5549 goto ran_out_of_chars;
5550 case ISO_ESC_SINGLE_SHIFT:
5551 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5552 st->iso2022.saw_single_shift = 1;
5561 goto ran_out_of_chars;
5564 label_continue_loop:;
5573 postprocess_iso2022_mask (int mask)
5575 /* #### kind of cheesy */
5576 /* If seven-bit ISO is allowed, then assume that the encoding is
5577 entirely seven-bit and turn off the eight-bit ones. */
5578 if (mask & CODING_CATEGORY_ISO_7_MASK)
5579 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5580 CODING_CATEGORY_ISO_8_1_MASK |
5581 CODING_CATEGORY_ISO_8_2_MASK);
5585 /* If FLAGS is a null pointer or specifies right-to-left motion,
5586 output a switch-dir-to-left-to-right sequence to DST.
5587 Also update FLAGS if it is not a null pointer.
5588 If INTERNAL_P is set, we are outputting in internal format and
5589 need to handle the CSI differently. */
5592 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5593 unsigned_char_dynarr *dst,
5594 unsigned int *flags,
5597 if (!flags || (*flags & CODING_STATE_R2L))
5599 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5601 Dynarr_add (dst, ISO_CODE_ESC);
5602 Dynarr_add (dst, '[');
5604 else if (internal_p)
5605 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5607 Dynarr_add (dst, ISO_CODE_CSI);
5608 Dynarr_add (dst, '0');
5609 Dynarr_add (dst, ']');
5611 *flags &= ~CODING_STATE_R2L;
5615 /* If FLAGS is a null pointer or specifies a direction different from
5616 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5617 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5618 sequence to DST. Also update FLAGS if it is not a null pointer.
5619 If INTERNAL_P is set, we are outputting in internal format and
5620 need to handle the CSI differently. */
5623 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5624 unsigned_char_dynarr *dst, unsigned int *flags,
5627 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5628 direction == CHARSET_LEFT_TO_RIGHT)
5629 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5630 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5631 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5632 direction == CHARSET_RIGHT_TO_LEFT)
5634 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5636 Dynarr_add (dst, ISO_CODE_ESC);
5637 Dynarr_add (dst, '[');
5639 else if (internal_p)
5640 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5642 Dynarr_add (dst, ISO_CODE_CSI);
5643 Dynarr_add (dst, '2');
5644 Dynarr_add (dst, ']');
5646 *flags |= CODING_STATE_R2L;
5650 /* Convert ISO2022-format data to internal format. */
5653 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5654 unsigned_char_dynarr *dst, Lstream_data_count n)
5656 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5657 unsigned int flags = str->flags;
5658 unsigned int cpos = str->cpos;
5659 unsigned char counter = str->counter;
5660 eol_type_t eol_type = str->eol_type;
5661 #ifdef ENABLE_COMPOSITE_CHARS
5662 unsigned_char_dynarr *real_dst = dst;
5664 Lisp_Object coding_system;
5666 XSETCODING_SYSTEM (coding_system, str->codesys);
5668 #ifdef ENABLE_COMPOSITE_CHARS
5669 if (flags & CODING_STATE_COMPOSITE)
5670 dst = str->iso2022.composite_chars;
5671 #endif /* ENABLE_COMPOSITE_CHARS */
5675 unsigned char c = *(unsigned char *)src++;
5676 if (flags & CODING_STATE_ESCAPE)
5677 { /* Within ESC sequence */
5678 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5683 switch (str->iso2022.esc)
5685 #ifdef ENABLE_COMPOSITE_CHARS
5686 case ISO_ESC_START_COMPOSITE:
5687 if (str->iso2022.composite_chars)
5688 Dynarr_reset (str->iso2022.composite_chars);
5690 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5691 dst = str->iso2022.composite_chars;
5693 case ISO_ESC_END_COMPOSITE:
5695 Bufbyte comstr[MAX_EMCHAR_LEN];
5697 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5698 Dynarr_length (dst));
5700 len = set_charptr_emchar (comstr, emch);
5701 Dynarr_add_many (dst, comstr, len);
5704 #endif /* ENABLE_COMPOSITE_CHARS */
5706 case ISO_ESC_LITERAL:
5707 COMPOSE_FLUSH_CHARS (str, dst);
5708 decode_flush_er_chars (str, dst);
5709 DECODE_ADD_BINARY_CHAR (c, dst);
5713 /* Everything else handled already */
5718 /* Attempted error recovery. */
5719 if (str->iso2022.output_direction_sequence)
5720 ensure_correct_direction (flags & CODING_STATE_R2L ?
5721 CHARSET_RIGHT_TO_LEFT :
5722 CHARSET_LEFT_TO_RIGHT,
5723 str->codesys, dst, 0, 1);
5724 /* More error recovery. */
5725 if (!retval || str->iso2022.output_literally)
5727 /* Output the (possibly invalid) sequence */
5729 COMPOSE_FLUSH_CHARS (str, dst);
5730 decode_flush_er_chars (str, dst);
5731 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5732 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5733 flags &= CODING_STATE_ISO2022_LOCK;
5735 n++, src--;/* Repeat the loop with the same character. */
5738 /* No sense in reprocessing the final byte of the
5739 escape sequence; it could mess things up anyway.
5741 COMPOSE_FLUSH_CHARS (str, dst);
5742 decode_flush_er_chars (str, dst);
5743 DECODE_ADD_BINARY_CHAR (c, dst);
5749 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5750 { /* Control characters */
5752 /***** Error-handling *****/
5754 /* If we were in the middle of a character, dump out the
5755 partial character. */
5758 COMPOSE_FLUSH_CHARS (str, dst);
5759 decode_flush_er_chars (str, dst);
5763 DECODE_ADD_BINARY_CHAR
5764 ((unsigned char)(cpos >> (counter * 8)), dst);
5769 /* If we just saw a single-shift character, dump it out.
5770 This may dump out the wrong sort of single-shift character,
5771 but least it will give an indication that something went
5773 if (flags & CODING_STATE_SS2)
5775 COMPOSE_FLUSH_CHARS (str, dst);
5776 decode_flush_er_chars (str, dst);
5777 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5778 flags &= ~CODING_STATE_SS2;
5780 if (flags & CODING_STATE_SS3)
5782 COMPOSE_FLUSH_CHARS (str, dst);
5783 decode_flush_er_chars (str, dst);
5784 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5785 flags &= ~CODING_STATE_SS3;
5788 /***** Now handle the control characters. *****/
5794 COMPOSE_FLUSH_CHARS (str, dst);
5795 decode_flush_er_chars (str, dst);
5796 if (eol_type == EOL_CR)
5797 Dynarr_add (dst, '\n');
5798 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5799 Dynarr_add (dst, c);
5801 flags |= CODING_STATE_CR;
5802 goto label_continue_loop;
5804 else if (flags & CODING_STATE_CR)
5805 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5807 Dynarr_add (dst, '\r');
5808 flags &= ~CODING_STATE_CR;
5811 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5814 flags &= CODING_STATE_ISO2022_LOCK;
5816 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5818 COMPOSE_FLUSH_CHARS (str, dst);
5819 decode_flush_er_chars (str, dst);
5820 DECODE_ADD_BINARY_CHAR (c, dst);
5824 { /* Graphic characters */
5825 Lisp_Object charset;
5834 COMPOSE_FLUSH_CHARS (str, dst);
5835 decode_flush_er_chars (str, dst);
5836 if (eol_type == EOL_CR)
5837 Dynarr_add (dst, '\n');
5838 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5839 Dynarr_add (dst, c);
5841 flags |= CODING_STATE_CR;
5842 goto label_continue_loop;
5844 else if (flags & CODING_STATE_CR)
5845 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5847 Dynarr_add (dst, '\r');
5848 flags &= ~CODING_STATE_CR;
5851 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5854 /* Now determine the charset. */
5855 reg = ((flags & CODING_STATE_SS2) ? 2
5856 : (flags & CODING_STATE_SS3) ? 3
5857 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5858 : str->iso2022.register_left);
5859 charset = str->iso2022.charset[reg];
5861 /* Error checking: */
5862 if (! CHARSETP (charset)
5863 || str->iso2022.invalid_designated[reg]
5864 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5865 && XCHARSET_CHARS (charset) == 94))
5866 /* Mrmph. We are trying to invoke a register that has no
5867 or an invalid charset in it, or trying to add a character
5868 outside the range of the charset. Insert that char literally
5869 to preserve it for the output. */
5871 COMPOSE_FLUSH_CHARS (str, dst);
5872 decode_flush_er_chars (str, dst);
5876 DECODE_ADD_BINARY_CHAR
5877 ((unsigned char)(cpos >> (counter * 8)), dst);
5880 DECODE_ADD_BINARY_CHAR (c, dst);
5885 /* Things are probably hunky-dorey. */
5887 /* Fetch reverse charset, maybe. */
5888 if (((flags & CODING_STATE_R2L) &&
5889 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5891 (!(flags & CODING_STATE_R2L) &&
5892 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5894 Lisp_Object new_charset =
5895 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5896 if (!NILP (new_charset))
5897 charset = new_charset;
5902 if (XCHARSET_DIMENSION (charset) == counter)
5904 COMPOSE_ADD_CHAR (str,
5905 DECODE_CHAR (charset,
5906 ((cpos & 0x7F7F7F) << 8)
5913 cpos = (cpos << 8) | c;
5915 lb = XCHARSET_LEADING_BYTE (charset);
5916 switch (XCHARSET_REP_BYTES (charset))
5919 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5920 Dynarr_add (dst, c & 0x7F);
5923 case 2: /* one-byte official */
5924 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5925 Dynarr_add (dst, lb);
5926 Dynarr_add (dst, c | 0x80);
5929 case 3: /* one-byte private or two-byte official */
5930 if (XCHARSET_PRIVATE_P (charset))
5932 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5933 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5934 Dynarr_add (dst, lb);
5935 Dynarr_add (dst, c | 0x80);
5941 Dynarr_add (dst, lb);
5942 Dynarr_add (dst, ch | 0x80);
5943 Dynarr_add (dst, c | 0x80);
5951 default: /* two-byte private */
5954 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5955 Dynarr_add (dst, lb);
5956 Dynarr_add (dst, ch | 0x80);
5957 Dynarr_add (dst, c | 0x80);
5967 flags &= CODING_STATE_ISO2022_LOCK;
5970 label_continue_loop:;
5973 if (flags & CODING_STATE_END)
5975 COMPOSE_FLUSH_CHARS (str, dst);
5976 decode_flush_er_chars (str, dst);
5977 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5981 str->counter = counter;
5985 /***** ISO2022 encoder *****/
5987 /* Designate CHARSET into register REG. */
5990 iso2022_designate (Lisp_Object charset, unsigned char reg,
5991 struct encoding_stream *str, unsigned_char_dynarr *dst)
5993 static const char inter94[] = "()*+";
5994 static const char inter96[] = ",-./";
5995 unsigned short chars;
5996 unsigned char dimension;
5997 unsigned char final;
5998 Lisp_Object old_charset = str->iso2022.charset[reg];
6000 str->iso2022.charset[reg] = charset;
6001 if (!CHARSETP (charset))
6002 /* charset might be an initial nil or t. */
6004 chars = XCHARSET_CHARS (charset);
6005 dimension = XCHARSET_DIMENSION (charset);
6006 final = XCHARSET_FINAL (charset);
6007 if (!str->iso2022.force_charset_on_output[reg] &&
6008 CHARSETP (old_charset) &&
6009 XCHARSET_CHARS (old_charset) == chars &&
6010 XCHARSET_DIMENSION (old_charset) == dimension &&
6011 XCHARSET_FINAL (old_charset) == final)
6014 str->iso2022.force_charset_on_output[reg] = 0;
6017 charset_conversion_spec_dynarr *dyn =
6018 str->codesys->iso2022.output_conv;
6024 for (i = 0; i < Dynarr_length (dyn); i++)
6026 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
6027 if (EQ (charset, spec->from_charset))
6028 charset = spec->to_charset;
6033 Dynarr_add (dst, ISO_CODE_ESC);
6038 Dynarr_add (dst, inter94[reg]);
6041 Dynarr_add (dst, '$');
6043 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
6046 Dynarr_add (dst, inter94[reg]);
6051 Dynarr_add (dst, inter96[reg]);
6054 Dynarr_add (dst, '$');
6055 Dynarr_add (dst, inter96[reg]);
6059 Dynarr_add (dst, final);
6063 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
6065 if (str->iso2022.register_left != 0)
6067 Dynarr_add (dst, ISO_CODE_SI);
6068 str->iso2022.register_left = 0;
6073 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
6075 if (str->iso2022.register_left != 1)
6077 Dynarr_add (dst, ISO_CODE_SO);
6078 str->iso2022.register_left = 1;
6083 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
6084 unsigned_char_dynarr *dst, unsigned int *flags)
6086 unsigned char charmask;
6087 Lisp_Coding_System* codesys = str->codesys;
6088 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6090 Lisp_Object charset = str->iso2022.current_charset;
6091 int half = str->iso2022.current_half;
6092 int code_point = -1;
6096 restore_left_to_right_direction (codesys, dst, flags, 0);
6098 /* Make sure G0 contains ASCII */
6099 if ((ch > ' ' && ch < ISO_CODE_DEL)
6100 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
6102 ensure_normal_shift (str, dst);
6103 iso2022_designate (Vcharset_ascii, 0, str, dst);
6106 /* If necessary, restore everything to the default state
6108 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
6110 restore_left_to_right_direction (codesys, dst, flags, 0);
6112 ensure_normal_shift (str, dst);
6114 for (i = 0; i < 4; i++)
6116 Lisp_Object initial_charset =
6117 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6118 iso2022_designate (initial_charset, i, str, dst);
6123 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6124 Dynarr_add (dst, '\r');
6125 if (eol_type != EOL_CR)
6126 Dynarr_add (dst, ch);
6130 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6131 && fit_to_be_escape_quoted (ch))
6132 Dynarr_add (dst, ISO_CODE_ESC);
6133 Dynarr_add (dst, ch);
6136 else if ( (0x80 <= ch) && (ch <= 0x9f) )
6138 charmask = (half == 0 ? 0x00 : 0x80);
6140 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6141 && fit_to_be_escape_quoted (ch))
6142 Dynarr_add (dst, ISO_CODE_ESC);
6143 /* you asked for it ... */
6144 Dynarr_add (dst, ch);
6150 /* Now determine which register to use. */
6152 for (i = 0; i < 4; i++)
6154 if ((CHARSETP (charset = str->iso2022.charset[i])
6155 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6159 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6160 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6168 Lisp_Object original_default_coded_charset_priority_list
6169 = Vdefault_coded_charset_priority_list;
6170 Vdefault_coded_charset_priority_list
6171 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6172 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6174 code_point = ENCODE_CHAR (ch, charset);
6175 if (XCHARSET_FINAL (charset))
6177 Vdefault_coded_charset_priority_list
6178 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6179 Vdefault_coded_charset_priority_list));
6181 Vdefault_coded_charset_priority_list
6182 = original_default_coded_charset_priority_list;
6183 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6185 code_point = ENCODE_CHAR (ch, charset);
6186 if (XCHARSET_FINAL (charset))
6188 Vdefault_coded_charset_priority_list
6189 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6190 Vdefault_coded_charset_priority_list));
6192 code_point = ENCODE_CHAR (ch, charset);
6193 if (!XCHARSET_FINAL (charset))
6195 charset = Vcharset_ascii;
6199 Vdefault_coded_charset_priority_list
6200 = original_default_coded_charset_priority_list;
6202 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6203 codesys, dst, flags, 0);
6207 if (XCHARSET_GRAPHIC (charset) != 0)
6209 if (!NILP (str->iso2022.charset[1]) &&
6210 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6211 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6213 else if (!NILP (str->iso2022.charset[2]))
6215 else if (!NILP (str->iso2022.charset[3]))
6224 iso2022_designate (charset, reg, str, dst);
6226 /* Now invoke that register. */
6230 ensure_normal_shift (str, dst);
6234 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6236 ensure_shift_out (str, dst);
6243 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6245 Dynarr_add (dst, ISO_CODE_ESC);
6246 Dynarr_add (dst, 'N');
6251 Dynarr_add (dst, ISO_CODE_SS2);
6256 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6258 Dynarr_add (dst, ISO_CODE_ESC);
6259 Dynarr_add (dst, 'O');
6264 Dynarr_add (dst, ISO_CODE_SS3);
6272 charmask = (half == 0 ? 0x00 : 0x80);
6274 switch (XCHARSET_DIMENSION (charset))
6277 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6280 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6281 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6284 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6285 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6286 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6289 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6290 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6291 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6292 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6298 str->iso2022.current_charset = charset;
6299 str->iso2022.current_half = half;
6303 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6304 unsigned int *flags)
6306 Lisp_Coding_System* codesys = str->codesys;
6309 restore_left_to_right_direction (codesys, dst, flags, 0);
6310 ensure_normal_shift (str, dst);
6311 for (i = 0; i < 4; i++)
6313 Lisp_Object initial_charset
6314 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6315 iso2022_designate (initial_charset, i, str, dst);
6320 /************************************************************************/
6321 /* No-conversion methods */
6322 /************************************************************************/
6324 /* This is used when reading in "binary" files -- i.e. files that may
6325 contain all 256 possible byte values and that are not to be
6326 interpreted as being in any particular decoding. */
6328 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6329 unsigned_char_dynarr *dst, Lstream_data_count n)
6331 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6332 unsigned int flags = str->flags;
6333 unsigned int cpos = str->cpos;
6334 eol_type_t eol_type = str->eol_type;
6338 unsigned char c = *(unsigned char *)src++;
6340 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6341 DECODE_ADD_BINARY_CHAR (c, dst);
6342 label_continue_loop:;
6345 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6352 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6353 unsigned_char_dynarr *dst, Lstream_data_count n)
6356 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6357 unsigned int flags = str->flags;
6358 unsigned int ch = str->ch;
6359 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6361 unsigned char char_boundary = str->iso2022.current_char_boundary;
6368 if (char_boundary == 0)
6374 else if ( c >= 0xf8 )
6379 else if ( c >= 0xf0 )
6384 else if ( c >= 0xe0 )
6389 else if ( c >= 0xc0 )
6399 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6400 Dynarr_add (dst, '\r');
6401 if (eol_type != EOL_CR)
6402 Dynarr_add (dst, c);
6405 Dynarr_add (dst, c);
6408 else if (char_boundary == 1)
6410 ch = ( ch << 6 ) | ( c & 0x3f );
6411 Dynarr_add (dst, ch & 0xff);
6416 ch = ( ch << 6 ) | ( c & 0x3f );
6419 #else /* not UTF2000 */
6422 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6423 Dynarr_add (dst, '\r');
6424 if (eol_type != EOL_CR)
6425 Dynarr_add (dst, '\n');
6428 else if (BYTE_ASCII_P (c))
6431 Dynarr_add (dst, c);
6433 else if (BUFBYTE_LEADING_BYTE_P (c))
6436 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6437 c == LEADING_BYTE_CONTROL_1)
6440 Dynarr_add (dst, '~'); /* untranslatable character */
6444 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6445 Dynarr_add (dst, c);
6446 else if (ch == LEADING_BYTE_CONTROL_1)
6449 Dynarr_add (dst, c - 0x20);
6451 /* else it should be the second or third byte of an
6452 untranslatable character, so ignore it */
6455 #endif /* not UTF2000 */
6461 str->iso2022.current_char_boundary = char_boundary;
6467 /************************************************************************/
6468 /* Initialization */
6469 /************************************************************************/
6472 syms_of_file_coding (void)
6474 INIT_LRECORD_IMPLEMENTATION (coding_system);
6476 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6478 DEFSUBR (Fcoding_system_p);
6479 DEFSUBR (Ffind_coding_system);
6480 DEFSUBR (Fget_coding_system);
6481 DEFSUBR (Fcoding_system_list);
6482 DEFSUBR (Fcoding_system_name);
6483 DEFSUBR (Fmake_coding_system);
6484 DEFSUBR (Fcopy_coding_system);
6485 DEFSUBR (Fcoding_system_canonical_name_p);
6486 DEFSUBR (Fcoding_system_alias_p);
6487 DEFSUBR (Fcoding_system_aliasee);
6488 DEFSUBR (Fdefine_coding_system_alias);
6489 DEFSUBR (Fsubsidiary_coding_system);
6491 DEFSUBR (Fcoding_system_type);
6492 DEFSUBR (Fcoding_system_doc_string);
6494 DEFSUBR (Fcoding_system_charset);
6496 DEFSUBR (Fcoding_system_property);
6498 DEFSUBR (Fcoding_category_list);
6499 DEFSUBR (Fset_coding_priority_list);
6500 DEFSUBR (Fcoding_priority_list);
6501 DEFSUBR (Fset_coding_category_system);
6502 DEFSUBR (Fcoding_category_system);
6504 DEFSUBR (Fdetect_coding_region);
6505 DEFSUBR (Fdecode_coding_region);
6506 DEFSUBR (Fencode_coding_region);
6508 DEFSUBR (Fdecode_shift_jis_char);
6509 DEFSUBR (Fencode_shift_jis_char);
6510 DEFSUBR (Fdecode_big5_char);
6511 DEFSUBR (Fencode_big5_char);
6513 defsymbol (&Qcoding_systemp, "coding-system-p");
6514 defsymbol (&Qno_conversion, "no-conversion");
6515 defsymbol (&Qraw_text, "raw-text");
6517 defsymbol (&Qbig5, "big5");
6518 defsymbol (&Qshift_jis, "shift-jis");
6519 defsymbol (&Qucs4, "ucs-4");
6520 defsymbol (&Qutf8, "utf-8");
6521 defsymbol (&Qutf16, "utf-16");
6522 defsymbol (&Qccl, "ccl");
6523 defsymbol (&Qiso2022, "iso2022");
6525 defsymbol (&Qmnemonic, "mnemonic");
6526 defsymbol (&Qeol_type, "eol-type");
6527 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6528 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6530 defsymbol (&Qcr, "cr");
6531 defsymbol (&Qlf, "lf");
6532 defsymbol (&Qcrlf, "crlf");
6533 defsymbol (&Qeol_cr, "eol-cr");
6534 defsymbol (&Qeol_lf, "eol-lf");
6535 defsymbol (&Qeol_crlf, "eol-crlf");
6537 defsymbol (&Qcharset_g0, "charset-g0");
6538 defsymbol (&Qcharset_g1, "charset-g1");
6539 defsymbol (&Qcharset_g2, "charset-g2");
6540 defsymbol (&Qcharset_g3, "charset-g3");
6541 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6542 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6543 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6544 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6545 defsymbol (&Qno_iso6429, "no-iso6429");
6546 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6547 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6549 defsymbol (&Qshort, "short");
6550 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6551 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6552 defsymbol (&Qseven, "seven");
6553 defsymbol (&Qlock_shift, "lock-shift");
6554 defsymbol (&Qescape_quoted, "escape-quoted");
6557 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6558 defsymbol (&Qdisable_composition, "disable-composition");
6559 defsymbol (&Qenable_decomposition, "enable-decomposition");
6560 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6561 defsymbol (&Quse_entity_reference, "use-entity-reference");
6562 defsymbol (&Qd, "d");
6563 defsymbol (&Qx, "x");
6564 defsymbol (&QX, "X");
6566 defsymbol (&Qencode, "encode");
6567 defsymbol (&Qdecode, "decode");
6570 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6572 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6574 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6576 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6578 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6580 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6582 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6584 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6586 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6588 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6591 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6596 lstream_type_create_file_coding (void)
6598 LSTREAM_HAS_METHOD (decoding, reader);
6599 LSTREAM_HAS_METHOD (decoding, writer);
6600 LSTREAM_HAS_METHOD (decoding, rewinder);
6601 LSTREAM_HAS_METHOD (decoding, seekable_p);
6602 LSTREAM_HAS_METHOD (decoding, flusher);
6603 LSTREAM_HAS_METHOD (decoding, closer);
6604 LSTREAM_HAS_METHOD (decoding, marker);
6606 LSTREAM_HAS_METHOD (encoding, reader);
6607 LSTREAM_HAS_METHOD (encoding, writer);
6608 LSTREAM_HAS_METHOD (encoding, rewinder);
6609 LSTREAM_HAS_METHOD (encoding, seekable_p);
6610 LSTREAM_HAS_METHOD (encoding, flusher);
6611 LSTREAM_HAS_METHOD (encoding, closer);
6612 LSTREAM_HAS_METHOD (encoding, marker);
6616 vars_of_file_coding (void)
6620 fcd = xnew (struct file_coding_dump);
6621 dump_add_root_struct_ptr (&fcd, &fcd_description);
6623 /* Initialize to something reasonable ... */
6624 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6626 fcd->coding_category_system[i] = Qnil;
6627 fcd->coding_category_by_priority[i] = i;
6630 Fprovide (intern ("file-coding"));
6632 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6633 Coding system used for TTY keyboard input.
6634 Not used under a windowing system.
6636 Vkeyboard_coding_system = Qnil;
6638 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6639 Coding system used for TTY display output.
6640 Not used under a windowing system.
6642 Vterminal_coding_system = Qnil;
6644 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6645 Overriding coding system used when reading from a file or process.
6646 You should bind this variable with `let', but do not set it globally.
6647 If this is non-nil, it specifies the coding system that will be used
6648 to decode input on read operations, such as from a file or process.
6649 It overrides `buffer-file-coding-system-for-read',
6650 `insert-file-contents-pre-hook', etc. Use those variables instead of
6651 this one for permanent changes to the environment. */ );
6652 Vcoding_system_for_read = Qnil;
6654 DEFVAR_LISP ("coding-system-for-write",
6655 &Vcoding_system_for_write /*
6656 Overriding coding system used when writing to a file or process.
6657 You should bind this variable with `let', but do not set it globally.
6658 If this is non-nil, it specifies the coding system that will be used
6659 to encode output for write operations, such as to a file or process.
6660 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6661 Use those variables instead of this one for permanent changes to the
6663 Vcoding_system_for_write = Qnil;
6665 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6666 Coding system used to convert pathnames when accessing files.
6668 Vfile_name_coding_system = Qnil;
6670 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6671 &Vcoded_charset_entity_reference_alist /*
6672 Alist of coded-charset vs corresponding entity-reference.
6673 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6674 CCS is coded-charset.
6675 CODE-COLUMNS is columns of code-point of entity-reference.
6676 CODE-TYPE is format type of code-point of entity-reference.
6677 `d' means decimal value and `x' means hexadecimal value.
6679 Vcoded_charset_entity_reference_alist = Qnil;
6681 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6682 Non-nil means the buffer contents are regarded as multi-byte form
6683 of characters, not a binary code. This affects the display, file I/O,
6684 and behaviors of various editing commands.
6686 Setting this to nil does not do anything.
6688 enable_multibyte_characters = 1;
6691 DEFVAR_LISP ("decomposition-feature-list",
6692 &Vdecomposition_feature_list /*
6693 List of `=decomposition@FOO' feature to encode characters as IVS.
6695 Vdecomposition_feature_list = Qnil;
6700 complex_vars_of_file_coding (void)
6702 staticpro (&Vcoding_system_hash_table);
6703 Vcoding_system_hash_table =
6704 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6706 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6707 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6709 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6711 struct codesys_prop csp; \
6713 csp.prop_type = (Prop_Type); \
6714 Dynarr_add (the_codesys_prop_dynarr, csp); \
6717 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6718 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6719 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6720 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6721 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6722 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6723 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6725 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6726 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6727 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6728 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6729 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6730 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6731 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6732 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6733 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6734 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6735 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6736 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6737 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6738 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6739 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6740 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6741 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6743 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6746 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6747 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6749 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6750 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qenable_decomposition);
6751 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6754 /* Need to create this here or we're really screwed. */
6756 (Qraw_text, Qno_conversion,
6757 build_string ("Raw text, which means it converts only line-break-codes."),
6758 list2 (Qmnemonic, build_string ("Raw")));
6761 (Qbinary, Qno_conversion,
6762 build_string ("Binary, which means it does not convert anything."),
6763 list4 (Qeol_type, Qlf,
6764 Qmnemonic, build_string ("Binary")));
6770 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6771 list2 (Qmnemonic, build_string ("MTF8")));
6774 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6776 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6778 Fdefine_coding_system_alias (Qterminal, Qbinary);
6779 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6781 /* Need this for bootstrapping */
6782 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6783 Fget_coding_system (Qraw_text);
6786 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6787 = Fget_coding_system (Qutf_8_mcs);
6790 #if defined(MULE) && !defined(UTF2000)
6794 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6795 fcd->ucs_to_mule_table[i] = Qnil;
6797 staticpro (&mule_to_ucs_table);
6798 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6799 #endif /* defined(MULE) && !defined(UTF2000) */