1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011,
5 2012, 2013 MORIOKA Tomohiko
7 This file is part of XEmacs.
9 XEmacs is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with XEmacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 Boston, MA 02111-1307, USA. */
24 /* Synched up with: Mule 2.3. Not in FSF. */
26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
44 #include "file-coding.h"
46 Lisp_Object Qcoding_system_error;
48 Lisp_Object Vkeyboard_coding_system;
49 Lisp_Object Vterminal_coding_system;
50 Lisp_Object Vcoding_system_for_read;
51 Lisp_Object Vcoding_system_for_write;
52 Lisp_Object Vfile_name_coding_system;
54 Lisp_Object Vcoded_charset_entity_reference_alist;
56 /* Table of symbols identifying each coding category. */
57 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
61 struct file_coding_dump {
62 /* Coding system currently associated with each coding category. */
63 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
65 /* Table of all coding categories in decreasing order of priority.
66 This describes a permutation of the possible coding categories. */
67 int coding_category_by_priority[CODING_CATEGORY_LAST];
69 #if defined(MULE) && !defined(UTF2000)
70 Lisp_Object ucs_to_mule_table[65536];
74 static const struct lrecord_description fcd_description_1[] = {
75 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
76 #if defined(MULE) && !defined(UTF2000)
77 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
82 static const struct struct_description fcd_description = {
83 sizeof (struct file_coding_dump),
87 Lisp_Object mule_to_ucs_table;
89 Lisp_Object Qcoding_systemp;
91 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
92 /* Qinternal in general.c */
94 Lisp_Object Qmnemonic, Qeol_type;
95 Lisp_Object Qcr, Qcrlf, Qlf;
96 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
97 Lisp_Object Qpost_read_conversion;
98 Lisp_Object Qpre_write_conversion;
101 Lisp_Object Qucs4, Qutf16, Qutf8;
102 Lisp_Object Qbig5, Qshift_jis;
103 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
104 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
105 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
106 Lisp_Object Qno_iso6429;
107 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
108 Lisp_Object Qescape_quoted;
109 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
112 Lisp_Object Qutf_8_mcs;
113 Lisp_Object Qdisable_composition, Qenable_decomposition;
114 Lisp_Object Qccs_priority_list;
115 Lisp_Object Quse_entity_reference;
116 Lisp_Object Qd, Qx, QX;
117 Lisp_Object Vdecomposition_feature_list;
119 Lisp_Object Qencode, Qdecode;
121 Lisp_Object Vcoding_system_hash_table;
123 int enable_multibyte_characters;
126 /* Additional information used by the ISO2022 decoder and detector. */
127 struct iso2022_decoder
129 /* CHARSET holds the character sets currently assigned to the G0
130 through G3 variables. It is initialized from the array
131 INITIAL_CHARSET in CODESYS. */
132 Lisp_Object charset[4];
134 /* Which registers are currently invoked into the left (GL) and
135 right (GR) halves of the 8-bit encoding space? */
136 int register_left, register_right;
138 /* ISO_ESC holds a value indicating part of an escape sequence
139 that has already been seen. */
140 enum iso_esc_flag esc;
142 /* This records the bytes we've seen so far in an escape sequence,
143 in case the sequence is invalid (we spit out the bytes unchanged). */
144 unsigned char esc_bytes[8];
146 /* Index for next byte to store in ISO escape sequence. */
149 #ifdef ENABLE_COMPOSITE_CHARS
150 /* Stuff seen so far when composing a string. */
151 unsigned_char_dynarr *composite_chars;
154 /* If we saw an invalid designation sequence for a particular
155 register, we flag it here and switch to ASCII. The next time we
156 see a valid designation for this register, we turn off the flag
157 and do the designation normally, but pretend the sequence was
158 invalid. The effect of all this is that (most of the time) the
159 escape sequences for both the switch to the unknown charset, and
160 the switch back to the known charset, get inserted literally into
161 the buffer and saved out as such. The hope is that we can
162 preserve the escape sequences so that the resulting written out
163 file makes sense. If we don't do any of this, the designation
164 to the invalid charset will be preserved but that switch back
165 to the known charset will probably get eaten because it was
166 the same charset that was already present in the register. */
167 unsigned char invalid_designated[4];
169 /* We try to do similar things as above for direction-switching
170 sequences. If we encountered a direction switch while an
171 invalid designation was present, or an invalid designation
172 just after a direction switch (i.e. no valid designation
173 encountered yet), we insert the direction-switch escape
174 sequence literally into the output stream, and later on
175 insert the corresponding direction-restoring escape sequence
177 unsigned int switched_dir_and_no_valid_charset_yet :1;
178 unsigned int invalid_switch_dir :1;
180 /* Tells the decoder to output the escape sequence literally
181 even though it was valid. Used in the games we play to
182 avoid lossage when we encounter invalid designations. */
183 unsigned int output_literally :1;
184 /* We encountered a direction switch followed by an invalid
185 designation. We didn't output the direction switch
186 literally because we didn't know about the invalid designation;
187 but we have to do so now. */
188 unsigned int output_direction_sequence :1;
191 EXFUN (Fcopy_coding_system, 2);
193 struct detection_state;
196 text_encode_generic (Lstream *encoding, const Bufbyte *src,
197 unsigned_char_dynarr *dst, Lstream_data_count n);
199 static int detect_coding_sjis (struct detection_state *st,
200 const Extbyte *src, Lstream_data_count n);
201 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
202 unsigned_char_dynarr *dst, Lstream_data_count n);
203 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
204 unsigned_char_dynarr *dst, unsigned int *flags);
205 void char_finish_shift_jis (struct encoding_stream *str,
206 unsigned_char_dynarr *dst, unsigned int *flags);
208 static int detect_coding_big5 (struct detection_state *st,
209 const Extbyte *src, Lstream_data_count n);
210 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
211 unsigned_char_dynarr *dst, Lstream_data_count n);
212 void char_encode_big5 (struct encoding_stream *str, Emchar c,
213 unsigned_char_dynarr *dst, unsigned int *flags);
214 void char_finish_big5 (struct encoding_stream *str,
215 unsigned_char_dynarr *dst, unsigned int *flags);
217 static int detect_coding_ucs4 (struct detection_state *st,
218 const Extbyte *src, Lstream_data_count n);
219 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
220 unsigned_char_dynarr *dst, Lstream_data_count n);
221 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
222 unsigned_char_dynarr *dst, unsigned int *flags);
223 void char_finish_ucs4 (struct encoding_stream *str,
224 unsigned_char_dynarr *dst, unsigned int *flags);
226 static int detect_coding_utf16 (struct detection_state *st,
227 const Extbyte *src, Lstream_data_count n);
228 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
229 unsigned_char_dynarr *dst, Lstream_data_count n);
230 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
231 unsigned_char_dynarr *dst, unsigned int *flags);
232 void char_finish_utf16 (struct encoding_stream *str,
233 unsigned_char_dynarr *dst, unsigned int *flags);
235 static int detect_coding_utf8 (struct detection_state *st,
236 const Extbyte *src, Lstream_data_count n);
237 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
238 unsigned_char_dynarr *dst, Lstream_data_count n);
239 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
240 unsigned_char_dynarr *dst, unsigned int *flags);
241 void char_finish_utf8 (struct encoding_stream *str,
242 unsigned_char_dynarr *dst, unsigned int *flags);
244 static int postprocess_iso2022_mask (int mask);
245 static void reset_iso2022 (Lisp_Object coding_system,
246 struct iso2022_decoder *iso);
247 static int detect_coding_iso2022 (struct detection_state *st,
248 const Extbyte *src, Lstream_data_count n);
249 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
250 unsigned_char_dynarr *dst, Lstream_data_count n);
251 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
252 unsigned_char_dynarr *dst, unsigned int *flags);
253 void char_finish_iso2022 (struct encoding_stream *str,
254 unsigned_char_dynarr *dst, unsigned int *flags);
256 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
257 unsigned_char_dynarr *dst, Lstream_data_count n);
258 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
259 unsigned_char_dynarr *dst, Lstream_data_count n);
260 static void mule_decode (Lstream *decoding, const Extbyte *src,
261 unsigned_char_dynarr *dst, Lstream_data_count n);
262 static void mule_encode (Lstream *encoding, const Bufbyte *src,
263 unsigned_char_dynarr *dst, Lstream_data_count n);
265 typedef struct codesys_prop codesys_prop;
274 Dynarr_declare (codesys_prop);
275 } codesys_prop_dynarr;
277 static const struct lrecord_description codesys_prop_description_1[] = {
278 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
282 static const struct struct_description codesys_prop_description = {
283 sizeof (codesys_prop),
284 codesys_prop_description_1
287 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
288 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
292 static const struct struct_description codesys_prop_dynarr_description = {
293 sizeof (codesys_prop_dynarr),
294 codesys_prop_dynarr_description_1
297 codesys_prop_dynarr *the_codesys_prop_dynarr;
299 enum codesys_prop_enum
302 CODESYS_PROP_ISO2022,
307 /************************************************************************/
308 /* Coding system functions */
309 /************************************************************************/
311 static Lisp_Object mark_coding_system (Lisp_Object);
312 static void print_coding_system (Lisp_Object, Lisp_Object, int);
313 static void finalize_coding_system (void *header, int for_disksave);
316 static const struct lrecord_description ccs_description_1[] = {
317 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
318 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
322 static const struct struct_description ccs_description = {
323 sizeof (charset_conversion_spec),
327 static const struct lrecord_description ccsd_description_1[] = {
328 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
332 static const struct struct_description ccsd_description = {
333 sizeof (charset_conversion_spec_dynarr),
338 static const struct lrecord_description coding_system_description[] = {
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
342 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
343 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
344 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
345 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
348 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
349 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
350 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
351 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
352 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
354 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
360 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
361 mark_coding_system, print_coding_system,
362 finalize_coding_system,
363 0, 0, coding_system_description,
367 mark_coding_system (Lisp_Object obj)
369 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
371 mark_object (CODING_SYSTEM_NAME (codesys));
372 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
373 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
374 mark_object (CODING_SYSTEM_EOL_LF (codesys));
375 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
376 mark_object (CODING_SYSTEM_EOL_CR (codesys));
378 switch (CODING_SYSTEM_TYPE (codesys))
382 case CODESYS_ISO2022:
383 for (i = 0; i < 4; i++)
384 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
385 if (codesys->iso2022.input_conv)
387 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
389 struct charset_conversion_spec *ccs =
390 Dynarr_atp (codesys->iso2022.input_conv, i);
391 mark_object (ccs->from_charset);
392 mark_object (ccs->to_charset);
395 if (codesys->iso2022.output_conv)
397 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
399 struct charset_conversion_spec *ccs =
400 Dynarr_atp (codesys->iso2022.output_conv, i);
401 mark_object (ccs->from_charset);
402 mark_object (ccs->to_charset);
409 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
410 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
415 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
416 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
423 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
425 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
427 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
431 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
434 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
436 error ("printing unreadable object #<coding_system 0x%x>",
439 write_c_string ("#<coding_system ", printcharfun);
440 print_internal (c->name, printcharfun, 1);
441 write_c_string (">", printcharfun);
445 finalize_coding_system (void *header, int for_disksave)
447 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
448 /* Since coding systems never go away, this function is not
449 necessary. But it would be necessary if we changed things
450 so that coding systems could go away. */
451 if (!for_disksave) /* see comment in lstream.c */
453 switch (CODING_SYSTEM_TYPE (c))
456 case CODESYS_ISO2022:
457 if (c->iso2022.input_conv)
459 Dynarr_free (c->iso2022.input_conv);
460 c->iso2022.input_conv = 0;
462 if (c->iso2022.output_conv)
464 Dynarr_free (c->iso2022.output_conv);
465 c->iso2022.output_conv = 0;
476 symbol_to_eol_type (Lisp_Object symbol)
478 CHECK_SYMBOL (symbol);
479 if (NILP (symbol)) return EOL_AUTODETECT;
480 if (EQ (symbol, Qlf)) return EOL_LF;
481 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
482 if (EQ (symbol, Qcr)) return EOL_CR;
484 signal_simple_error ("Unrecognized eol type", symbol);
485 return EOL_AUTODETECT; /* not reached */
489 eol_type_to_symbol (eol_type_t type)
494 case EOL_LF: return Qlf;
495 case EOL_CRLF: return Qcrlf;
496 case EOL_CR: return Qcr;
497 case EOL_AUTODETECT: return Qnil;
502 setup_eol_coding_systems (Lisp_Coding_System *codesys)
504 Lisp_Object codesys_obj;
505 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
506 char *codesys_name = (char *) alloca (len + 7);
508 char *codesys_mnemonic=0;
510 Lisp_Object codesys_name_sym, sub_codesys_obj;
514 XSETCODING_SYSTEM (codesys_obj, codesys);
516 memcpy (codesys_name,
517 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
519 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
521 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
522 codesys_mnemonic = (char *) alloca (mlen + 7);
523 memcpy (codesys_mnemonic,
524 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
527 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
528 strcpy (codesys_name + len, "-" op_sys); \
530 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
531 codesys_name_sym = intern (codesys_name); \
532 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
533 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
535 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
536 build_string (codesys_mnemonic); \
537 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
540 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
541 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
542 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
545 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
546 Return t if OBJECT is a coding system.
547 A coding system is an object that defines how text containing multiple
548 character sets is encoded into a stream of (typically 8-bit) bytes.
549 The coding system is used to decode the stream into a series of
550 characters (which may be from multiple charsets) when the text is read
551 from a file or process, and is used to encode the text back into the
552 same format when it is written out to a file or process.
554 For example, many ISO2022-compliant coding systems (such as Compound
555 Text, which is used for inter-client data under the X Window System)
556 use escape sequences to switch between different charsets -- Japanese
557 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
558 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
559 `make-coding-system' for more information.
561 Coding systems are normally identified using a symbol, and the
562 symbol is accepted in place of the actual coding system object whenever
563 a coding system is called for. (This is similar to how faces work.)
567 return CODING_SYSTEMP (object) ? Qt : Qnil;
570 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
571 Retrieve the coding system of the given name.
573 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
574 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
575 If there is no such coding system, nil is returned. Otherwise the
576 associated coding system object is returned.
578 (coding_system_or_name))
580 if (NILP (coding_system_or_name))
581 coding_system_or_name = Qbinary;
582 else if (CODING_SYSTEMP (coding_system_or_name))
583 return coding_system_or_name;
585 CHECK_SYMBOL (coding_system_or_name);
589 coding_system_or_name =
590 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
592 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
593 return coding_system_or_name;
597 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
598 Retrieve the coding system of the given name.
599 Same as `find-coding-system' except that if there is no such
600 coding system, an error is signaled instead of returning nil.
604 Lisp_Object coding_system = Ffind_coding_system (name);
606 if (NILP (coding_system))
607 signal_simple_error ("No such coding system", name);
608 return coding_system;
611 /* We store the coding systems in hash tables with the names as the key and the
612 actual coding system object as the value. Occasionally we need to use them
613 in a list format. These routines provide us with that. */
614 struct coding_system_list_closure
616 Lisp_Object *coding_system_list;
620 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
621 void *coding_system_list_closure)
623 /* This function can GC */
624 struct coding_system_list_closure *cscl =
625 (struct coding_system_list_closure *) coding_system_list_closure;
626 Lisp_Object *coding_system_list = cscl->coding_system_list;
628 *coding_system_list = Fcons (key, *coding_system_list);
632 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
633 Return a list of the names of all defined coding systems.
637 Lisp_Object coding_system_list = Qnil;
639 struct coding_system_list_closure coding_system_list_closure;
641 GCPRO1 (coding_system_list);
642 coding_system_list_closure.coding_system_list = &coding_system_list;
643 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
644 &coding_system_list_closure);
647 return coding_system_list;
650 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
651 Return the name of the given coding system.
655 coding_system = Fget_coding_system (coding_system);
656 return XCODING_SYSTEM_NAME (coding_system);
659 static Lisp_Coding_System *
660 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
662 Lisp_Coding_System *codesys =
663 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
665 zero_lcrecord (codesys);
666 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
667 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
668 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
669 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
670 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
671 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
672 CODING_SYSTEM_TYPE (codesys) = type;
673 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
676 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
678 if (type == CODESYS_ISO2022)
681 for (i = 0; i < 4; i++)
682 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
685 if (type == CODESYS_UTF8)
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
691 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
693 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
696 else if (type == CODESYS_BIG5)
698 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
700 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
701 = Vcharset_chinese_big5;
702 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
704 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
708 else if (type == CODESYS_CCL)
710 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
711 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
714 CODING_SYSTEM_NAME (codesys) = name;
720 /* Given a list of charset conversion specs as specified in a Lisp
721 program, parse it into STORE_HERE. */
724 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
725 Lisp_Object spec_list)
729 EXTERNAL_LIST_LOOP (rest, spec_list)
731 Lisp_Object car = XCAR (rest);
732 Lisp_Object from, to;
733 struct charset_conversion_spec spec;
735 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
736 signal_simple_error ("Invalid charset conversion spec", car);
737 from = Fget_charset (XCAR (car));
738 to = Fget_charset (XCAR (XCDR (car)));
739 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
740 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
741 signal_simple_error_2
742 ("Attempted conversion between different charset types",
744 spec.from_charset = from;
745 spec.to_charset = to;
747 Dynarr_add (store_here, spec);
751 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
752 specs, return the equivalent as the Lisp programmer would see it.
754 If LOAD_HERE is 0, return Qnil. */
757 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
764 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
766 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
767 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
770 return Fnreverse (result);
775 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
776 Register symbol NAME as a coding system.
778 TYPE describes the conversion method used and should be one of
781 Automatic conversion. XEmacs attempts to detect the coding system
784 No conversion. Use this for binary files and such. On output,
785 graphic characters that are not in ASCII or Latin-1 will be
786 replaced by a ?. (For a no-conversion-encoded buffer, these
787 characters will only be present if you explicitly insert them.)
789 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
791 ISO 10646 UCS-4 encoding.
793 ISO 10646 UTF-8 encoding.
795 Any ISO2022-compliant encoding. Among other things, this includes
796 JIS (the Japanese encoding commonly used for e-mail), EUC (the
797 standard Unix encoding for Japanese and other languages), and
798 Compound Text (the encoding used in X11). You can specify more
799 specific information about the conversion with the PROPS argument.
801 Big5 (the encoding commonly used for Taiwanese).
803 The conversion is performed using a user-written pseudo-code
804 program. CCL (Code Conversion Language) is the name of this
807 Write out or read in the raw contents of the memory representing
808 the buffer's text. This is primarily useful for debugging
809 purposes, and is only enabled when XEmacs has been compiled with
810 DEBUG_XEMACS defined (via the --debug configure option).
811 WARNING: Reading in a file using 'internal conversion can result
812 in an internal inconsistency in the memory representing a
813 buffer's text, which will produce unpredictable results and may
814 cause XEmacs to crash. Under normal circumstances you should
815 never use 'internal conversion.
817 DOC-STRING is a string describing the coding system.
819 PROPS is a property list, describing the specific nature of the
820 character set. Recognized properties are:
823 String to be displayed in the modeline when this coding system is
827 End-of-line conversion to be used. It should be one of
830 Automatically detect the end-of-line type (LF, CRLF,
831 or CR). Also generate subsidiary coding systems named
832 `NAME-unix', `NAME-dos', and `NAME-mac', that are
833 identical to this coding system but have an EOL-TYPE
834 value of 'lf, 'crlf, and 'cr, respectively.
836 The end of a line is marked externally using ASCII LF.
837 Since this is also the way that XEmacs represents an
838 end-of-line internally, specifying this option results
839 in no end-of-line conversion. This is the standard
840 format for Unix text files.
842 The end of a line is marked externally using ASCII
843 CRLF. This is the standard format for MS-DOS text
846 The end of a line is marked externally using ASCII CR.
847 This is the standard format for Macintosh text files.
849 Automatically detect the end-of-line type but do not
850 generate subsidiary coding systems. (This value is
851 converted to nil when stored internally, and
852 `coding-system-property' will return nil.)
855 If non-nil, composition for combining characters is disabled.
857 'enable-decomposition
858 If non-nil, decomposition for combining characters is enabled.
860 'use-entity-reference
861 If non-nil, SGML style entity-reference is used for non-system-characters.
863 'post-read-conversion
864 Function called after a file has been read in, to perform the
865 decoding. Called with two arguments, START and END, denoting
866 a region of the current buffer to be decoded.
868 'pre-write-conversion
869 Function called before a file is written out, to perform the
870 encoding. Called with two arguments, START and END, denoting
871 a region of the current buffer to be encoded.
874 The following additional properties are recognized if TYPE is 'iso2022:
880 The character set initially designated to the G0 - G3 registers.
881 The value should be one of
883 -- A charset object (designate that character set)
884 -- nil (do not ever use this register)
885 -- t (no character set is initially designated to
886 the register, but may be later on; this automatically
887 sets the corresponding `force-g*-on-output' property)
893 If non-nil, send an explicit designation sequence on output before
894 using the specified register.
897 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
898 "ESC $ B" on output in place of the full designation sequences
899 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
902 If non-nil, don't designate ASCII to G0 at each end of line on output.
903 Setting this to non-nil also suppresses other state-resetting that
904 normally happens at the end of a line.
907 If non-nil, don't designate ASCII to G0 before control chars on output.
910 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
914 If non-nil, use locking-shift (SO/SI) instead of single-shift
915 or designation by escape sequence.
918 If non-nil, don't use ISO6429's direction specification.
921 If non-nil, literal control characters that are the same as
922 the beginning of a recognized ISO2022 or ISO6429 escape sequence
923 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
924 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
925 so that they can be properly distinguished from an escape sequence.
926 (Note that doing this results in a non-portable encoding.) This
927 encoding flag is used for byte-compiled files. Note that ESC
928 is a good choice for a quoting character because there are no
929 escape sequences whose second byte is a character from the Control-0
930 or Control-1 character sets; this is explicitly disallowed by the
933 'input-charset-conversion
934 A list of conversion specifications, specifying conversion of
935 characters in one charset to another when decoding is performed.
936 Each specification is a list of two elements: the source charset,
937 and the destination charset.
939 'output-charset-conversion
940 A list of conversion specifications, specifying conversion of
941 characters in one charset to another when encoding is performed.
942 The form of each specification is the same as for
943 'input-charset-conversion.
946 The following additional properties are recognized (and required)
950 CCL program used for decoding (converting to internal format).
953 CCL program used for encoding (converting to external format).
955 (name, type, doc_string, props))
957 Lisp_Coding_System *codesys;
958 enum coding_system_type ty;
959 int need_to_setup_eol_systems = 1;
961 /* Convert type to constant */
962 if (NILP (type) || EQ (type, Qundecided))
963 { ty = CODESYS_AUTODETECT; }
965 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
966 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
967 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
968 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
969 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
970 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
971 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
973 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
975 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
978 signal_simple_error ("Invalid coding system type", type);
982 codesys = allocate_coding_system (ty, name);
984 if (NILP (doc_string))
985 doc_string = build_string ("");
987 CHECK_STRING (doc_string);
988 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
991 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
993 if (EQ (key, Qmnemonic))
996 CHECK_STRING (value);
997 CODING_SYSTEM_MNEMONIC (codesys) = value;
1000 else if (EQ (key, Qeol_type))
1002 need_to_setup_eol_systems = NILP (value);
1005 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1008 else if (EQ (key, Qpost_read_conversion))
1009 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1010 else if (EQ (key, Qpre_write_conversion))
1011 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1013 else if (EQ (key, Qdisable_composition))
1014 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1015 else if (EQ (key, Qenable_decomposition))
1016 CODING_SYSTEM_ENABLE_DECOMPOSITION (codesys) = !NILP (value);
1017 else if (EQ (key, Quse_entity_reference))
1018 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1021 else if (ty == CODESYS_ISO2022)
1023 #define FROB_INITIAL_CHARSET(charset_num) \
1024 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1025 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1027 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1028 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1029 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1030 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1032 #define FROB_FORCE_CHARSET(charset_num) \
1033 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1035 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1036 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1037 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1038 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1040 #define FROB_BOOLEAN_PROPERTY(prop) \
1041 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1043 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1044 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1045 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1046 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1047 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1048 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1049 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1051 else if (EQ (key, Qinput_charset_conversion))
1053 codesys->iso2022.input_conv =
1054 Dynarr_new (charset_conversion_spec);
1055 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1058 else if (EQ (key, Qoutput_charset_conversion))
1060 codesys->iso2022.output_conv =
1061 Dynarr_new (charset_conversion_spec);
1062 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1066 else if (EQ (key, Qccs_priority_list))
1068 codesys->ccs_priority_list = value;
1072 signal_simple_error ("Unrecognized property", key);
1075 else if (ty == CODESYS_UTF8)
1077 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1078 else if (EQ (key, Qcharset_g1))
1079 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1080 else if (EQ (key, Qcharset_g2))
1081 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1083 signal_simple_error ("Unrecognized property", key);
1085 else if (ty == CODESYS_BIG5)
1087 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1088 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1090 signal_simple_error ("Unrecognized property", key);
1093 else if (EQ (type, Qccl))
1096 struct ccl_program test_ccl;
1099 /* Check key first. */
1100 if (EQ (key, Qdecode))
1101 suffix = "-ccl-decode";
1102 else if (EQ (key, Qencode))
1103 suffix = "-ccl-encode";
1105 signal_simple_error ("Unrecognized property", key);
1107 /* If value is vector, register it as a ccl program
1108 associated with an newly created symbol for
1109 backward compatibility. */
1110 if (VECTORP (value))
1112 sym = Fintern (concat2 (Fsymbol_name (name),
1113 build_string (suffix)),
1115 Fregister_ccl_program (sym, value);
1119 CHECK_SYMBOL (value);
1122 /* check if the given ccl programs are valid. */
1123 if (setup_ccl_program (&test_ccl, sym) < 0)
1124 signal_simple_error ("Invalid CCL program", value);
1126 if (EQ (key, Qdecode))
1127 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1128 else if (EQ (key, Qencode))
1129 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1134 signal_simple_error ("Unrecognized property", key);
1138 if (need_to_setup_eol_systems)
1139 setup_eol_coding_systems (codesys);
1142 Lisp_Object codesys_obj;
1143 XSETCODING_SYSTEM (codesys_obj, codesys);
1144 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1149 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1150 Copy OLD-CODING-SYSTEM to NEW-NAME.
1151 If NEW-NAME does not name an existing coding system, a new one will
1154 (old_coding_system, new_name))
1156 Lisp_Object new_coding_system;
1157 old_coding_system = Fget_coding_system (old_coding_system);
1158 new_coding_system = Ffind_coding_system (new_name);
1159 if (NILP (new_coding_system))
1161 XSETCODING_SYSTEM (new_coding_system,
1162 allocate_coding_system
1163 (XCODING_SYSTEM_TYPE (old_coding_system),
1165 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1169 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1170 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1171 memcpy (((char *) to ) + sizeof (to->header),
1172 ((char *) from) + sizeof (from->header),
1173 sizeof (*from) - sizeof (from->header));
1174 to->name = new_name;
1176 return new_coding_system;
1179 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1180 Return t if OBJECT names a coding system, and is not a coding system alias.
1184 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1188 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1189 Return t if OBJECT is a coding system alias.
1190 All coding system aliases are created by `define-coding-system-alias'.
1194 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1198 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1199 Return the coding-system symbol for which symbol ALIAS is an alias.
1203 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1204 if (SYMBOLP (aliasee))
1207 signal_simple_error ("Symbol is not a coding system alias", alias);
1208 return Qnil; /* To keep the compiler happy */
1212 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1214 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1218 /* A maphash function, for removing dangling coding system aliases. */
1220 dangling_coding_system_alias_p (Lisp_Object alias,
1221 Lisp_Object aliasee,
1222 void *dangling_aliases)
1224 if (SYMBOLP (aliasee)
1225 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1227 (*(int *) dangling_aliases)++;
1234 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1235 Define symbol ALIAS as an alias for coding system ALIASEE.
1237 You can use this function to redefine an alias that has already been defined,
1238 but you cannot redefine a name which is the canonical name for a coding system.
1239 \(a canonical name of a coding system is what is returned when you call
1240 `coding-system-name' on a coding system).
1242 ALIASEE itself can be an alias, which allows you to define nested aliases.
1244 You are forbidden, however, from creating alias loops or `dangling' aliases.
1245 These will be detected, and an error will be signaled if you attempt to do so.
1247 If ALIASEE is nil, then ALIAS will simply be undefined.
1249 See also `coding-system-alias-p', `coding-system-aliasee',
1250 and `coding-system-canonical-name-p'.
1254 Lisp_Object real_coding_system, probe;
1256 CHECK_SYMBOL (alias);
1258 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1260 ("Symbol is the canonical name of a coding system and cannot be redefined",
1265 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1266 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1267 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1269 Fremhash (alias, Vcoding_system_hash_table);
1271 /* Undefine subsidiary aliases,
1272 presumably created by a previous call to this function */
1273 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1274 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1275 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1277 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1278 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1279 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1282 /* Undefine dangling coding system aliases. */
1284 int dangling_aliases;
1287 dangling_aliases = 0;
1288 elisp_map_remhash (dangling_coding_system_alias_p,
1289 Vcoding_system_hash_table,
1291 } while (dangling_aliases > 0);
1297 if (CODING_SYSTEMP (aliasee))
1298 aliasee = XCODING_SYSTEM_NAME (aliasee);
1300 /* Checks that aliasee names a coding-system */
1301 real_coding_system = Fget_coding_system (aliasee);
1303 /* Check for coding system alias loops */
1304 if (EQ (alias, aliasee))
1305 alias_loop: signal_simple_error_2
1306 ("Attempt to create a coding system alias loop", alias, aliasee);
1308 for (probe = aliasee;
1310 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1312 if (EQ (probe, alias))
1316 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1318 /* Set up aliases for subsidiaries.
1319 #### There must be a better way to handle subsidiary coding systems. */
1321 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1323 for (i = 0; i < countof (suffixes); i++)
1325 Lisp_Object alias_subsidiary =
1326 append_suffix_to_symbol (alias, suffixes[i]);
1327 Lisp_Object aliasee_subsidiary =
1328 append_suffix_to_symbol (aliasee, suffixes[i]);
1330 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1331 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1334 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1335 but it doesn't look intentional, so I'd rather return something
1336 meaningful or nothing at all. */
1341 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1343 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1344 Lisp_Object new_coding_system;
1346 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1347 return coding_system;
1351 case EOL_AUTODETECT: return coding_system;
1352 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1353 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1354 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1355 default: abort (); return Qnil;
1358 return NILP (new_coding_system) ? coding_system : new_coding_system;
1361 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1362 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1364 (coding_system, eol_type))
1366 coding_system = Fget_coding_system (coding_system);
1368 return subsidiary_coding_system (coding_system,
1369 symbol_to_eol_type (eol_type));
1373 /************************************************************************/
1374 /* Coding system accessors */
1375 /************************************************************************/
1377 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1378 Return the doc string for CODING-SYSTEM.
1382 coding_system = Fget_coding_system (coding_system);
1383 return XCODING_SYSTEM_DOC_STRING (coding_system);
1386 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1387 Return the type of CODING-SYSTEM.
1391 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1394 case CODESYS_AUTODETECT: return Qundecided;
1396 case CODESYS_SHIFT_JIS: return Qshift_jis;
1397 case CODESYS_ISO2022: return Qiso2022;
1398 case CODESYS_BIG5: return Qbig5;
1399 case CODESYS_UCS4: return Qucs4;
1400 case CODESYS_UTF16: return Qutf16;
1401 case CODESYS_UTF8: return Qutf8;
1402 case CODESYS_CCL: return Qccl;
1404 case CODESYS_NO_CONVERSION: return Qno_conversion;
1406 case CODESYS_INTERNAL: return Qinternal;
1413 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1416 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1418 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1421 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1422 Return initial charset of CODING-SYSTEM designated to GNUM.
1425 (coding_system, gnum))
1427 coding_system = Fget_coding_system (coding_system);
1430 return coding_system_charset (coding_system, XINT (gnum));
1434 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1435 Return the PROP property of CODING-SYSTEM.
1437 (coding_system, prop))
1440 enum coding_system_type type;
1442 coding_system = Fget_coding_system (coding_system);
1443 CHECK_SYMBOL (prop);
1444 type = XCODING_SYSTEM_TYPE (coding_system);
1446 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1447 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1450 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1452 case CODESYS_PROP_ALL_OK:
1455 case CODESYS_PROP_ISO2022:
1456 if (type != CODESYS_ISO2022)
1458 ("Property only valid in ISO2022 coding systems",
1462 case CODESYS_PROP_CCL:
1463 if (type != CODESYS_CCL)
1465 ("Property only valid in CCL coding systems",
1475 signal_simple_error ("Unrecognized property", prop);
1477 if (EQ (prop, Qname))
1478 return XCODING_SYSTEM_NAME (coding_system);
1479 else if (EQ (prop, Qtype))
1480 return Fcoding_system_type (coding_system);
1481 else if (EQ (prop, Qdoc_string))
1482 return XCODING_SYSTEM_DOC_STRING (coding_system);
1483 else if (EQ (prop, Qmnemonic))
1484 return XCODING_SYSTEM_MNEMONIC (coding_system);
1485 else if (EQ (prop, Qeol_type))
1486 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1487 else if (EQ (prop, Qeol_lf))
1488 return XCODING_SYSTEM_EOL_LF (coding_system);
1489 else if (EQ (prop, Qeol_crlf))
1490 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1491 else if (EQ (prop, Qeol_cr))
1492 return XCODING_SYSTEM_EOL_CR (coding_system);
1493 else if (EQ (prop, Qpost_read_conversion))
1494 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1495 else if (EQ (prop, Qpre_write_conversion))
1496 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1499 else if (EQ (prop, Qdisable_composition))
1500 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1501 else if (EQ (prop, Qenable_decomposition))
1502 return XCODING_SYSTEM_ENABLE_DECOMPOSITION (coding_system) ? Qt : Qnil;
1503 else if (EQ (prop, Quse_entity_reference))
1504 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1505 else if (EQ (prop, Qccs_priority_list))
1506 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1508 else if (type == CODESYS_ISO2022)
1510 if (EQ (prop, Qcharset_g0))
1511 return coding_system_charset (coding_system, 0);
1512 else if (EQ (prop, Qcharset_g1))
1513 return coding_system_charset (coding_system, 1);
1514 else if (EQ (prop, Qcharset_g2))
1515 return coding_system_charset (coding_system, 2);
1516 else if (EQ (prop, Qcharset_g3))
1517 return coding_system_charset (coding_system, 3);
1519 #define FORCE_CHARSET(charset_num) \
1520 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1521 (coding_system, charset_num) ? Qt : Qnil)
1523 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1524 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1525 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1526 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1528 #define LISP_BOOLEAN(prop) \
1529 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1531 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1532 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1533 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1534 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1535 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1536 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1537 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1539 else if (EQ (prop, Qinput_charset_conversion))
1541 unparse_charset_conversion_specs
1542 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1543 else if (EQ (prop, Qoutput_charset_conversion))
1545 unparse_charset_conversion_specs
1546 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1550 else if (type == CODESYS_CCL)
1552 if (EQ (prop, Qdecode))
1553 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1554 else if (EQ (prop, Qencode))
1555 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1563 return Qnil; /* not reached */
1567 /************************************************************************/
1568 /* Coding category functions */
1569 /************************************************************************/
1572 decode_coding_category (Lisp_Object symbol)
1576 CHECK_SYMBOL (symbol);
1577 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1578 if (EQ (coding_category_symbol[i], symbol))
1581 signal_simple_error ("Unrecognized coding category", symbol);
1582 return 0; /* not reached */
1585 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1586 Return a list of all recognized coding categories.
1591 Lisp_Object list = Qnil;
1593 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1594 list = Fcons (coding_category_symbol[i], list);
1598 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1599 Change the priority order of the coding categories.
1600 LIST should be list of coding categories, in descending order of
1601 priority. Unspecified coding categories will be lower in priority
1602 than all specified ones, in the same relative order they were in
1607 int category_to_priority[CODING_CATEGORY_LAST];
1611 /* First generate a list that maps coding categories to priorities. */
1613 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1614 category_to_priority[i] = -1;
1616 /* Highest priority comes from the specified list. */
1618 EXTERNAL_LIST_LOOP (rest, list)
1620 int cat = decode_coding_category (XCAR (rest));
1622 if (category_to_priority[cat] >= 0)
1623 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1624 category_to_priority[cat] = i++;
1627 /* Now go through the existing categories by priority to retrieve
1628 the categories not yet specified and preserve their priority
1630 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1632 int cat = fcd->coding_category_by_priority[j];
1633 if (category_to_priority[cat] < 0)
1634 category_to_priority[cat] = i++;
1637 /* Now we need to construct the inverse of the mapping we just
1640 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1641 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1643 /* Phew! That was confusing. */
1647 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1648 Return a list of coding categories in descending order of priority.
1653 Lisp_Object list = Qnil;
1655 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1656 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1661 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1662 Change the coding system associated with a coding category.
1664 (coding_category, coding_system))
1666 int cat = decode_coding_category (coding_category);
1668 coding_system = Fget_coding_system (coding_system);
1669 fcd->coding_category_system[cat] = coding_system;
1673 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1674 Return the coding system associated with a coding category.
1678 int cat = decode_coding_category (coding_category);
1679 Lisp_Object sys = fcd->coding_category_system[cat];
1682 return XCODING_SYSTEM_NAME (sys);
1687 /************************************************************************/
1688 /* Detecting the encoding of data */
1689 /************************************************************************/
1691 struct detection_state
1693 eol_type_t eol_type;
1736 struct iso2022_decoder iso;
1738 int high_byte_count;
1739 unsigned int saw_single_shift:1;
1752 acceptable_control_char_p (int c)
1756 /* Allow and ignore control characters that you might
1757 reasonably see in a text file */
1762 case 8: /* backspace */
1763 case 11: /* vertical tab */
1764 case 12: /* form feed */
1765 case 26: /* MS-DOS C-z junk */
1766 case 31: /* '^_' -- for info */
1774 mask_has_at_most_one_bit_p (int mask)
1776 /* Perhaps the only thing useful you learn from intensive Microsoft
1777 technical interviews */
1778 return (mask & (mask - 1)) == 0;
1782 detect_eol_type (struct detection_state *st, const Extbyte *src,
1783 Lstream_data_count n)
1787 unsigned char c = *(unsigned char *)src++;
1790 if (st->eol.just_saw_cr)
1792 else if (st->eol.seen_anything)
1795 else if (st->eol.just_saw_cr)
1798 st->eol.just_saw_cr = 1;
1800 st->eol.just_saw_cr = 0;
1801 st->eol.seen_anything = 1;
1804 return EOL_AUTODETECT;
1807 /* Attempt to determine the encoding and EOL type of the given text.
1808 Before calling this function for the first type, you must initialize
1809 st->eol_type as appropriate and initialize st->mask to ~0.
1811 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1814 st->mask holds the determined coding category mask, or ~0 if only
1815 ASCII has been seen so far.
1819 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1820 is present in st->mask
1821 1 == definitive answers are here for both st->eol_type and st->mask
1825 detect_coding_type (struct detection_state *st, const Extbyte *src,
1826 Lstream_data_count n, int just_do_eol)
1828 if (st->eol_type == EOL_AUTODETECT)
1829 st->eol_type = detect_eol_type (st, src, n);
1832 return st->eol_type != EOL_AUTODETECT;
1834 if (!st->seen_non_ascii)
1836 for (; n; n--, src++)
1838 unsigned char c = *(unsigned char *) src;
1839 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1841 st->seen_non_ascii = 1;
1843 st->shift_jis.mask = ~0;
1846 st->utf16.mask = ~0;
1848 st->iso2022.mask = ~0;
1858 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1859 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1860 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1861 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1862 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1863 st->big5.mask = detect_coding_big5 (st, src, n);
1864 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1865 st->utf8.mask = detect_coding_utf8 (st, src, n);
1866 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1867 st->utf16.mask = detect_coding_utf16 (st, src, n);
1868 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1869 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1872 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1873 | st->utf8.mask | st->ucs4.mask;
1876 int retval = mask_has_at_most_one_bit_p (st->mask);
1877 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1878 return retval && st->eol_type != EOL_AUTODETECT;
1883 coding_system_from_mask (int mask)
1887 /* If the file was entirely or basically ASCII, use the
1888 default value of `buffer-file-coding-system'. */
1889 Lisp_Object retval =
1890 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1893 retval = Ffind_coding_system (retval);
1897 (Qbad_variable, Qwarning,
1898 "Invalid `default-buffer-file-coding-system', set to nil");
1899 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1903 retval = Fget_coding_system (Qraw_text);
1911 mask = postprocess_iso2022_mask (mask);
1913 /* Look through the coding categories by priority and find
1914 the first one that is allowed. */
1915 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1917 cat = fcd->coding_category_by_priority[i];
1918 if ((mask & (1 << cat)) &&
1919 !NILP (fcd->coding_category_system[cat]))
1923 return fcd->coding_category_system[cat];
1925 return Fget_coding_system (Qraw_text);
1929 /* Given a seekable read stream and potential coding system and EOL type
1930 as specified, do any autodetection that is called for. If the
1931 coding system and/or EOL type are not `autodetect', they will be left
1932 alone; but this function will never return an autodetect coding system
1935 This function does not automatically fetch subsidiary coding systems;
1936 that should be unnecessary with the explicit eol-type argument. */
1938 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1939 /* number of leading lines to check for a coding cookie */
1940 #define LINES_TO_CHECK 2
1943 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1944 eol_type_t *eol_type_in_out)
1946 struct detection_state decst;
1948 if (*eol_type_in_out == EOL_AUTODETECT)
1949 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1952 decst.eol_type = *eol_type_in_out;
1955 /* If autodetection is called for, do it now. */
1956 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1957 || *eol_type_in_out == EOL_AUTODETECT)
1960 Lisp_Object coding_system = Qnil;
1962 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1964 int lines_checked = 0;
1966 /* Look for initial "-*-"; mode line prefix */
1968 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1970 && lines_checked < LINES_TO_CHECK;
1972 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1974 Extbyte *local_vars_beg = p + 3;
1975 /* Look for final "-*-"; mode line suffix */
1976 for (p = local_vars_beg,
1977 scan_end = buf + nread - LENGTH ("-*-");
1979 && lines_checked < LINES_TO_CHECK;
1981 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1983 Extbyte *suffix = p;
1984 /* Look for "coding:" */
1985 for (p = local_vars_beg,
1986 scan_end = suffix - LENGTH ("coding:?");
1989 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1990 && (p == local_vars_beg
1991 || (*(p-1) == ' ' ||
1997 p += LENGTH ("coding:");
1998 while (*p == ' ' || *p == '\t') p++;
2000 /* Get coding system name */
2001 save = *suffix; *suffix = '\0';
2002 /* Characters valid in a MIME charset name (rfc 1521),
2003 and in a Lisp symbol name. */
2004 n = strspn ( (char *) p,
2005 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
2006 "abcdefghijklmnopqrstuvwxyz"
2012 save = p[n]; p[n] = '\0';
2014 Ffind_coding_system (intern ((char *) p));
2021 /* #### file must use standard EOLs or we miss 2d line */
2022 /* #### not to mention this is broken for UTF-16 DOS files */
2023 else if (*p == '\n' || *p == '\r')
2026 /* skip past multibyte (DOS) newline */
2027 if (*p == '\r' && *(p+1) == '\n') p++;
2031 /* #### file must use standard EOLs or we miss 2d line */
2032 /* #### not to mention this is broken for UTF-16 DOS files */
2033 else if (*p == '\n' || *p == '\r')
2036 /* skip past multibyte (DOS) newline */
2037 if (*p == '\r' && *(p+1) == '\n') p++;
2040 if (NILP (coding_system))
2043 if (detect_coding_type (&decst, buf, nread,
2044 XCODING_SYSTEM_TYPE (*codesys_in_out)
2045 != CODESYS_AUTODETECT))
2047 nread = Lstream_read (stream, buf, sizeof (buf));
2053 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2054 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2057 if (detect_coding_type (&decst, buf, nread, 1))
2059 nread = Lstream_read (stream, buf, sizeof (buf));
2065 *eol_type_in_out = decst.eol_type;
2066 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2068 if (NILP (coding_system))
2069 *codesys_in_out = coding_system_from_mask (decst.mask);
2071 *codesys_in_out = coding_system;
2075 /* If we absolutely can't determine the EOL type, just assume LF. */
2076 if (*eol_type_in_out == EOL_AUTODETECT)
2077 *eol_type_in_out = EOL_LF;
2079 Lstream_rewind (stream);
2082 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2083 Detect coding system of the text in the region between START and END.
2084 Return a list of possible coding systems ordered by priority.
2085 If only ASCII characters are found, return 'undecided or one of
2086 its subsidiary coding systems according to a detected end-of-line
2087 type. Optional arg BUFFER defaults to the current buffer.
2089 (start, end, buffer))
2091 Lisp_Object val = Qnil;
2092 struct buffer *buf = decode_buffer (buffer, 0);
2094 Lisp_Object instream, lb_instream;
2095 Lstream *istr, *lb_istr;
2096 struct detection_state decst;
2097 struct gcpro gcpro1, gcpro2;
2099 get_buffer_range_char (buf, start, end, &b, &e, 0);
2100 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2101 lb_istr = XLSTREAM (lb_instream);
2102 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2103 istr = XLSTREAM (instream);
2104 GCPRO2 (instream, lb_instream);
2106 decst.eol_type = EOL_AUTODETECT;
2110 Extbyte random_buffer[4096];
2111 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2115 if (detect_coding_type (&decst, random_buffer, nread, 0))
2119 if (decst.mask == ~0)
2120 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2128 decst.mask = postprocess_iso2022_mask (decst.mask);
2130 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2132 int sys = fcd->coding_category_by_priority[i];
2133 if (decst.mask & (1 << sys))
2135 Lisp_Object codesys = fcd->coding_category_system[sys];
2136 if (!NILP (codesys))
2137 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2138 val = Fcons (codesys, val);
2142 Lstream_close (istr);
2144 Lstream_delete (istr);
2145 Lstream_delete (lb_istr);
2150 /************************************************************************/
2151 /* Converting to internal Mule format ("decoding") */
2152 /************************************************************************/
2154 /* A decoding stream is a stream used for decoding text (i.e.
2155 converting from some external format to internal format).
2156 The decoding-stream object keeps track of the actual coding
2157 stream, the stream that is at the other end, and data that
2158 needs to be persistent across the lifetime of the stream. */
2160 /* Handle the EOL stuff related to just-read-in character C.
2161 EOL_TYPE is the EOL type of the coding stream.
2162 FLAGS is the current value of FLAGS in the coding stream, and may
2163 be modified by this macro. (The macro only looks at the
2164 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2165 bytes are to be written. You need to also define a local goto
2166 label "label_continue_loop" that is at the end of the main
2167 character-reading loop.
2169 If C is a CR character, then this macro handles it entirely and
2170 jumps to label_continue_loop. Otherwise, this macro does not add
2171 anything to DST, and continues normally. You should continue
2172 processing C normally after this macro. */
2174 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2178 if (eol_type == EOL_CR) \
2179 Dynarr_add (dst, '\n'); \
2180 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2181 Dynarr_add (dst, c); \
2183 flags |= CODING_STATE_CR; \
2184 goto label_continue_loop; \
2186 else if (flags & CODING_STATE_CR) \
2187 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2189 Dynarr_add (dst, '\r'); \
2190 flags &= ~CODING_STATE_CR; \
2194 /* C should be a binary character in the range 0 - 255; convert
2195 to internal format and add to Dynarr DST. */
2198 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2200 if (BYTE_ASCII_P (c)) \
2201 Dynarr_add (dst, c); \
2204 Dynarr_add (dst, (c >> 6) | 0xc0); \
2205 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2209 static void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2211 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2215 Dynarr_add (dst, c);
2217 else if ( c <= 0x7ff )
2219 Dynarr_add (dst, (c >> 6) | 0xc0);
2220 Dynarr_add (dst, (c & 0x3f) | 0x80);
2222 else if ( c <= 0xffff )
2224 Dynarr_add (dst, (c >> 12) | 0xe0);
2225 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2226 Dynarr_add (dst, (c & 0x3f) | 0x80);
2228 else if ( c <= 0x1fffff )
2230 Dynarr_add (dst, (c >> 18) | 0xf0);
2231 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2232 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2233 Dynarr_add (dst, (c & 0x3f) | 0x80);
2235 else if ( c <= 0x3ffffff )
2237 Dynarr_add (dst, (c >> 24) | 0xf8);
2238 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2239 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2240 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2241 Dynarr_add (dst, (c & 0x3f) | 0x80);
2245 Dynarr_add (dst, (c >> 30) | 0xfc);
2246 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2247 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2248 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2249 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2250 Dynarr_add (dst, (c & 0x3f) | 0x80);
2254 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2256 if (BYTE_ASCII_P (c)) \
2257 Dynarr_add (dst, c); \
2258 else if (BYTE_C1_P (c)) \
2260 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2261 Dynarr_add (dst, c + 0x20); \
2265 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2266 Dynarr_add (dst, c); \
2271 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2275 DECODE_ADD_BINARY_CHAR (ch, dst); \
2280 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2282 if (flags & CODING_STATE_END) \
2284 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2285 if (flags & CODING_STATE_CR) \
2286 Dynarr_add (dst, '\r'); \
2290 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2292 #define ER_BUF_SIZE 24
2294 struct decoding_stream
2296 /* Coding system that governs the conversion. */
2297 Lisp_Coding_System *codesys;
2299 /* Stream that we read the encoded data from or
2300 write the decoded data to. */
2303 /* If we are reading, then we can return only a fixed amount of
2304 data, so if the conversion resulted in too much data, we store it
2305 here for retrieval the next time around. */
2306 unsigned_char_dynarr *runoff;
2308 /* FLAGS holds flags indicating the current state of the decoding.
2309 Some of these flags are dependent on the coding system. */
2312 /* CPOS holds a partially built-up code-point of character. */
2315 /* EOL_TYPE specifies the type of end-of-line conversion that
2316 currently applies. We need to keep this separate from the
2317 EOL type stored in CODESYS because the latter might indicate
2318 automatic EOL-type detection while the former will always
2319 indicate a particular EOL type. */
2320 eol_type_t eol_type;
2322 /* Additional ISO2022 information. We define the structure above
2323 because it's also needed by the detection routines. */
2324 struct iso2022_decoder iso2022;
2326 /* Additional information (the state of the running CCL program)
2327 used by the CCL decoder. */
2328 struct ccl_program ccl;
2330 /* counter for UTF-8 or UCS-4 */
2331 unsigned char counter;
2335 unsigned char er_counter;
2336 unsigned char er_buf[ER_BUF_SIZE];
2338 unsigned combined_char_count;
2339 Emchar combined_chars[16];
2340 #ifdef HAVE_LIBCHISE
2341 COS_object combining_table;
2343 Lisp_Object combining_table;
2344 #endif /* HAVE_LIBCHISE */
2346 struct detection_state decst;
2349 static Lstream_data_count decoding_reader (Lstream *stream,
2350 unsigned char *data, Lstream_data_count size);
2351 static Lstream_data_count decoding_writer (Lstream *stream,
2352 const unsigned char *data, Lstream_data_count size);
2353 static int decoding_rewinder (Lstream *stream);
2354 static int decoding_seekable_p (Lstream *stream);
2355 static int decoding_flusher (Lstream *stream);
2356 static int decoding_closer (Lstream *stream);
2358 static Lisp_Object decoding_marker (Lisp_Object stream);
2360 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2361 sizeof (struct decoding_stream));
2364 decoding_marker (Lisp_Object stream)
2366 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2367 Lisp_Object str_obj;
2369 /* We do not need to mark the coding systems or charsets stored
2370 within the stream because they are stored in a global list
2371 and automatically marked. */
2373 XSETLSTREAM (str_obj, str);
2374 mark_object (str_obj);
2375 if (str->imp->marker)
2376 return (str->imp->marker) (str_obj);
2381 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2382 so we read data from the other end, decode it, and store it into DATA. */
2384 static Lstream_data_count
2385 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2387 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2388 unsigned char *orig_data = data;
2389 Lstream_data_count read_size;
2390 int error_occurred = 0;
2392 /* We need to interface to mule_decode(), which expects to take some
2393 amount of data and store the result into a Dynarr. We have
2394 mule_decode() store into str->runoff, and take data from there
2397 /* We loop until we have enough data, reading chunks from the other
2398 end and decoding it. */
2401 /* Take data from the runoff if we can. Make sure to take at
2402 most SIZE bytes, and delete the data from the runoff. */
2403 if (Dynarr_length (str->runoff) > 0)
2405 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2406 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2407 Dynarr_delete_many (str->runoff, 0, chunk);
2413 break; /* No more room for data */
2415 if (str->flags & CODING_STATE_END)
2416 /* This means that on the previous iteration, we hit the EOF on
2417 the other end. We loop once more so that mule_decode() can
2418 output any final stuff it may be holding, or any "go back
2419 to a sane state" escape sequences. (This latter makes sense
2420 during encoding.) */
2423 /* Exhausted the runoff, so get some more. DATA has at least
2424 SIZE bytes left of storage in it, so it's OK to read directly
2425 into it. (We'll be overwriting above, after we've decoded it
2426 into the runoff.) */
2427 read_size = Lstream_read (str->other_end, data, size);
2434 /* There might be some more end data produced in the translation.
2435 See the comment above. */
2436 str->flags |= CODING_STATE_END;
2437 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2440 if (data - orig_data == 0)
2441 return error_occurred ? -1 : 0;
2443 return data - orig_data;
2446 static Lstream_data_count
2447 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2449 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2450 Lstream_data_count retval;
2452 /* Decode all our data into the runoff, and then attempt to write
2453 it all out to the other end. Remove whatever chunk we succeeded
2455 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2456 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2457 Dynarr_length (str->runoff));
2459 Dynarr_delete_many (str->runoff, 0, retval);
2460 /* Do NOT return retval. The return value indicates how much
2461 of the incoming data was written, not how many bytes were
2467 reset_decoding_stream (struct decoding_stream *str)
2470 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2472 Lisp_Object coding_system;
2473 XSETCODING_SYSTEM (coding_system, str->codesys);
2474 reset_iso2022 (coding_system, &str->iso2022);
2476 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2478 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2484 str->er_counter = 0;
2485 str->combined_char_count = 0;
2486 #ifdef HAVE_LIBCHISE
2487 str->combining_table = COS_NIL;
2489 str->combining_table = Qnil;
2490 #endif /* HAVE_LIBCHISE */
2492 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2493 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2496 str->decst.eol_type = EOL_AUTODETECT;
2497 str->decst.mask = ~0;
2499 str->flags = str->cpos = 0;
2503 decoding_rewinder (Lstream *stream)
2505 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2506 reset_decoding_stream (str);
2507 Dynarr_reset (str->runoff);
2508 return Lstream_rewind (str->other_end);
2512 decoding_seekable_p (Lstream *stream)
2514 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2515 return Lstream_seekable_p (str->other_end);
2519 decoding_flusher (Lstream *stream)
2521 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2522 return Lstream_flush (str->other_end);
2526 decoding_closer (Lstream *stream)
2528 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2529 if (stream->flags & LSTREAM_FL_WRITE)
2531 str->flags |= CODING_STATE_END;
2532 decoding_writer (stream, 0, 0);
2534 Dynarr_free (str->runoff);
2536 #ifdef ENABLE_COMPOSITE_CHARS
2537 if (str->iso2022.composite_chars)
2538 Dynarr_free (str->iso2022.composite_chars);
2541 return Lstream_close (str->other_end);
2545 decoding_stream_coding_system (Lstream *stream)
2547 Lisp_Object coding_system;
2548 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2550 XSETCODING_SYSTEM (coding_system, str->codesys);
2551 return subsidiary_coding_system (coding_system, str->eol_type);
2555 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2557 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2558 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2560 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2561 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2562 reset_decoding_stream (str);
2565 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2566 stream for writing, no automatic code detection will be performed.
2567 The reason for this is that automatic code detection requires a
2568 seekable input. Things will also fail if you open a decoding
2569 stream for reading using a non-fully-specified coding system and
2570 a non-seekable input stream. */
2573 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2576 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2577 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2581 str->other_end = stream;
2582 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2583 str->eol_type = EOL_AUTODETECT;
2584 if (!strcmp (mode, "r")
2585 && Lstream_seekable_p (stream))
2586 /* We can determine the coding system now. */
2587 determine_real_coding_system (stream, &codesys, &str->eol_type);
2588 set_decoding_stream_coding_system (lstr, codesys);
2589 str->decst.eol_type = str->eol_type;
2590 str->decst.mask = ~0;
2591 XSETLSTREAM (obj, lstr);
2596 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2598 return make_decoding_stream_1 (stream, codesys, "r");
2602 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2604 return make_decoding_stream_1 (stream, codesys, "w");
2607 /* Note: the decode_coding_* functions all take the same
2608 arguments as mule_decode(), which is to say some SRC data of
2609 size N, which is to be stored into dynamic array DST.
2610 DECODING is the stream within which the decoding is
2611 taking place, but no data is actually read from or
2612 written to that stream; that is handled in decoding_reader()
2613 or decoding_writer(). This allows the same functions to
2614 be used for both reading and writing. */
2617 mule_decode (Lstream *decoding, const Extbyte *src,
2618 unsigned_char_dynarr *dst, Lstream_data_count n)
2620 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2622 /* If necessary, do encoding-detection now. We do this when
2623 we're a writing stream or a non-seekable reading stream,
2624 meaning that we can't just process the whole input,
2625 rewind, and start over. */
2627 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2628 str->eol_type == EOL_AUTODETECT)
2630 Lisp_Object codesys;
2632 XSETCODING_SYSTEM (codesys, str->codesys);
2633 detect_coding_type (&str->decst, src, n,
2634 CODING_SYSTEM_TYPE (str->codesys) !=
2635 CODESYS_AUTODETECT);
2636 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2637 str->decst.mask != ~0)
2638 /* #### This is cheesy. What we really ought to do is
2639 buffer up a certain amount of data so as to get a
2640 less random result. */
2641 codesys = coding_system_from_mask (str->decst.mask);
2642 str->eol_type = str->decst.eol_type;
2643 if (XCODING_SYSTEM (codesys) != str->codesys)
2645 /* Preserve the CODING_STATE_END flag in case it was set.
2646 If we erase it, bad things might happen. */
2647 int was_end = str->flags & CODING_STATE_END;
2648 set_decoding_stream_coding_system (decoding, codesys);
2650 str->flags |= CODING_STATE_END;
2654 switch (CODING_SYSTEM_TYPE (str->codesys))
2657 case CODESYS_INTERNAL:
2658 Dynarr_add_many (dst, src, n);
2661 case CODESYS_AUTODETECT:
2662 /* If we got this far and still haven't decided on the coding
2663 system, then do no conversion. */
2664 case CODESYS_NO_CONVERSION:
2665 decode_coding_no_conversion (decoding, src, dst, n);
2668 case CODESYS_SHIFT_JIS:
2669 decode_coding_sjis (decoding, src, dst, n);
2672 decode_coding_big5 (decoding, src, dst, n);
2675 decode_coding_ucs4 (decoding, src, dst, n);
2678 decode_coding_utf16 (decoding, src, dst, n);
2681 decode_coding_utf8 (decoding, src, dst, n);
2684 str->ccl.last_block = str->flags & CODING_STATE_END;
2685 /* When applying ccl program to stream, MUST NOT set NULL
2687 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2688 dst, n, 0, CCL_MODE_DECODING);
2690 case CODESYS_ISO2022:
2691 decode_coding_iso2022 (decoding, src, dst, n);
2699 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2700 Decode the text between START and END which is encoded in CODING-SYSTEM.
2701 This is useful if you've read in encoded text from a file without decoding
2702 it (e.g. you read in a JIS-formatted file but used the `binary' or
2703 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2704 Return length of decoded text.
2705 BUFFER defaults to the current buffer if unspecified.
2707 (start, end, coding_system, buffer))
2710 struct buffer *buf = decode_buffer (buffer, 0);
2711 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2712 Lstream *istr, *ostr;
2713 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2715 get_buffer_range_char (buf, start, end, &b, &e, 0);
2717 barf_if_buffer_read_only (buf, b, e);
2719 coding_system = Fget_coding_system (coding_system);
2720 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2721 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2722 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2724 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2725 Fget_coding_system (Qbinary));
2726 istr = XLSTREAM (instream);
2727 ostr = XLSTREAM (outstream);
2728 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2730 /* The chain of streams looks like this:
2732 [BUFFER] <----- send through
2733 ------> [ENCODE AS BINARY]
2734 ------> [DECODE AS SPECIFIED]
2740 char tempbuf[1024]; /* some random amount */
2741 Bufpos newpos, even_newer_pos;
2742 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2743 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2747 newpos = lisp_buffer_stream_startpos (istr);
2748 Lstream_write (ostr, tempbuf, size_in_bytes);
2749 even_newer_pos = lisp_buffer_stream_startpos (istr);
2750 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2753 Lstream_close (istr);
2754 Lstream_close (ostr);
2756 Lstream_delete (istr);
2757 Lstream_delete (ostr);
2758 Lstream_delete (XLSTREAM (de_outstream));
2759 Lstream_delete (XLSTREAM (lb_outstream));
2764 /************************************************************************/
2765 /* Converting to an external encoding ("encoding") */
2766 /************************************************************************/
2768 /* An encoding stream is an output stream. When you create the
2769 stream, you specify the coding system that governs the encoding
2770 and another stream that the resulting encoded data is to be
2771 sent to, and then start sending data to it. */
2773 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2775 struct encoding_stream
2777 /* Coding system that governs the conversion. */
2778 Lisp_Coding_System *codesys;
2780 /* Stream that we read the encoded data from or
2781 write the decoded data to. */
2784 /* If we are reading, then we can return only a fixed amount of
2785 data, so if the conversion resulted in too much data, we store it
2786 here for retrieval the next time around. */
2787 unsigned_char_dynarr *runoff;
2789 /* FLAGS holds flags indicating the current state of the encoding.
2790 Some of these flags are dependent on the coding system. */
2793 /* CH holds a partially built-up character. Since we only deal
2794 with one- and two-byte characters at the moment, we only use
2795 this to store the first byte of a two-byte character. */
2798 /* Additional information used by the ISO2022 encoder. */
2801 /* CHARSET holds the character sets currently assigned to the G0
2802 through G3 registers. It is initialized from the array
2803 INITIAL_CHARSET in CODESYS. */
2804 Lisp_Object charset[4];
2806 /* Which registers are currently invoked into the left (GL) and
2807 right (GR) halves of the 8-bit encoding space? */
2808 int register_left, register_right;
2810 /* Whether we need to explicitly designate the charset in the
2811 G? register before using it. It is initialized from the
2812 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2813 unsigned char force_charset_on_output[4];
2815 /* Other state variables that need to be preserved across
2817 Lisp_Object current_charset;
2819 int current_char_boundary;
2822 void (*encode_char) (struct encoding_stream *str, Emchar c,
2823 unsigned_char_dynarr *dst, unsigned int *flags);
2824 void (*finish) (struct encoding_stream *str,
2825 unsigned_char_dynarr *dst, unsigned int *flags);
2827 /* Additional information (the state of the running CCL program)
2828 used by the CCL encoder. */
2829 struct ccl_program ccl;
2833 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2834 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2835 Lstream_data_count size);
2836 static int encoding_rewinder (Lstream *stream);
2837 static int encoding_seekable_p (Lstream *stream);
2838 static int encoding_flusher (Lstream *stream);
2839 static int encoding_closer (Lstream *stream);
2841 static Lisp_Object encoding_marker (Lisp_Object stream);
2843 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2844 sizeof (struct encoding_stream));
2847 encoding_marker (Lisp_Object stream)
2849 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2850 Lisp_Object str_obj;
2852 /* We do not need to mark the coding systems or charsets stored
2853 within the stream because they are stored in a global list
2854 and automatically marked. */
2856 XSETLSTREAM (str_obj, str);
2857 mark_object (str_obj);
2858 if (str->imp->marker)
2859 return (str->imp->marker) (str_obj);
2864 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2865 so we read data from the other end, encode it, and store it into DATA. */
2867 static Lstream_data_count
2868 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2870 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2871 unsigned char *orig_data = data;
2872 Lstream_data_count read_size;
2873 int error_occurred = 0;
2875 /* We need to interface to mule_encode(), which expects to take some
2876 amount of data and store the result into a Dynarr. We have
2877 mule_encode() store into str->runoff, and take data from there
2880 /* We loop until we have enough data, reading chunks from the other
2881 end and encoding it. */
2884 /* Take data from the runoff if we can. Make sure to take at
2885 most SIZE bytes, and delete the data from the runoff. */
2886 if (Dynarr_length (str->runoff) > 0)
2888 int chunk = min ((int) size, Dynarr_length (str->runoff));
2889 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2890 Dynarr_delete_many (str->runoff, 0, chunk);
2896 break; /* No more room for data */
2898 if (str->flags & CODING_STATE_END)
2899 /* This means that on the previous iteration, we hit the EOF on
2900 the other end. We loop once more so that mule_encode() can
2901 output any final stuff it may be holding, or any "go back
2902 to a sane state" escape sequences. (This latter makes sense
2903 during encoding.) */
2906 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2907 left of storage in it, so it's OK to read directly into it.
2908 (We'll be overwriting above, after we've encoded it into the
2910 read_size = Lstream_read (str->other_end, data, size);
2917 /* There might be some more end data produced in the translation.
2918 See the comment above. */
2919 str->flags |= CODING_STATE_END;
2920 mule_encode (stream, data, str->runoff, read_size);
2923 if (data == orig_data)
2924 return error_occurred ? -1 : 0;
2926 return data - orig_data;
2929 static Lstream_data_count
2930 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2932 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2933 Lstream_data_count retval;
2935 /* Encode all our data into the runoff, and then attempt to write
2936 it all out to the other end. Remove whatever chunk we succeeded
2938 mule_encode (stream, data, str->runoff, size);
2939 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2940 Dynarr_length (str->runoff));
2942 Dynarr_delete_many (str->runoff, 0, retval);
2943 /* Do NOT return retval. The return value indicates how much
2944 of the incoming data was written, not how many bytes were
2950 reset_encoding_stream (struct encoding_stream *str)
2953 switch (CODING_SYSTEM_TYPE (str->codesys))
2955 case CODESYS_ISO2022:
2959 str->encode_char = &char_encode_iso2022;
2960 str->finish = &char_finish_iso2022;
2961 for (i = 0; i < 4; i++)
2963 str->iso2022.charset[i] =
2964 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2965 str->iso2022.force_charset_on_output[i] =
2966 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2968 str->iso2022.register_left = 0;
2969 str->iso2022.register_right = 1;
2970 str->iso2022.current_charset = Qnil;
2971 str->iso2022.current_half = 0;
2975 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2978 str->encode_char = &char_encode_utf8;
2979 str->finish = &char_finish_utf8;
2982 str->encode_char = &char_encode_utf16;
2983 str->finish = &char_finish_utf16;
2986 str->encode_char = &char_encode_ucs4;
2987 str->finish = &char_finish_ucs4;
2989 case CODESYS_SHIFT_JIS:
2990 str->encode_char = &char_encode_shift_jis;
2991 str->finish = &char_finish_shift_jis;
2994 str->encode_char = &char_encode_big5;
2995 str->finish = &char_finish_big5;
3001 str->iso2022.current_char_boundary = 0;
3002 str->flags = str->ch = 0;
3006 encoding_rewinder (Lstream *stream)
3008 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3009 reset_encoding_stream (str);
3010 Dynarr_reset (str->runoff);
3011 return Lstream_rewind (str->other_end);
3015 encoding_seekable_p (Lstream *stream)
3017 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3018 return Lstream_seekable_p (str->other_end);
3022 encoding_flusher (Lstream *stream)
3024 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3025 return Lstream_flush (str->other_end);
3029 encoding_closer (Lstream *stream)
3031 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3032 if (stream->flags & LSTREAM_FL_WRITE)
3034 str->flags |= CODING_STATE_END;
3035 encoding_writer (stream, 0, 0);
3037 Dynarr_free (str->runoff);
3038 return Lstream_close (str->other_end);
3042 encoding_stream_coding_system (Lstream *stream)
3044 Lisp_Object coding_system;
3045 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3047 XSETCODING_SYSTEM (coding_system, str->codesys);
3048 return coding_system;
3052 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3054 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3055 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3057 reset_encoding_stream (str);
3061 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3064 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3065 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3069 str->runoff = Dynarr_new (unsigned_char);
3070 str->other_end = stream;
3071 set_encoding_stream_coding_system (lstr, codesys);
3072 XSETLSTREAM (obj, lstr);
3077 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3079 return make_encoding_stream_1 (stream, codesys, "r");
3083 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3085 return make_encoding_stream_1 (stream, codesys, "w");
3088 /* Convert N bytes of internally-formatted data stored in SRC to an
3089 external format, according to the encoding stream ENCODING.
3090 Store the encoded data into DST. */
3093 mule_encode (Lstream *encoding, const Bufbyte *src,
3094 unsigned_char_dynarr *dst, Lstream_data_count n)
3096 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3098 switch (CODING_SYSTEM_TYPE (str->codesys))
3101 case CODESYS_INTERNAL:
3102 Dynarr_add_many (dst, src, n);
3105 case CODESYS_AUTODETECT:
3106 /* If we got this far and still haven't decided on the coding
3107 system, then do no conversion. */
3108 case CODESYS_NO_CONVERSION:
3109 encode_coding_no_conversion (encoding, src, dst, n);
3113 str->ccl.last_block = str->flags & CODING_STATE_END;
3114 /* When applying ccl program to stream, MUST NOT set NULL
3116 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3117 dst, n, 0, CCL_MODE_ENCODING);
3121 text_encode_generic (encoding, src, dst, n);
3125 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3126 Encode the text between START and END using CODING-SYSTEM.
3127 This will, for example, convert Japanese characters into stuff such as
3128 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3129 text. BUFFER defaults to the current buffer if unspecified.
3131 (start, end, coding_system, buffer))
3134 struct buffer *buf = decode_buffer (buffer, 0);
3135 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3136 Lstream *istr, *ostr;
3137 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3139 get_buffer_range_char (buf, start, end, &b, &e, 0);
3141 barf_if_buffer_read_only (buf, b, e);
3143 coding_system = Fget_coding_system (coding_system);
3144 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3145 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3146 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3147 Fget_coding_system (Qbinary));
3148 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3150 istr = XLSTREAM (instream);
3151 ostr = XLSTREAM (outstream);
3152 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3153 /* The chain of streams looks like this:
3155 [BUFFER] <----- send through
3156 ------> [ENCODE AS SPECIFIED]
3157 ------> [DECODE AS BINARY]
3162 char tempbuf[1024]; /* some random amount */
3163 Bufpos newpos, even_newer_pos;
3164 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3165 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3169 newpos = lisp_buffer_stream_startpos (istr);
3170 Lstream_write (ostr, tempbuf, size_in_bytes);
3171 even_newer_pos = lisp_buffer_stream_startpos (istr);
3172 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3178 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3179 Lstream_close (istr);
3180 Lstream_close (ostr);
3182 Lstream_delete (istr);
3183 Lstream_delete (ostr);
3184 Lstream_delete (XLSTREAM (de_outstream));
3185 Lstream_delete (XLSTREAM (lb_outstream));
3186 return make_int (retlen);
3193 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3194 unsigned_char_dynarr *dst, Lstream_data_count n)
3197 unsigned char char_boundary;
3198 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3199 unsigned int flags = str->flags;
3200 Emchar ch = str->ch;
3202 char_boundary = str->iso2022.current_char_boundary;
3208 if (char_boundary == 0)
3236 (*str->encode_char) (str, c, dst, &flags);
3238 else if (char_boundary == 1)
3240 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3246 ch = (ch << 6) | (c & 0x3f);
3251 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3253 (*str->finish) (str, dst, &flags);
3258 str->iso2022.current_char_boundary = char_boundary;
3263 /************************************************************************/
3264 /* entity reference */
3265 /************************************************************************/
3268 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3270 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3272 if ( str->er_counter > 0)
3274 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3275 str->er_counter = 0;
3279 EXFUN (Fregexp_quote, 1);
3281 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3282 unsigned_char_dynarr* dst);
3284 decode_add_er_char (struct decoding_stream *str, Emchar c,
3285 unsigned_char_dynarr* dst)
3287 if (str->er_counter == 0)
3289 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3292 str->er_buf[0] = '&';
3296 DECODE_ADD_UCS_CHAR (c, dst);
3300 Lisp_Object string = make_string (str->er_buf,
3307 Lisp_Object char_type;
3310 for ( rest = Vcoded_charset_entity_reference_alist;
3311 !NILP (rest); rest = Fcdr (rest) )
3317 char_type = XCDR (ccs);
3322 if (NILP (ccs = Ffind_charset (ccs)))
3331 pat = Fregexp_quote (pat);
3338 pat = concat3 (build_string ("^&"),
3339 pat, build_string ("\\([0-9]+\\)$"));
3342 else if (EQ (ret, Qx))
3344 pat = concat3 (build_string ("^&"),
3345 pat, build_string ("\\([0-9a-f]+\\)$"));
3348 else if (EQ (ret, QX))
3350 pat = concat3 (build_string ("^&"),
3351 pat, build_string ("\\([0-9A-F]+\\)$"));
3357 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3360 = XINT (Fstring_to_number
3361 (Fsubstring (string,
3362 Fmatch_beginning (make_int (1)),
3363 Fmatch_end (make_int (1))),
3367 ? DECODE_CHAR (ccs, code, 0)
3368 : decode_builtin_char (ccs, code);
3371 DECODE_ADD_UCS_CHAR (chr, dst);
3374 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3375 Dynarr_add (dst, ';');
3381 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3382 string, Qnil, Qnil)))
3385 = XUINT (Fstring_to_number
3386 (Fsubstring (string,
3387 Fmatch_beginning (make_int (1)),
3388 Fmatch_end (make_int (1))),
3391 DECODE_ADD_UCS_CHAR (code, dst);
3395 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3396 Dynarr_add (dst, ';');
3399 str->er_counter = 0;
3401 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3403 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3404 str->er_counter = 0;
3405 DECODE_ADD_UCS_CHAR (c, dst);
3408 str->er_buf[str->er_counter++] = c;
3411 void char_encode_as_entity_reference (Emchar ch, char* buf);
3413 char_encode_as_entity_reference (Emchar ch, char* buf)
3415 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3418 Lisp_Object char_type;
3419 int format_columns, idx;
3420 char format[ER_BUF_SIZE];
3422 while (!NILP (rest))
3428 char_type = XCDR (ccs);
3433 if (!NILP (ccs = Ffind_charset (ccs)))
3436 = charset_code_point (ccs, ch,
3438 CHAR_ALL : CHAR_ISOLATED_ONLY );
3440 if ( (code_point >= 0)
3441 && ( NILP (char_type)
3443 || ( charset_code_point (ccs, ch, CHAR_DEFINED_ONLY)
3447 || ( DECODE_CHAR (ccs, code_point, 0) != ch )
3456 if ( STRINGP (ret) &&
3457 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3460 strncpy (&format[1], XSTRING_DATA (ret), idx);
3470 format[idx++] = '%';
3471 format_columns = XINT (ret);
3472 if ( (2 <= format_columns) && (format_columns <= 8)
3473 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3475 format [idx++] = '0';
3476 format [idx++] = '0' + format_columns;
3485 format [idx++] = 'd';
3486 else if (EQ (ret, Qx))
3487 format [idx++] = 'x';
3488 else if (EQ (ret, QX))
3489 format [idx++] = 'X';
3492 format [idx++] = ';';
3495 sprintf (buf, format, code_point);
3502 sprintf (buf, "&MCS-%08X;", ch);
3506 /************************************************************************/
3507 /* character composition */
3508 /************************************************************************/
3509 extern Lisp_Object Qcomposition, Qrep_decomposition;
3512 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3514 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3518 for (i = 0; i < str->combined_char_count; i++)
3519 decode_add_er_char (str, str->combined_chars[i], dst);
3520 str->combined_char_count = 0;
3521 #ifdef HAVE_LIBCHISE
3522 str->combining_table = COS_NIL;
3524 str->combining_table = Qnil;
3525 #endif /* HAVE_LIBCHISE */
3528 extern CONCORD_DS concord_current_env;
3532 concord_setup_env_maybe ()
3534 if (concord_current_env == NULL)
3536 concord_open_env ("/usr/local/share/chise/1.0/db/");
3542 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3543 unsigned_char_dynarr* dst);
3545 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3546 Emchar character, unsigned_char_dynarr* dst)
3548 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3549 decode_add_er_char (str, character, dst);
3550 #ifdef HAVE_LIBCHISE
3551 else if (!cos_cons_p (str->combining_table))
3555 /* concord_setup_env_maybe (); */
3556 open_chise_data_source_maybe ();
3557 ret = concord_object_get_attribute (cos_make_char (character),
3560 if (!cos_cons_p (ret))
3561 decode_add_er_char (str, character, dst);
3564 str->combined_chars[0] = character;
3565 str->combined_char_count = 1;
3566 str->combining_table = ret;
3572 = cos_cdr (cos_assoc (cos_make_char (character),
3573 str->combining_table));
3575 if (cos_char_p (ret))
3577 Emchar char2 = cos_char_id (ret);
3580 /* concord_setup_env_maybe (); */
3581 open_chise_data_source_maybe ();
3582 ret2 = concord_object_get_attribute (ret, COS_COMPOSITION);
3584 if (!cos_cons_p (ret2))
3586 decode_add_er_char (str, char2, dst);
3587 str->combined_char_count = 0;
3588 str->combining_table = COS_NIL;
3592 str->combined_chars[0] = char2;
3593 str->combined_char_count = 1;
3594 str->combining_table = ret2;
3599 /* concord_setup_env_maybe (); */
3600 open_chise_data_source_maybe ();
3601 ret = concord_object_get_attribute (cos_make_char (character),
3604 COMPOSE_FLUSH_CHARS (str, dst);
3605 if (!cos_cons_p (ret))
3606 decode_add_er_char (str, character, dst);
3609 str->combined_chars[0] = character;
3610 str->combined_char_count = 1;
3611 str->combining_table = ret;
3616 else if (!CONSP (str->combining_table))
3619 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3623 decode_add_er_char (str, character, dst);
3626 str->combined_chars[0] = character;
3627 str->combined_char_count = 1;
3628 str->combining_table = ret;
3634 = Fcdr (Fassq (make_char (character), str->combining_table));
3638 Emchar char2 = XCHARVAL (ret);
3639 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3644 decode_add_er_char (str, char2, dst);
3645 str->combined_char_count = 0;
3646 str->combining_table = Qnil;
3650 str->combined_chars[0] = char2;
3651 str->combined_char_count = 1;
3652 str->combining_table = ret2;
3657 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3660 COMPOSE_FLUSH_CHARS (str, dst);
3662 decode_add_er_char (str, character, dst);
3665 str->combined_chars[0] = character;
3666 str->combined_char_count = 1;
3667 str->combining_table = ret;
3671 #endif /* HAVE_LIBCHISE */
3673 #else /* not UTF2000 */
3674 #define COMPOSE_FLUSH_CHARS(str, dst)
3675 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3676 #endif /* UTF2000 */
3679 /************************************************************************/
3680 /* Shift-JIS methods */
3681 /************************************************************************/
3683 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3684 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3685 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3686 encoded by "position-code + 0x80". A character of JISX0208
3687 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3688 position-codes are divided and shifted so that it fit in the range
3691 --- CODE RANGE of Shift-JIS ---
3692 (character set) (range)
3694 JISX0201-Kana 0xA0 .. 0xDF
3695 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3696 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3697 -------------------------------
3701 /* Is this the first byte of a Shift-JIS two-byte char? */
3703 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3704 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3706 /* Is this the second byte of a Shift-JIS two-byte char? */
3708 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3709 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3711 #define BYTE_SJIS_KATAKANA_P(c) \
3712 ((c) >= 0xA1 && (c) <= 0xDF)
3715 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3719 unsigned char c = *(unsigned char *)src++;
3720 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3722 if (st->shift_jis.in_second_byte)
3724 st->shift_jis.in_second_byte = 0;
3728 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3729 st->shift_jis.in_second_byte = 1;
3731 return CODING_CATEGORY_SHIFT_JIS_MASK;
3734 /* Convert Shift-JIS data to internal format. */
3737 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3738 unsigned_char_dynarr *dst, Lstream_data_count n)
3740 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3741 unsigned int flags = str->flags;
3742 unsigned int cpos = str->cpos;
3743 eol_type_t eol_type = str->eol_type;
3747 unsigned char c = *(unsigned char *)src++;
3751 /* Previous character was first byte of Shift-JIS Kanji char. */
3752 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3754 unsigned char e1, e2;
3756 DECODE_SJIS (cpos, c, e1, e2);
3758 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3762 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3763 Dynarr_add (dst, e1);
3764 Dynarr_add (dst, e2);
3769 DECODE_ADD_BINARY_CHAR (cpos, dst);
3770 DECODE_ADD_BINARY_CHAR (c, dst);
3776 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3777 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3779 else if (BYTE_SJIS_KATAKANA_P (c))
3782 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3785 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3786 Dynarr_add (dst, c);
3791 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3795 DECODE_ADD_BINARY_CHAR (c, dst);
3797 label_continue_loop:;
3800 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3806 /* Convert internal character representation to Shift_JIS. */
3809 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3810 unsigned_char_dynarr *dst, unsigned int *flags)
3812 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3816 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3817 Dynarr_add (dst, '\r');
3818 if (eol_type != EOL_CR)
3819 Dynarr_add (dst, ch);
3823 unsigned int s1, s2;
3825 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3827 if (code_point >= 0)
3828 Dynarr_add (dst, code_point);
3829 else if ((code_point
3830 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3833 ENCODE_SJIS ((code_point >> 8) | 0x80,
3834 (code_point & 0xFF) | 0x80, s1, s2);
3835 Dynarr_add (dst, s1);
3836 Dynarr_add (dst, s2);
3838 else if ((code_point
3839 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3841 Dynarr_add (dst, code_point | 0x80);
3842 else if ((code_point
3843 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3846 ENCODE_SJIS ((code_point >> 8) | 0x80,
3847 (code_point & 0xFF) | 0x80, s1, s2);
3848 Dynarr_add (dst, s1);
3849 Dynarr_add (dst, s2);
3851 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3853 Dynarr_add (dst, code_point);
3855 Dynarr_add (dst, '?');
3857 Lisp_Object charset;
3858 unsigned int c1, c2;
3860 BREAKUP_CHAR (ch, charset, c1, c2);
3862 if (EQ(charset, Vcharset_katakana_jisx0201))
3864 Dynarr_add (dst, c1 | 0x80);
3868 Dynarr_add (dst, c1);
3870 else if (EQ(charset, Vcharset_japanese_jisx0208))
3872 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3873 Dynarr_add (dst, s1);
3874 Dynarr_add (dst, s2);
3877 Dynarr_add (dst, '?');
3883 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3884 unsigned int *flags)
3888 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3889 Decode a JISX0208 character of Shift-JIS coding-system.
3890 CODE is the character code in Shift-JIS as a cons of type bytes.
3891 Return the corresponding character.
3895 unsigned char c1, c2, s1, s2;
3898 CHECK_INT (XCAR (code));
3899 CHECK_INT (XCDR (code));
3900 s1 = XINT (XCAR (code));
3901 s2 = XINT (XCDR (code));
3902 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3903 BYTE_SJIS_TWO_BYTE_2_P (s2))
3905 DECODE_SJIS (s1, s2, c1, c2);
3906 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3907 c1 & 0x7F, c2 & 0x7F));
3913 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3914 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3915 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3919 Lisp_Object charset;
3922 CHECK_CHAR_COERCE_INT (character);
3923 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3924 if (EQ (charset, Vcharset_japanese_jisx0208))
3926 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3927 return Fcons (make_int (s1), make_int (s2));
3934 /************************************************************************/
3936 /************************************************************************/
3938 /* BIG5 is a coding system encoding two character sets: ASCII and
3939 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3940 character set and is encoded in two-byte.
3942 --- CODE RANGE of BIG5 ---
3943 (character set) (range)
3945 Big5 (1st byte) 0xA1 .. 0xFE
3946 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3947 --------------------------
3949 Since the number of characters in Big5 is larger than maximum
3950 characters in Emacs' charset (96x96), it can't be handled as one
3951 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3952 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3953 contains frequently used characters and the latter contains less
3954 frequently used characters. */
3957 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3958 ((c) >= 0x81 && (c) <= 0xFE)
3960 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3961 ((c) >= 0xA1 && (c) <= 0xFE)
3964 /* Is this the second byte of a Shift-JIS two-byte char? */
3966 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3967 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3969 /* Number of Big5 characters which have the same code in 1st byte. */
3971 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3973 /* Code conversion macros. These are macros because they are used in
3974 inner loops during code conversion.
3976 Note that temporary variables in macros introduce the classic
3977 dynamic-scoping problems with variable names. We use capital-
3978 lettered variables in the assumption that XEmacs does not use
3979 capital letters in variables except in a very formalized way
3982 /* Convert Big5 code (b1, b2) into its internal string representation
3985 /* There is a much simpler way to split the Big5 charset into two.
3986 For the moment I'm going to leave the algorithm as-is because it
3987 claims to separate out the most-used characters into a single
3988 charset, which perhaps will lead to optimizations in various
3991 The way the algorithm works is something like this:
3993 Big5 can be viewed as a 94x157 charset, where the row is
3994 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3995 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3996 the split between low and high column numbers is apparently
3997 meaningless; ascending rows produce less and less frequent chars.
3998 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3999 the first charset, and the upper half (0xC9 .. 0xFE) to the
4000 second. To do the conversion, we convert the character into
4001 a single number where 0 .. 156 is the first row, 157 .. 313
4002 is the second, etc. That way, the characters are ordered by
4003 decreasing frequency. Then we just chop the space in two
4004 and coerce the result into a 94x94 space.
4007 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
4009 int B1 = b1, B2 = b2; \
4011 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
4015 lb = LEADING_BYTE_CHINESE_BIG5_1; \
4019 lb = LEADING_BYTE_CHINESE_BIG5_2; \
4020 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
4022 c1 = I / (0xFF - 0xA1) + 0xA1; \
4023 c2 = I % (0xFF - 0xA1) + 0xA1; \
4026 /* Convert the internal string representation of a Big5 character
4027 (lb, c1, c2) into Big5 code (b1, b2). */
4029 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
4031 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
4033 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
4035 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
4037 b1 = I / BIG5_SAME_ROW + 0xA1; \
4038 b2 = I % BIG5_SAME_ROW; \
4039 b2 += b2 < 0x3F ? 0x40 : 0x62; \
4043 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4047 unsigned char c = *(unsigned char *)src++;
4048 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
4050 || (c >= 0x80 && c <= 0xA0)
4054 if (st->big5.in_second_byte)
4056 st->big5.in_second_byte = 0;
4057 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
4067 st->big5.in_second_byte = 1;
4069 return CODING_CATEGORY_BIG5_MASK;
4072 /* Convert Big5 data to internal format. */
4075 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
4076 unsigned_char_dynarr *dst, Lstream_data_count n)
4078 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4079 unsigned int flags = str->flags;
4080 unsigned int cpos = str->cpos;
4081 eol_type_t eol_type = str->eol_type;
4084 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4085 (decoding)->codesys, 1);
4090 unsigned char c = *(unsigned char *)src++;
4093 /* Previous character was first byte of Big5 char. */
4094 if (BYTE_BIG5_TWO_BYTE_2_P (c))
4097 int code_point = (cpos << 8) | c;
4098 Emchar char_id = decode_defined_char (ccs, code_point, 0);
4102 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
4103 DECODE_ADD_UCS_CHAR (char_id, dst);
4105 unsigned char b1, b2, b3;
4106 DECODE_BIG5 (cpos, c, b1, b2, b3);
4107 Dynarr_add (dst, b1);
4108 Dynarr_add (dst, b2);
4109 Dynarr_add (dst, b3);
4114 DECODE_ADD_BINARY_CHAR (cpos, dst);
4115 DECODE_ADD_BINARY_CHAR (c, dst);
4121 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4122 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4124 decode_flush_er_chars (str, dst);
4129 decode_flush_er_chars (str, dst);
4130 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4131 DECODE_ADD_BINARY_CHAR (c, dst);
4135 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4136 decode_add_er_char (str, c, dst);
4139 label_continue_loop:;
4142 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4143 if (flags & CODING_STATE_END)
4145 decode_flush_er_chars (str, dst);
4146 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4147 if (flags & CODING_STATE_CR)
4148 Dynarr_add (dst, '\r');
4155 /* Convert internally-formatted data to Big5. */
4158 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4159 unsigned_char_dynarr *dst, unsigned int *flags)
4161 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4165 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4166 Dynarr_add (dst, '\r');
4167 if (eol_type != EOL_CR)
4168 Dynarr_add (dst, ch);
4175 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4177 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4178 Dynarr_add (dst, code_point);
4179 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4181 Dynarr_add (dst, code_point >> 8);
4182 Dynarr_add (dst, code_point & 0xFF);
4184 else if ((code_point
4185 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4187 Dynarr_add (dst, code_point >> 8);
4188 Dynarr_add (dst, code_point & 0xFF);
4190 else if ((code_point
4191 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4194 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4195 + ((code_point & 0xFF) - 33);
4196 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4197 unsigned char b2 = I % BIG5_SAME_ROW;
4199 b2 += b2 < 0x3F ? 0x40 : 0x62;
4200 Dynarr_add (dst, b1);
4201 Dynarr_add (dst, b2);
4203 else if ((code_point
4204 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4207 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4208 + ((code_point & 0xFF) - 33);
4209 unsigned char b1, b2;
4211 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4212 b1 = I / BIG5_SAME_ROW + 0xA1;
4213 b2 = I % BIG5_SAME_ROW;
4214 b2 += b2 < 0x3F ? 0x40 : 0x62;
4215 Dynarr_add (dst, b1);
4216 Dynarr_add (dst, b2);
4218 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4222 char_encode_as_entity_reference (ch, buf);
4223 Dynarr_add_many (dst, buf, strlen (buf));
4226 Dynarr_add (dst, '?');
4233 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4234 unsigned int *flags)
4239 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4240 Decode a Big5 character CODE of BIG5 coding-system.
4241 CODE is the character code in BIG5, a cons of two integers.
4242 Return the corresponding character.
4246 unsigned char c1, c2, b1, b2;
4249 CHECK_INT (XCAR (code));
4250 CHECK_INT (XCDR (code));
4251 b1 = XINT (XCAR (code));
4252 b2 = XINT (XCDR (code));
4253 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4254 BYTE_BIG5_TWO_BYTE_2_P (b2))
4256 Charset_ID leading_byte;
4257 Lisp_Object charset;
4258 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4259 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4260 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4266 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4267 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4268 Return the corresponding character code in Big5.
4272 Lisp_Object charset;
4275 CHECK_CHAR_COERCE_INT (character);
4276 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4277 if (EQ (charset, Vcharset_chinese_big5_1) ||
4278 EQ (charset, Vcharset_chinese_big5_2))
4280 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4282 return Fcons (make_int (b1), make_int (b2));
4289 /************************************************************************/
4291 /************************************************************************/
4294 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4298 unsigned char c = *(unsigned char *)src++;
4299 switch (st->ucs4.in_byte)
4308 st->ucs4.in_byte = 0;
4314 return CODING_CATEGORY_UCS4_MASK;
4318 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4319 unsigned_char_dynarr *dst, Lstream_data_count n)
4321 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4322 unsigned int flags = str->flags;
4323 unsigned int cpos = str->cpos;
4324 unsigned char counter = str->counter;
4328 unsigned char c = *(unsigned char *)src++;
4336 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4341 cpos = ( cpos << 8 ) | c;
4345 if (counter & CODING_STATE_END)
4346 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4350 str->counter = counter;
4354 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4355 unsigned_char_dynarr *dst, unsigned int *flags)
4357 Dynarr_add (dst, ch >> 24);
4358 Dynarr_add (dst, ch >> 16);
4359 Dynarr_add (dst, ch >> 8);
4360 Dynarr_add (dst, ch );
4364 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4365 unsigned int *flags)
4370 /************************************************************************/
4371 /* UTF-16 methods */
4372 /************************************************************************/
4375 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4377 return CODING_CATEGORY_UTF16_MASK;
4381 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4382 unsigned_char_dynarr *dst, Lstream_data_count n)
4384 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4385 unsigned int flags = str->flags;
4386 unsigned int cpos = str->cpos;
4387 unsigned char counter = str->counter & 3;
4388 unsigned char byte_order = str->counter >> 2;
4389 eol_type_t eol_type = str->eol_type;
4393 unsigned char c = *(unsigned char *)src++;
4399 else if (counter == 1)
4403 if (byte_order == 0)
4404 code = (c << 8) | cpos;
4406 code = (cpos << 8) | c;
4409 code = ((code & 0xFF) << 8) | (code >> 8);
4410 if ( byte_order == 0 )
4415 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4426 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4427 DECODE_ADD_UCS_CHAR (code, dst);
4431 else if (counter == 2)
4433 cpos = (cpos << 8) | c;
4441 ? (c << 8) | (cpos & 0xFF)
4442 : ((cpos & 0xFF) << 8) | c;
4444 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4449 label_continue_loop:;
4451 if (counter & CODING_STATE_END)
4452 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4456 str->counter = (byte_order << 2) | counter;
4460 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4461 unsigned_char_dynarr *dst, unsigned int *flags)
4465 Dynarr_add (dst, ch);
4466 Dynarr_add (dst, ch >> 8);
4470 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4471 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4473 Dynarr_add (dst, y);
4474 Dynarr_add (dst, y >> 8);
4475 Dynarr_add (dst, z);
4476 Dynarr_add (dst, z >> 8);
4481 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4482 unsigned int *flags)
4487 /************************************************************************/
4489 /************************************************************************/
4492 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4496 unsigned char c = *(unsigned char *)src++;
4497 switch (st->utf8.in_byte)
4500 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4503 st->utf8.in_byte = 5;
4505 st->utf8.in_byte = 4;
4507 st->utf8.in_byte = 3;
4509 st->utf8.in_byte = 2;
4511 st->utf8.in_byte = 1;
4516 if ((c & 0xc0) != 0x80)
4522 return CODING_CATEGORY_UTF8_MASK;
4526 decode_output_utf8_partial_char (unsigned char counter,
4528 unsigned_char_dynarr *dst)
4531 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4532 else if (counter == 4)
4534 if (cpos < (1 << 6))
4535 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4538 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4539 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4542 else if (counter == 3)
4544 if (cpos < (1 << 6))
4545 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4546 else if (cpos < (1 << 12))
4548 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4549 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4553 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4554 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4555 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4558 else if (counter == 2)
4560 if (cpos < (1 << 6))
4561 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4562 else if (cpos < (1 << 12))
4564 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4565 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4567 else if (cpos < (1 << 18))
4569 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4570 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4571 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4575 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4576 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4577 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4578 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4583 if (cpos < (1 << 6))
4584 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4585 else if (cpos < (1 << 12))
4587 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4588 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4590 else if (cpos < (1 << 18))
4592 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4593 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4594 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4596 else if (cpos < (1 << 24))
4598 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4599 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4600 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4601 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4605 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4606 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4607 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4608 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4609 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4615 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4616 unsigned_char_dynarr *dst, Lstream_data_count n)
4618 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4619 unsigned int flags = str->flags;
4620 unsigned int cpos = str->cpos;
4621 eol_type_t eol_type = str->eol_type;
4622 unsigned char counter = str->counter;
4624 int bom_flag = str->bom_flag;
4626 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4627 (decoding)->codesys, 0);
4632 unsigned char c = *(unsigned char *)src++;
4637 COMPOSE_FLUSH_CHARS (str, dst);
4638 decode_flush_er_chars (str, dst);
4639 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4641 if ( bom_flag == 0 )
4644 DECODE_ADD_UCS_CHAR (c, dst);
4646 else if ( c < 0xC0 )
4648 if ( bom_flag == 0 )
4651 /* decode_add_er_char (str, c, dst); */
4652 COMPOSE_ADD_CHAR (str, c, dst);
4656 /* decode_flush_er_chars (str, dst); */
4662 else if ( c < 0xF0 )
4667 else if ( c < 0xF8 )
4672 else if ( c < 0xFC )
4684 else if ( (c & 0xC0) == 0x80 )
4686 cpos = ( cpos << 6 ) | ( c & 0x3f );
4691 if ( bom_flag == 0 )
4693 if ( cpos == 0xFEFF )
4704 char_id = decode_defined_char (ccs, cpos, 0);
4711 COMPOSE_ADD_CHAR (str, char_id, dst);
4721 COMPOSE_FLUSH_CHARS (str, dst);
4722 decode_flush_er_chars (str, dst);
4723 decode_output_utf8_partial_char (counter, cpos, dst);
4724 DECODE_ADD_BINARY_CHAR (c, dst);
4728 label_continue_loop:;
4731 if (flags & CODING_STATE_END)
4733 COMPOSE_FLUSH_CHARS (str, dst);
4734 decode_flush_er_chars (str, dst);
4737 decode_output_utf8_partial_char (counter, cpos, dst);
4744 str->counter = counter;
4746 str->bom_flag = bom_flag;
4751 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4752 unsigned_char_dynarr *dst, unsigned int *flags)
4754 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4758 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4759 Dynarr_add (dst, '\r');
4760 if (eol_type != EOL_CR)
4761 Dynarr_add (dst, ch);
4763 else if (ch <= 0x7f)
4765 Dynarr_add (dst, ch);
4770 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4771 int code_point = charset_code_point (ucs_ccs, ch, 0);
4773 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4775 Lisp_Object map, ret;
4777 if ( CODING_SYSTEM_ENABLE_DECOMPOSITION (str->codesys) )
4779 Lisp_Object rest = Vdecomposition_feature_list;
4780 Lisp_Object decomp_f;
4781 Lisp_Object seq = Qnil;
4782 struct gcpro gcpro1;
4784 while ( CONSP (rest) )
4786 decomp_f = XCAR (rest);
4788 seq = Fchar_feature (make_char (ch), decomp_f, Qnil,
4798 Lisp_Object base = Fcar (seq);
4801 if ( CHARP (base) && CONSP (seq) )
4803 Lisp_Object comb = Fcar (seq);
4807 char_encode_utf8 (str, XCHAR (base), dst, flags);
4808 char_encode_utf8 (str, XCHAR (comb), dst, flags);
4815 map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4817 && INTP (ret = Fchar_feature (make_char (ch),
4820 code_point = XINT (ret);
4821 else if ( !NILP (map =
4822 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4824 && INTP (ret = Fchar_feature (make_char (ch),
4827 code_point = XINT (ret);
4828 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4832 char_encode_as_entity_reference (ch, buf);
4833 Dynarr_add_many (dst, buf, strlen (buf));
4839 if (code_point <= 0x7ff)
4841 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4842 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4844 else if (code_point <= 0xffff)
4846 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4847 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4848 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4850 else if (code_point <= 0x1fffff)
4852 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4853 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4854 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4855 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4857 else if (code_point <= 0x3ffffff)
4859 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4860 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4861 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4862 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4863 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4867 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4868 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4869 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4870 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4871 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4872 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4878 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4879 unsigned int *flags)
4884 /************************************************************************/
4885 /* ISO2022 methods */
4886 /************************************************************************/
4888 /* The following note describes the coding system ISO2022 briefly.
4889 Since the intention of this note is to help understand the
4890 functions in this file, some parts are NOT ACCURATE or OVERLY
4891 SIMPLIFIED. For thorough understanding, please refer to the
4892 original document of ISO2022.
4894 ISO2022 provides many mechanisms to encode several character sets
4895 in 7-bit and 8-bit environments. For 7-bit environments, all text
4896 is encoded using bytes less than 128. This may make the encoded
4897 text a little bit longer, but the text passes more easily through
4898 several gateways, some of which strip off MSB (Most Signigant Bit).
4900 There are two kinds of character sets: control character set and
4901 graphic character set. The former contains control characters such
4902 as `newline' and `escape' to provide control functions (control
4903 functions are also provided by escape sequences). The latter
4904 contains graphic characters such as 'A' and '-'. Emacs recognizes
4905 two control character sets and many graphic character sets.
4907 Graphic character sets are classified into one of the following
4908 four classes, according to the number of bytes (DIMENSION) and
4909 number of characters in one dimension (CHARS) of the set:
4910 - DIMENSION1_CHARS94
4911 - DIMENSION1_CHARS96
4912 - DIMENSION2_CHARS94
4913 - DIMENSION2_CHARS96
4915 In addition, each character set is assigned an identification tag,
4916 unique for each set, called "final character" (denoted as <F>
4917 hereafter). The <F> of each character set is decided by ECMA(*)
4918 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4919 (0x30..0x3F are for private use only).
4921 Note (*): ECMA = European Computer Manufacturers Association
4923 Here are examples of graphic character set [NAME(<F>)]:
4924 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4925 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4926 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4927 o DIMENSION2_CHARS96 -- none for the moment
4929 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4930 C0 [0x00..0x1F] -- control character plane 0
4931 GL [0x20..0x7F] -- graphic character plane 0
4932 C1 [0x80..0x9F] -- control character plane 1
4933 GR [0xA0..0xFF] -- graphic character plane 1
4935 A control character set is directly designated and invoked to C0 or
4936 C1 by an escape sequence. The most common case is that:
4937 - ISO646's control character set is designated/invoked to C0, and
4938 - ISO6429's control character set is designated/invoked to C1,
4939 and usually these designations/invocations are omitted in encoded
4940 text. In a 7-bit environment, only C0 can be used, and a control
4941 character for C1 is encoded by an appropriate escape sequence to
4942 fit into the environment. All control characters for C1 are
4943 defined to have corresponding escape sequences.
4945 A graphic character set is at first designated to one of four
4946 graphic registers (G0 through G3), then these graphic registers are
4947 invoked to GL or GR. These designations and invocations can be
4948 done independently. The most common case is that G0 is invoked to
4949 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4950 these invocations and designations are omitted in encoded text.
4951 In a 7-bit environment, only GL can be used.
4953 When a graphic character set of CHARS94 is invoked to GL, codes
4954 0x20 and 0x7F of the GL area work as control characters SPACE and
4955 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4958 There are two ways of invocation: locking-shift and single-shift.
4959 With locking-shift, the invocation lasts until the next different
4960 invocation, whereas with single-shift, the invocation affects the
4961 following character only and doesn't affect the locking-shift
4962 state. Invocations are done by the following control characters or
4965 ----------------------------------------------------------------------
4966 abbrev function cntrl escape seq description
4967 ----------------------------------------------------------------------
4968 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4969 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4970 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4971 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4972 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4973 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4974 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4975 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4976 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4977 ----------------------------------------------------------------------
4978 (*) These are not used by any known coding system.
4980 Control characters for these functions are defined by macros
4981 ISO_CODE_XXX in `coding.h'.
4983 Designations are done by the following escape sequences:
4984 ----------------------------------------------------------------------
4985 escape sequence description
4986 ----------------------------------------------------------------------
4987 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4988 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4989 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4990 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4991 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4992 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4993 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4994 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4995 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4996 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4997 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4998 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4999 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
5000 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
5001 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
5002 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
5003 ----------------------------------------------------------------------
5005 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
5006 of dimension 1, chars 94, and final character <F>, etc...
5008 Note (*): Although these designations are not allowed in ISO2022,
5009 Emacs accepts them on decoding, and produces them on encoding
5010 CHARS96 character sets in a coding system which is characterized as
5011 7-bit environment, non-locking-shift, and non-single-shift.
5013 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
5014 '(' can be omitted. We refer to this as "short-form" hereafter.
5016 Now you may notice that there are a lot of ways for encoding the
5017 same multilingual text in ISO2022. Actually, there exist many
5018 coding systems such as Compound Text (used in X11's inter client
5019 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
5020 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
5021 localized platforms), and all of these are variants of ISO2022.
5023 In addition to the above, Emacs handles two more kinds of escape
5024 sequences: ISO6429's direction specification and Emacs' private
5025 sequence for specifying character composition.
5027 ISO6429's direction specification takes the following form:
5028 o CSI ']' -- end of the current direction
5029 o CSI '0' ']' -- end of the current direction
5030 o CSI '1' ']' -- start of left-to-right text
5031 o CSI '2' ']' -- start of right-to-left text
5032 The control character CSI (0x9B: control sequence introducer) is
5033 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
5035 Character composition specification takes the following form:
5036 o ESC '0' -- start character composition
5037 o ESC '1' -- end character composition
5038 Since these are not standard escape sequences of any ISO standard,
5039 their use with these meanings is restricted to Emacs only. */
5042 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
5046 for (i = 0; i < 4; i++)
5048 if (!NILP (coding_system))
5050 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
5052 iso->charset[i] = Qt;
5053 iso->invalid_designated[i] = 0;
5055 iso->esc = ISO_ESC_NOTHING;
5056 iso->esc_bytes_index = 0;
5057 iso->register_left = 0;
5058 iso->register_right = 1;
5059 iso->switched_dir_and_no_valid_charset_yet = 0;
5060 iso->invalid_switch_dir = 0;
5061 iso->output_direction_sequence = 0;
5062 iso->output_literally = 0;
5063 #ifdef ENABLE_COMPOSITE_CHARS
5064 if (iso->composite_chars)
5065 Dynarr_reset (iso->composite_chars);
5070 fit_to_be_escape_quoted (unsigned char c)
5087 /* Parse one byte of an ISO2022 escape sequence.
5088 If the result is an invalid escape sequence, return 0 and
5089 do not change anything in STR. Otherwise, if the result is
5090 an incomplete escape sequence, update ISO2022.ESC and
5091 ISO2022.ESC_BYTES and return -1. Otherwise, update
5092 all the state variables (but not ISO2022.ESC_BYTES) and
5095 If CHECK_INVALID_CHARSETS is non-zero, check for designation
5096 or invocation of an invalid character set and treat that as
5097 an unrecognized escape sequence. */
5100 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
5101 unsigned char c, unsigned int *flags,
5102 int check_invalid_charsets)
5104 /* (1) If we're at the end of a designation sequence, CS is the
5105 charset being designated and REG is the register to designate
5108 (2) If we're at the end of a locking-shift sequence, REG is
5109 the register to invoke and HALF (0 == left, 1 == right) is
5110 the half to invoke it into.
5112 (3) If we're at the end of a single-shift sequence, REG is
5113 the register to invoke. */
5114 Lisp_Object cs = Qnil;
5117 /* NOTE: This code does goto's all over the fucking place.
5118 The reason for this is that we're basically implementing
5119 a state machine here, and hierarchical languages like C
5120 don't really provide a clean way of doing this. */
5122 if (! (*flags & CODING_STATE_ESCAPE))
5123 /* At beginning of escape sequence; we need to reset our
5124 escape-state variables. */
5125 iso->esc = ISO_ESC_NOTHING;
5127 iso->output_literally = 0;
5128 iso->output_direction_sequence = 0;
5132 case ISO_ESC_NOTHING:
5133 iso->esc_bytes_index = 0;
5136 case ISO_CODE_ESC: /* Start escape sequence */
5137 *flags |= CODING_STATE_ESCAPE;
5141 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
5142 *flags |= CODING_STATE_ESCAPE;
5143 iso->esc = ISO_ESC_5_11;
5146 case ISO_CODE_SO: /* locking shift 1 */
5149 case ISO_CODE_SI: /* locking shift 0 */
5153 case ISO_CODE_SS2: /* single shift */
5156 case ISO_CODE_SS3: /* single shift */
5160 default: /* Other control characters */
5167 /**** single shift ****/
5169 case 'N': /* single shift 2 */
5172 case 'O': /* single shift 3 */
5176 /**** locking shift ****/
5178 case '~': /* locking shift 1 right */
5181 case 'n': /* locking shift 2 */
5184 case '}': /* locking shift 2 right */
5187 case 'o': /* locking shift 3 */
5190 case '|': /* locking shift 3 right */
5194 #ifdef ENABLE_COMPOSITE_CHARS
5195 /**** composite ****/
5198 iso->esc = ISO_ESC_START_COMPOSITE;
5199 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5200 CODING_STATE_COMPOSITE;
5204 iso->esc = ISO_ESC_END_COMPOSITE;
5205 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5206 ~CODING_STATE_COMPOSITE;
5208 #endif /* ENABLE_COMPOSITE_CHARS */
5210 /**** directionality ****/
5213 iso->esc = ISO_ESC_5_11;
5216 /**** designation ****/
5218 case '$': /* multibyte charset prefix */
5219 iso->esc = ISO_ESC_2_4;
5223 if (0x28 <= c && c <= 0x2F)
5225 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5229 /* This function is called with CODESYS equal to nil when
5230 doing coding-system detection. */
5232 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5233 && fit_to_be_escape_quoted (c))
5235 iso->esc = ISO_ESC_LITERAL;
5236 *flags &= CODING_STATE_ISO2022_LOCK;
5246 /**** directionality ****/
5248 case ISO_ESC_5_11: /* ISO6429 direction control */
5251 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5252 goto directionality;
5254 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5255 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5256 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5260 case ISO_ESC_5_11_0:
5263 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5264 goto directionality;
5268 case ISO_ESC_5_11_1:
5271 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5272 goto directionality;
5276 case ISO_ESC_5_11_2:
5279 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5280 goto directionality;
5285 iso->esc = ISO_ESC_DIRECTIONALITY;
5286 /* Various junk here to attempt to preserve the direction sequences
5287 literally in the text if they would otherwise be swallowed due
5288 to invalid designations that don't show up as actual charset
5289 changes in the text. */
5290 if (iso->invalid_switch_dir)
5292 /* We already inserted a direction switch literally into the
5293 text. We assume (#### this may not be right) that the
5294 next direction switch is the one going the other way,
5295 and we need to output that literally as well. */
5296 iso->output_literally = 1;
5297 iso->invalid_switch_dir = 0;
5303 /* If we are in the thrall of an invalid designation,
5304 then stick the directionality sequence literally into the
5305 output stream so it ends up in the original text again. */
5306 for (jj = 0; jj < 4; jj++)
5307 if (iso->invalid_designated[jj])
5311 iso->output_literally = 1;
5312 iso->invalid_switch_dir = 1;
5315 /* Indicate that we haven't yet seen a valid designation,
5316 so that if a switch-dir is directly followed by an
5317 invalid designation, both get inserted literally. */
5318 iso->switched_dir_and_no_valid_charset_yet = 1;
5323 /**** designation ****/
5326 if (0x28 <= c && c <= 0x2F)
5328 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5331 if (0x40 <= c && c <= 0x42)
5334 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5335 *flags & CODING_STATE_R2L ?
5336 CHARSET_RIGHT_TO_LEFT :
5337 CHARSET_LEFT_TO_RIGHT);
5348 if (c < '0' || c > '~')
5349 return 0; /* bad final byte */
5351 if (iso->esc >= ISO_ESC_2_8 &&
5352 iso->esc <= ISO_ESC_2_15)
5354 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5355 single = 1; /* single-byte */
5356 reg = (iso->esc - ISO_ESC_2_8) & 3;
5358 else if (iso->esc >= ISO_ESC_2_4_8 &&
5359 iso->esc <= ISO_ESC_2_4_15)
5361 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5362 single = -1; /* multi-byte */
5363 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5367 /* Can this ever be reached? -slb */
5371 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5372 *flags & CODING_STATE_R2L ?
5373 CHARSET_RIGHT_TO_LEFT :
5374 CHARSET_LEFT_TO_RIGHT);
5380 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5384 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5385 /* can't invoke something that ain't there. */
5387 iso->esc = ISO_ESC_SINGLE_SHIFT;
5388 *flags &= CODING_STATE_ISO2022_LOCK;
5390 *flags |= CODING_STATE_SS2;
5392 *flags |= CODING_STATE_SS3;
5396 if (check_invalid_charsets &&
5397 !CHARSETP (iso->charset[reg]))
5398 /* can't invoke something that ain't there. */
5401 iso->register_right = reg;
5403 iso->register_left = reg;
5404 *flags &= CODING_STATE_ISO2022_LOCK;
5405 iso->esc = ISO_ESC_LOCKING_SHIFT;
5409 if (NILP (cs) && check_invalid_charsets)
5411 iso->invalid_designated[reg] = 1;
5412 iso->charset[reg] = Vcharset_ascii;
5413 iso->esc = ISO_ESC_DESIGNATE;
5414 *flags &= CODING_STATE_ISO2022_LOCK;
5415 iso->output_literally = 1;
5416 if (iso->switched_dir_and_no_valid_charset_yet)
5418 /* We encountered a switch-direction followed by an
5419 invalid designation. Ensure that the switch-direction
5420 gets outputted; otherwise it will probably get eaten
5421 when the text is written out again. */
5422 iso->switched_dir_and_no_valid_charset_yet = 0;
5423 iso->output_direction_sequence = 1;
5424 /* And make sure that the switch-dir going the other
5425 way gets outputted, as well. */
5426 iso->invalid_switch_dir = 1;
5430 /* This function is called with CODESYS equal to nil when
5431 doing coding-system detection. */
5432 if (!NILP (codesys))
5434 charset_conversion_spec_dynarr *dyn =
5435 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5441 for (i = 0; i < Dynarr_length (dyn); i++)
5443 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5444 if (EQ (cs, spec->from_charset))
5445 cs = spec->to_charset;
5450 iso->charset[reg] = cs;
5451 iso->esc = ISO_ESC_DESIGNATE;
5452 *flags &= CODING_STATE_ISO2022_LOCK;
5453 if (iso->invalid_designated[reg])
5455 iso->invalid_designated[reg] = 0;
5456 iso->output_literally = 1;
5458 if (iso->switched_dir_and_no_valid_charset_yet)
5459 iso->switched_dir_and_no_valid_charset_yet = 0;
5464 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5468 /* #### There are serious deficiencies in the recognition mechanism
5469 here. This needs to be much smarter if it's going to cut it.
5470 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5471 it should be detected as Latin-1.
5472 All the ISO2022 stuff in this file should be synced up with the
5473 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5474 Perhaps we should wait till R2L works in FSF Emacs? */
5476 if (!st->iso2022.initted)
5478 reset_iso2022 (Qnil, &st->iso2022.iso);
5479 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5480 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5481 CODING_CATEGORY_ISO_8_1_MASK |
5482 CODING_CATEGORY_ISO_8_2_MASK |
5483 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5484 st->iso2022.flags = 0;
5485 st->iso2022.high_byte_count = 0;
5486 st->iso2022.saw_single_shift = 0;
5487 st->iso2022.initted = 1;
5490 mask = st->iso2022.mask;
5494 unsigned char c = *(unsigned char *)src++;
5497 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5498 st->iso2022.high_byte_count++;
5502 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5504 if (st->iso2022.high_byte_count & 1)
5505 /* odd number of high bytes; assume not iso-8-2 */
5506 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5508 st->iso2022.high_byte_count = 0;
5509 st->iso2022.saw_single_shift = 0;
5511 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5513 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5514 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5515 { /* control chars */
5518 /* Allow and ignore control characters that you might
5519 reasonably see in a text file */
5524 case 8: /* backspace */
5525 case 11: /* vertical tab */
5526 case 12: /* form feed */
5527 case 26: /* MS-DOS C-z junk */
5528 case 31: /* '^_' -- for info */
5529 goto label_continue_loop;
5536 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5539 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5540 &st->iso2022.flags, 0))
5542 switch (st->iso2022.iso.esc)
5544 case ISO_ESC_DESIGNATE:
5545 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5546 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5548 case ISO_ESC_LOCKING_SHIFT:
5549 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5550 goto ran_out_of_chars;
5551 case ISO_ESC_SINGLE_SHIFT:
5552 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5553 st->iso2022.saw_single_shift = 1;
5562 goto ran_out_of_chars;
5565 label_continue_loop:;
5574 postprocess_iso2022_mask (int mask)
5576 /* #### kind of cheesy */
5577 /* If seven-bit ISO is allowed, then assume that the encoding is
5578 entirely seven-bit and turn off the eight-bit ones. */
5579 if (mask & CODING_CATEGORY_ISO_7_MASK)
5580 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5581 CODING_CATEGORY_ISO_8_1_MASK |
5582 CODING_CATEGORY_ISO_8_2_MASK);
5586 /* If FLAGS is a null pointer or specifies right-to-left motion,
5587 output a switch-dir-to-left-to-right sequence to DST.
5588 Also update FLAGS if it is not a null pointer.
5589 If INTERNAL_P is set, we are outputting in internal format and
5590 need to handle the CSI differently. */
5593 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5594 unsigned_char_dynarr *dst,
5595 unsigned int *flags,
5598 if (!flags || (*flags & CODING_STATE_R2L))
5600 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5602 Dynarr_add (dst, ISO_CODE_ESC);
5603 Dynarr_add (dst, '[');
5605 else if (internal_p)
5606 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5608 Dynarr_add (dst, ISO_CODE_CSI);
5609 Dynarr_add (dst, '0');
5610 Dynarr_add (dst, ']');
5612 *flags &= ~CODING_STATE_R2L;
5616 /* If FLAGS is a null pointer or specifies a direction different from
5617 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5618 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5619 sequence to DST. Also update FLAGS if it is not a null pointer.
5620 If INTERNAL_P is set, we are outputting in internal format and
5621 need to handle the CSI differently. */
5624 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5625 unsigned_char_dynarr *dst, unsigned int *flags,
5628 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5629 direction == CHARSET_LEFT_TO_RIGHT)
5630 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5631 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5632 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5633 direction == CHARSET_RIGHT_TO_LEFT)
5635 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5637 Dynarr_add (dst, ISO_CODE_ESC);
5638 Dynarr_add (dst, '[');
5640 else if (internal_p)
5641 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5643 Dynarr_add (dst, ISO_CODE_CSI);
5644 Dynarr_add (dst, '2');
5645 Dynarr_add (dst, ']');
5647 *flags |= CODING_STATE_R2L;
5651 /* Convert ISO2022-format data to internal format. */
5654 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5655 unsigned_char_dynarr *dst, Lstream_data_count n)
5657 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5658 unsigned int flags = str->flags;
5659 unsigned int cpos = str->cpos;
5660 unsigned char counter = str->counter;
5661 eol_type_t eol_type = str->eol_type;
5662 #ifdef ENABLE_COMPOSITE_CHARS
5663 unsigned_char_dynarr *real_dst = dst;
5665 Lisp_Object coding_system;
5667 XSETCODING_SYSTEM (coding_system, str->codesys);
5669 #ifdef ENABLE_COMPOSITE_CHARS
5670 if (flags & CODING_STATE_COMPOSITE)
5671 dst = str->iso2022.composite_chars;
5672 #endif /* ENABLE_COMPOSITE_CHARS */
5676 unsigned char c = *(unsigned char *)src++;
5677 if (flags & CODING_STATE_ESCAPE)
5678 { /* Within ESC sequence */
5679 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5684 switch (str->iso2022.esc)
5686 #ifdef ENABLE_COMPOSITE_CHARS
5687 case ISO_ESC_START_COMPOSITE:
5688 if (str->iso2022.composite_chars)
5689 Dynarr_reset (str->iso2022.composite_chars);
5691 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5692 dst = str->iso2022.composite_chars;
5694 case ISO_ESC_END_COMPOSITE:
5696 Bufbyte comstr[MAX_EMCHAR_LEN];
5698 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5699 Dynarr_length (dst));
5701 len = set_charptr_emchar (comstr, emch);
5702 Dynarr_add_many (dst, comstr, len);
5705 #endif /* ENABLE_COMPOSITE_CHARS */
5707 case ISO_ESC_LITERAL:
5708 COMPOSE_FLUSH_CHARS (str, dst);
5709 decode_flush_er_chars (str, dst);
5710 DECODE_ADD_BINARY_CHAR (c, dst);
5714 /* Everything else handled already */
5719 /* Attempted error recovery. */
5720 if (str->iso2022.output_direction_sequence)
5721 ensure_correct_direction (flags & CODING_STATE_R2L ?
5722 CHARSET_RIGHT_TO_LEFT :
5723 CHARSET_LEFT_TO_RIGHT,
5724 str->codesys, dst, 0, 1);
5725 /* More error recovery. */
5726 if (!retval || str->iso2022.output_literally)
5728 /* Output the (possibly invalid) sequence */
5730 COMPOSE_FLUSH_CHARS (str, dst);
5731 decode_flush_er_chars (str, dst);
5732 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5733 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5734 flags &= CODING_STATE_ISO2022_LOCK;
5736 n++, src--;/* Repeat the loop with the same character. */
5739 /* No sense in reprocessing the final byte of the
5740 escape sequence; it could mess things up anyway.
5742 COMPOSE_FLUSH_CHARS (str, dst);
5743 decode_flush_er_chars (str, dst);
5744 DECODE_ADD_BINARY_CHAR (c, dst);
5750 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5751 { /* Control characters */
5753 /***** Error-handling *****/
5755 /* If we were in the middle of a character, dump out the
5756 partial character. */
5759 COMPOSE_FLUSH_CHARS (str, dst);
5760 decode_flush_er_chars (str, dst);
5764 DECODE_ADD_BINARY_CHAR
5765 ((unsigned char)(cpos >> (counter * 8)), dst);
5770 /* If we just saw a single-shift character, dump it out.
5771 This may dump out the wrong sort of single-shift character,
5772 but least it will give an indication that something went
5774 if (flags & CODING_STATE_SS2)
5776 COMPOSE_FLUSH_CHARS (str, dst);
5777 decode_flush_er_chars (str, dst);
5778 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5779 flags &= ~CODING_STATE_SS2;
5781 if (flags & CODING_STATE_SS3)
5783 COMPOSE_FLUSH_CHARS (str, dst);
5784 decode_flush_er_chars (str, dst);
5785 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5786 flags &= ~CODING_STATE_SS3;
5789 /***** Now handle the control characters. *****/
5795 COMPOSE_FLUSH_CHARS (str, dst);
5796 decode_flush_er_chars (str, dst);
5797 if (eol_type == EOL_CR)
5798 Dynarr_add (dst, '\n');
5799 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5800 Dynarr_add (dst, c);
5802 flags |= CODING_STATE_CR;
5803 goto label_continue_loop;
5805 else if (flags & CODING_STATE_CR)
5806 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5808 Dynarr_add (dst, '\r');
5809 flags &= ~CODING_STATE_CR;
5812 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5815 flags &= CODING_STATE_ISO2022_LOCK;
5817 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5819 COMPOSE_FLUSH_CHARS (str, dst);
5820 decode_flush_er_chars (str, dst);
5821 DECODE_ADD_BINARY_CHAR (c, dst);
5825 { /* Graphic characters */
5826 Lisp_Object charset;
5835 COMPOSE_FLUSH_CHARS (str, dst);
5836 decode_flush_er_chars (str, dst);
5837 if (eol_type == EOL_CR)
5838 Dynarr_add (dst, '\n');
5839 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5840 Dynarr_add (dst, c);
5842 flags |= CODING_STATE_CR;
5843 goto label_continue_loop;
5845 else if (flags & CODING_STATE_CR)
5846 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5848 Dynarr_add (dst, '\r');
5849 flags &= ~CODING_STATE_CR;
5852 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5855 /* Now determine the charset. */
5856 reg = ((flags & CODING_STATE_SS2) ? 2
5857 : (flags & CODING_STATE_SS3) ? 3
5858 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5859 : str->iso2022.register_left);
5860 charset = str->iso2022.charset[reg];
5862 /* Error checking: */
5863 if (! CHARSETP (charset)
5864 || str->iso2022.invalid_designated[reg]
5865 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5866 && XCHARSET_CHARS (charset) == 94))
5867 /* Mrmph. We are trying to invoke a register that has no
5868 or an invalid charset in it, or trying to add a character
5869 outside the range of the charset. Insert that char literally
5870 to preserve it for the output. */
5872 COMPOSE_FLUSH_CHARS (str, dst);
5873 decode_flush_er_chars (str, dst);
5877 DECODE_ADD_BINARY_CHAR
5878 ((unsigned char)(cpos >> (counter * 8)), dst);
5881 DECODE_ADD_BINARY_CHAR (c, dst);
5886 /* Things are probably hunky-dorey. */
5888 /* Fetch reverse charset, maybe. */
5889 if (((flags & CODING_STATE_R2L) &&
5890 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5892 (!(flags & CODING_STATE_R2L) &&
5893 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5895 Lisp_Object new_charset =
5896 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5897 if (!NILP (new_charset))
5898 charset = new_charset;
5903 if (XCHARSET_DIMENSION (charset) == counter)
5905 COMPOSE_ADD_CHAR (str,
5906 DECODE_CHAR (charset,
5907 ((cpos & 0x7F7F7F) << 8)
5914 cpos = (cpos << 8) | c;
5916 lb = XCHARSET_LEADING_BYTE (charset);
5917 switch (XCHARSET_REP_BYTES (charset))
5920 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5921 Dynarr_add (dst, c & 0x7F);
5924 case 2: /* one-byte official */
5925 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5926 Dynarr_add (dst, lb);
5927 Dynarr_add (dst, c | 0x80);
5930 case 3: /* one-byte private or two-byte official */
5931 if (XCHARSET_PRIVATE_P (charset))
5933 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5934 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5935 Dynarr_add (dst, lb);
5936 Dynarr_add (dst, c | 0x80);
5942 Dynarr_add (dst, lb);
5943 Dynarr_add (dst, ch | 0x80);
5944 Dynarr_add (dst, c | 0x80);
5952 default: /* two-byte private */
5955 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5956 Dynarr_add (dst, lb);
5957 Dynarr_add (dst, ch | 0x80);
5958 Dynarr_add (dst, c | 0x80);
5968 flags &= CODING_STATE_ISO2022_LOCK;
5971 label_continue_loop:;
5974 if (flags & CODING_STATE_END)
5976 COMPOSE_FLUSH_CHARS (str, dst);
5977 decode_flush_er_chars (str, dst);
5978 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5982 str->counter = counter;
5986 /***** ISO2022 encoder *****/
5988 /* Designate CHARSET into register REG. */
5991 iso2022_designate (Lisp_Object charset, unsigned char reg,
5992 struct encoding_stream *str, unsigned_char_dynarr *dst)
5994 static const char inter94[] = "()*+";
5995 static const char inter96[] = ",-./";
5996 unsigned short chars;
5997 unsigned char dimension;
5998 unsigned char final;
5999 Lisp_Object old_charset = str->iso2022.charset[reg];
6001 str->iso2022.charset[reg] = charset;
6002 if (!CHARSETP (charset))
6003 /* charset might be an initial nil or t. */
6005 chars = XCHARSET_CHARS (charset);
6006 dimension = XCHARSET_DIMENSION (charset);
6007 final = XCHARSET_FINAL (charset);
6008 if (!str->iso2022.force_charset_on_output[reg] &&
6009 CHARSETP (old_charset) &&
6010 XCHARSET_CHARS (old_charset) == chars &&
6011 XCHARSET_DIMENSION (old_charset) == dimension &&
6012 XCHARSET_FINAL (old_charset) == final)
6015 str->iso2022.force_charset_on_output[reg] = 0;
6018 charset_conversion_spec_dynarr *dyn =
6019 str->codesys->iso2022.output_conv;
6025 for (i = 0; i < Dynarr_length (dyn); i++)
6027 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
6028 if (EQ (charset, spec->from_charset))
6029 charset = spec->to_charset;
6034 Dynarr_add (dst, ISO_CODE_ESC);
6039 Dynarr_add (dst, inter94[reg]);
6042 Dynarr_add (dst, '$');
6044 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
6047 Dynarr_add (dst, inter94[reg]);
6052 Dynarr_add (dst, inter96[reg]);
6055 Dynarr_add (dst, '$');
6056 Dynarr_add (dst, inter96[reg]);
6060 Dynarr_add (dst, final);
6064 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
6066 if (str->iso2022.register_left != 0)
6068 Dynarr_add (dst, ISO_CODE_SI);
6069 str->iso2022.register_left = 0;
6074 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
6076 if (str->iso2022.register_left != 1)
6078 Dynarr_add (dst, ISO_CODE_SO);
6079 str->iso2022.register_left = 1;
6084 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
6085 unsigned_char_dynarr *dst, unsigned int *flags)
6087 unsigned char charmask;
6088 Lisp_Coding_System* codesys = str->codesys;
6089 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6091 Lisp_Object charset = str->iso2022.current_charset;
6092 int half = str->iso2022.current_half;
6093 int code_point = -1;
6097 restore_left_to_right_direction (codesys, dst, flags, 0);
6099 /* Make sure G0 contains ASCII */
6100 if ((ch > ' ' && ch < ISO_CODE_DEL)
6101 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
6103 ensure_normal_shift (str, dst);
6104 iso2022_designate (Vcharset_ascii, 0, str, dst);
6107 /* If necessary, restore everything to the default state
6109 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
6111 restore_left_to_right_direction (codesys, dst, flags, 0);
6113 ensure_normal_shift (str, dst);
6115 for (i = 0; i < 4; i++)
6117 Lisp_Object initial_charset =
6118 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6119 iso2022_designate (initial_charset, i, str, dst);
6124 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6125 Dynarr_add (dst, '\r');
6126 if (eol_type != EOL_CR)
6127 Dynarr_add (dst, ch);
6131 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6132 && fit_to_be_escape_quoted (ch))
6133 Dynarr_add (dst, ISO_CODE_ESC);
6134 Dynarr_add (dst, ch);
6137 else if ( (0x80 <= ch) && (ch <= 0x9f) )
6139 charmask = (half == 0 ? 0x00 : 0x80);
6141 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6142 && fit_to_be_escape_quoted (ch))
6143 Dynarr_add (dst, ISO_CODE_ESC);
6144 /* you asked for it ... */
6145 Dynarr_add (dst, ch);
6151 /* Now determine which register to use. */
6153 for (i = 0; i < 4; i++)
6155 if ((CHARSETP (charset = str->iso2022.charset[i])
6156 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6160 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6161 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6169 Lisp_Object original_default_coded_charset_priority_list
6170 = Vdefault_coded_charset_priority_list;
6171 Vdefault_coded_charset_priority_list
6172 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6173 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6175 code_point = ENCODE_CHAR (ch, charset);
6176 if (XCHARSET_FINAL (charset))
6178 Vdefault_coded_charset_priority_list
6179 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6180 Vdefault_coded_charset_priority_list));
6182 Vdefault_coded_charset_priority_list
6183 = original_default_coded_charset_priority_list;
6184 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6186 code_point = ENCODE_CHAR (ch, charset);
6187 if (XCHARSET_FINAL (charset))
6189 Vdefault_coded_charset_priority_list
6190 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6191 Vdefault_coded_charset_priority_list));
6193 code_point = ENCODE_CHAR (ch, charset);
6194 if (!XCHARSET_FINAL (charset))
6196 charset = Vcharset_ascii;
6200 Vdefault_coded_charset_priority_list
6201 = original_default_coded_charset_priority_list;
6203 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6204 codesys, dst, flags, 0);
6208 if (XCHARSET_GRAPHIC (charset) != 0)
6210 if (!NILP (str->iso2022.charset[1]) &&
6211 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6212 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6214 else if (!NILP (str->iso2022.charset[2]))
6216 else if (!NILP (str->iso2022.charset[3]))
6225 iso2022_designate (charset, reg, str, dst);
6227 /* Now invoke that register. */
6231 ensure_normal_shift (str, dst);
6235 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6237 ensure_shift_out (str, dst);
6244 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6246 Dynarr_add (dst, ISO_CODE_ESC);
6247 Dynarr_add (dst, 'N');
6252 Dynarr_add (dst, ISO_CODE_SS2);
6257 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6259 Dynarr_add (dst, ISO_CODE_ESC);
6260 Dynarr_add (dst, 'O');
6265 Dynarr_add (dst, ISO_CODE_SS3);
6273 charmask = (half == 0 ? 0x00 : 0x80);
6275 switch (XCHARSET_DIMENSION (charset))
6278 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6281 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6282 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6285 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6286 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6287 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6290 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6291 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6292 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6293 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6299 str->iso2022.current_charset = charset;
6300 str->iso2022.current_half = half;
6304 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6305 unsigned int *flags)
6307 Lisp_Coding_System* codesys = str->codesys;
6310 restore_left_to_right_direction (codesys, dst, flags, 0);
6311 ensure_normal_shift (str, dst);
6312 for (i = 0; i < 4; i++)
6314 Lisp_Object initial_charset
6315 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6316 iso2022_designate (initial_charset, i, str, dst);
6321 /************************************************************************/
6322 /* No-conversion methods */
6323 /************************************************************************/
6325 /* This is used when reading in "binary" files -- i.e. files that may
6326 contain all 256 possible byte values and that are not to be
6327 interpreted as being in any particular decoding. */
6329 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6330 unsigned_char_dynarr *dst, Lstream_data_count n)
6332 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6333 unsigned int flags = str->flags;
6334 unsigned int cpos = str->cpos;
6335 eol_type_t eol_type = str->eol_type;
6339 unsigned char c = *(unsigned char *)src++;
6341 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6342 DECODE_ADD_BINARY_CHAR (c, dst);
6343 label_continue_loop:;
6346 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6353 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6354 unsigned_char_dynarr *dst, Lstream_data_count n)
6357 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6358 unsigned int flags = str->flags;
6359 unsigned int ch = str->ch;
6360 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6362 unsigned char char_boundary = str->iso2022.current_char_boundary;
6369 if (char_boundary == 0)
6375 else if ( c >= 0xf8 )
6380 else if ( c >= 0xf0 )
6385 else if ( c >= 0xe0 )
6390 else if ( c >= 0xc0 )
6400 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6401 Dynarr_add (dst, '\r');
6402 if (eol_type != EOL_CR)
6403 Dynarr_add (dst, c);
6406 Dynarr_add (dst, c);
6409 else if (char_boundary == 1)
6411 ch = ( ch << 6 ) | ( c & 0x3f );
6412 Dynarr_add (dst, ch & 0xff);
6417 ch = ( ch << 6 ) | ( c & 0x3f );
6420 #else /* not UTF2000 */
6423 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6424 Dynarr_add (dst, '\r');
6425 if (eol_type != EOL_CR)
6426 Dynarr_add (dst, '\n');
6429 else if (BYTE_ASCII_P (c))
6432 Dynarr_add (dst, c);
6434 else if (BUFBYTE_LEADING_BYTE_P (c))
6437 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6438 c == LEADING_BYTE_CONTROL_1)
6441 Dynarr_add (dst, '~'); /* untranslatable character */
6445 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6446 Dynarr_add (dst, c);
6447 else if (ch == LEADING_BYTE_CONTROL_1)
6450 Dynarr_add (dst, c - 0x20);
6452 /* else it should be the second or third byte of an
6453 untranslatable character, so ignore it */
6456 #endif /* not UTF2000 */
6462 str->iso2022.current_char_boundary = char_boundary;
6468 /************************************************************************/
6469 /* Initialization */
6470 /************************************************************************/
6473 syms_of_file_coding (void)
6475 INIT_LRECORD_IMPLEMENTATION (coding_system);
6477 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6479 DEFSUBR (Fcoding_system_p);
6480 DEFSUBR (Ffind_coding_system);
6481 DEFSUBR (Fget_coding_system);
6482 DEFSUBR (Fcoding_system_list);
6483 DEFSUBR (Fcoding_system_name);
6484 DEFSUBR (Fmake_coding_system);
6485 DEFSUBR (Fcopy_coding_system);
6486 DEFSUBR (Fcoding_system_canonical_name_p);
6487 DEFSUBR (Fcoding_system_alias_p);
6488 DEFSUBR (Fcoding_system_aliasee);
6489 DEFSUBR (Fdefine_coding_system_alias);
6490 DEFSUBR (Fsubsidiary_coding_system);
6492 DEFSUBR (Fcoding_system_type);
6493 DEFSUBR (Fcoding_system_doc_string);
6495 DEFSUBR (Fcoding_system_charset);
6497 DEFSUBR (Fcoding_system_property);
6499 DEFSUBR (Fcoding_category_list);
6500 DEFSUBR (Fset_coding_priority_list);
6501 DEFSUBR (Fcoding_priority_list);
6502 DEFSUBR (Fset_coding_category_system);
6503 DEFSUBR (Fcoding_category_system);
6505 DEFSUBR (Fdetect_coding_region);
6506 DEFSUBR (Fdecode_coding_region);
6507 DEFSUBR (Fencode_coding_region);
6509 DEFSUBR (Fdecode_shift_jis_char);
6510 DEFSUBR (Fencode_shift_jis_char);
6511 DEFSUBR (Fdecode_big5_char);
6512 DEFSUBR (Fencode_big5_char);
6514 defsymbol (&Qcoding_systemp, "coding-system-p");
6515 defsymbol (&Qno_conversion, "no-conversion");
6516 defsymbol (&Qraw_text, "raw-text");
6518 defsymbol (&Qbig5, "big5");
6519 defsymbol (&Qshift_jis, "shift-jis");
6520 defsymbol (&Qucs4, "ucs-4");
6521 defsymbol (&Qutf8, "utf-8");
6522 defsymbol (&Qutf16, "utf-16");
6523 defsymbol (&Qccl, "ccl");
6524 defsymbol (&Qiso2022, "iso2022");
6526 defsymbol (&Qmnemonic, "mnemonic");
6527 defsymbol (&Qeol_type, "eol-type");
6528 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6529 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6531 defsymbol (&Qcr, "cr");
6532 defsymbol (&Qlf, "lf");
6533 defsymbol (&Qcrlf, "crlf");
6534 defsymbol (&Qeol_cr, "eol-cr");
6535 defsymbol (&Qeol_lf, "eol-lf");
6536 defsymbol (&Qeol_crlf, "eol-crlf");
6538 defsymbol (&Qcharset_g0, "charset-g0");
6539 defsymbol (&Qcharset_g1, "charset-g1");
6540 defsymbol (&Qcharset_g2, "charset-g2");
6541 defsymbol (&Qcharset_g3, "charset-g3");
6542 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6543 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6544 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6545 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6546 defsymbol (&Qno_iso6429, "no-iso6429");
6547 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6548 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6550 defsymbol (&Qshort, "short");
6551 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6552 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6553 defsymbol (&Qseven, "seven");
6554 defsymbol (&Qlock_shift, "lock-shift");
6555 defsymbol (&Qescape_quoted, "escape-quoted");
6558 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6559 defsymbol (&Qdisable_composition, "disable-composition");
6560 defsymbol (&Qenable_decomposition, "enable-decomposition");
6561 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6562 defsymbol (&Quse_entity_reference, "use-entity-reference");
6563 defsymbol (&Qd, "d");
6564 defsymbol (&Qx, "x");
6565 defsymbol (&QX, "X");
6567 defsymbol (&Qencode, "encode");
6568 defsymbol (&Qdecode, "decode");
6571 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6573 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6575 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6577 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6579 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6581 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6583 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6585 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6587 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6589 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6592 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6597 lstream_type_create_file_coding (void)
6599 LSTREAM_HAS_METHOD (decoding, reader);
6600 LSTREAM_HAS_METHOD (decoding, writer);
6601 LSTREAM_HAS_METHOD (decoding, rewinder);
6602 LSTREAM_HAS_METHOD (decoding, seekable_p);
6603 LSTREAM_HAS_METHOD (decoding, flusher);
6604 LSTREAM_HAS_METHOD (decoding, closer);
6605 LSTREAM_HAS_METHOD (decoding, marker);
6607 LSTREAM_HAS_METHOD (encoding, reader);
6608 LSTREAM_HAS_METHOD (encoding, writer);
6609 LSTREAM_HAS_METHOD (encoding, rewinder);
6610 LSTREAM_HAS_METHOD (encoding, seekable_p);
6611 LSTREAM_HAS_METHOD (encoding, flusher);
6612 LSTREAM_HAS_METHOD (encoding, closer);
6613 LSTREAM_HAS_METHOD (encoding, marker);
6617 vars_of_file_coding (void)
6621 fcd = xnew (struct file_coding_dump);
6622 dump_add_root_struct_ptr (&fcd, &fcd_description);
6624 /* Initialize to something reasonable ... */
6625 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6627 fcd->coding_category_system[i] = Qnil;
6628 fcd->coding_category_by_priority[i] = i;
6631 Fprovide (intern ("file-coding"));
6633 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6634 Coding system used for TTY keyboard input.
6635 Not used under a windowing system.
6637 Vkeyboard_coding_system = Qnil;
6639 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6640 Coding system used for TTY display output.
6641 Not used under a windowing system.
6643 Vterminal_coding_system = Qnil;
6645 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6646 Overriding coding system used when reading from a file or process.
6647 You should bind this variable with `let', but do not set it globally.
6648 If this is non-nil, it specifies the coding system that will be used
6649 to decode input on read operations, such as from a file or process.
6650 It overrides `buffer-file-coding-system-for-read',
6651 `insert-file-contents-pre-hook', etc. Use those variables instead of
6652 this one for permanent changes to the environment. */ );
6653 Vcoding_system_for_read = Qnil;
6655 DEFVAR_LISP ("coding-system-for-write",
6656 &Vcoding_system_for_write /*
6657 Overriding coding system used when writing to a file or process.
6658 You should bind this variable with `let', but do not set it globally.
6659 If this is non-nil, it specifies the coding system that will be used
6660 to encode output for write operations, such as to a file or process.
6661 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6662 Use those variables instead of this one for permanent changes to the
6664 Vcoding_system_for_write = Qnil;
6666 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6667 Coding system used to convert pathnames when accessing files.
6669 Vfile_name_coding_system = Qnil;
6671 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6672 &Vcoded_charset_entity_reference_alist /*
6673 Alist of coded-charset vs corresponding entity-reference.
6674 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6675 CCS is coded-charset.
6676 CODE-COLUMNS is columns of code-point of entity-reference.
6677 CODE-TYPE is format type of code-point of entity-reference.
6678 `d' means decimal value and `x' means hexadecimal value.
6680 Vcoded_charset_entity_reference_alist = Qnil;
6682 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6683 Non-nil means the buffer contents are regarded as multi-byte form
6684 of characters, not a binary code. This affects the display, file I/O,
6685 and behaviors of various editing commands.
6687 Setting this to nil does not do anything.
6689 enable_multibyte_characters = 1;
6692 DEFVAR_LISP ("decomposition-feature-list",
6693 &Vdecomposition_feature_list /*
6694 List of `=decomposition@FOO' feature to encode characters as IVS.
6696 Vdecomposition_feature_list = Qnil;
6701 complex_vars_of_file_coding (void)
6703 staticpro (&Vcoding_system_hash_table);
6704 Vcoding_system_hash_table =
6705 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6707 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6708 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6710 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6712 struct codesys_prop csp; \
6714 csp.prop_type = (Prop_Type); \
6715 Dynarr_add (the_codesys_prop_dynarr, csp); \
6718 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6719 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6720 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6721 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6722 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6723 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6724 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6726 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6727 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6728 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6729 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6730 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6731 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6732 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6733 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6734 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6735 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6736 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6737 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6738 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6739 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6740 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6741 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6742 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6744 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6747 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6748 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6750 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6751 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qenable_decomposition);
6752 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6755 /* Need to create this here or we're really screwed. */
6757 (Qraw_text, Qno_conversion,
6758 build_string ("Raw text, which means it converts only line-break-codes."),
6759 list2 (Qmnemonic, build_string ("Raw")));
6762 (Qbinary, Qno_conversion,
6763 build_string ("Binary, which means it does not convert anything."),
6764 list4 (Qeol_type, Qlf,
6765 Qmnemonic, build_string ("Binary")));
6771 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6772 list2 (Qmnemonic, build_string ("MTF8")));
6775 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6777 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6779 Fdefine_coding_system_alias (Qterminal, Qbinary);
6780 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6782 /* Need this for bootstrapping */
6783 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6784 Fget_coding_system (Qraw_text);
6787 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6788 = Fget_coding_system (Qutf_8_mcs);
6791 #if defined(MULE) && !defined(UTF2000)
6795 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6796 fcd->ucs_to_mule_table[i] = Qnil;
6798 staticpro (&mule_to_ucs_table);
6799 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6800 #endif /* defined(MULE) && !defined(UTF2000) */