1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011,
5 2012, 2013, 2016 MORIOKA Tomohiko
7 This file is part of XEmacs.
9 XEmacs is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with XEmacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 Boston, MA 02111-1307, USA. */
24 /* Synched up with: Mule 2.3. Not in FSF. */
26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
41 #ifdef USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE
44 #include "file-coding.h"
46 Lisp_Object Qcoding_system_error;
48 Lisp_Object Vkeyboard_coding_system;
49 Lisp_Object Vterminal_coding_system;
50 Lisp_Object Vcoding_system_for_read;
51 Lisp_Object Vcoding_system_for_write;
52 Lisp_Object Vfile_name_coding_system;
54 Lisp_Object Vcoded_charset_entity_reference_alist;
56 /* Table of symbols identifying each coding category. */
57 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
61 struct file_coding_dump {
62 /* Coding system currently associated with each coding category. */
63 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
65 /* Table of all coding categories in decreasing order of priority.
66 This describes a permutation of the possible coding categories. */
67 int coding_category_by_priority[CODING_CATEGORY_LAST];
69 #if defined(MULE) && !defined(UTF2000)
70 Lisp_Object ucs_to_mule_table[65536];
74 static const struct lrecord_description fcd_description_1[] = {
75 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
76 #if defined(MULE) && !defined(UTF2000)
77 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
82 static const struct struct_description fcd_description = {
83 sizeof (struct file_coding_dump),
87 Lisp_Object mule_to_ucs_table;
89 Lisp_Object Qcoding_systemp;
91 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
92 /* Qinternal in general.c */
94 Lisp_Object Qmnemonic, Qeol_type;
95 Lisp_Object Qcr, Qcrlf, Qlf;
96 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
97 Lisp_Object Qpost_read_conversion;
98 Lisp_Object Qpre_write_conversion;
101 Lisp_Object Qucs4, Qutf16, Qutf8;
102 Lisp_Object Qbig5, Qshift_jis;
103 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
104 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
105 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
106 Lisp_Object Qno_iso6429;
107 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
108 Lisp_Object Qescape_quoted;
109 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
112 Lisp_Object Qutf_8_mcs;
113 Lisp_Object Qdisable_composition, Qenable_decomposition;
114 Lisp_Object Qccs_priority_list;
115 Lisp_Object Quse_entity_reference;
116 Lisp_Object Qd, Qx, QX;
117 Lisp_Object Vdecomposition_feature_list;
119 Lisp_Object Qencode, Qdecode;
121 Lisp_Object Vcoding_system_hash_table;
123 int enable_multibyte_characters;
126 /* Additional information used by the ISO2022 decoder and detector. */
127 struct iso2022_decoder
129 /* CHARSET holds the character sets currently assigned to the G0
130 through G3 variables. It is initialized from the array
131 INITIAL_CHARSET in CODESYS. */
132 Lisp_Object charset[4];
134 /* Which registers are currently invoked into the left (GL) and
135 right (GR) halves of the 8-bit encoding space? */
136 int register_left, register_right;
138 /* ISO_ESC holds a value indicating part of an escape sequence
139 that has already been seen. */
140 enum iso_esc_flag esc;
142 /* This records the bytes we've seen so far in an escape sequence,
143 in case the sequence is invalid (we spit out the bytes unchanged). */
144 unsigned char esc_bytes[8];
146 /* Index for next byte to store in ISO escape sequence. */
149 #ifdef ENABLE_COMPOSITE_CHARS
150 /* Stuff seen so far when composing a string. */
151 unsigned_char_dynarr *composite_chars;
154 /* If we saw an invalid designation sequence for a particular
155 register, we flag it here and switch to ASCII. The next time we
156 see a valid designation for this register, we turn off the flag
157 and do the designation normally, but pretend the sequence was
158 invalid. The effect of all this is that (most of the time) the
159 escape sequences for both the switch to the unknown charset, and
160 the switch back to the known charset, get inserted literally into
161 the buffer and saved out as such. The hope is that we can
162 preserve the escape sequences so that the resulting written out
163 file makes sense. If we don't do any of this, the designation
164 to the invalid charset will be preserved but that switch back
165 to the known charset will probably get eaten because it was
166 the same charset that was already present in the register. */
167 unsigned char invalid_designated[4];
169 /* We try to do similar things as above for direction-switching
170 sequences. If we encountered a direction switch while an
171 invalid designation was present, or an invalid designation
172 just after a direction switch (i.e. no valid designation
173 encountered yet), we insert the direction-switch escape
174 sequence literally into the output stream, and later on
175 insert the corresponding direction-restoring escape sequence
177 unsigned int switched_dir_and_no_valid_charset_yet :1;
178 unsigned int invalid_switch_dir :1;
180 /* Tells the decoder to output the escape sequence literally
181 even though it was valid. Used in the games we play to
182 avoid lossage when we encounter invalid designations. */
183 unsigned int output_literally :1;
184 /* We encountered a direction switch followed by an invalid
185 designation. We didn't output the direction switch
186 literally because we didn't know about the invalid designation;
187 but we have to do so now. */
188 unsigned int output_direction_sequence :1;
191 EXFUN (Fcopy_coding_system, 2);
193 struct detection_state;
196 text_encode_generic (Lstream *encoding, const Bufbyte *src,
197 unsigned_char_dynarr *dst, Lstream_data_count n);
199 static int detect_coding_sjis (struct detection_state *st,
200 const Extbyte *src, Lstream_data_count n);
201 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
202 unsigned_char_dynarr *dst, Lstream_data_count n);
203 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
204 unsigned_char_dynarr *dst, unsigned int *flags);
205 void char_finish_shift_jis (struct encoding_stream *str,
206 unsigned_char_dynarr *dst, unsigned int *flags);
208 static int detect_coding_big5 (struct detection_state *st,
209 const Extbyte *src, Lstream_data_count n);
210 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
211 unsigned_char_dynarr *dst, Lstream_data_count n);
212 void char_encode_big5 (struct encoding_stream *str, Emchar c,
213 unsigned_char_dynarr *dst, unsigned int *flags);
214 void char_finish_big5 (struct encoding_stream *str,
215 unsigned_char_dynarr *dst, unsigned int *flags);
217 static int detect_coding_ucs4 (struct detection_state *st,
218 const Extbyte *src, Lstream_data_count n);
219 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
220 unsigned_char_dynarr *dst, Lstream_data_count n);
221 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
222 unsigned_char_dynarr *dst, unsigned int *flags);
223 void char_finish_ucs4 (struct encoding_stream *str,
224 unsigned_char_dynarr *dst, unsigned int *flags);
226 static int detect_coding_utf16 (struct detection_state *st,
227 const Extbyte *src, Lstream_data_count n);
228 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
229 unsigned_char_dynarr *dst, Lstream_data_count n);
230 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
231 unsigned_char_dynarr *dst, unsigned int *flags);
232 void char_finish_utf16 (struct encoding_stream *str,
233 unsigned_char_dynarr *dst, unsigned int *flags);
235 static int detect_coding_utf8 (struct detection_state *st,
236 const Extbyte *src, Lstream_data_count n);
237 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
238 unsigned_char_dynarr *dst, Lstream_data_count n);
239 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
240 unsigned_char_dynarr *dst, unsigned int *flags);
241 void char_finish_utf8 (struct encoding_stream *str,
242 unsigned_char_dynarr *dst, unsigned int *flags);
244 static int postprocess_iso2022_mask (int mask);
245 static void reset_iso2022 (Lisp_Object coding_system,
246 struct iso2022_decoder *iso);
247 static int detect_coding_iso2022 (struct detection_state *st,
248 const Extbyte *src, Lstream_data_count n);
249 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
250 unsigned_char_dynarr *dst, Lstream_data_count n);
251 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
252 unsigned_char_dynarr *dst, unsigned int *flags);
253 void char_finish_iso2022 (struct encoding_stream *str,
254 unsigned_char_dynarr *dst, unsigned int *flags);
256 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
257 unsigned_char_dynarr *dst, Lstream_data_count n);
258 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
259 unsigned_char_dynarr *dst, Lstream_data_count n);
260 static void mule_decode (Lstream *decoding, const Extbyte *src,
261 unsigned_char_dynarr *dst, Lstream_data_count n);
262 static void mule_encode (Lstream *encoding, const Bufbyte *src,
263 unsigned_char_dynarr *dst, Lstream_data_count n);
265 typedef struct codesys_prop codesys_prop;
274 Dynarr_declare (codesys_prop);
275 } codesys_prop_dynarr;
277 static const struct lrecord_description codesys_prop_description_1[] = {
278 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
282 static const struct struct_description codesys_prop_description = {
283 sizeof (codesys_prop),
284 codesys_prop_description_1
287 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
288 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
292 static const struct struct_description codesys_prop_dynarr_description = {
293 sizeof (codesys_prop_dynarr),
294 codesys_prop_dynarr_description_1
297 codesys_prop_dynarr *the_codesys_prop_dynarr;
299 enum codesys_prop_enum
302 CODESYS_PROP_ISO2022,
307 /************************************************************************/
308 /* Coding system functions */
309 /************************************************************************/
311 static Lisp_Object mark_coding_system (Lisp_Object);
312 static void print_coding_system (Lisp_Object, Lisp_Object, int);
313 static void finalize_coding_system (void *header, int for_disksave);
316 static const struct lrecord_description ccs_description_1[] = {
317 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
318 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
322 static const struct struct_description ccs_description = {
323 sizeof (charset_conversion_spec),
327 static const struct lrecord_description ccsd_description_1[] = {
328 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
332 static const struct struct_description ccsd_description = {
333 sizeof (charset_conversion_spec_dynarr),
338 static const struct lrecord_description coding_system_description[] = {
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
342 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
343 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
344 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
345 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
348 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
349 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
350 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
351 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
352 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
354 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
360 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
361 mark_coding_system, print_coding_system,
362 finalize_coding_system,
363 0, 0, coding_system_description,
367 mark_coding_system (Lisp_Object obj)
369 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
371 mark_object (CODING_SYSTEM_NAME (codesys));
372 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
373 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
374 mark_object (CODING_SYSTEM_EOL_LF (codesys));
375 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
376 mark_object (CODING_SYSTEM_EOL_CR (codesys));
378 switch (CODING_SYSTEM_TYPE (codesys))
382 case CODESYS_ISO2022:
383 for (i = 0; i < 4; i++)
384 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
385 if (codesys->iso2022.input_conv)
387 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
389 struct charset_conversion_spec *ccs =
390 Dynarr_atp (codesys->iso2022.input_conv, i);
391 mark_object (ccs->from_charset);
392 mark_object (ccs->to_charset);
395 if (codesys->iso2022.output_conv)
397 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
399 struct charset_conversion_spec *ccs =
400 Dynarr_atp (codesys->iso2022.output_conv, i);
401 mark_object (ccs->from_charset);
402 mark_object (ccs->to_charset);
409 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
410 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
415 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
416 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
423 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
425 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
427 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
431 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
434 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
436 error ("printing unreadable object #<coding_system 0x%x>",
439 write_c_string ("#<coding_system ", printcharfun);
440 print_internal (c->name, printcharfun, 1);
441 write_c_string (">", printcharfun);
445 finalize_coding_system (void *header, int for_disksave)
447 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
448 /* Since coding systems never go away, this function is not
449 necessary. But it would be necessary if we changed things
450 so that coding systems could go away. */
451 if (!for_disksave) /* see comment in lstream.c */
453 switch (CODING_SYSTEM_TYPE (c))
456 case CODESYS_ISO2022:
457 if (c->iso2022.input_conv)
459 Dynarr_free (c->iso2022.input_conv);
460 c->iso2022.input_conv = 0;
462 if (c->iso2022.output_conv)
464 Dynarr_free (c->iso2022.output_conv);
465 c->iso2022.output_conv = 0;
476 symbol_to_eol_type (Lisp_Object symbol)
478 CHECK_SYMBOL (symbol);
479 if (NILP (symbol)) return EOL_AUTODETECT;
480 if (EQ (symbol, Qlf)) return EOL_LF;
481 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
482 if (EQ (symbol, Qcr)) return EOL_CR;
484 signal_simple_error ("Unrecognized eol type", symbol);
485 return EOL_AUTODETECT; /* not reached */
489 eol_type_to_symbol (eol_type_t type)
494 case EOL_LF: return Qlf;
495 case EOL_CRLF: return Qcrlf;
496 case EOL_CR: return Qcr;
497 case EOL_AUTODETECT: return Qnil;
502 setup_eol_coding_systems (Lisp_Coding_System *codesys)
504 Lisp_Object codesys_obj;
505 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
506 char *codesys_name = (char *) alloca (len + 7);
508 char *codesys_mnemonic=0;
510 Lisp_Object codesys_name_sym, sub_codesys_obj;
514 XSETCODING_SYSTEM (codesys_obj, codesys);
516 memcpy (codesys_name,
517 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
519 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
521 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
522 codesys_mnemonic = (char *) alloca (mlen + 7);
523 memcpy (codesys_mnemonic,
524 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
527 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
528 strcpy (codesys_name + len, "-" op_sys); \
530 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
531 codesys_name_sym = intern (codesys_name); \
532 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
533 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
535 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
536 build_string (codesys_mnemonic); \
537 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
540 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
541 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
542 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
545 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
546 Return t if OBJECT is a coding system.
547 A coding system is an object that defines how text containing multiple
548 character sets is encoded into a stream of (typically 8-bit) bytes.
549 The coding system is used to decode the stream into a series of
550 characters (which may be from multiple charsets) when the text is read
551 from a file or process, and is used to encode the text back into the
552 same format when it is written out to a file or process.
554 For example, many ISO2022-compliant coding systems (such as Compound
555 Text, which is used for inter-client data under the X Window System)
556 use escape sequences to switch between different charsets -- Japanese
557 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
558 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
559 `make-coding-system' for more information.
561 Coding systems are normally identified using a symbol, and the
562 symbol is accepted in place of the actual coding system object whenever
563 a coding system is called for. (This is similar to how faces work.)
567 return CODING_SYSTEMP (object) ? Qt : Qnil;
570 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
571 Retrieve the coding system of the given name.
573 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
574 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
575 If there is no such coding system, nil is returned. Otherwise the
576 associated coding system object is returned.
578 (coding_system_or_name))
580 if (NILP (coding_system_or_name))
581 coding_system_or_name = Qbinary;
582 else if (CODING_SYSTEMP (coding_system_or_name))
583 return coding_system_or_name;
585 CHECK_SYMBOL (coding_system_or_name);
589 coding_system_or_name =
590 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
592 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
593 return coding_system_or_name;
597 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
598 Retrieve the coding system of the given name.
599 Same as `find-coding-system' except that if there is no such
600 coding system, an error is signaled instead of returning nil.
604 Lisp_Object coding_system = Ffind_coding_system (name);
606 if (NILP (coding_system))
607 signal_simple_error ("No such coding system", name);
608 return coding_system;
611 /* We store the coding systems in hash tables with the names as the key and the
612 actual coding system object as the value. Occasionally we need to use them
613 in a list format. These routines provide us with that. */
614 struct coding_system_list_closure
616 Lisp_Object *coding_system_list;
620 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
621 void *coding_system_list_closure)
623 /* This function can GC */
624 struct coding_system_list_closure *cscl =
625 (struct coding_system_list_closure *) coding_system_list_closure;
626 Lisp_Object *coding_system_list = cscl->coding_system_list;
628 *coding_system_list = Fcons (key, *coding_system_list);
632 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
633 Return a list of the names of all defined coding systems.
637 Lisp_Object coding_system_list = Qnil;
639 struct coding_system_list_closure coding_system_list_closure;
641 GCPRO1 (coding_system_list);
642 coding_system_list_closure.coding_system_list = &coding_system_list;
643 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
644 &coding_system_list_closure);
647 return coding_system_list;
650 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
651 Return the name of the given coding system.
655 coding_system = Fget_coding_system (coding_system);
656 return XCODING_SYSTEM_NAME (coding_system);
659 static Lisp_Coding_System *
660 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
662 Lisp_Coding_System *codesys =
663 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
665 zero_lcrecord (codesys);
666 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
667 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
668 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
669 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
670 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
671 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
672 CODING_SYSTEM_TYPE (codesys) = type;
673 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
676 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
678 if (type == CODESYS_ISO2022)
681 for (i = 0; i < 4; i++)
682 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
685 if (type == CODESYS_UTF8)
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
691 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
693 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
696 else if (type == CODESYS_BIG5)
698 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
700 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
701 = Vcharset_chinese_big5;
702 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
704 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
708 else if (type == CODESYS_CCL)
710 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
711 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
714 CODING_SYSTEM_NAME (codesys) = name;
720 /* Given a list of charset conversion specs as specified in a Lisp
721 program, parse it into STORE_HERE. */
724 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
725 Lisp_Object spec_list)
729 EXTERNAL_LIST_LOOP (rest, spec_list)
731 Lisp_Object car = XCAR (rest);
732 Lisp_Object from, to;
733 struct charset_conversion_spec spec;
735 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
736 signal_simple_error ("Invalid charset conversion spec", car);
737 from = Fget_charset (XCAR (car));
738 to = Fget_charset (XCAR (XCDR (car)));
739 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
740 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
741 signal_simple_error_2
742 ("Attempted conversion between different charset types",
744 spec.from_charset = from;
745 spec.to_charset = to;
747 Dynarr_add (store_here, spec);
751 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
752 specs, return the equivalent as the Lisp programmer would see it.
754 If LOAD_HERE is 0, return Qnil. */
757 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
764 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
766 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
767 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
770 return Fnreverse (result);
775 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
776 Register symbol NAME as a coding system.
778 TYPE describes the conversion method used and should be one of
781 Automatic conversion. XEmacs attempts to detect the coding system
784 No conversion. Use this for binary files and such. On output,
785 graphic characters that are not in ASCII or Latin-1 will be
786 replaced by a ?. (For a no-conversion-encoded buffer, these
787 characters will only be present if you explicitly insert them.)
789 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
791 ISO 10646 UCS-4 encoding.
793 ISO 10646 UTF-8 encoding.
795 Any ISO2022-compliant encoding. Among other things, this includes
796 JIS (the Japanese encoding commonly used for e-mail), EUC (the
797 standard Unix encoding for Japanese and other languages), and
798 Compound Text (the encoding used in X11). You can specify more
799 specific information about the conversion with the PROPS argument.
801 Big5 (the encoding commonly used for Taiwanese).
803 The conversion is performed using a user-written pseudo-code
804 program. CCL (Code Conversion Language) is the name of this
807 Write out or read in the raw contents of the memory representing
808 the buffer's text. This is primarily useful for debugging
809 purposes, and is only enabled when XEmacs has been compiled with
810 DEBUG_XEMACS defined (via the --debug configure option).
811 WARNING: Reading in a file using 'internal conversion can result
812 in an internal inconsistency in the memory representing a
813 buffer's text, which will produce unpredictable results and may
814 cause XEmacs to crash. Under normal circumstances you should
815 never use 'internal conversion.
817 DOC-STRING is a string describing the coding system.
819 PROPS is a property list, describing the specific nature of the
820 character set. Recognized properties are:
823 String to be displayed in the modeline when this coding system is
827 End-of-line conversion to be used. It should be one of
830 Automatically detect the end-of-line type (LF, CRLF,
831 or CR). Also generate subsidiary coding systems named
832 `NAME-unix', `NAME-dos', and `NAME-mac', that are
833 identical to this coding system but have an EOL-TYPE
834 value of 'lf, 'crlf, and 'cr, respectively.
836 The end of a line is marked externally using ASCII LF.
837 Since this is also the way that XEmacs represents an
838 end-of-line internally, specifying this option results
839 in no end-of-line conversion. This is the standard
840 format for Unix text files.
842 The end of a line is marked externally using ASCII
843 CRLF. This is the standard format for MS-DOS text
846 The end of a line is marked externally using ASCII CR.
847 This is the standard format for Macintosh text files.
849 Automatically detect the end-of-line type but do not
850 generate subsidiary coding systems. (This value is
851 converted to nil when stored internally, and
852 `coding-system-property' will return nil.)
855 If non-nil, composition for combining characters is disabled.
857 'enable-decomposition
858 If non-nil, decomposition for combining characters is enabled.
860 'use-entity-reference
861 If non-nil, SGML style entity-reference is used for non-system-characters.
863 'post-read-conversion
864 Function called after a file has been read in, to perform the
865 decoding. Called with two arguments, START and END, denoting
866 a region of the current buffer to be decoded.
868 'pre-write-conversion
869 Function called before a file is written out, to perform the
870 encoding. Called with two arguments, START and END, denoting
871 a region of the current buffer to be encoded.
874 The following additional properties are recognized if TYPE is 'iso2022:
880 The character set initially designated to the G0 - G3 registers.
881 The value should be one of
883 -- A charset object (designate that character set)
884 -- nil (do not ever use this register)
885 -- t (no character set is initially designated to
886 the register, but may be later on; this automatically
887 sets the corresponding `force-g*-on-output' property)
893 If non-nil, send an explicit designation sequence on output before
894 using the specified register.
897 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
898 "ESC $ B" on output in place of the full designation sequences
899 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
902 If non-nil, don't designate ASCII to G0 at each end of line on output.
903 Setting this to non-nil also suppresses other state-resetting that
904 normally happens at the end of a line.
907 If non-nil, don't designate ASCII to G0 before control chars on output.
910 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
914 If non-nil, use locking-shift (SO/SI) instead of single-shift
915 or designation by escape sequence.
918 If non-nil, don't use ISO6429's direction specification.
921 If non-nil, literal control characters that are the same as
922 the beginning of a recognized ISO2022 or ISO6429 escape sequence
923 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
924 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
925 so that they can be properly distinguished from an escape sequence.
926 (Note that doing this results in a non-portable encoding.) This
927 encoding flag is used for byte-compiled files. Note that ESC
928 is a good choice for a quoting character because there are no
929 escape sequences whose second byte is a character from the Control-0
930 or Control-1 character sets; this is explicitly disallowed by the
933 'input-charset-conversion
934 A list of conversion specifications, specifying conversion of
935 characters in one charset to another when decoding is performed.
936 Each specification is a list of two elements: the source charset,
937 and the destination charset.
939 'output-charset-conversion
940 A list of conversion specifications, specifying conversion of
941 characters in one charset to another when encoding is performed.
942 The form of each specification is the same as for
943 'input-charset-conversion.
946 The following additional properties are recognized (and required)
950 CCL program used for decoding (converting to internal format).
953 CCL program used for encoding (converting to external format).
955 (name, type, doc_string, props))
957 Lisp_Coding_System *codesys;
958 enum coding_system_type ty;
959 int need_to_setup_eol_systems = 1;
961 /* Convert type to constant */
962 if (NILP (type) || EQ (type, Qundecided))
963 { ty = CODESYS_AUTODETECT; }
965 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
966 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
967 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
968 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
969 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
970 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
971 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
973 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
975 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
978 signal_simple_error ("Invalid coding system type", type);
982 codesys = allocate_coding_system (ty, name);
984 if (NILP (doc_string))
985 doc_string = build_string ("");
987 CHECK_STRING (doc_string);
988 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
991 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
993 if (EQ (key, Qmnemonic))
996 CHECK_STRING (value);
997 CODING_SYSTEM_MNEMONIC (codesys) = value;
1000 else if (EQ (key, Qeol_type))
1002 need_to_setup_eol_systems = NILP (value);
1005 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1008 else if (EQ (key, Qpost_read_conversion))
1009 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1010 else if (EQ (key, Qpre_write_conversion))
1011 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1013 else if (EQ (key, Qdisable_composition))
1014 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1015 else if (EQ (key, Qenable_decomposition))
1016 CODING_SYSTEM_ENABLE_DECOMPOSITION (codesys) = !NILP (value);
1017 else if (EQ (key, Quse_entity_reference))
1018 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1021 else if (ty == CODESYS_ISO2022)
1023 #define FROB_INITIAL_CHARSET(charset_num) \
1024 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1025 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1027 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1028 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1029 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1030 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1032 #define FROB_FORCE_CHARSET(charset_num) \
1033 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1035 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1036 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1037 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1038 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1040 #define FROB_BOOLEAN_PROPERTY(prop) \
1041 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1043 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1044 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1045 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1046 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1047 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1048 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1049 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1051 else if (EQ (key, Qinput_charset_conversion))
1053 codesys->iso2022.input_conv =
1054 Dynarr_new (charset_conversion_spec);
1055 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1058 else if (EQ (key, Qoutput_charset_conversion))
1060 codesys->iso2022.output_conv =
1061 Dynarr_new (charset_conversion_spec);
1062 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1066 else if (EQ (key, Qccs_priority_list))
1068 codesys->ccs_priority_list = value;
1072 signal_simple_error ("Unrecognized property", key);
1075 else if (ty == CODESYS_UTF8)
1077 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1078 else if (EQ (key, Qcharset_g1))
1079 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1080 else if (EQ (key, Qcharset_g2))
1081 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1082 else if (EQ (key, Qcharset_g3))
1083 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3) = value;
1085 signal_simple_error ("Unrecognized property", key);
1087 else if (ty == CODESYS_BIG5)
1089 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1090 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1092 signal_simple_error ("Unrecognized property", key);
1095 else if (EQ (type, Qccl))
1098 struct ccl_program test_ccl;
1101 /* Check key first. */
1102 if (EQ (key, Qdecode))
1103 suffix = "-ccl-decode";
1104 else if (EQ (key, Qencode))
1105 suffix = "-ccl-encode";
1107 signal_simple_error ("Unrecognized property", key);
1109 /* If value is vector, register it as a ccl program
1110 associated with an newly created symbol for
1111 backward compatibility. */
1112 if (VECTORP (value))
1114 sym = Fintern (concat2 (Fsymbol_name (name),
1115 build_string (suffix)),
1117 Fregister_ccl_program (sym, value);
1121 CHECK_SYMBOL (value);
1124 /* check if the given ccl programs are valid. */
1125 if (setup_ccl_program (&test_ccl, sym) < 0)
1126 signal_simple_error ("Invalid CCL program", value);
1128 if (EQ (key, Qdecode))
1129 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1130 else if (EQ (key, Qencode))
1131 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1136 signal_simple_error ("Unrecognized property", key);
1140 if (need_to_setup_eol_systems)
1141 setup_eol_coding_systems (codesys);
1144 Lisp_Object codesys_obj;
1145 XSETCODING_SYSTEM (codesys_obj, codesys);
1146 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1151 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1152 Copy OLD-CODING-SYSTEM to NEW-NAME.
1153 If NEW-NAME does not name an existing coding system, a new one will
1156 (old_coding_system, new_name))
1158 Lisp_Object new_coding_system;
1159 old_coding_system = Fget_coding_system (old_coding_system);
1160 new_coding_system = Ffind_coding_system (new_name);
1161 if (NILP (new_coding_system))
1163 XSETCODING_SYSTEM (new_coding_system,
1164 allocate_coding_system
1165 (XCODING_SYSTEM_TYPE (old_coding_system),
1167 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1171 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1172 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1173 memcpy (((char *) to ) + sizeof (to->header),
1174 ((char *) from) + sizeof (from->header),
1175 sizeof (*from) - sizeof (from->header));
1176 to->name = new_name;
1178 return new_coding_system;
1181 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1182 Return t if OBJECT names a coding system, and is not a coding system alias.
1186 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1190 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1191 Return t if OBJECT is a coding system alias.
1192 All coding system aliases are created by `define-coding-system-alias'.
1196 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1200 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1201 Return the coding-system symbol for which symbol ALIAS is an alias.
1205 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1206 if (SYMBOLP (aliasee))
1209 signal_simple_error ("Symbol is not a coding system alias", alias);
1210 return Qnil; /* To keep the compiler happy */
1214 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1216 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1220 /* A maphash function, for removing dangling coding system aliases. */
1222 dangling_coding_system_alias_p (Lisp_Object alias,
1223 Lisp_Object aliasee,
1224 void *dangling_aliases)
1226 if (SYMBOLP (aliasee)
1227 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1229 (*(int *) dangling_aliases)++;
1236 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1237 Define symbol ALIAS as an alias for coding system ALIASEE.
1239 You can use this function to redefine an alias that has already been defined,
1240 but you cannot redefine a name which is the canonical name for a coding system.
1241 \(a canonical name of a coding system is what is returned when you call
1242 `coding-system-name' on a coding system).
1244 ALIASEE itself can be an alias, which allows you to define nested aliases.
1246 You are forbidden, however, from creating alias loops or `dangling' aliases.
1247 These will be detected, and an error will be signaled if you attempt to do so.
1249 If ALIASEE is nil, then ALIAS will simply be undefined.
1251 See also `coding-system-alias-p', `coding-system-aliasee',
1252 and `coding-system-canonical-name-p'.
1256 Lisp_Object real_coding_system, probe;
1258 CHECK_SYMBOL (alias);
1260 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1262 ("Symbol is the canonical name of a coding system and cannot be redefined",
1267 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1268 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1269 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1271 Fremhash (alias, Vcoding_system_hash_table);
1273 /* Undefine subsidiary aliases,
1274 presumably created by a previous call to this function */
1275 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1276 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1277 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1279 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1280 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1281 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1284 /* Undefine dangling coding system aliases. */
1286 int dangling_aliases;
1289 dangling_aliases = 0;
1290 elisp_map_remhash (dangling_coding_system_alias_p,
1291 Vcoding_system_hash_table,
1293 } while (dangling_aliases > 0);
1299 if (CODING_SYSTEMP (aliasee))
1300 aliasee = XCODING_SYSTEM_NAME (aliasee);
1302 /* Checks that aliasee names a coding-system */
1303 real_coding_system = Fget_coding_system (aliasee);
1305 /* Check for coding system alias loops */
1306 if (EQ (alias, aliasee))
1307 alias_loop: signal_simple_error_2
1308 ("Attempt to create a coding system alias loop", alias, aliasee);
1310 for (probe = aliasee;
1312 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1314 if (EQ (probe, alias))
1318 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1320 /* Set up aliases for subsidiaries.
1321 #### There must be a better way to handle subsidiary coding systems. */
1323 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1325 for (i = 0; i < countof (suffixes); i++)
1327 Lisp_Object alias_subsidiary =
1328 append_suffix_to_symbol (alias, suffixes[i]);
1329 Lisp_Object aliasee_subsidiary =
1330 append_suffix_to_symbol (aliasee, suffixes[i]);
1332 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1333 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1336 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1337 but it doesn't look intentional, so I'd rather return something
1338 meaningful or nothing at all. */
1343 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1345 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1346 Lisp_Object new_coding_system;
1348 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1349 return coding_system;
1353 case EOL_AUTODETECT: return coding_system;
1354 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1355 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1356 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1357 default: abort (); return Qnil;
1360 return NILP (new_coding_system) ? coding_system : new_coding_system;
1363 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1364 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1366 (coding_system, eol_type))
1368 coding_system = Fget_coding_system (coding_system);
1370 return subsidiary_coding_system (coding_system,
1371 symbol_to_eol_type (eol_type));
1375 /************************************************************************/
1376 /* Coding system accessors */
1377 /************************************************************************/
1379 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1380 Return the doc string for CODING-SYSTEM.
1384 coding_system = Fget_coding_system (coding_system);
1385 return XCODING_SYSTEM_DOC_STRING (coding_system);
1388 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1389 Return the type of CODING-SYSTEM.
1393 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1396 case CODESYS_AUTODETECT: return Qundecided;
1398 case CODESYS_SHIFT_JIS: return Qshift_jis;
1399 case CODESYS_ISO2022: return Qiso2022;
1400 case CODESYS_BIG5: return Qbig5;
1401 case CODESYS_UCS4: return Qucs4;
1402 case CODESYS_UTF16: return Qutf16;
1403 case CODESYS_UTF8: return Qutf8;
1404 case CODESYS_CCL: return Qccl;
1406 case CODESYS_NO_CONVERSION: return Qno_conversion;
1408 case CODESYS_INTERNAL: return Qinternal;
1415 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1418 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1420 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1423 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1424 Return initial charset of CODING-SYSTEM designated to GNUM.
1427 (coding_system, gnum))
1429 coding_system = Fget_coding_system (coding_system);
1432 return coding_system_charset (coding_system, XINT (gnum));
1436 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1437 Return the PROP property of CODING-SYSTEM.
1439 (coding_system, prop))
1442 enum coding_system_type type;
1444 coding_system = Fget_coding_system (coding_system);
1445 CHECK_SYMBOL (prop);
1446 type = XCODING_SYSTEM_TYPE (coding_system);
1448 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1449 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1452 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1454 case CODESYS_PROP_ALL_OK:
1457 case CODESYS_PROP_ISO2022:
1458 if (type != CODESYS_ISO2022)
1460 ("Property only valid in ISO2022 coding systems",
1464 case CODESYS_PROP_CCL:
1465 if (type != CODESYS_CCL)
1467 ("Property only valid in CCL coding systems",
1477 signal_simple_error ("Unrecognized property", prop);
1479 if (EQ (prop, Qname))
1480 return XCODING_SYSTEM_NAME (coding_system);
1481 else if (EQ (prop, Qtype))
1482 return Fcoding_system_type (coding_system);
1483 else if (EQ (prop, Qdoc_string))
1484 return XCODING_SYSTEM_DOC_STRING (coding_system);
1485 else if (EQ (prop, Qmnemonic))
1486 return XCODING_SYSTEM_MNEMONIC (coding_system);
1487 else if (EQ (prop, Qeol_type))
1488 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1489 else if (EQ (prop, Qeol_lf))
1490 return XCODING_SYSTEM_EOL_LF (coding_system);
1491 else if (EQ (prop, Qeol_crlf))
1492 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1493 else if (EQ (prop, Qeol_cr))
1494 return XCODING_SYSTEM_EOL_CR (coding_system);
1495 else if (EQ (prop, Qpost_read_conversion))
1496 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1497 else if (EQ (prop, Qpre_write_conversion))
1498 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1501 else if (EQ (prop, Qdisable_composition))
1502 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1503 else if (EQ (prop, Qenable_decomposition))
1504 return XCODING_SYSTEM_ENABLE_DECOMPOSITION (coding_system) ? Qt : Qnil;
1505 else if (EQ (prop, Quse_entity_reference))
1506 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1507 else if (EQ (prop, Qccs_priority_list))
1508 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1510 else if (type == CODESYS_ISO2022)
1512 if (EQ (prop, Qcharset_g0))
1513 return coding_system_charset (coding_system, 0);
1514 else if (EQ (prop, Qcharset_g1))
1515 return coding_system_charset (coding_system, 1);
1516 else if (EQ (prop, Qcharset_g2))
1517 return coding_system_charset (coding_system, 2);
1518 else if (EQ (prop, Qcharset_g3))
1519 return coding_system_charset (coding_system, 3);
1521 #define FORCE_CHARSET(charset_num) \
1522 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1523 (coding_system, charset_num) ? Qt : Qnil)
1525 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1526 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1527 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1528 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1530 #define LISP_BOOLEAN(prop) \
1531 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1533 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1534 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1535 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1536 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1537 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1538 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1539 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1541 else if (EQ (prop, Qinput_charset_conversion))
1543 unparse_charset_conversion_specs
1544 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1545 else if (EQ (prop, Qoutput_charset_conversion))
1547 unparse_charset_conversion_specs
1548 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1552 else if (type == CODESYS_CCL)
1554 if (EQ (prop, Qdecode))
1555 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1556 else if (EQ (prop, Qencode))
1557 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1565 return Qnil; /* not reached */
1569 /************************************************************************/
1570 /* Coding category functions */
1571 /************************************************************************/
1574 decode_coding_category (Lisp_Object symbol)
1578 CHECK_SYMBOL (symbol);
1579 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1580 if (EQ (coding_category_symbol[i], symbol))
1583 signal_simple_error ("Unrecognized coding category", symbol);
1584 return 0; /* not reached */
1587 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1588 Return a list of all recognized coding categories.
1593 Lisp_Object list = Qnil;
1595 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1596 list = Fcons (coding_category_symbol[i], list);
1600 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1601 Change the priority order of the coding categories.
1602 LIST should be list of coding categories, in descending order of
1603 priority. Unspecified coding categories will be lower in priority
1604 than all specified ones, in the same relative order they were in
1609 int category_to_priority[CODING_CATEGORY_LAST];
1613 /* First generate a list that maps coding categories to priorities. */
1615 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1616 category_to_priority[i] = -1;
1618 /* Highest priority comes from the specified list. */
1620 EXTERNAL_LIST_LOOP (rest, list)
1622 int cat = decode_coding_category (XCAR (rest));
1624 if (category_to_priority[cat] >= 0)
1625 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1626 category_to_priority[cat] = i++;
1629 /* Now go through the existing categories by priority to retrieve
1630 the categories not yet specified and preserve their priority
1632 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1634 int cat = fcd->coding_category_by_priority[j];
1635 if (category_to_priority[cat] < 0)
1636 category_to_priority[cat] = i++;
1639 /* Now we need to construct the inverse of the mapping we just
1642 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1643 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1645 /* Phew! That was confusing. */
1649 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1650 Return a list of coding categories in descending order of priority.
1655 Lisp_Object list = Qnil;
1657 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1658 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1663 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1664 Change the coding system associated with a coding category.
1666 (coding_category, coding_system))
1668 int cat = decode_coding_category (coding_category);
1670 coding_system = Fget_coding_system (coding_system);
1671 fcd->coding_category_system[cat] = coding_system;
1675 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1676 Return the coding system associated with a coding category.
1680 int cat = decode_coding_category (coding_category);
1681 Lisp_Object sys = fcd->coding_category_system[cat];
1684 return XCODING_SYSTEM_NAME (sys);
1689 /************************************************************************/
1690 /* Detecting the encoding of data */
1691 /************************************************************************/
1693 struct detection_state
1695 eol_type_t eol_type;
1738 struct iso2022_decoder iso;
1740 int high_byte_count;
1741 unsigned int saw_single_shift:1;
1754 acceptable_control_char_p (int c)
1758 /* Allow and ignore control characters that you might
1759 reasonably see in a text file */
1764 case 8: /* backspace */
1765 case 11: /* vertical tab */
1766 case 12: /* form feed */
1767 case 26: /* MS-DOS C-z junk */
1768 case 31: /* '^_' -- for info */
1776 mask_has_at_most_one_bit_p (int mask)
1778 /* Perhaps the only thing useful you learn from intensive Microsoft
1779 technical interviews */
1780 return (mask & (mask - 1)) == 0;
1784 detect_eol_type (struct detection_state *st, const Extbyte *src,
1785 Lstream_data_count n)
1789 unsigned char c = *(unsigned char *)src++;
1792 if (st->eol.just_saw_cr)
1794 else if (st->eol.seen_anything)
1797 else if (st->eol.just_saw_cr)
1800 st->eol.just_saw_cr = 1;
1802 st->eol.just_saw_cr = 0;
1803 st->eol.seen_anything = 1;
1806 return EOL_AUTODETECT;
1809 /* Attempt to determine the encoding and EOL type of the given text.
1810 Before calling this function for the first type, you must initialize
1811 st->eol_type as appropriate and initialize st->mask to ~0.
1813 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1816 st->mask holds the determined coding category mask, or ~0 if only
1817 ASCII has been seen so far.
1821 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1822 is present in st->mask
1823 1 == definitive answers are here for both st->eol_type and st->mask
1827 detect_coding_type (struct detection_state *st, const Extbyte *src,
1828 Lstream_data_count n, int just_do_eol)
1830 if (st->eol_type == EOL_AUTODETECT)
1831 st->eol_type = detect_eol_type (st, src, n);
1834 return st->eol_type != EOL_AUTODETECT;
1836 if (!st->seen_non_ascii)
1838 for (; n; n--, src++)
1840 unsigned char c = *(unsigned char *) src;
1841 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1843 st->seen_non_ascii = 1;
1845 st->shift_jis.mask = ~0;
1848 st->utf16.mask = ~0;
1850 st->iso2022.mask = ~0;
1860 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1861 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1862 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1863 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1864 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1865 st->big5.mask = detect_coding_big5 (st, src, n);
1866 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1867 st->utf8.mask = detect_coding_utf8 (st, src, n);
1868 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1869 st->utf16.mask = detect_coding_utf16 (st, src, n);
1870 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1871 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1874 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1875 | st->utf8.mask | st->ucs4.mask;
1878 int retval = mask_has_at_most_one_bit_p (st->mask);
1879 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1880 return retval && st->eol_type != EOL_AUTODETECT;
1885 coding_system_from_mask (int mask)
1889 /* If the file was entirely or basically ASCII, use the
1890 default value of `buffer-file-coding-system'. */
1891 Lisp_Object retval =
1892 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1895 retval = Ffind_coding_system (retval);
1899 (Qbad_variable, Qwarning,
1900 "Invalid `default-buffer-file-coding-system', set to nil");
1901 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1905 retval = Fget_coding_system (Qraw_text);
1913 mask = postprocess_iso2022_mask (mask);
1915 /* Look through the coding categories by priority and find
1916 the first one that is allowed. */
1917 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1919 cat = fcd->coding_category_by_priority[i];
1920 if ((mask & (1 << cat)) &&
1921 !NILP (fcd->coding_category_system[cat]))
1925 return fcd->coding_category_system[cat];
1927 return Fget_coding_system (Qraw_text);
1931 /* Given a seekable read stream and potential coding system and EOL type
1932 as specified, do any autodetection that is called for. If the
1933 coding system and/or EOL type are not `autodetect', they will be left
1934 alone; but this function will never return an autodetect coding system
1937 This function does not automatically fetch subsidiary coding systems;
1938 that should be unnecessary with the explicit eol-type argument. */
1940 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1941 /* number of leading lines to check for a coding cookie */
1942 #define LINES_TO_CHECK 2
1945 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1946 eol_type_t *eol_type_in_out)
1948 struct detection_state decst;
1950 if (*eol_type_in_out == EOL_AUTODETECT)
1951 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1954 decst.eol_type = *eol_type_in_out;
1957 /* If autodetection is called for, do it now. */
1958 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1959 || *eol_type_in_out == EOL_AUTODETECT)
1962 Lisp_Object coding_system = Qnil;
1964 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1966 int lines_checked = 0;
1968 /* Look for initial "-*-"; mode line prefix */
1970 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1972 && lines_checked < LINES_TO_CHECK;
1974 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1976 Extbyte *local_vars_beg = p + 3;
1977 /* Look for final "-*-"; mode line suffix */
1978 for (p = local_vars_beg,
1979 scan_end = buf + nread - LENGTH ("-*-");
1981 && lines_checked < LINES_TO_CHECK;
1983 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1985 Extbyte *suffix = p;
1986 /* Look for "coding:" */
1987 for (p = local_vars_beg,
1988 scan_end = suffix - LENGTH ("coding:?");
1991 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1992 && (p == local_vars_beg
1993 || (*(p-1) == ' ' ||
1999 p += LENGTH ("coding:");
2000 while (*p == ' ' || *p == '\t') p++;
2002 /* Get coding system name */
2003 save = *suffix; *suffix = '\0';
2004 /* Characters valid in a MIME charset name (rfc 1521),
2005 and in a Lisp symbol name. */
2006 n = strspn ( (char *) p,
2007 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
2008 "abcdefghijklmnopqrstuvwxyz"
2014 save = p[n]; p[n] = '\0';
2016 Ffind_coding_system (intern ((char *) p));
2023 /* #### file must use standard EOLs or we miss 2d line */
2024 /* #### not to mention this is broken for UTF-16 DOS files */
2025 else if (*p == '\n' || *p == '\r')
2028 /* skip past multibyte (DOS) newline */
2029 if (*p == '\r' && *(p+1) == '\n') p++;
2033 /* #### file must use standard EOLs or we miss 2d line */
2034 /* #### not to mention this is broken for UTF-16 DOS files */
2035 else if (*p == '\n' || *p == '\r')
2038 /* skip past multibyte (DOS) newline */
2039 if (*p == '\r' && *(p+1) == '\n') p++;
2042 if (NILP (coding_system))
2045 if (detect_coding_type (&decst, buf, nread,
2046 XCODING_SYSTEM_TYPE (*codesys_in_out)
2047 != CODESYS_AUTODETECT))
2049 nread = Lstream_read (stream, buf, sizeof (buf));
2055 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2056 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2059 if (detect_coding_type (&decst, buf, nread, 1))
2061 nread = Lstream_read (stream, buf, sizeof (buf));
2067 *eol_type_in_out = decst.eol_type;
2068 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2070 if (NILP (coding_system))
2071 *codesys_in_out = coding_system_from_mask (decst.mask);
2073 *codesys_in_out = coding_system;
2077 /* If we absolutely can't determine the EOL type, just assume LF. */
2078 if (*eol_type_in_out == EOL_AUTODETECT)
2079 *eol_type_in_out = EOL_LF;
2081 Lstream_rewind (stream);
2084 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2085 Detect coding system of the text in the region between START and END.
2086 Return a list of possible coding systems ordered by priority.
2087 If only ASCII characters are found, return 'undecided or one of
2088 its subsidiary coding systems according to a detected end-of-line
2089 type. Optional arg BUFFER defaults to the current buffer.
2091 (start, end, buffer))
2093 Lisp_Object val = Qnil;
2094 struct buffer *buf = decode_buffer (buffer, 0);
2096 Lisp_Object instream, lb_instream;
2097 Lstream *istr, *lb_istr;
2098 struct detection_state decst;
2099 struct gcpro gcpro1, gcpro2;
2101 get_buffer_range_char (buf, start, end, &b, &e, 0);
2102 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2103 lb_istr = XLSTREAM (lb_instream);
2104 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2105 istr = XLSTREAM (instream);
2106 GCPRO2 (instream, lb_instream);
2108 decst.eol_type = EOL_AUTODETECT;
2112 Extbyte random_buffer[4096];
2113 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2117 if (detect_coding_type (&decst, random_buffer, nread, 0))
2121 if (decst.mask == ~0)
2122 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2130 decst.mask = postprocess_iso2022_mask (decst.mask);
2132 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2134 int sys = fcd->coding_category_by_priority[i];
2135 if (decst.mask & (1 << sys))
2137 Lisp_Object codesys = fcd->coding_category_system[sys];
2138 if (!NILP (codesys))
2139 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2140 val = Fcons (codesys, val);
2144 Lstream_close (istr);
2146 Lstream_delete (istr);
2147 Lstream_delete (lb_istr);
2152 /************************************************************************/
2153 /* Converting to internal Mule format ("decoding") */
2154 /************************************************************************/
2156 /* A decoding stream is a stream used for decoding text (i.e.
2157 converting from some external format to internal format).
2158 The decoding-stream object keeps track of the actual coding
2159 stream, the stream that is at the other end, and data that
2160 needs to be persistent across the lifetime of the stream. */
2162 /* Handle the EOL stuff related to just-read-in character C.
2163 EOL_TYPE is the EOL type of the coding stream.
2164 FLAGS is the current value of FLAGS in the coding stream, and may
2165 be modified by this macro. (The macro only looks at the
2166 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2167 bytes are to be written. You need to also define a local goto
2168 label "label_continue_loop" that is at the end of the main
2169 character-reading loop.
2171 If C is a CR character, then this macro handles it entirely and
2172 jumps to label_continue_loop. Otherwise, this macro does not add
2173 anything to DST, and continues normally. You should continue
2174 processing C normally after this macro. */
2176 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2180 if (eol_type == EOL_CR) \
2181 Dynarr_add (dst, '\n'); \
2182 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2183 Dynarr_add (dst, c); \
2185 flags |= CODING_STATE_CR; \
2186 goto label_continue_loop; \
2188 else if (flags & CODING_STATE_CR) \
2189 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2191 Dynarr_add (dst, '\r'); \
2192 flags &= ~CODING_STATE_CR; \
2196 /* C should be a binary character in the range 0 - 255; convert
2197 to internal format and add to Dynarr DST. */
2200 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2202 if (BYTE_ASCII_P (c)) \
2203 Dynarr_add (dst, c); \
2206 Dynarr_add (dst, (c >> 6) | 0xc0); \
2207 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2211 static void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2213 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2217 Dynarr_add (dst, c);
2219 else if ( c <= 0x7ff )
2221 Dynarr_add (dst, (c >> 6) | 0xc0);
2222 Dynarr_add (dst, (c & 0x3f) | 0x80);
2224 else if ( c <= 0xffff )
2226 Dynarr_add (dst, (c >> 12) | 0xe0);
2227 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2228 Dynarr_add (dst, (c & 0x3f) | 0x80);
2230 else if ( c <= 0x1fffff )
2232 Dynarr_add (dst, (c >> 18) | 0xf0);
2233 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2234 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2235 Dynarr_add (dst, (c & 0x3f) | 0x80);
2237 else if ( c <= 0x3ffffff )
2239 Dynarr_add (dst, (c >> 24) | 0xf8);
2240 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2241 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2242 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2243 Dynarr_add (dst, (c & 0x3f) | 0x80);
2247 Dynarr_add (dst, (c >> 30) | 0xfc);
2248 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2249 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2250 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2251 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2252 Dynarr_add (dst, (c & 0x3f) | 0x80);
2256 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2258 if (BYTE_ASCII_P (c)) \
2259 Dynarr_add (dst, c); \
2260 else if (BYTE_C1_P (c)) \
2262 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2263 Dynarr_add (dst, c + 0x20); \
2267 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2268 Dynarr_add (dst, c); \
2273 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2277 DECODE_ADD_BINARY_CHAR (ch, dst); \
2282 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2284 if (flags & CODING_STATE_END) \
2286 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2287 if (flags & CODING_STATE_CR) \
2288 Dynarr_add (dst, '\r'); \
2292 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2294 #define ER_BUF_SIZE 24
2296 struct decoding_stream
2298 /* Coding system that governs the conversion. */
2299 Lisp_Coding_System *codesys;
2301 /* Stream that we read the encoded data from or
2302 write the decoded data to. */
2305 /* If we are reading, then we can return only a fixed amount of
2306 data, so if the conversion resulted in too much data, we store it
2307 here for retrieval the next time around. */
2308 unsigned_char_dynarr *runoff;
2310 /* FLAGS holds flags indicating the current state of the decoding.
2311 Some of these flags are dependent on the coding system. */
2314 /* CPOS holds a partially built-up code-point of character. */
2317 /* EOL_TYPE specifies the type of end-of-line conversion that
2318 currently applies. We need to keep this separate from the
2319 EOL type stored in CODESYS because the latter might indicate
2320 automatic EOL-type detection while the former will always
2321 indicate a particular EOL type. */
2322 eol_type_t eol_type;
2324 /* Additional ISO2022 information. We define the structure above
2325 because it's also needed by the detection routines. */
2326 struct iso2022_decoder iso2022;
2328 /* Additional information (the state of the running CCL program)
2329 used by the CCL decoder. */
2330 struct ccl_program ccl;
2332 /* counter for UTF-8 or UCS-4 */
2333 unsigned char counter;
2337 unsigned char er_counter;
2338 unsigned char er_buf[ER_BUF_SIZE];
2340 unsigned combined_char_count;
2341 Emchar combined_chars[16];
2342 #ifdef USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE
2343 COS_object combining_table;
2345 Lisp_Object combining_table;
2346 #endif /* USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE */
2348 struct detection_state decst;
2351 static Lstream_data_count decoding_reader (Lstream *stream,
2352 unsigned char *data, Lstream_data_count size);
2353 static Lstream_data_count decoding_writer (Lstream *stream,
2354 const unsigned char *data, Lstream_data_count size);
2355 static int decoding_rewinder (Lstream *stream);
2356 static int decoding_seekable_p (Lstream *stream);
2357 static int decoding_flusher (Lstream *stream);
2358 static int decoding_closer (Lstream *stream);
2360 static Lisp_Object decoding_marker (Lisp_Object stream);
2362 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2363 sizeof (struct decoding_stream));
2366 decoding_marker (Lisp_Object stream)
2368 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2369 Lisp_Object str_obj;
2371 /* We do not need to mark the coding systems or charsets stored
2372 within the stream because they are stored in a global list
2373 and automatically marked. */
2375 XSETLSTREAM (str_obj, str);
2376 mark_object (str_obj);
2377 if (str->imp->marker)
2378 return (str->imp->marker) (str_obj);
2383 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2384 so we read data from the other end, decode it, and store it into DATA. */
2386 static Lstream_data_count
2387 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2389 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2390 unsigned char *orig_data = data;
2391 Lstream_data_count read_size;
2392 int error_occurred = 0;
2394 /* We need to interface to mule_decode(), which expects to take some
2395 amount of data and store the result into a Dynarr. We have
2396 mule_decode() store into str->runoff, and take data from there
2399 /* We loop until we have enough data, reading chunks from the other
2400 end and decoding it. */
2403 /* Take data from the runoff if we can. Make sure to take at
2404 most SIZE bytes, and delete the data from the runoff. */
2405 if (Dynarr_length (str->runoff) > 0)
2407 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2408 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2409 Dynarr_delete_many (str->runoff, 0, chunk);
2415 break; /* No more room for data */
2417 if (str->flags & CODING_STATE_END)
2418 /* This means that on the previous iteration, we hit the EOF on
2419 the other end. We loop once more so that mule_decode() can
2420 output any final stuff it may be holding, or any "go back
2421 to a sane state" escape sequences. (This latter makes sense
2422 during encoding.) */
2425 /* Exhausted the runoff, so get some more. DATA has at least
2426 SIZE bytes left of storage in it, so it's OK to read directly
2427 into it. (We'll be overwriting above, after we've decoded it
2428 into the runoff.) */
2429 read_size = Lstream_read (str->other_end, data, size);
2436 /* There might be some more end data produced in the translation.
2437 See the comment above. */
2438 str->flags |= CODING_STATE_END;
2439 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2442 if (data - orig_data == 0)
2443 return error_occurred ? -1 : 0;
2445 return data - orig_data;
2448 static Lstream_data_count
2449 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2451 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2452 Lstream_data_count retval;
2454 /* Decode all our data into the runoff, and then attempt to write
2455 it all out to the other end. Remove whatever chunk we succeeded
2457 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2458 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2459 Dynarr_length (str->runoff));
2461 Dynarr_delete_many (str->runoff, 0, retval);
2462 /* Do NOT return retval. The return value indicates how much
2463 of the incoming data was written, not how many bytes were
2469 reset_decoding_stream (struct decoding_stream *str)
2472 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2474 Lisp_Object coding_system;
2475 XSETCODING_SYSTEM (coding_system, str->codesys);
2476 reset_iso2022 (coding_system, &str->iso2022);
2478 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2480 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2486 str->er_counter = 0;
2487 str->combined_char_count = 0;
2488 #ifdef USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE
2489 str->combining_table = COS_NIL;
2491 str->combining_table = Qnil;
2492 #endif /* USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE */
2494 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2495 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2498 str->decst.eol_type = EOL_AUTODETECT;
2499 str->decst.mask = ~0;
2501 str->flags = str->cpos = 0;
2505 decoding_rewinder (Lstream *stream)
2507 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2508 reset_decoding_stream (str);
2509 Dynarr_reset (str->runoff);
2510 return Lstream_rewind (str->other_end);
2514 decoding_seekable_p (Lstream *stream)
2516 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2517 return Lstream_seekable_p (str->other_end);
2521 decoding_flusher (Lstream *stream)
2523 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2524 return Lstream_flush (str->other_end);
2528 decoding_closer (Lstream *stream)
2530 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2531 if (stream->flags & LSTREAM_FL_WRITE)
2533 str->flags |= CODING_STATE_END;
2534 decoding_writer (stream, 0, 0);
2536 Dynarr_free (str->runoff);
2538 #ifdef ENABLE_COMPOSITE_CHARS
2539 if (str->iso2022.composite_chars)
2540 Dynarr_free (str->iso2022.composite_chars);
2543 return Lstream_close (str->other_end);
2547 decoding_stream_coding_system (Lstream *stream)
2549 Lisp_Object coding_system;
2550 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2552 XSETCODING_SYSTEM (coding_system, str->codesys);
2553 return subsidiary_coding_system (coding_system, str->eol_type);
2557 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2559 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2560 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2562 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2563 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2564 reset_decoding_stream (str);
2567 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2568 stream for writing, no automatic code detection will be performed.
2569 The reason for this is that automatic code detection requires a
2570 seekable input. Things will also fail if you open a decoding
2571 stream for reading using a non-fully-specified coding system and
2572 a non-seekable input stream. */
2575 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2578 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2579 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2583 str->other_end = stream;
2584 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2585 str->eol_type = EOL_AUTODETECT;
2586 if (!strcmp (mode, "r")
2587 && Lstream_seekable_p (stream))
2588 /* We can determine the coding system now. */
2589 determine_real_coding_system (stream, &codesys, &str->eol_type);
2590 set_decoding_stream_coding_system (lstr, codesys);
2591 str->decst.eol_type = str->eol_type;
2592 str->decst.mask = ~0;
2593 XSETLSTREAM (obj, lstr);
2598 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2600 return make_decoding_stream_1 (stream, codesys, "r");
2604 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2606 return make_decoding_stream_1 (stream, codesys, "w");
2609 /* Note: the decode_coding_* functions all take the same
2610 arguments as mule_decode(), which is to say some SRC data of
2611 size N, which is to be stored into dynamic array DST.
2612 DECODING is the stream within which the decoding is
2613 taking place, but no data is actually read from or
2614 written to that stream; that is handled in decoding_reader()
2615 or decoding_writer(). This allows the same functions to
2616 be used for both reading and writing. */
2619 mule_decode (Lstream *decoding, const Extbyte *src,
2620 unsigned_char_dynarr *dst, Lstream_data_count n)
2622 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2624 /* If necessary, do encoding-detection now. We do this when
2625 we're a writing stream or a non-seekable reading stream,
2626 meaning that we can't just process the whole input,
2627 rewind, and start over. */
2629 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2630 str->eol_type == EOL_AUTODETECT)
2632 Lisp_Object codesys;
2634 XSETCODING_SYSTEM (codesys, str->codesys);
2635 detect_coding_type (&str->decst, src, n,
2636 CODING_SYSTEM_TYPE (str->codesys) !=
2637 CODESYS_AUTODETECT);
2638 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2639 str->decst.mask != ~0)
2640 /* #### This is cheesy. What we really ought to do is
2641 buffer up a certain amount of data so as to get a
2642 less random result. */
2643 codesys = coding_system_from_mask (str->decst.mask);
2644 str->eol_type = str->decst.eol_type;
2645 if (XCODING_SYSTEM (codesys) != str->codesys)
2647 /* Preserve the CODING_STATE_END flag in case it was set.
2648 If we erase it, bad things might happen. */
2649 int was_end = str->flags & CODING_STATE_END;
2650 set_decoding_stream_coding_system (decoding, codesys);
2652 str->flags |= CODING_STATE_END;
2656 switch (CODING_SYSTEM_TYPE (str->codesys))
2659 case CODESYS_INTERNAL:
2660 Dynarr_add_many (dst, src, n);
2663 case CODESYS_AUTODETECT:
2664 /* If we got this far and still haven't decided on the coding
2665 system, then do no conversion. */
2666 case CODESYS_NO_CONVERSION:
2667 decode_coding_no_conversion (decoding, src, dst, n);
2670 case CODESYS_SHIFT_JIS:
2671 decode_coding_sjis (decoding, src, dst, n);
2674 decode_coding_big5 (decoding, src, dst, n);
2677 decode_coding_ucs4 (decoding, src, dst, n);
2680 decode_coding_utf16 (decoding, src, dst, n);
2683 decode_coding_utf8 (decoding, src, dst, n);
2686 str->ccl.last_block = str->flags & CODING_STATE_END;
2687 /* When applying ccl program to stream, MUST NOT set NULL
2689 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2690 dst, n, 0, CCL_MODE_DECODING);
2692 case CODESYS_ISO2022:
2693 decode_coding_iso2022 (decoding, src, dst, n);
2701 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2702 Decode the text between START and END which is encoded in CODING-SYSTEM.
2703 This is useful if you've read in encoded text from a file without decoding
2704 it (e.g. you read in a JIS-formatted file but used the `binary' or
2705 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2706 Return length of decoded text.
2707 BUFFER defaults to the current buffer if unspecified.
2709 (start, end, coding_system, buffer))
2712 struct buffer *buf = decode_buffer (buffer, 0);
2713 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2714 Lstream *istr, *ostr;
2715 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2717 get_buffer_range_char (buf, start, end, &b, &e, 0);
2719 barf_if_buffer_read_only (buf, b, e);
2721 coding_system = Fget_coding_system (coding_system);
2722 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2723 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2724 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2726 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2727 Fget_coding_system (Qbinary));
2728 istr = XLSTREAM (instream);
2729 ostr = XLSTREAM (outstream);
2730 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2732 /* The chain of streams looks like this:
2734 [BUFFER] <----- send through
2735 ------> [ENCODE AS BINARY]
2736 ------> [DECODE AS SPECIFIED]
2742 char tempbuf[1024]; /* some random amount */
2743 Bufpos newpos, even_newer_pos;
2744 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2745 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2749 newpos = lisp_buffer_stream_startpos (istr);
2750 Lstream_write (ostr, tempbuf, size_in_bytes);
2751 even_newer_pos = lisp_buffer_stream_startpos (istr);
2752 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2755 Lstream_close (istr);
2756 Lstream_close (ostr);
2758 Lstream_delete (istr);
2759 Lstream_delete (ostr);
2760 Lstream_delete (XLSTREAM (de_outstream));
2761 Lstream_delete (XLSTREAM (lb_outstream));
2766 /************************************************************************/
2767 /* Converting to an external encoding ("encoding") */
2768 /************************************************************************/
2770 /* An encoding stream is an output stream. When you create the
2771 stream, you specify the coding system that governs the encoding
2772 and another stream that the resulting encoded data is to be
2773 sent to, and then start sending data to it. */
2775 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2777 struct encoding_stream
2779 /* Coding system that governs the conversion. */
2780 Lisp_Coding_System *codesys;
2782 /* Stream that we read the encoded data from or
2783 write the decoded data to. */
2786 /* If we are reading, then we can return only a fixed amount of
2787 data, so if the conversion resulted in too much data, we store it
2788 here for retrieval the next time around. */
2789 unsigned_char_dynarr *runoff;
2791 /* FLAGS holds flags indicating the current state of the encoding.
2792 Some of these flags are dependent on the coding system. */
2795 /* CH holds a partially built-up character. Since we only deal
2796 with one- and two-byte characters at the moment, we only use
2797 this to store the first byte of a two-byte character. */
2800 /* Additional information used by the ISO2022 encoder. */
2803 /* CHARSET holds the character sets currently assigned to the G0
2804 through G3 registers. It is initialized from the array
2805 INITIAL_CHARSET in CODESYS. */
2806 Lisp_Object charset[4];
2808 /* Which registers are currently invoked into the left (GL) and
2809 right (GR) halves of the 8-bit encoding space? */
2810 int register_left, register_right;
2812 /* Whether we need to explicitly designate the charset in the
2813 G? register before using it. It is initialized from the
2814 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2815 unsigned char force_charset_on_output[4];
2817 /* Other state variables that need to be preserved across
2819 Lisp_Object current_charset;
2821 int current_char_boundary;
2824 void (*encode_char) (struct encoding_stream *str, Emchar c,
2825 unsigned_char_dynarr *dst, unsigned int *flags);
2826 void (*finish) (struct encoding_stream *str,
2827 unsigned_char_dynarr *dst, unsigned int *flags);
2829 /* Additional information (the state of the running CCL program)
2830 used by the CCL encoder. */
2831 struct ccl_program ccl;
2835 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2836 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2837 Lstream_data_count size);
2838 static int encoding_rewinder (Lstream *stream);
2839 static int encoding_seekable_p (Lstream *stream);
2840 static int encoding_flusher (Lstream *stream);
2841 static int encoding_closer (Lstream *stream);
2843 static Lisp_Object encoding_marker (Lisp_Object stream);
2845 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2846 sizeof (struct encoding_stream));
2849 encoding_marker (Lisp_Object stream)
2851 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2852 Lisp_Object str_obj;
2854 /* We do not need to mark the coding systems or charsets stored
2855 within the stream because they are stored in a global list
2856 and automatically marked. */
2858 XSETLSTREAM (str_obj, str);
2859 mark_object (str_obj);
2860 if (str->imp->marker)
2861 return (str->imp->marker) (str_obj);
2866 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2867 so we read data from the other end, encode it, and store it into DATA. */
2869 static Lstream_data_count
2870 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2872 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2873 unsigned char *orig_data = data;
2874 Lstream_data_count read_size;
2875 int error_occurred = 0;
2877 /* We need to interface to mule_encode(), which expects to take some
2878 amount of data and store the result into a Dynarr. We have
2879 mule_encode() store into str->runoff, and take data from there
2882 /* We loop until we have enough data, reading chunks from the other
2883 end and encoding it. */
2886 /* Take data from the runoff if we can. Make sure to take at
2887 most SIZE bytes, and delete the data from the runoff. */
2888 if (Dynarr_length (str->runoff) > 0)
2890 int chunk = min ((int) size, Dynarr_length (str->runoff));
2891 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2892 Dynarr_delete_many (str->runoff, 0, chunk);
2898 break; /* No more room for data */
2900 if (str->flags & CODING_STATE_END)
2901 /* This means that on the previous iteration, we hit the EOF on
2902 the other end. We loop once more so that mule_encode() can
2903 output any final stuff it may be holding, or any "go back
2904 to a sane state" escape sequences. (This latter makes sense
2905 during encoding.) */
2908 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2909 left of storage in it, so it's OK to read directly into it.
2910 (We'll be overwriting above, after we've encoded it into the
2912 read_size = Lstream_read (str->other_end, data, size);
2919 /* There might be some more end data produced in the translation.
2920 See the comment above. */
2921 str->flags |= CODING_STATE_END;
2922 mule_encode (stream, data, str->runoff, read_size);
2925 if (data == orig_data)
2926 return error_occurred ? -1 : 0;
2928 return data - orig_data;
2931 static Lstream_data_count
2932 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2934 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2935 Lstream_data_count retval;
2937 /* Encode all our data into the runoff, and then attempt to write
2938 it all out to the other end. Remove whatever chunk we succeeded
2940 mule_encode (stream, data, str->runoff, size);
2941 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2942 Dynarr_length (str->runoff));
2944 Dynarr_delete_many (str->runoff, 0, retval);
2945 /* Do NOT return retval. The return value indicates how much
2946 of the incoming data was written, not how many bytes were
2952 reset_encoding_stream (struct encoding_stream *str)
2955 switch (CODING_SYSTEM_TYPE (str->codesys))
2957 case CODESYS_ISO2022:
2961 str->encode_char = &char_encode_iso2022;
2962 str->finish = &char_finish_iso2022;
2963 for (i = 0; i < 4; i++)
2965 str->iso2022.charset[i] =
2966 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2967 str->iso2022.force_charset_on_output[i] =
2968 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2970 str->iso2022.register_left = 0;
2971 str->iso2022.register_right = 1;
2972 str->iso2022.current_charset = Qnil;
2973 str->iso2022.current_half = 0;
2977 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2980 str->encode_char = &char_encode_utf8;
2981 str->finish = &char_finish_utf8;
2984 str->encode_char = &char_encode_utf16;
2985 str->finish = &char_finish_utf16;
2988 str->encode_char = &char_encode_ucs4;
2989 str->finish = &char_finish_ucs4;
2991 case CODESYS_SHIFT_JIS:
2992 str->encode_char = &char_encode_shift_jis;
2993 str->finish = &char_finish_shift_jis;
2996 str->encode_char = &char_encode_big5;
2997 str->finish = &char_finish_big5;
3003 str->iso2022.current_char_boundary = 0;
3004 str->flags = str->ch = 0;
3008 encoding_rewinder (Lstream *stream)
3010 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3011 reset_encoding_stream (str);
3012 Dynarr_reset (str->runoff);
3013 return Lstream_rewind (str->other_end);
3017 encoding_seekable_p (Lstream *stream)
3019 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3020 return Lstream_seekable_p (str->other_end);
3024 encoding_flusher (Lstream *stream)
3026 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3027 return Lstream_flush (str->other_end);
3031 encoding_closer (Lstream *stream)
3033 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3034 if (stream->flags & LSTREAM_FL_WRITE)
3036 str->flags |= CODING_STATE_END;
3037 encoding_writer (stream, 0, 0);
3039 Dynarr_free (str->runoff);
3040 return Lstream_close (str->other_end);
3044 encoding_stream_coding_system (Lstream *stream)
3046 Lisp_Object coding_system;
3047 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3049 XSETCODING_SYSTEM (coding_system, str->codesys);
3050 return coding_system;
3054 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3056 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3057 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3059 reset_encoding_stream (str);
3063 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3066 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3067 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3071 str->runoff = Dynarr_new (unsigned_char);
3072 str->other_end = stream;
3073 set_encoding_stream_coding_system (lstr, codesys);
3074 XSETLSTREAM (obj, lstr);
3079 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3081 return make_encoding_stream_1 (stream, codesys, "r");
3085 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3087 return make_encoding_stream_1 (stream, codesys, "w");
3090 /* Convert N bytes of internally-formatted data stored in SRC to an
3091 external format, according to the encoding stream ENCODING.
3092 Store the encoded data into DST. */
3095 mule_encode (Lstream *encoding, const Bufbyte *src,
3096 unsigned_char_dynarr *dst, Lstream_data_count n)
3098 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3100 switch (CODING_SYSTEM_TYPE (str->codesys))
3103 case CODESYS_INTERNAL:
3104 Dynarr_add_many (dst, src, n);
3107 case CODESYS_AUTODETECT:
3108 /* If we got this far and still haven't decided on the coding
3109 system, then do no conversion. */
3110 case CODESYS_NO_CONVERSION:
3111 encode_coding_no_conversion (encoding, src, dst, n);
3115 str->ccl.last_block = str->flags & CODING_STATE_END;
3116 /* When applying ccl program to stream, MUST NOT set NULL
3118 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3119 dst, n, 0, CCL_MODE_ENCODING);
3123 text_encode_generic (encoding, src, dst, n);
3127 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3128 Encode the text between START and END using CODING-SYSTEM.
3129 This will, for example, convert Japanese characters into stuff such as
3130 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3131 text. BUFFER defaults to the current buffer if unspecified.
3133 (start, end, coding_system, buffer))
3136 struct buffer *buf = decode_buffer (buffer, 0);
3137 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3138 Lstream *istr, *ostr;
3139 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3141 get_buffer_range_char (buf, start, end, &b, &e, 0);
3143 barf_if_buffer_read_only (buf, b, e);
3145 coding_system = Fget_coding_system (coding_system);
3146 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3147 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3148 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3149 Fget_coding_system (Qbinary));
3150 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3152 istr = XLSTREAM (instream);
3153 ostr = XLSTREAM (outstream);
3154 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3155 /* The chain of streams looks like this:
3157 [BUFFER] <----- send through
3158 ------> [ENCODE AS SPECIFIED]
3159 ------> [DECODE AS BINARY]
3164 char tempbuf[1024]; /* some random amount */
3165 Bufpos newpos, even_newer_pos;
3166 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3167 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3171 newpos = lisp_buffer_stream_startpos (istr);
3172 Lstream_write (ostr, tempbuf, size_in_bytes);
3173 even_newer_pos = lisp_buffer_stream_startpos (istr);
3174 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3180 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3181 Lstream_close (istr);
3182 Lstream_close (ostr);
3184 Lstream_delete (istr);
3185 Lstream_delete (ostr);
3186 Lstream_delete (XLSTREAM (de_outstream));
3187 Lstream_delete (XLSTREAM (lb_outstream));
3188 return make_int (retlen);
3195 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3196 unsigned_char_dynarr *dst, Lstream_data_count n)
3199 unsigned char char_boundary;
3200 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3201 unsigned int flags = str->flags;
3202 Emchar ch = str->ch;
3204 char_boundary = str->iso2022.current_char_boundary;
3210 if (char_boundary == 0)
3238 (*str->encode_char) (str, c, dst, &flags);
3240 else if (char_boundary == 1)
3242 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3248 ch = (ch << 6) | (c & 0x3f);
3253 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3255 (*str->finish) (str, dst, &flags);
3260 str->iso2022.current_char_boundary = char_boundary;
3265 /************************************************************************/
3266 /* entity reference */
3267 /************************************************************************/
3270 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3272 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3274 if ( str->er_counter > 0)
3276 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3277 str->er_counter = 0;
3281 EXFUN (Fregexp_quote, 1);
3283 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3284 unsigned_char_dynarr* dst);
3286 decode_add_er_char (struct decoding_stream *str, Emchar c,
3287 unsigned_char_dynarr* dst)
3289 if (str->er_counter == 0)
3291 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3294 str->er_buf[0] = '&';
3298 DECODE_ADD_UCS_CHAR (c, dst);
3302 Lisp_Object string = make_string (str->er_buf,
3309 Lisp_Object char_type;
3312 for ( rest = Vcoded_charset_entity_reference_alist;
3313 !NILP (rest); rest = Fcdr (rest) )
3319 char_type = XCDR (ccs);
3324 if (NILP (ccs = Ffind_charset (ccs)))
3333 pat = Fregexp_quote (pat);
3340 pat = concat3 (build_string ("^&"),
3341 pat, build_string ("\\([0-9]+\\)$"));
3344 else if (EQ (ret, Qx))
3346 pat = concat3 (build_string ("^&"),
3347 pat, build_string ("\\([0-9a-f]+\\)$"));
3350 else if (EQ (ret, QX))
3352 pat = concat3 (build_string ("^&"),
3353 pat, build_string ("\\([0-9A-F]+\\)$"));
3359 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3362 = XINT (Fstring_to_number
3363 (Fsubstring (string,
3364 Fmatch_beginning (make_int (1)),
3365 Fmatch_end (make_int (1))),
3369 ? DECODE_CHAR (ccs, code, 0)
3370 : decode_builtin_char (ccs, code);
3373 DECODE_ADD_UCS_CHAR (chr, dst);
3376 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3377 Dynarr_add (dst, ';');
3383 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3384 string, Qnil, Qnil)))
3387 = XUINT (Fstring_to_number
3388 (Fsubstring (string,
3389 Fmatch_beginning (make_int (1)),
3390 Fmatch_end (make_int (1))),
3393 DECODE_ADD_UCS_CHAR (code, dst);
3397 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3398 Dynarr_add (dst, ';');
3401 str->er_counter = 0;
3403 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3405 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3406 str->er_counter = 0;
3407 DECODE_ADD_UCS_CHAR (c, dst);
3410 str->er_buf[str->er_counter++] = c;
3413 void char_encode_as_entity_reference (Emchar ch, char* buf);
3415 char_encode_as_entity_reference (Emchar ch, char* buf)
3417 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3420 Lisp_Object char_type;
3421 int format_columns, idx;
3422 char format[ER_BUF_SIZE];
3424 while (!NILP (rest))
3430 char_type = XCDR (ccs);
3435 if (!NILP (ccs = Ffind_charset (ccs)))
3438 = charset_code_point (ccs, ch,
3440 CHAR_ALL : CHAR_ISOLATED_ONLY );
3442 if ( (code_point >= 0)
3443 && ( NILP (char_type)
3445 || ( charset_code_point (ccs, ch, CHAR_DEFINED_ONLY)
3449 || ( DECODE_CHAR (ccs, code_point, 0) != ch )
3458 if ( STRINGP (ret) &&
3459 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3462 strncpy (&format[1], XSTRING_DATA (ret), idx);
3472 format[idx++] = '%';
3473 format_columns = XINT (ret);
3474 if ( (2 <= format_columns) && (format_columns <= 8)
3475 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3477 format [idx++] = '0';
3478 format [idx++] = '0' + format_columns;
3487 format [idx++] = 'd';
3488 else if (EQ (ret, Qx))
3489 format [idx++] = 'x';
3490 else if (EQ (ret, QX))
3491 format [idx++] = 'X';
3494 format [idx++] = ';';
3497 sprintf (buf, format, code_point);
3504 sprintf (buf, "&MCS-%08X;", ch);
3508 /************************************************************************/
3509 /* character composition */
3510 /************************************************************************/
3511 extern Lisp_Object Qcomposition, Qrep_decomposition;
3514 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3516 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3520 for (i = 0; i < str->combined_char_count; i++)
3521 decode_add_er_char (str, str->combined_chars[i], dst);
3522 str->combined_char_count = 0;
3523 #ifdef USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE
3524 str->combining_table = COS_NIL;
3526 str->combining_table = Qnil;
3527 #endif /* USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE */
3530 extern CONCORD_DS concord_current_env;
3534 concord_setup_env_maybe ()
3536 if (concord_current_env == NULL)
3538 concord_open_env ("/usr/local/share/chise/1.0/db/");
3544 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3545 unsigned_char_dynarr* dst);
3547 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3548 Emchar character, unsigned_char_dynarr* dst)
3550 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3551 decode_add_er_char (str, character, dst);
3552 #ifdef USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE
3553 else if (!cos_cons_p (str->combining_table))
3557 /* concord_setup_env_maybe (); */
3558 open_chise_data_source_maybe ();
3559 ret = concord_object_get_attribute (cos_make_char (character),
3562 if (!cos_cons_p (ret))
3563 decode_add_er_char (str, character, dst);
3566 str->combined_chars[0] = character;
3567 str->combined_char_count = 1;
3568 str->combining_table = ret;
3574 = cos_cdr (cos_assoc (cos_make_char (character),
3575 str->combining_table));
3577 if (cos_char_p (ret))
3579 Emchar char2 = cos_char_id (ret);
3582 /* concord_setup_env_maybe (); */
3583 open_chise_data_source_maybe ();
3584 ret2 = concord_object_get_attribute (ret, COS_COMPOSITION);
3586 if (!cos_cons_p (ret2))
3588 decode_add_er_char (str, char2, dst);
3589 str->combined_char_count = 0;
3590 str->combining_table = COS_NIL;
3594 str->combined_chars[0] = char2;
3595 str->combined_char_count = 1;
3596 str->combining_table = ret2;
3601 /* concord_setup_env_maybe (); */
3602 open_chise_data_source_maybe ();
3603 ret = concord_object_get_attribute (cos_make_char (character),
3606 COMPOSE_FLUSH_CHARS (str, dst);
3607 if (!cos_cons_p (ret))
3608 decode_add_er_char (str, character, dst);
3611 str->combined_chars[0] = character;
3612 str->combined_char_count = 1;
3613 str->combining_table = ret;
3618 else if (!CONSP (str->combining_table))
3621 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3625 decode_add_er_char (str, character, dst);
3628 str->combined_chars[0] = character;
3629 str->combined_char_count = 1;
3630 str->combining_table = ret;
3636 = Fcdr (Fassq (make_char (character), str->combining_table));
3640 Emchar char2 = XCHARVAL (ret);
3641 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3646 decode_add_er_char (str, char2, dst);
3647 str->combined_char_count = 0;
3648 str->combining_table = Qnil;
3652 str->combined_chars[0] = char2;
3653 str->combined_char_count = 1;
3654 str->combining_table = ret2;
3659 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3662 COMPOSE_FLUSH_CHARS (str, dst);
3664 decode_add_er_char (str, character, dst);
3667 str->combined_chars[0] = character;
3668 str->combined_char_count = 1;
3669 str->combining_table = ret;
3673 #endif /* USE_CONCORD_OBJECT_SYSTEM_TO_COMPOSE */
3675 #else /* not UTF2000 */
3676 #define COMPOSE_FLUSH_CHARS(str, dst)
3677 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3678 #endif /* UTF2000 */
3681 /************************************************************************/
3682 /* Shift-JIS methods */
3683 /************************************************************************/
3685 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3686 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3687 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3688 encoded by "position-code + 0x80". A character of JISX0208
3689 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3690 position-codes are divided and shifted so that it fit in the range
3693 --- CODE RANGE of Shift-JIS ---
3694 (character set) (range)
3696 JISX0201-Kana 0xA0 .. 0xDF
3697 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3698 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3699 -------------------------------
3703 /* Is this the first byte of a Shift-JIS two-byte char? */
3705 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3706 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3708 /* Is this the second byte of a Shift-JIS two-byte char? */
3710 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3711 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3713 #define BYTE_SJIS_KATAKANA_P(c) \
3714 ((c) >= 0xA1 && (c) <= 0xDF)
3717 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3721 unsigned char c = *(unsigned char *)src++;
3722 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3724 if (st->shift_jis.in_second_byte)
3726 st->shift_jis.in_second_byte = 0;
3730 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3731 st->shift_jis.in_second_byte = 1;
3733 return CODING_CATEGORY_SHIFT_JIS_MASK;
3736 /* Convert Shift-JIS data to internal format. */
3739 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3740 unsigned_char_dynarr *dst, Lstream_data_count n)
3742 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3743 unsigned int flags = str->flags;
3744 unsigned int cpos = str->cpos;
3745 eol_type_t eol_type = str->eol_type;
3749 unsigned char c = *(unsigned char *)src++;
3753 /* Previous character was first byte of Shift-JIS Kanji char. */
3754 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3756 unsigned char e1, e2;
3758 DECODE_SJIS (cpos, c, e1, e2);
3760 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3764 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3765 Dynarr_add (dst, e1);
3766 Dynarr_add (dst, e2);
3771 DECODE_ADD_BINARY_CHAR (cpos, dst);
3772 DECODE_ADD_BINARY_CHAR (c, dst);
3778 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3779 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3781 else if (BYTE_SJIS_KATAKANA_P (c))
3784 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3787 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3788 Dynarr_add (dst, c);
3793 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3797 DECODE_ADD_BINARY_CHAR (c, dst);
3799 label_continue_loop:;
3802 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3808 /* Convert internal character representation to Shift_JIS. */
3811 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3812 unsigned_char_dynarr *dst, unsigned int *flags)
3814 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3818 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3819 Dynarr_add (dst, '\r');
3820 if (eol_type != EOL_CR)
3821 Dynarr_add (dst, ch);
3825 unsigned int s1, s2;
3827 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3829 if (code_point >= 0)
3830 Dynarr_add (dst, code_point);
3831 else if ((code_point
3832 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3835 ENCODE_SJIS ((code_point >> 8) | 0x80,
3836 (code_point & 0xFF) | 0x80, s1, s2);
3837 Dynarr_add (dst, s1);
3838 Dynarr_add (dst, s2);
3840 else if ((code_point
3841 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3843 Dynarr_add (dst, code_point | 0x80);
3844 else if ((code_point
3845 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3848 ENCODE_SJIS ((code_point >> 8) | 0x80,
3849 (code_point & 0xFF) | 0x80, s1, s2);
3850 Dynarr_add (dst, s1);
3851 Dynarr_add (dst, s2);
3853 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3855 Dynarr_add (dst, code_point);
3857 Dynarr_add (dst, '?');
3859 Lisp_Object charset;
3860 unsigned int c1, c2;
3862 BREAKUP_CHAR (ch, charset, c1, c2);
3864 if (EQ(charset, Vcharset_katakana_jisx0201))
3866 Dynarr_add (dst, c1 | 0x80);
3870 Dynarr_add (dst, c1);
3872 else if (EQ(charset, Vcharset_japanese_jisx0208))
3874 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3875 Dynarr_add (dst, s1);
3876 Dynarr_add (dst, s2);
3879 Dynarr_add (dst, '?');
3885 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3886 unsigned int *flags)
3890 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3891 Decode a JISX0208 character of Shift-JIS coding-system.
3892 CODE is the character code in Shift-JIS as a cons of type bytes.
3893 Return the corresponding character.
3897 unsigned char c1, c2, s1, s2;
3900 CHECK_INT (XCAR (code));
3901 CHECK_INT (XCDR (code));
3902 s1 = XINT (XCAR (code));
3903 s2 = XINT (XCDR (code));
3904 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3905 BYTE_SJIS_TWO_BYTE_2_P (s2))
3907 DECODE_SJIS (s1, s2, c1, c2);
3908 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3909 c1 & 0x7F, c2 & 0x7F));
3915 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3916 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3917 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3921 Lisp_Object charset;
3924 CHECK_CHAR_COERCE_INT (character);
3925 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3926 if (EQ (charset, Vcharset_japanese_jisx0208))
3928 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3929 return Fcons (make_int (s1), make_int (s2));
3936 /************************************************************************/
3938 /************************************************************************/
3940 /* BIG5 is a coding system encoding two character sets: ASCII and
3941 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3942 character set and is encoded in two-byte.
3944 --- CODE RANGE of BIG5 ---
3945 (character set) (range)
3947 Big5 (1st byte) 0xA1 .. 0xFE
3948 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3949 --------------------------
3951 Since the number of characters in Big5 is larger than maximum
3952 characters in Emacs' charset (96x96), it can't be handled as one
3953 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3954 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3955 contains frequently used characters and the latter contains less
3956 frequently used characters. */
3959 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3960 ((c) >= 0x81 && (c) <= 0xFE)
3962 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3963 ((c) >= 0xA1 && (c) <= 0xFE)
3966 /* Is this the second byte of a Shift-JIS two-byte char? */
3968 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3969 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3971 /* Number of Big5 characters which have the same code in 1st byte. */
3973 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3975 /* Code conversion macros. These are macros because they are used in
3976 inner loops during code conversion.
3978 Note that temporary variables in macros introduce the classic
3979 dynamic-scoping problems with variable names. We use capital-
3980 lettered variables in the assumption that XEmacs does not use
3981 capital letters in variables except in a very formalized way
3984 /* Convert Big5 code (b1, b2) into its internal string representation
3987 /* There is a much simpler way to split the Big5 charset into two.
3988 For the moment I'm going to leave the algorithm as-is because it
3989 claims to separate out the most-used characters into a single
3990 charset, which perhaps will lead to optimizations in various
3993 The way the algorithm works is something like this:
3995 Big5 can be viewed as a 94x157 charset, where the row is
3996 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3997 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3998 the split between low and high column numbers is apparently
3999 meaningless; ascending rows produce less and less frequent chars.
4000 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
4001 the first charset, and the upper half (0xC9 .. 0xFE) to the
4002 second. To do the conversion, we convert the character into
4003 a single number where 0 .. 156 is the first row, 157 .. 313
4004 is the second, etc. That way, the characters are ordered by
4005 decreasing frequency. Then we just chop the space in two
4006 and coerce the result into a 94x94 space.
4009 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
4011 int B1 = b1, B2 = b2; \
4013 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
4017 lb = LEADING_BYTE_CHINESE_BIG5_1; \
4021 lb = LEADING_BYTE_CHINESE_BIG5_2; \
4022 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
4024 c1 = I / (0xFF - 0xA1) + 0xA1; \
4025 c2 = I % (0xFF - 0xA1) + 0xA1; \
4028 /* Convert the internal string representation of a Big5 character
4029 (lb, c1, c2) into Big5 code (b1, b2). */
4031 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
4033 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
4035 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
4037 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
4039 b1 = I / BIG5_SAME_ROW + 0xA1; \
4040 b2 = I % BIG5_SAME_ROW; \
4041 b2 += b2 < 0x3F ? 0x40 : 0x62; \
4045 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4049 unsigned char c = *(unsigned char *)src++;
4050 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
4052 || (c >= 0x80 && c <= 0xA0)
4056 if (st->big5.in_second_byte)
4058 st->big5.in_second_byte = 0;
4059 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
4069 st->big5.in_second_byte = 1;
4071 return CODING_CATEGORY_BIG5_MASK;
4074 /* Convert Big5 data to internal format. */
4077 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
4078 unsigned_char_dynarr *dst, Lstream_data_count n)
4080 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4081 unsigned int flags = str->flags;
4082 unsigned int cpos = str->cpos;
4083 eol_type_t eol_type = str->eol_type;
4086 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4087 (decoding)->codesys, 1);
4092 unsigned char c = *(unsigned char *)src++;
4095 /* Previous character was first byte of Big5 char. */
4096 if (BYTE_BIG5_TWO_BYTE_2_P (c))
4099 int code_point = (cpos << 8) | c;
4100 Emchar char_id = decode_defined_char (ccs, code_point, 0);
4104 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
4105 DECODE_ADD_UCS_CHAR (char_id, dst);
4107 unsigned char b1, b2, b3;
4108 DECODE_BIG5 (cpos, c, b1, b2, b3);
4109 Dynarr_add (dst, b1);
4110 Dynarr_add (dst, b2);
4111 Dynarr_add (dst, b3);
4116 DECODE_ADD_BINARY_CHAR (cpos, dst);
4117 DECODE_ADD_BINARY_CHAR (c, dst);
4123 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4124 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4126 decode_flush_er_chars (str, dst);
4131 decode_flush_er_chars (str, dst);
4132 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4133 DECODE_ADD_BINARY_CHAR (c, dst);
4137 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4138 decode_add_er_char (str, c, dst);
4141 label_continue_loop:;
4144 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4145 if (flags & CODING_STATE_END)
4147 decode_flush_er_chars (str, dst);
4148 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4149 if (flags & CODING_STATE_CR)
4150 Dynarr_add (dst, '\r');
4157 /* Convert internally-formatted data to Big5. */
4160 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4161 unsigned_char_dynarr *dst, unsigned int *flags)
4163 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4167 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4168 Dynarr_add (dst, '\r');
4169 if (eol_type != EOL_CR)
4170 Dynarr_add (dst, ch);
4177 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4179 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4180 Dynarr_add (dst, code_point);
4181 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4183 Dynarr_add (dst, code_point >> 8);
4184 Dynarr_add (dst, code_point & 0xFF);
4186 else if ((code_point
4187 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4189 Dynarr_add (dst, code_point >> 8);
4190 Dynarr_add (dst, code_point & 0xFF);
4192 else if ((code_point
4193 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4196 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4197 + ((code_point & 0xFF) - 33);
4198 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4199 unsigned char b2 = I % BIG5_SAME_ROW;
4201 b2 += b2 < 0x3F ? 0x40 : 0x62;
4202 Dynarr_add (dst, b1);
4203 Dynarr_add (dst, b2);
4205 else if ((code_point
4206 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4209 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4210 + ((code_point & 0xFF) - 33);
4211 unsigned char b1, b2;
4213 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4214 b1 = I / BIG5_SAME_ROW + 0xA1;
4215 b2 = I % BIG5_SAME_ROW;
4216 b2 += b2 < 0x3F ? 0x40 : 0x62;
4217 Dynarr_add (dst, b1);
4218 Dynarr_add (dst, b2);
4220 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4224 char_encode_as_entity_reference (ch, buf);
4225 Dynarr_add_many (dst, buf, strlen (buf));
4228 Dynarr_add (dst, '?');
4235 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4236 unsigned int *flags)
4241 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4242 Decode a Big5 character CODE of BIG5 coding-system.
4243 CODE is the character code in BIG5, a cons of two integers.
4244 Return the corresponding character.
4248 unsigned char c1, c2, b1, b2;
4251 CHECK_INT (XCAR (code));
4252 CHECK_INT (XCDR (code));
4253 b1 = XINT (XCAR (code));
4254 b2 = XINT (XCDR (code));
4255 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4256 BYTE_BIG5_TWO_BYTE_2_P (b2))
4258 Charset_ID leading_byte;
4259 Lisp_Object charset;
4260 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4261 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4262 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4268 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4269 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4270 Return the corresponding character code in Big5.
4274 Lisp_Object charset;
4277 CHECK_CHAR_COERCE_INT (character);
4278 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4279 if (EQ (charset, Vcharset_chinese_big5_1) ||
4280 EQ (charset, Vcharset_chinese_big5_2))
4282 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4284 return Fcons (make_int (b1), make_int (b2));
4291 /************************************************************************/
4293 /************************************************************************/
4296 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4300 unsigned char c = *(unsigned char *)src++;
4301 switch (st->ucs4.in_byte)
4310 st->ucs4.in_byte = 0;
4316 return CODING_CATEGORY_UCS4_MASK;
4320 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4321 unsigned_char_dynarr *dst, Lstream_data_count n)
4323 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4324 unsigned int flags = str->flags;
4325 unsigned int cpos = str->cpos;
4326 unsigned char counter = str->counter;
4330 unsigned char c = *(unsigned char *)src++;
4338 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4343 cpos = ( cpos << 8 ) | c;
4347 if (counter & CODING_STATE_END)
4348 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4352 str->counter = counter;
4356 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4357 unsigned_char_dynarr *dst, unsigned int *flags)
4359 Dynarr_add (dst, ch >> 24);
4360 Dynarr_add (dst, ch >> 16);
4361 Dynarr_add (dst, ch >> 8);
4362 Dynarr_add (dst, ch );
4366 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4367 unsigned int *flags)
4372 /************************************************************************/
4373 /* UTF-16 methods */
4374 /************************************************************************/
4377 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4379 return CODING_CATEGORY_UTF16_MASK;
4383 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4384 unsigned_char_dynarr *dst, Lstream_data_count n)
4386 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4387 unsigned int flags = str->flags;
4388 unsigned int cpos = str->cpos;
4389 unsigned char counter = str->counter & 3;
4390 unsigned char byte_order = str->counter >> 2;
4391 eol_type_t eol_type = str->eol_type;
4395 unsigned char c = *(unsigned char *)src++;
4401 else if (counter == 1)
4405 if (byte_order == 0)
4406 code = (c << 8) | cpos;
4408 code = (cpos << 8) | c;
4411 code = ((code & 0xFF) << 8) | (code >> 8);
4412 if ( byte_order == 0 )
4417 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4428 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4429 DECODE_ADD_UCS_CHAR (code, dst);
4433 else if (counter == 2)
4435 cpos = (cpos << 8) | c;
4443 ? (c << 8) | (cpos & 0xFF)
4444 : ((cpos & 0xFF) << 8) | c;
4446 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4451 label_continue_loop:;
4453 if (counter & CODING_STATE_END)
4454 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4458 str->counter = (byte_order << 2) | counter;
4462 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4463 unsigned_char_dynarr *dst, unsigned int *flags)
4467 Dynarr_add (dst, ch);
4468 Dynarr_add (dst, ch >> 8);
4472 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4473 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4475 Dynarr_add (dst, y);
4476 Dynarr_add (dst, y >> 8);
4477 Dynarr_add (dst, z);
4478 Dynarr_add (dst, z >> 8);
4483 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4484 unsigned int *flags)
4489 /************************************************************************/
4491 /************************************************************************/
4494 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4498 unsigned char c = *(unsigned char *)src++;
4499 switch (st->utf8.in_byte)
4502 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4505 st->utf8.in_byte = 5;
4507 st->utf8.in_byte = 4;
4509 st->utf8.in_byte = 3;
4511 st->utf8.in_byte = 2;
4513 st->utf8.in_byte = 1;
4518 if ((c & 0xc0) != 0x80)
4524 return CODING_CATEGORY_UTF8_MASK;
4528 decode_output_utf8_partial_char (unsigned char counter,
4530 unsigned_char_dynarr *dst)
4533 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4534 else if (counter == 4)
4536 if (cpos < (1 << 6))
4537 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4540 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4541 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4544 else if (counter == 3)
4546 if (cpos < (1 << 6))
4547 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4548 else if (cpos < (1 << 12))
4550 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4551 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4555 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4556 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4557 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4560 else if (counter == 2)
4562 if (cpos < (1 << 6))
4563 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4564 else if (cpos < (1 << 12))
4566 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4567 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4569 else if (cpos < (1 << 18))
4571 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4572 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4573 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4577 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4578 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4579 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4580 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4585 if (cpos < (1 << 6))
4586 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4587 else if (cpos < (1 << 12))
4589 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4590 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4592 else if (cpos < (1 << 18))
4594 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4595 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4596 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4598 else if (cpos < (1 << 24))
4600 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4601 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4602 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4603 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4607 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4608 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4609 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4610 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4611 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4617 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4618 unsigned_char_dynarr *dst, Lstream_data_count n)
4620 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4621 unsigned int flags = str->flags;
4622 unsigned int cpos = str->cpos;
4623 eol_type_t eol_type = str->eol_type;
4624 unsigned char counter = str->counter;
4626 int bom_flag = str->bom_flag;
4628 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4629 (decoding)->codesys, 0);
4634 unsigned char c = *(unsigned char *)src++;
4639 COMPOSE_FLUSH_CHARS (str, dst);
4640 decode_flush_er_chars (str, dst);
4641 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4643 if ( bom_flag == 0 )
4646 DECODE_ADD_UCS_CHAR (c, dst);
4648 else if ( c < 0xC0 )
4650 if ( bom_flag == 0 )
4653 /* decode_add_er_char (str, c, dst); */
4654 COMPOSE_ADD_CHAR (str, c, dst);
4658 /* decode_flush_er_chars (str, dst); */
4664 else if ( c < 0xF0 )
4669 else if ( c < 0xF8 )
4674 else if ( c < 0xFC )
4686 else if ( (c & 0xC0) == 0x80 )
4688 cpos = ( cpos << 6 ) | ( c & 0x3f );
4693 if ( bom_flag == 0 )
4695 if ( cpos == 0xFEFF )
4706 char_id = decode_defined_char (ccs, cpos, 0);
4713 COMPOSE_ADD_CHAR (str, char_id, dst);
4723 COMPOSE_FLUSH_CHARS (str, dst);
4724 decode_flush_er_chars (str, dst);
4725 decode_output_utf8_partial_char (counter, cpos, dst);
4726 DECODE_ADD_BINARY_CHAR (c, dst);
4730 label_continue_loop:;
4733 if (flags & CODING_STATE_END)
4735 COMPOSE_FLUSH_CHARS (str, dst);
4736 decode_flush_er_chars (str, dst);
4739 decode_output_utf8_partial_char (counter, cpos, dst);
4746 str->counter = counter;
4748 str->bom_flag = bom_flag;
4753 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4754 unsigned_char_dynarr *dst, unsigned int *flags)
4756 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4760 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4761 Dynarr_add (dst, '\r');
4762 if (eol_type != EOL_CR)
4763 Dynarr_add (dst, ch);
4765 else if (ch <= 0x7f)
4767 Dynarr_add (dst, ch);
4772 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4773 int code_point = charset_code_point (ucs_ccs, ch, 0);
4775 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4777 Lisp_Object map, ret;
4779 if ( CODING_SYSTEM_ENABLE_DECOMPOSITION (str->codesys) )
4781 Lisp_Object rest = Vdecomposition_feature_list;
4782 Lisp_Object decomp_f;
4783 Lisp_Object seq = Qnil;
4784 struct gcpro gcpro1;
4786 while ( CONSP (rest) )
4788 decomp_f = XCAR (rest);
4790 seq = Fchar_feature (make_char (ch), decomp_f, Qnil,
4800 Lisp_Object base = Fcar (seq);
4803 if ( CHARP (base) && CONSP (seq) )
4805 Lisp_Object comb = Fcar (seq);
4809 char_encode_utf8 (str, XCHAR (base), dst, flags);
4810 char_encode_utf8 (str, XCHAR (comb), dst, flags);
4817 map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4819 && INTP (ret = Fchar_feature (make_char (ch),
4822 code_point = XINT (ret);
4823 else if ( !NILP (map =
4824 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4826 && INTP (ret = Fchar_feature (make_char (ch),
4829 code_point = XINT (ret);
4830 else if ( !NILP (map =
4831 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4833 && INTP (ret = Fchar_feature (make_char (ch),
4836 code_point = XINT (ret);
4837 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4841 char_encode_as_entity_reference (ch, buf);
4842 Dynarr_add_many (dst, buf, strlen (buf));
4848 if (code_point <= 0x7ff)
4850 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4851 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4853 else if (code_point <= 0xffff)
4855 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4856 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4857 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4859 else if (code_point <= 0x1fffff)
4861 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4862 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4863 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4864 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4866 else if (code_point <= 0x3ffffff)
4868 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4869 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4870 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4871 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4872 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4876 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4877 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4878 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4879 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4880 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4881 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4887 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4888 unsigned int *flags)
4893 /************************************************************************/
4894 /* ISO2022 methods */
4895 /************************************************************************/
4897 /* The following note describes the coding system ISO2022 briefly.
4898 Since the intention of this note is to help understand the
4899 functions in this file, some parts are NOT ACCURATE or OVERLY
4900 SIMPLIFIED. For thorough understanding, please refer to the
4901 original document of ISO2022.
4903 ISO2022 provides many mechanisms to encode several character sets
4904 in 7-bit and 8-bit environments. For 7-bit environments, all text
4905 is encoded using bytes less than 128. This may make the encoded
4906 text a little bit longer, but the text passes more easily through
4907 several gateways, some of which strip off MSB (Most Signigant Bit).
4909 There are two kinds of character sets: control character set and
4910 graphic character set. The former contains control characters such
4911 as `newline' and `escape' to provide control functions (control
4912 functions are also provided by escape sequences). The latter
4913 contains graphic characters such as 'A' and '-'. Emacs recognizes
4914 two control character sets and many graphic character sets.
4916 Graphic character sets are classified into one of the following
4917 four classes, according to the number of bytes (DIMENSION) and
4918 number of characters in one dimension (CHARS) of the set:
4919 - DIMENSION1_CHARS94
4920 - DIMENSION1_CHARS96
4921 - DIMENSION2_CHARS94
4922 - DIMENSION2_CHARS96
4924 In addition, each character set is assigned an identification tag,
4925 unique for each set, called "final character" (denoted as <F>
4926 hereafter). The <F> of each character set is decided by ECMA(*)
4927 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4928 (0x30..0x3F are for private use only).
4930 Note (*): ECMA = European Computer Manufacturers Association
4932 Here are examples of graphic character set [NAME(<F>)]:
4933 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4934 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4935 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4936 o DIMENSION2_CHARS96 -- none for the moment
4938 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4939 C0 [0x00..0x1F] -- control character plane 0
4940 GL [0x20..0x7F] -- graphic character plane 0
4941 C1 [0x80..0x9F] -- control character plane 1
4942 GR [0xA0..0xFF] -- graphic character plane 1
4944 A control character set is directly designated and invoked to C0 or
4945 C1 by an escape sequence. The most common case is that:
4946 - ISO646's control character set is designated/invoked to C0, and
4947 - ISO6429's control character set is designated/invoked to C1,
4948 and usually these designations/invocations are omitted in encoded
4949 text. In a 7-bit environment, only C0 can be used, and a control
4950 character for C1 is encoded by an appropriate escape sequence to
4951 fit into the environment. All control characters for C1 are
4952 defined to have corresponding escape sequences.
4954 A graphic character set is at first designated to one of four
4955 graphic registers (G0 through G3), then these graphic registers are
4956 invoked to GL or GR. These designations and invocations can be
4957 done independently. The most common case is that G0 is invoked to
4958 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4959 these invocations and designations are omitted in encoded text.
4960 In a 7-bit environment, only GL can be used.
4962 When a graphic character set of CHARS94 is invoked to GL, codes
4963 0x20 and 0x7F of the GL area work as control characters SPACE and
4964 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4967 There are two ways of invocation: locking-shift and single-shift.
4968 With locking-shift, the invocation lasts until the next different
4969 invocation, whereas with single-shift, the invocation affects the
4970 following character only and doesn't affect the locking-shift
4971 state. Invocations are done by the following control characters or
4974 ----------------------------------------------------------------------
4975 abbrev function cntrl escape seq description
4976 ----------------------------------------------------------------------
4977 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4978 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4979 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4980 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4981 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4982 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4983 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4984 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4985 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4986 ----------------------------------------------------------------------
4987 (*) These are not used by any known coding system.
4989 Control characters for these functions are defined by macros
4990 ISO_CODE_XXX in `coding.h'.
4992 Designations are done by the following escape sequences:
4993 ----------------------------------------------------------------------
4994 escape sequence description
4995 ----------------------------------------------------------------------
4996 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4997 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4998 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4999 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
5000 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
5001 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
5002 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
5003 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
5004 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
5005 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
5006 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
5007 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
5008 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
5009 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
5010 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
5011 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
5012 ----------------------------------------------------------------------
5014 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
5015 of dimension 1, chars 94, and final character <F>, etc...
5017 Note (*): Although these designations are not allowed in ISO2022,
5018 Emacs accepts them on decoding, and produces them on encoding
5019 CHARS96 character sets in a coding system which is characterized as
5020 7-bit environment, non-locking-shift, and non-single-shift.
5022 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
5023 '(' can be omitted. We refer to this as "short-form" hereafter.
5025 Now you may notice that there are a lot of ways for encoding the
5026 same multilingual text in ISO2022. Actually, there exist many
5027 coding systems such as Compound Text (used in X11's inter client
5028 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
5029 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
5030 localized platforms), and all of these are variants of ISO2022.
5032 In addition to the above, Emacs handles two more kinds of escape
5033 sequences: ISO6429's direction specification and Emacs' private
5034 sequence for specifying character composition.
5036 ISO6429's direction specification takes the following form:
5037 o CSI ']' -- end of the current direction
5038 o CSI '0' ']' -- end of the current direction
5039 o CSI '1' ']' -- start of left-to-right text
5040 o CSI '2' ']' -- start of right-to-left text
5041 The control character CSI (0x9B: control sequence introducer) is
5042 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
5044 Character composition specification takes the following form:
5045 o ESC '0' -- start character composition
5046 o ESC '1' -- end character composition
5047 Since these are not standard escape sequences of any ISO standard,
5048 their use with these meanings is restricted to Emacs only. */
5051 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
5055 for (i = 0; i < 4; i++)
5057 if (!NILP (coding_system))
5059 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
5061 iso->charset[i] = Qt;
5062 iso->invalid_designated[i] = 0;
5064 iso->esc = ISO_ESC_NOTHING;
5065 iso->esc_bytes_index = 0;
5066 iso->register_left = 0;
5067 iso->register_right = 1;
5068 iso->switched_dir_and_no_valid_charset_yet = 0;
5069 iso->invalid_switch_dir = 0;
5070 iso->output_direction_sequence = 0;
5071 iso->output_literally = 0;
5072 #ifdef ENABLE_COMPOSITE_CHARS
5073 if (iso->composite_chars)
5074 Dynarr_reset (iso->composite_chars);
5079 fit_to_be_escape_quoted (unsigned char c)
5096 /* Parse one byte of an ISO2022 escape sequence.
5097 If the result is an invalid escape sequence, return 0 and
5098 do not change anything in STR. Otherwise, if the result is
5099 an incomplete escape sequence, update ISO2022.ESC and
5100 ISO2022.ESC_BYTES and return -1. Otherwise, update
5101 all the state variables (but not ISO2022.ESC_BYTES) and
5104 If CHECK_INVALID_CHARSETS is non-zero, check for designation
5105 or invocation of an invalid character set and treat that as
5106 an unrecognized escape sequence. */
5109 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
5110 unsigned char c, unsigned int *flags,
5111 int check_invalid_charsets)
5113 /* (1) If we're at the end of a designation sequence, CS is the
5114 charset being designated and REG is the register to designate
5117 (2) If we're at the end of a locking-shift sequence, REG is
5118 the register to invoke and HALF (0 == left, 1 == right) is
5119 the half to invoke it into.
5121 (3) If we're at the end of a single-shift sequence, REG is
5122 the register to invoke. */
5123 Lisp_Object cs = Qnil;
5126 /* NOTE: This code does goto's all over the fucking place.
5127 The reason for this is that we're basically implementing
5128 a state machine here, and hierarchical languages like C
5129 don't really provide a clean way of doing this. */
5131 if (! (*flags & CODING_STATE_ESCAPE))
5132 /* At beginning of escape sequence; we need to reset our
5133 escape-state variables. */
5134 iso->esc = ISO_ESC_NOTHING;
5136 iso->output_literally = 0;
5137 iso->output_direction_sequence = 0;
5141 case ISO_ESC_NOTHING:
5142 iso->esc_bytes_index = 0;
5145 case ISO_CODE_ESC: /* Start escape sequence */
5146 *flags |= CODING_STATE_ESCAPE;
5150 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
5151 *flags |= CODING_STATE_ESCAPE;
5152 iso->esc = ISO_ESC_5_11;
5155 case ISO_CODE_SO: /* locking shift 1 */
5158 case ISO_CODE_SI: /* locking shift 0 */
5162 case ISO_CODE_SS2: /* single shift */
5165 case ISO_CODE_SS3: /* single shift */
5169 default: /* Other control characters */
5176 /**** single shift ****/
5178 case 'N': /* single shift 2 */
5181 case 'O': /* single shift 3 */
5185 /**** locking shift ****/
5187 case '~': /* locking shift 1 right */
5190 case 'n': /* locking shift 2 */
5193 case '}': /* locking shift 2 right */
5196 case 'o': /* locking shift 3 */
5199 case '|': /* locking shift 3 right */
5203 #ifdef ENABLE_COMPOSITE_CHARS
5204 /**** composite ****/
5207 iso->esc = ISO_ESC_START_COMPOSITE;
5208 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5209 CODING_STATE_COMPOSITE;
5213 iso->esc = ISO_ESC_END_COMPOSITE;
5214 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5215 ~CODING_STATE_COMPOSITE;
5217 #endif /* ENABLE_COMPOSITE_CHARS */
5219 /**** directionality ****/
5222 iso->esc = ISO_ESC_5_11;
5225 /**** designation ****/
5227 case '$': /* multibyte charset prefix */
5228 iso->esc = ISO_ESC_2_4;
5232 if (0x28 <= c && c <= 0x2F)
5234 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5238 /* This function is called with CODESYS equal to nil when
5239 doing coding-system detection. */
5241 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5242 && fit_to_be_escape_quoted (c))
5244 iso->esc = ISO_ESC_LITERAL;
5245 *flags &= CODING_STATE_ISO2022_LOCK;
5255 /**** directionality ****/
5257 case ISO_ESC_5_11: /* ISO6429 direction control */
5260 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5261 goto directionality;
5263 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5264 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5265 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5269 case ISO_ESC_5_11_0:
5272 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5273 goto directionality;
5277 case ISO_ESC_5_11_1:
5280 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5281 goto directionality;
5285 case ISO_ESC_5_11_2:
5288 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5289 goto directionality;
5294 iso->esc = ISO_ESC_DIRECTIONALITY;
5295 /* Various junk here to attempt to preserve the direction sequences
5296 literally in the text if they would otherwise be swallowed due
5297 to invalid designations that don't show up as actual charset
5298 changes in the text. */
5299 if (iso->invalid_switch_dir)
5301 /* We already inserted a direction switch literally into the
5302 text. We assume (#### this may not be right) that the
5303 next direction switch is the one going the other way,
5304 and we need to output that literally as well. */
5305 iso->output_literally = 1;
5306 iso->invalid_switch_dir = 0;
5312 /* If we are in the thrall of an invalid designation,
5313 then stick the directionality sequence literally into the
5314 output stream so it ends up in the original text again. */
5315 for (jj = 0; jj < 4; jj++)
5316 if (iso->invalid_designated[jj])
5320 iso->output_literally = 1;
5321 iso->invalid_switch_dir = 1;
5324 /* Indicate that we haven't yet seen a valid designation,
5325 so that if a switch-dir is directly followed by an
5326 invalid designation, both get inserted literally. */
5327 iso->switched_dir_and_no_valid_charset_yet = 1;
5332 /**** designation ****/
5335 if (0x28 <= c && c <= 0x2F)
5337 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5340 if (0x40 <= c && c <= 0x42)
5343 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5344 *flags & CODING_STATE_R2L ?
5345 CHARSET_RIGHT_TO_LEFT :
5346 CHARSET_LEFT_TO_RIGHT);
5357 if (c < '0' || c > '~')
5358 return 0; /* bad final byte */
5360 if (iso->esc >= ISO_ESC_2_8 &&
5361 iso->esc <= ISO_ESC_2_15)
5363 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5364 single = 1; /* single-byte */
5365 reg = (iso->esc - ISO_ESC_2_8) & 3;
5367 else if (iso->esc >= ISO_ESC_2_4_8 &&
5368 iso->esc <= ISO_ESC_2_4_15)
5370 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5371 single = -1; /* multi-byte */
5372 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5376 /* Can this ever be reached? -slb */
5380 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5381 *flags & CODING_STATE_R2L ?
5382 CHARSET_RIGHT_TO_LEFT :
5383 CHARSET_LEFT_TO_RIGHT);
5389 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5393 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5394 /* can't invoke something that ain't there. */
5396 iso->esc = ISO_ESC_SINGLE_SHIFT;
5397 *flags &= CODING_STATE_ISO2022_LOCK;
5399 *flags |= CODING_STATE_SS2;
5401 *flags |= CODING_STATE_SS3;
5405 if (check_invalid_charsets &&
5406 !CHARSETP (iso->charset[reg]))
5407 /* can't invoke something that ain't there. */
5410 iso->register_right = reg;
5412 iso->register_left = reg;
5413 *flags &= CODING_STATE_ISO2022_LOCK;
5414 iso->esc = ISO_ESC_LOCKING_SHIFT;
5418 if (NILP (cs) && check_invalid_charsets)
5420 iso->invalid_designated[reg] = 1;
5421 iso->charset[reg] = Vcharset_ascii;
5422 iso->esc = ISO_ESC_DESIGNATE;
5423 *flags &= CODING_STATE_ISO2022_LOCK;
5424 iso->output_literally = 1;
5425 if (iso->switched_dir_and_no_valid_charset_yet)
5427 /* We encountered a switch-direction followed by an
5428 invalid designation. Ensure that the switch-direction
5429 gets outputted; otherwise it will probably get eaten
5430 when the text is written out again. */
5431 iso->switched_dir_and_no_valid_charset_yet = 0;
5432 iso->output_direction_sequence = 1;
5433 /* And make sure that the switch-dir going the other
5434 way gets outputted, as well. */
5435 iso->invalid_switch_dir = 1;
5439 /* This function is called with CODESYS equal to nil when
5440 doing coding-system detection. */
5441 if (!NILP (codesys))
5443 charset_conversion_spec_dynarr *dyn =
5444 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5450 for (i = 0; i < Dynarr_length (dyn); i++)
5452 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5453 if (EQ (cs, spec->from_charset))
5454 cs = spec->to_charset;
5459 iso->charset[reg] = cs;
5460 iso->esc = ISO_ESC_DESIGNATE;
5461 *flags &= CODING_STATE_ISO2022_LOCK;
5462 if (iso->invalid_designated[reg])
5464 iso->invalid_designated[reg] = 0;
5465 iso->output_literally = 1;
5467 if (iso->switched_dir_and_no_valid_charset_yet)
5468 iso->switched_dir_and_no_valid_charset_yet = 0;
5473 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5477 /* #### There are serious deficiencies in the recognition mechanism
5478 here. This needs to be much smarter if it's going to cut it.
5479 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5480 it should be detected as Latin-1.
5481 All the ISO2022 stuff in this file should be synced up with the
5482 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5483 Perhaps we should wait till R2L works in FSF Emacs? */
5485 if (!st->iso2022.initted)
5487 reset_iso2022 (Qnil, &st->iso2022.iso);
5488 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5489 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5490 CODING_CATEGORY_ISO_8_1_MASK |
5491 CODING_CATEGORY_ISO_8_2_MASK |
5492 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5493 st->iso2022.flags = 0;
5494 st->iso2022.high_byte_count = 0;
5495 st->iso2022.saw_single_shift = 0;
5496 st->iso2022.initted = 1;
5499 mask = st->iso2022.mask;
5503 unsigned char c = *(unsigned char *)src++;
5506 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5507 st->iso2022.high_byte_count++;
5511 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5513 if (st->iso2022.high_byte_count & 1)
5514 /* odd number of high bytes; assume not iso-8-2 */
5515 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5517 st->iso2022.high_byte_count = 0;
5518 st->iso2022.saw_single_shift = 0;
5520 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5522 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5523 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5524 { /* control chars */
5527 /* Allow and ignore control characters that you might
5528 reasonably see in a text file */
5533 case 8: /* backspace */
5534 case 11: /* vertical tab */
5535 case 12: /* form feed */
5536 case 26: /* MS-DOS C-z junk */
5537 case 31: /* '^_' -- for info */
5538 goto label_continue_loop;
5545 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5548 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5549 &st->iso2022.flags, 0))
5551 switch (st->iso2022.iso.esc)
5553 case ISO_ESC_DESIGNATE:
5554 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5555 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5557 case ISO_ESC_LOCKING_SHIFT:
5558 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5559 goto ran_out_of_chars;
5560 case ISO_ESC_SINGLE_SHIFT:
5561 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5562 st->iso2022.saw_single_shift = 1;
5571 goto ran_out_of_chars;
5574 label_continue_loop:;
5583 postprocess_iso2022_mask (int mask)
5585 /* #### kind of cheesy */
5586 /* If seven-bit ISO is allowed, then assume that the encoding is
5587 entirely seven-bit and turn off the eight-bit ones. */
5588 if (mask & CODING_CATEGORY_ISO_7_MASK)
5589 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5590 CODING_CATEGORY_ISO_8_1_MASK |
5591 CODING_CATEGORY_ISO_8_2_MASK);
5595 /* If FLAGS is a null pointer or specifies right-to-left motion,
5596 output a switch-dir-to-left-to-right sequence to DST.
5597 Also update FLAGS if it is not a null pointer.
5598 If INTERNAL_P is set, we are outputting in internal format and
5599 need to handle the CSI differently. */
5602 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5603 unsigned_char_dynarr *dst,
5604 unsigned int *flags,
5607 if (!flags || (*flags & CODING_STATE_R2L))
5609 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5611 Dynarr_add (dst, ISO_CODE_ESC);
5612 Dynarr_add (dst, '[');
5614 else if (internal_p)
5615 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5617 Dynarr_add (dst, ISO_CODE_CSI);
5618 Dynarr_add (dst, '0');
5619 Dynarr_add (dst, ']');
5621 *flags &= ~CODING_STATE_R2L;
5625 /* If FLAGS is a null pointer or specifies a direction different from
5626 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5627 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5628 sequence to DST. Also update FLAGS if it is not a null pointer.
5629 If INTERNAL_P is set, we are outputting in internal format and
5630 need to handle the CSI differently. */
5633 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5634 unsigned_char_dynarr *dst, unsigned int *flags,
5637 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5638 direction == CHARSET_LEFT_TO_RIGHT)
5639 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5640 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5641 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5642 direction == CHARSET_RIGHT_TO_LEFT)
5644 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5646 Dynarr_add (dst, ISO_CODE_ESC);
5647 Dynarr_add (dst, '[');
5649 else if (internal_p)
5650 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5652 Dynarr_add (dst, ISO_CODE_CSI);
5653 Dynarr_add (dst, '2');
5654 Dynarr_add (dst, ']');
5656 *flags |= CODING_STATE_R2L;
5660 /* Convert ISO2022-format data to internal format. */
5663 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5664 unsigned_char_dynarr *dst, Lstream_data_count n)
5666 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5667 unsigned int flags = str->flags;
5668 unsigned int cpos = str->cpos;
5669 unsigned char counter = str->counter;
5670 eol_type_t eol_type = str->eol_type;
5671 #ifdef ENABLE_COMPOSITE_CHARS
5672 unsigned_char_dynarr *real_dst = dst;
5674 Lisp_Object coding_system;
5676 XSETCODING_SYSTEM (coding_system, str->codesys);
5678 #ifdef ENABLE_COMPOSITE_CHARS
5679 if (flags & CODING_STATE_COMPOSITE)
5680 dst = str->iso2022.composite_chars;
5681 #endif /* ENABLE_COMPOSITE_CHARS */
5685 unsigned char c = *(unsigned char *)src++;
5686 if (flags & CODING_STATE_ESCAPE)
5687 { /* Within ESC sequence */
5688 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5693 switch (str->iso2022.esc)
5695 #ifdef ENABLE_COMPOSITE_CHARS
5696 case ISO_ESC_START_COMPOSITE:
5697 if (str->iso2022.composite_chars)
5698 Dynarr_reset (str->iso2022.composite_chars);
5700 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5701 dst = str->iso2022.composite_chars;
5703 case ISO_ESC_END_COMPOSITE:
5705 Bufbyte comstr[MAX_EMCHAR_LEN];
5707 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5708 Dynarr_length (dst));
5710 len = set_charptr_emchar (comstr, emch);
5711 Dynarr_add_many (dst, comstr, len);
5714 #endif /* ENABLE_COMPOSITE_CHARS */
5716 case ISO_ESC_LITERAL:
5717 COMPOSE_FLUSH_CHARS (str, dst);
5718 decode_flush_er_chars (str, dst);
5719 DECODE_ADD_BINARY_CHAR (c, dst);
5723 /* Everything else handled already */
5728 /* Attempted error recovery. */
5729 if (str->iso2022.output_direction_sequence)
5730 ensure_correct_direction (flags & CODING_STATE_R2L ?
5731 CHARSET_RIGHT_TO_LEFT :
5732 CHARSET_LEFT_TO_RIGHT,
5733 str->codesys, dst, 0, 1);
5734 /* More error recovery. */
5735 if (!retval || str->iso2022.output_literally)
5737 /* Output the (possibly invalid) sequence */
5739 COMPOSE_FLUSH_CHARS (str, dst);
5740 decode_flush_er_chars (str, dst);
5741 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5742 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5743 flags &= CODING_STATE_ISO2022_LOCK;
5745 n++, src--;/* Repeat the loop with the same character. */
5748 /* No sense in reprocessing the final byte of the
5749 escape sequence; it could mess things up anyway.
5751 COMPOSE_FLUSH_CHARS (str, dst);
5752 decode_flush_er_chars (str, dst);
5753 DECODE_ADD_BINARY_CHAR (c, dst);
5759 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5760 { /* Control characters */
5762 /***** Error-handling *****/
5764 /* If we were in the middle of a character, dump out the
5765 partial character. */
5768 COMPOSE_FLUSH_CHARS (str, dst);
5769 decode_flush_er_chars (str, dst);
5773 DECODE_ADD_BINARY_CHAR
5774 ((unsigned char)(cpos >> (counter * 8)), dst);
5779 /* If we just saw a single-shift character, dump it out.
5780 This may dump out the wrong sort of single-shift character,
5781 but least it will give an indication that something went
5783 if (flags & CODING_STATE_SS2)
5785 COMPOSE_FLUSH_CHARS (str, dst);
5786 decode_flush_er_chars (str, dst);
5787 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5788 flags &= ~CODING_STATE_SS2;
5790 if (flags & CODING_STATE_SS3)
5792 COMPOSE_FLUSH_CHARS (str, dst);
5793 decode_flush_er_chars (str, dst);
5794 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5795 flags &= ~CODING_STATE_SS3;
5798 /***** Now handle the control characters. *****/
5804 COMPOSE_FLUSH_CHARS (str, dst);
5805 decode_flush_er_chars (str, dst);
5806 if (eol_type == EOL_CR)
5807 Dynarr_add (dst, '\n');
5808 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5809 Dynarr_add (dst, c);
5811 flags |= CODING_STATE_CR;
5812 goto label_continue_loop;
5814 else if (flags & CODING_STATE_CR)
5815 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5817 Dynarr_add (dst, '\r');
5818 flags &= ~CODING_STATE_CR;
5821 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5824 flags &= CODING_STATE_ISO2022_LOCK;
5826 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5828 COMPOSE_FLUSH_CHARS (str, dst);
5829 decode_flush_er_chars (str, dst);
5830 DECODE_ADD_BINARY_CHAR (c, dst);
5834 { /* Graphic characters */
5835 Lisp_Object charset;
5844 COMPOSE_FLUSH_CHARS (str, dst);
5845 decode_flush_er_chars (str, dst);
5846 if (eol_type == EOL_CR)
5847 Dynarr_add (dst, '\n');
5848 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5849 Dynarr_add (dst, c);
5851 flags |= CODING_STATE_CR;
5852 goto label_continue_loop;
5854 else if (flags & CODING_STATE_CR)
5855 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5857 Dynarr_add (dst, '\r');
5858 flags &= ~CODING_STATE_CR;
5861 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5864 /* Now determine the charset. */
5865 reg = ((flags & CODING_STATE_SS2) ? 2
5866 : (flags & CODING_STATE_SS3) ? 3
5867 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5868 : str->iso2022.register_left);
5869 charset = str->iso2022.charset[reg];
5871 /* Error checking: */
5872 if (! CHARSETP (charset)
5873 || str->iso2022.invalid_designated[reg]
5874 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5875 && XCHARSET_CHARS (charset) == 94))
5876 /* Mrmph. We are trying to invoke a register that has no
5877 or an invalid charset in it, or trying to add a character
5878 outside the range of the charset. Insert that char literally
5879 to preserve it for the output. */
5881 COMPOSE_FLUSH_CHARS (str, dst);
5882 decode_flush_er_chars (str, dst);
5886 DECODE_ADD_BINARY_CHAR
5887 ((unsigned char)(cpos >> (counter * 8)), dst);
5890 DECODE_ADD_BINARY_CHAR (c, dst);
5895 /* Things are probably hunky-dorey. */
5897 /* Fetch reverse charset, maybe. */
5898 if (((flags & CODING_STATE_R2L) &&
5899 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5901 (!(flags & CODING_STATE_R2L) &&
5902 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5904 Lisp_Object new_charset =
5905 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5906 if (!NILP (new_charset))
5907 charset = new_charset;
5912 if (XCHARSET_DIMENSION (charset) == counter)
5914 COMPOSE_ADD_CHAR (str,
5915 DECODE_CHAR (charset,
5916 ((cpos & 0x7F7F7F) << 8)
5923 cpos = (cpos << 8) | c;
5925 lb = XCHARSET_LEADING_BYTE (charset);
5926 switch (XCHARSET_REP_BYTES (charset))
5929 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5930 Dynarr_add (dst, c & 0x7F);
5933 case 2: /* one-byte official */
5934 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5935 Dynarr_add (dst, lb);
5936 Dynarr_add (dst, c | 0x80);
5939 case 3: /* one-byte private or two-byte official */
5940 if (XCHARSET_PRIVATE_P (charset))
5942 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5943 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5944 Dynarr_add (dst, lb);
5945 Dynarr_add (dst, c | 0x80);
5951 Dynarr_add (dst, lb);
5952 Dynarr_add (dst, ch | 0x80);
5953 Dynarr_add (dst, c | 0x80);
5961 default: /* two-byte private */
5964 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5965 Dynarr_add (dst, lb);
5966 Dynarr_add (dst, ch | 0x80);
5967 Dynarr_add (dst, c | 0x80);
5977 flags &= CODING_STATE_ISO2022_LOCK;
5980 label_continue_loop:;
5983 if (flags & CODING_STATE_END)
5985 COMPOSE_FLUSH_CHARS (str, dst);
5986 decode_flush_er_chars (str, dst);
5987 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5991 str->counter = counter;
5995 /***** ISO2022 encoder *****/
5997 /* Designate CHARSET into register REG. */
6000 iso2022_designate (Lisp_Object charset, unsigned char reg,
6001 struct encoding_stream *str, unsigned_char_dynarr *dst)
6003 static const char inter94[] = "()*+";
6004 static const char inter96[] = ",-./";
6005 unsigned short chars;
6006 unsigned char dimension;
6007 unsigned char final;
6008 Lisp_Object old_charset = str->iso2022.charset[reg];
6010 str->iso2022.charset[reg] = charset;
6011 if (!CHARSETP (charset))
6012 /* charset might be an initial nil or t. */
6014 chars = XCHARSET_CHARS (charset);
6015 dimension = XCHARSET_DIMENSION (charset);
6016 final = XCHARSET_FINAL (charset);
6017 if (!str->iso2022.force_charset_on_output[reg] &&
6018 CHARSETP (old_charset) &&
6019 XCHARSET_CHARS (old_charset) == chars &&
6020 XCHARSET_DIMENSION (old_charset) == dimension &&
6021 XCHARSET_FINAL (old_charset) == final)
6024 str->iso2022.force_charset_on_output[reg] = 0;
6027 charset_conversion_spec_dynarr *dyn =
6028 str->codesys->iso2022.output_conv;
6034 for (i = 0; i < Dynarr_length (dyn); i++)
6036 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
6037 if (EQ (charset, spec->from_charset))
6038 charset = spec->to_charset;
6043 Dynarr_add (dst, ISO_CODE_ESC);
6048 Dynarr_add (dst, inter94[reg]);
6051 Dynarr_add (dst, '$');
6053 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
6056 Dynarr_add (dst, inter94[reg]);
6061 Dynarr_add (dst, inter96[reg]);
6064 Dynarr_add (dst, '$');
6065 Dynarr_add (dst, inter96[reg]);
6069 Dynarr_add (dst, final);
6073 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
6075 if (str->iso2022.register_left != 0)
6077 Dynarr_add (dst, ISO_CODE_SI);
6078 str->iso2022.register_left = 0;
6083 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
6085 if (str->iso2022.register_left != 1)
6087 Dynarr_add (dst, ISO_CODE_SO);
6088 str->iso2022.register_left = 1;
6093 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
6094 unsigned_char_dynarr *dst, unsigned int *flags)
6096 unsigned char charmask;
6097 Lisp_Coding_System* codesys = str->codesys;
6098 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6100 Lisp_Object charset = str->iso2022.current_charset;
6101 int half = str->iso2022.current_half;
6102 int code_point = -1;
6106 restore_left_to_right_direction (codesys, dst, flags, 0);
6108 /* Make sure G0 contains ASCII */
6109 if ((ch > ' ' && ch < ISO_CODE_DEL)
6110 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
6112 ensure_normal_shift (str, dst);
6113 iso2022_designate (Vcharset_ascii, 0, str, dst);
6116 /* If necessary, restore everything to the default state
6118 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
6120 restore_left_to_right_direction (codesys, dst, flags, 0);
6122 ensure_normal_shift (str, dst);
6124 for (i = 0; i < 4; i++)
6126 Lisp_Object initial_charset =
6127 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6128 iso2022_designate (initial_charset, i, str, dst);
6133 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6134 Dynarr_add (dst, '\r');
6135 if (eol_type != EOL_CR)
6136 Dynarr_add (dst, ch);
6140 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6141 && fit_to_be_escape_quoted (ch))
6142 Dynarr_add (dst, ISO_CODE_ESC);
6143 Dynarr_add (dst, ch);
6146 else if ( (0x80 <= ch) && (ch <= 0x9f) )
6148 charmask = (half == 0 ? 0x00 : 0x80);
6150 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6151 && fit_to_be_escape_quoted (ch))
6152 Dynarr_add (dst, ISO_CODE_ESC);
6153 /* you asked for it ... */
6154 Dynarr_add (dst, ch);
6160 /* Now determine which register to use. */
6162 for (i = 0; i < 4; i++)
6164 if ((CHARSETP (charset = str->iso2022.charset[i])
6165 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6169 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6170 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6178 Lisp_Object original_default_coded_charset_priority_list
6179 = Vdefault_coded_charset_priority_list;
6180 Vdefault_coded_charset_priority_list
6181 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6182 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6184 code_point = ENCODE_CHAR (ch, charset);
6185 if (XCHARSET_FINAL (charset))
6187 Vdefault_coded_charset_priority_list
6188 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6189 Vdefault_coded_charset_priority_list));
6191 Vdefault_coded_charset_priority_list
6192 = original_default_coded_charset_priority_list;
6193 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6195 code_point = ENCODE_CHAR (ch, charset);
6196 if (XCHARSET_FINAL (charset))
6198 Vdefault_coded_charset_priority_list
6199 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6200 Vdefault_coded_charset_priority_list));
6202 code_point = ENCODE_CHAR (ch, charset);
6203 if (!XCHARSET_FINAL (charset))
6205 charset = Vcharset_ascii;
6209 Vdefault_coded_charset_priority_list
6210 = original_default_coded_charset_priority_list;
6212 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6213 codesys, dst, flags, 0);
6217 if (XCHARSET_GRAPHIC (charset) != 0)
6219 if (!NILP (str->iso2022.charset[1]) &&
6220 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6221 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6223 else if (!NILP (str->iso2022.charset[2]))
6225 else if (!NILP (str->iso2022.charset[3]))
6234 iso2022_designate (charset, reg, str, dst);
6236 /* Now invoke that register. */
6240 ensure_normal_shift (str, dst);
6244 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6246 ensure_shift_out (str, dst);
6253 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6255 Dynarr_add (dst, ISO_CODE_ESC);
6256 Dynarr_add (dst, 'N');
6261 Dynarr_add (dst, ISO_CODE_SS2);
6266 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6268 Dynarr_add (dst, ISO_CODE_ESC);
6269 Dynarr_add (dst, 'O');
6274 Dynarr_add (dst, ISO_CODE_SS3);
6282 charmask = (half == 0 ? 0x00 : 0x80);
6284 switch (XCHARSET_DIMENSION (charset))
6287 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6290 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6291 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6294 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6295 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6296 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6299 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6300 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6301 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6302 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6308 str->iso2022.current_charset = charset;
6309 str->iso2022.current_half = half;
6313 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6314 unsigned int *flags)
6316 Lisp_Coding_System* codesys = str->codesys;
6319 restore_left_to_right_direction (codesys, dst, flags, 0);
6320 ensure_normal_shift (str, dst);
6321 for (i = 0; i < 4; i++)
6323 Lisp_Object initial_charset
6324 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6325 iso2022_designate (initial_charset, i, str, dst);
6330 /************************************************************************/
6331 /* No-conversion methods */
6332 /************************************************************************/
6334 /* This is used when reading in "binary" files -- i.e. files that may
6335 contain all 256 possible byte values and that are not to be
6336 interpreted as being in any particular decoding. */
6338 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6339 unsigned_char_dynarr *dst, Lstream_data_count n)
6341 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6342 unsigned int flags = str->flags;
6343 unsigned int cpos = str->cpos;
6344 eol_type_t eol_type = str->eol_type;
6348 unsigned char c = *(unsigned char *)src++;
6350 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6351 DECODE_ADD_BINARY_CHAR (c, dst);
6352 label_continue_loop:;
6355 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6362 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6363 unsigned_char_dynarr *dst, Lstream_data_count n)
6366 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6367 unsigned int flags = str->flags;
6368 unsigned int ch = str->ch;
6369 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6371 unsigned char char_boundary = str->iso2022.current_char_boundary;
6378 if (char_boundary == 0)
6384 else if ( c >= 0xf8 )
6389 else if ( c >= 0xf0 )
6394 else if ( c >= 0xe0 )
6399 else if ( c >= 0xc0 )
6409 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6410 Dynarr_add (dst, '\r');
6411 if (eol_type != EOL_CR)
6412 Dynarr_add (dst, c);
6415 Dynarr_add (dst, c);
6418 else if (char_boundary == 1)
6420 ch = ( ch << 6 ) | ( c & 0x3f );
6421 Dynarr_add (dst, ch & 0xff);
6426 ch = ( ch << 6 ) | ( c & 0x3f );
6429 #else /* not UTF2000 */
6432 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6433 Dynarr_add (dst, '\r');
6434 if (eol_type != EOL_CR)
6435 Dynarr_add (dst, '\n');
6438 else if (BYTE_ASCII_P (c))
6441 Dynarr_add (dst, c);
6443 else if (BUFBYTE_LEADING_BYTE_P (c))
6446 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6447 c == LEADING_BYTE_CONTROL_1)
6450 Dynarr_add (dst, '~'); /* untranslatable character */
6454 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6455 Dynarr_add (dst, c);
6456 else if (ch == LEADING_BYTE_CONTROL_1)
6459 Dynarr_add (dst, c - 0x20);
6461 /* else it should be the second or third byte of an
6462 untranslatable character, so ignore it */
6465 #endif /* not UTF2000 */
6471 str->iso2022.current_char_boundary = char_boundary;
6477 /************************************************************************/
6478 /* Initialization */
6479 /************************************************************************/
6482 syms_of_file_coding (void)
6484 INIT_LRECORD_IMPLEMENTATION (coding_system);
6486 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6488 DEFSUBR (Fcoding_system_p);
6489 DEFSUBR (Ffind_coding_system);
6490 DEFSUBR (Fget_coding_system);
6491 DEFSUBR (Fcoding_system_list);
6492 DEFSUBR (Fcoding_system_name);
6493 DEFSUBR (Fmake_coding_system);
6494 DEFSUBR (Fcopy_coding_system);
6495 DEFSUBR (Fcoding_system_canonical_name_p);
6496 DEFSUBR (Fcoding_system_alias_p);
6497 DEFSUBR (Fcoding_system_aliasee);
6498 DEFSUBR (Fdefine_coding_system_alias);
6499 DEFSUBR (Fsubsidiary_coding_system);
6501 DEFSUBR (Fcoding_system_type);
6502 DEFSUBR (Fcoding_system_doc_string);
6504 DEFSUBR (Fcoding_system_charset);
6506 DEFSUBR (Fcoding_system_property);
6508 DEFSUBR (Fcoding_category_list);
6509 DEFSUBR (Fset_coding_priority_list);
6510 DEFSUBR (Fcoding_priority_list);
6511 DEFSUBR (Fset_coding_category_system);
6512 DEFSUBR (Fcoding_category_system);
6514 DEFSUBR (Fdetect_coding_region);
6515 DEFSUBR (Fdecode_coding_region);
6516 DEFSUBR (Fencode_coding_region);
6518 DEFSUBR (Fdecode_shift_jis_char);
6519 DEFSUBR (Fencode_shift_jis_char);
6520 DEFSUBR (Fdecode_big5_char);
6521 DEFSUBR (Fencode_big5_char);
6523 defsymbol (&Qcoding_systemp, "coding-system-p");
6524 defsymbol (&Qno_conversion, "no-conversion");
6525 defsymbol (&Qraw_text, "raw-text");
6527 defsymbol (&Qbig5, "big5");
6528 defsymbol (&Qshift_jis, "shift-jis");
6529 defsymbol (&Qucs4, "ucs-4");
6530 defsymbol (&Qutf8, "utf-8");
6531 defsymbol (&Qutf16, "utf-16");
6532 defsymbol (&Qccl, "ccl");
6533 defsymbol (&Qiso2022, "iso2022");
6535 defsymbol (&Qmnemonic, "mnemonic");
6536 defsymbol (&Qeol_type, "eol-type");
6537 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6538 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6540 defsymbol (&Qcr, "cr");
6541 defsymbol (&Qlf, "lf");
6542 defsymbol (&Qcrlf, "crlf");
6543 defsymbol (&Qeol_cr, "eol-cr");
6544 defsymbol (&Qeol_lf, "eol-lf");
6545 defsymbol (&Qeol_crlf, "eol-crlf");
6547 defsymbol (&Qcharset_g0, "charset-g0");
6548 defsymbol (&Qcharset_g1, "charset-g1");
6549 defsymbol (&Qcharset_g2, "charset-g2");
6550 defsymbol (&Qcharset_g3, "charset-g3");
6551 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6552 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6553 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6554 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6555 defsymbol (&Qno_iso6429, "no-iso6429");
6556 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6557 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6559 defsymbol (&Qshort, "short");
6560 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6561 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6562 defsymbol (&Qseven, "seven");
6563 defsymbol (&Qlock_shift, "lock-shift");
6564 defsymbol (&Qescape_quoted, "escape-quoted");
6567 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6568 defsymbol (&Qdisable_composition, "disable-composition");
6569 defsymbol (&Qenable_decomposition, "enable-decomposition");
6570 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6571 defsymbol (&Quse_entity_reference, "use-entity-reference");
6572 defsymbol (&Qd, "d");
6573 defsymbol (&Qx, "x");
6574 defsymbol (&QX, "X");
6576 defsymbol (&Qencode, "encode");
6577 defsymbol (&Qdecode, "decode");
6580 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6582 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6584 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6586 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6588 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6590 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6592 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6594 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6596 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6598 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6601 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6606 lstream_type_create_file_coding (void)
6608 LSTREAM_HAS_METHOD (decoding, reader);
6609 LSTREAM_HAS_METHOD (decoding, writer);
6610 LSTREAM_HAS_METHOD (decoding, rewinder);
6611 LSTREAM_HAS_METHOD (decoding, seekable_p);
6612 LSTREAM_HAS_METHOD (decoding, flusher);
6613 LSTREAM_HAS_METHOD (decoding, closer);
6614 LSTREAM_HAS_METHOD (decoding, marker);
6616 LSTREAM_HAS_METHOD (encoding, reader);
6617 LSTREAM_HAS_METHOD (encoding, writer);
6618 LSTREAM_HAS_METHOD (encoding, rewinder);
6619 LSTREAM_HAS_METHOD (encoding, seekable_p);
6620 LSTREAM_HAS_METHOD (encoding, flusher);
6621 LSTREAM_HAS_METHOD (encoding, closer);
6622 LSTREAM_HAS_METHOD (encoding, marker);
6626 vars_of_file_coding (void)
6630 fcd = xnew (struct file_coding_dump);
6631 dump_add_root_struct_ptr (&fcd, &fcd_description);
6633 /* Initialize to something reasonable ... */
6634 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6636 fcd->coding_category_system[i] = Qnil;
6637 fcd->coding_category_by_priority[i] = i;
6640 Fprovide (intern ("file-coding"));
6642 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6643 Coding system used for TTY keyboard input.
6644 Not used under a windowing system.
6646 Vkeyboard_coding_system = Qnil;
6648 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6649 Coding system used for TTY display output.
6650 Not used under a windowing system.
6652 Vterminal_coding_system = Qnil;
6654 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6655 Overriding coding system used when reading from a file or process.
6656 You should bind this variable with `let', but do not set it globally.
6657 If this is non-nil, it specifies the coding system that will be used
6658 to decode input on read operations, such as from a file or process.
6659 It overrides `buffer-file-coding-system-for-read',
6660 `insert-file-contents-pre-hook', etc. Use those variables instead of
6661 this one for permanent changes to the environment. */ );
6662 Vcoding_system_for_read = Qnil;
6664 DEFVAR_LISP ("coding-system-for-write",
6665 &Vcoding_system_for_write /*
6666 Overriding coding system used when writing to a file or process.
6667 You should bind this variable with `let', but do not set it globally.
6668 If this is non-nil, it specifies the coding system that will be used
6669 to encode output for write operations, such as to a file or process.
6670 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6671 Use those variables instead of this one for permanent changes to the
6673 Vcoding_system_for_write = Qnil;
6675 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6676 Coding system used to convert pathnames when accessing files.
6678 Vfile_name_coding_system = Qnil;
6680 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6681 &Vcoded_charset_entity_reference_alist /*
6682 Alist of coded-charset vs corresponding entity-reference.
6683 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6684 CCS is coded-charset.
6685 CODE-COLUMNS is columns of code-point of entity-reference.
6686 CODE-TYPE is format type of code-point of entity-reference.
6687 `d' means decimal value and `x' means hexadecimal value.
6689 Vcoded_charset_entity_reference_alist = Qnil;
6691 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6692 Non-nil means the buffer contents are regarded as multi-byte form
6693 of characters, not a binary code. This affects the display, file I/O,
6694 and behaviors of various editing commands.
6696 Setting this to nil does not do anything.
6698 enable_multibyte_characters = 1;
6701 DEFVAR_LISP ("decomposition-feature-list",
6702 &Vdecomposition_feature_list /*
6703 List of `=decomposition@FOO' feature to encode characters as IVS.
6705 Vdecomposition_feature_list = Qnil;
6710 complex_vars_of_file_coding (void)
6712 staticpro (&Vcoding_system_hash_table);
6713 Vcoding_system_hash_table =
6714 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6716 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6717 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6719 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6721 struct codesys_prop csp; \
6723 csp.prop_type = (Prop_Type); \
6724 Dynarr_add (the_codesys_prop_dynarr, csp); \
6727 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6728 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6729 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6730 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6731 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6732 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6733 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6735 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6736 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6737 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6738 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6739 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6740 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6741 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6742 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6743 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6744 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6745 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6746 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6747 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6748 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6749 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6750 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6751 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6753 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6756 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6757 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6759 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6760 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qenable_decomposition);
6761 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6764 /* Need to create this here or we're really screwed. */
6766 (Qraw_text, Qno_conversion,
6767 build_string ("Raw text, which means it converts only line-break-codes."),
6768 list2 (Qmnemonic, build_string ("Raw")));
6771 (Qbinary, Qno_conversion,
6772 build_string ("Binary, which means it does not convert anything."),
6773 list4 (Qeol_type, Qlf,
6774 Qmnemonic, build_string ("Binary")));
6780 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6781 list2 (Qmnemonic, build_string ("MTF8")));
6784 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6786 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6788 Fdefine_coding_system_alias (Qterminal, Qbinary);
6789 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6791 /* Need this for bootstrapping */
6792 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6793 Fget_coding_system (Qraw_text);
6796 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6797 = Fget_coding_system (Qutf_8_mcs);
6800 #if defined(MULE) && !defined(UTF2000)
6804 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6805 fcd->ucs_to_mule_table[i] = Qnil;
6807 staticpro (&mule_to_ucs_table);
6808 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6809 #endif /* defined(MULE) && !defined(UTF2000) */