1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011
7 This file is part of XEmacs.
9 XEmacs is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with XEmacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 Boston, MA 02111-1307, USA. */
24 /* Synched up with: Mule 2.3. Not in FSF. */
26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
41 #include "file-coding.h"
43 Lisp_Object Qcoding_system_error;
45 Lisp_Object Vkeyboard_coding_system;
46 Lisp_Object Vterminal_coding_system;
47 Lisp_Object Vcoding_system_for_read;
48 Lisp_Object Vcoding_system_for_write;
49 Lisp_Object Vfile_name_coding_system;
51 Lisp_Object Vcoded_charset_entity_reference_alist;
53 /* Table of symbols identifying each coding category. */
54 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
58 struct file_coding_dump {
59 /* Coding system currently associated with each coding category. */
60 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
62 /* Table of all coding categories in decreasing order of priority.
63 This describes a permutation of the possible coding categories. */
64 int coding_category_by_priority[CODING_CATEGORY_LAST];
66 #if defined(MULE) && !defined(UTF2000)
67 Lisp_Object ucs_to_mule_table[65536];
71 static const struct lrecord_description fcd_description_1[] = {
72 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
73 #if defined(MULE) && !defined(UTF2000)
74 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
79 static const struct struct_description fcd_description = {
80 sizeof (struct file_coding_dump),
84 Lisp_Object mule_to_ucs_table;
86 Lisp_Object Qcoding_systemp;
88 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
89 /* Qinternal in general.c */
91 Lisp_Object Qmnemonic, Qeol_type;
92 Lisp_Object Qcr, Qcrlf, Qlf;
93 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
94 Lisp_Object Qpost_read_conversion;
95 Lisp_Object Qpre_write_conversion;
98 Lisp_Object Qucs4, Qutf16, Qutf8;
99 Lisp_Object Qbig5, Qshift_jis;
100 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
101 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
102 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
103 Lisp_Object Qno_iso6429;
104 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
105 Lisp_Object Qescape_quoted;
106 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
109 Lisp_Object Qutf_8_mcs;
110 Lisp_Object Qdisable_composition;
111 Lisp_Object Qccs_priority_list;
112 Lisp_Object Quse_entity_reference;
113 Lisp_Object Qd, Qx, QX;
114 Lisp_Object Vdecomposition_feature_list;
116 Lisp_Object Qencode, Qdecode;
118 Lisp_Object Vcoding_system_hash_table;
120 int enable_multibyte_characters;
123 /* Additional information used by the ISO2022 decoder and detector. */
124 struct iso2022_decoder
126 /* CHARSET holds the character sets currently assigned to the G0
127 through G3 variables. It is initialized from the array
128 INITIAL_CHARSET in CODESYS. */
129 Lisp_Object charset[4];
131 /* Which registers are currently invoked into the left (GL) and
132 right (GR) halves of the 8-bit encoding space? */
133 int register_left, register_right;
135 /* ISO_ESC holds a value indicating part of an escape sequence
136 that has already been seen. */
137 enum iso_esc_flag esc;
139 /* This records the bytes we've seen so far in an escape sequence,
140 in case the sequence is invalid (we spit out the bytes unchanged). */
141 unsigned char esc_bytes[8];
143 /* Index for next byte to store in ISO escape sequence. */
146 #ifdef ENABLE_COMPOSITE_CHARS
147 /* Stuff seen so far when composing a string. */
148 unsigned_char_dynarr *composite_chars;
151 /* If we saw an invalid designation sequence for a particular
152 register, we flag it here and switch to ASCII. The next time we
153 see a valid designation for this register, we turn off the flag
154 and do the designation normally, but pretend the sequence was
155 invalid. The effect of all this is that (most of the time) the
156 escape sequences for both the switch to the unknown charset, and
157 the switch back to the known charset, get inserted literally into
158 the buffer and saved out as such. The hope is that we can
159 preserve the escape sequences so that the resulting written out
160 file makes sense. If we don't do any of this, the designation
161 to the invalid charset will be preserved but that switch back
162 to the known charset will probably get eaten because it was
163 the same charset that was already present in the register. */
164 unsigned char invalid_designated[4];
166 /* We try to do similar things as above for direction-switching
167 sequences. If we encountered a direction switch while an
168 invalid designation was present, or an invalid designation
169 just after a direction switch (i.e. no valid designation
170 encountered yet), we insert the direction-switch escape
171 sequence literally into the output stream, and later on
172 insert the corresponding direction-restoring escape sequence
174 unsigned int switched_dir_and_no_valid_charset_yet :1;
175 unsigned int invalid_switch_dir :1;
177 /* Tells the decoder to output the escape sequence literally
178 even though it was valid. Used in the games we play to
179 avoid lossage when we encounter invalid designations. */
180 unsigned int output_literally :1;
181 /* We encountered a direction switch followed by an invalid
182 designation. We didn't output the direction switch
183 literally because we didn't know about the invalid designation;
184 but we have to do so now. */
185 unsigned int output_direction_sequence :1;
188 EXFUN (Fcopy_coding_system, 2);
190 struct detection_state;
193 text_encode_generic (Lstream *encoding, const Bufbyte *src,
194 unsigned_char_dynarr *dst, Lstream_data_count n);
196 static int detect_coding_sjis (struct detection_state *st,
197 const Extbyte *src, Lstream_data_count n);
198 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, Lstream_data_count n);
200 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
201 unsigned_char_dynarr *dst, unsigned int *flags);
202 void char_finish_shift_jis (struct encoding_stream *str,
203 unsigned_char_dynarr *dst, unsigned int *flags);
205 static int detect_coding_big5 (struct detection_state *st,
206 const Extbyte *src, Lstream_data_count n);
207 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, Lstream_data_count n);
209 void char_encode_big5 (struct encoding_stream *str, Emchar c,
210 unsigned_char_dynarr *dst, unsigned int *flags);
211 void char_finish_big5 (struct encoding_stream *str,
212 unsigned_char_dynarr *dst, unsigned int *flags);
214 static int detect_coding_ucs4 (struct detection_state *st,
215 const Extbyte *src, Lstream_data_count n);
216 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, Lstream_data_count n);
218 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
219 unsigned_char_dynarr *dst, unsigned int *flags);
220 void char_finish_ucs4 (struct encoding_stream *str,
221 unsigned_char_dynarr *dst, unsigned int *flags);
223 static int detect_coding_utf16 (struct detection_state *st,
224 const Extbyte *src, Lstream_data_count n);
225 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
226 unsigned_char_dynarr *dst, Lstream_data_count n);
227 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
228 unsigned_char_dynarr *dst, unsigned int *flags);
229 void char_finish_utf16 (struct encoding_stream *str,
230 unsigned_char_dynarr *dst, unsigned int *flags);
232 static int detect_coding_utf8 (struct detection_state *st,
233 const Extbyte *src, Lstream_data_count n);
234 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
235 unsigned_char_dynarr *dst, Lstream_data_count n);
236 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
237 unsigned_char_dynarr *dst, unsigned int *flags);
238 void char_finish_utf8 (struct encoding_stream *str,
239 unsigned_char_dynarr *dst, unsigned int *flags);
241 static int postprocess_iso2022_mask (int mask);
242 static void reset_iso2022 (Lisp_Object coding_system,
243 struct iso2022_decoder *iso);
244 static int detect_coding_iso2022 (struct detection_state *st,
245 const Extbyte *src, Lstream_data_count n);
246 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
247 unsigned_char_dynarr *dst, Lstream_data_count n);
248 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
249 unsigned_char_dynarr *dst, unsigned int *flags);
250 void char_finish_iso2022 (struct encoding_stream *str,
251 unsigned_char_dynarr *dst, unsigned int *flags);
253 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
254 unsigned_char_dynarr *dst, Lstream_data_count n);
255 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
256 unsigned_char_dynarr *dst, Lstream_data_count n);
257 static void mule_decode (Lstream *decoding, const Extbyte *src,
258 unsigned_char_dynarr *dst, Lstream_data_count n);
259 static void mule_encode (Lstream *encoding, const Bufbyte *src,
260 unsigned_char_dynarr *dst, Lstream_data_count n);
262 typedef struct codesys_prop codesys_prop;
271 Dynarr_declare (codesys_prop);
272 } codesys_prop_dynarr;
274 static const struct lrecord_description codesys_prop_description_1[] = {
275 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
279 static const struct struct_description codesys_prop_description = {
280 sizeof (codesys_prop),
281 codesys_prop_description_1
284 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
285 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
289 static const struct struct_description codesys_prop_dynarr_description = {
290 sizeof (codesys_prop_dynarr),
291 codesys_prop_dynarr_description_1
294 codesys_prop_dynarr *the_codesys_prop_dynarr;
296 enum codesys_prop_enum
299 CODESYS_PROP_ISO2022,
304 /************************************************************************/
305 /* Coding system functions */
306 /************************************************************************/
308 static Lisp_Object mark_coding_system (Lisp_Object);
309 static void print_coding_system (Lisp_Object, Lisp_Object, int);
310 static void finalize_coding_system (void *header, int for_disksave);
313 static const struct lrecord_description ccs_description_1[] = {
314 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
315 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
319 static const struct struct_description ccs_description = {
320 sizeof (charset_conversion_spec),
324 static const struct lrecord_description ccsd_description_1[] = {
325 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
329 static const struct struct_description ccsd_description = {
330 sizeof (charset_conversion_spec_dynarr),
335 static const struct lrecord_description coding_system_description[] = {
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
342 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
343 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
345 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
346 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
347 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
348 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
349 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
351 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
357 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
358 mark_coding_system, print_coding_system,
359 finalize_coding_system,
360 0, 0, coding_system_description,
364 mark_coding_system (Lisp_Object obj)
366 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
368 mark_object (CODING_SYSTEM_NAME (codesys));
369 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
370 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
371 mark_object (CODING_SYSTEM_EOL_LF (codesys));
372 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
373 mark_object (CODING_SYSTEM_EOL_CR (codesys));
375 switch (CODING_SYSTEM_TYPE (codesys))
379 case CODESYS_ISO2022:
380 for (i = 0; i < 4; i++)
381 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
382 if (codesys->iso2022.input_conv)
384 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
386 struct charset_conversion_spec *ccs =
387 Dynarr_atp (codesys->iso2022.input_conv, i);
388 mark_object (ccs->from_charset);
389 mark_object (ccs->to_charset);
392 if (codesys->iso2022.output_conv)
394 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
396 struct charset_conversion_spec *ccs =
397 Dynarr_atp (codesys->iso2022.output_conv, i);
398 mark_object (ccs->from_charset);
399 mark_object (ccs->to_charset);
406 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
407 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
412 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
413 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
420 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
422 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
424 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
428 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
431 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
433 error ("printing unreadable object #<coding_system 0x%x>",
436 write_c_string ("#<coding_system ", printcharfun);
437 print_internal (c->name, printcharfun, 1);
438 write_c_string (">", printcharfun);
442 finalize_coding_system (void *header, int for_disksave)
444 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
445 /* Since coding systems never go away, this function is not
446 necessary. But it would be necessary if we changed things
447 so that coding systems could go away. */
448 if (!for_disksave) /* see comment in lstream.c */
450 switch (CODING_SYSTEM_TYPE (c))
453 case CODESYS_ISO2022:
454 if (c->iso2022.input_conv)
456 Dynarr_free (c->iso2022.input_conv);
457 c->iso2022.input_conv = 0;
459 if (c->iso2022.output_conv)
461 Dynarr_free (c->iso2022.output_conv);
462 c->iso2022.output_conv = 0;
473 symbol_to_eol_type (Lisp_Object symbol)
475 CHECK_SYMBOL (symbol);
476 if (NILP (symbol)) return EOL_AUTODETECT;
477 if (EQ (symbol, Qlf)) return EOL_LF;
478 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
479 if (EQ (symbol, Qcr)) return EOL_CR;
481 signal_simple_error ("Unrecognized eol type", symbol);
482 return EOL_AUTODETECT; /* not reached */
486 eol_type_to_symbol (eol_type_t type)
491 case EOL_LF: return Qlf;
492 case EOL_CRLF: return Qcrlf;
493 case EOL_CR: return Qcr;
494 case EOL_AUTODETECT: return Qnil;
499 setup_eol_coding_systems (Lisp_Coding_System *codesys)
501 Lisp_Object codesys_obj;
502 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
503 char *codesys_name = (char *) alloca (len + 7);
505 char *codesys_mnemonic=0;
507 Lisp_Object codesys_name_sym, sub_codesys_obj;
511 XSETCODING_SYSTEM (codesys_obj, codesys);
513 memcpy (codesys_name,
514 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
516 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
518 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
519 codesys_mnemonic = (char *) alloca (mlen + 7);
520 memcpy (codesys_mnemonic,
521 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
524 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
525 strcpy (codesys_name + len, "-" op_sys); \
527 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
528 codesys_name_sym = intern (codesys_name); \
529 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
530 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
532 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
533 build_string (codesys_mnemonic); \
534 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
537 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
538 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
539 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
542 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
543 Return t if OBJECT is a coding system.
544 A coding system is an object that defines how text containing multiple
545 character sets is encoded into a stream of (typically 8-bit) bytes.
546 The coding system is used to decode the stream into a series of
547 characters (which may be from multiple charsets) when the text is read
548 from a file or process, and is used to encode the text back into the
549 same format when it is written out to a file or process.
551 For example, many ISO2022-compliant coding systems (such as Compound
552 Text, which is used for inter-client data under the X Window System)
553 use escape sequences to switch between different charsets -- Japanese
554 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
555 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
556 `make-coding-system' for more information.
558 Coding systems are normally identified using a symbol, and the
559 symbol is accepted in place of the actual coding system object whenever
560 a coding system is called for. (This is similar to how faces work.)
564 return CODING_SYSTEMP (object) ? Qt : Qnil;
567 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
568 Retrieve the coding system of the given name.
570 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
571 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
572 If there is no such coding system, nil is returned. Otherwise the
573 associated coding system object is returned.
575 (coding_system_or_name))
577 if (NILP (coding_system_or_name))
578 coding_system_or_name = Qbinary;
579 else if (CODING_SYSTEMP (coding_system_or_name))
580 return coding_system_or_name;
582 CHECK_SYMBOL (coding_system_or_name);
586 coding_system_or_name =
587 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
589 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
590 return coding_system_or_name;
594 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
595 Retrieve the coding system of the given name.
596 Same as `find-coding-system' except that if there is no such
597 coding system, an error is signaled instead of returning nil.
601 Lisp_Object coding_system = Ffind_coding_system (name);
603 if (NILP (coding_system))
604 signal_simple_error ("No such coding system", name);
605 return coding_system;
608 /* We store the coding systems in hash tables with the names as the key and the
609 actual coding system object as the value. Occasionally we need to use them
610 in a list format. These routines provide us with that. */
611 struct coding_system_list_closure
613 Lisp_Object *coding_system_list;
617 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
618 void *coding_system_list_closure)
620 /* This function can GC */
621 struct coding_system_list_closure *cscl =
622 (struct coding_system_list_closure *) coding_system_list_closure;
623 Lisp_Object *coding_system_list = cscl->coding_system_list;
625 *coding_system_list = Fcons (key, *coding_system_list);
629 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
630 Return a list of the names of all defined coding systems.
634 Lisp_Object coding_system_list = Qnil;
636 struct coding_system_list_closure coding_system_list_closure;
638 GCPRO1 (coding_system_list);
639 coding_system_list_closure.coding_system_list = &coding_system_list;
640 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
641 &coding_system_list_closure);
644 return coding_system_list;
647 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
648 Return the name of the given coding system.
652 coding_system = Fget_coding_system (coding_system);
653 return XCODING_SYSTEM_NAME (coding_system);
656 static Lisp_Coding_System *
657 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
659 Lisp_Coding_System *codesys =
660 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
662 zero_lcrecord (codesys);
663 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
664 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
665 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
666 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
667 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
668 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
669 CODING_SYSTEM_TYPE (codesys) = type;
670 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
673 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
675 if (type == CODESYS_ISO2022)
678 for (i = 0; i < 4; i++)
679 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
682 if (type == CODESYS_UTF8)
684 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
686 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
688 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
690 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
693 else if (type == CODESYS_BIG5)
695 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
697 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
698 = Vcharset_chinese_big5;
699 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
701 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
705 else if (type == CODESYS_CCL)
707 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
708 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
711 CODING_SYSTEM_NAME (codesys) = name;
717 /* Given a list of charset conversion specs as specified in a Lisp
718 program, parse it into STORE_HERE. */
721 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
722 Lisp_Object spec_list)
726 EXTERNAL_LIST_LOOP (rest, spec_list)
728 Lisp_Object car = XCAR (rest);
729 Lisp_Object from, to;
730 struct charset_conversion_spec spec;
732 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
733 signal_simple_error ("Invalid charset conversion spec", car);
734 from = Fget_charset (XCAR (car));
735 to = Fget_charset (XCAR (XCDR (car)));
736 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
737 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
738 signal_simple_error_2
739 ("Attempted conversion between different charset types",
741 spec.from_charset = from;
742 spec.to_charset = to;
744 Dynarr_add (store_here, spec);
748 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
749 specs, return the equivalent as the Lisp programmer would see it.
751 If LOAD_HERE is 0, return Qnil. */
754 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
761 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
763 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
764 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
767 return Fnreverse (result);
772 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
773 Register symbol NAME as a coding system.
775 TYPE describes the conversion method used and should be one of
778 Automatic conversion. XEmacs attempts to detect the coding system
781 No conversion. Use this for binary files and such. On output,
782 graphic characters that are not in ASCII or Latin-1 will be
783 replaced by a ?. (For a no-conversion-encoded buffer, these
784 characters will only be present if you explicitly insert them.)
786 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
788 ISO 10646 UCS-4 encoding.
790 ISO 10646 UTF-8 encoding.
792 Any ISO2022-compliant encoding. Among other things, this includes
793 JIS (the Japanese encoding commonly used for e-mail), EUC (the
794 standard Unix encoding for Japanese and other languages), and
795 Compound Text (the encoding used in X11). You can specify more
796 specific information about the conversion with the PROPS argument.
798 Big5 (the encoding commonly used for Taiwanese).
800 The conversion is performed using a user-written pseudo-code
801 program. CCL (Code Conversion Language) is the name of this
804 Write out or read in the raw contents of the memory representing
805 the buffer's text. This is primarily useful for debugging
806 purposes, and is only enabled when XEmacs has been compiled with
807 DEBUG_XEMACS defined (via the --debug configure option).
808 WARNING: Reading in a file using 'internal conversion can result
809 in an internal inconsistency in the memory representing a
810 buffer's text, which will produce unpredictable results and may
811 cause XEmacs to crash. Under normal circumstances you should
812 never use 'internal conversion.
814 DOC-STRING is a string describing the coding system.
816 PROPS is a property list, describing the specific nature of the
817 character set. Recognized properties are:
820 String to be displayed in the modeline when this coding system is
824 End-of-line conversion to be used. It should be one of
827 Automatically detect the end-of-line type (LF, CRLF,
828 or CR). Also generate subsidiary coding systems named
829 `NAME-unix', `NAME-dos', and `NAME-mac', that are
830 identical to this coding system but have an EOL-TYPE
831 value of 'lf, 'crlf, and 'cr, respectively.
833 The end of a line is marked externally using ASCII LF.
834 Since this is also the way that XEmacs represents an
835 end-of-line internally, specifying this option results
836 in no end-of-line conversion. This is the standard
837 format for Unix text files.
839 The end of a line is marked externally using ASCII
840 CRLF. This is the standard format for MS-DOS text
843 The end of a line is marked externally using ASCII CR.
844 This is the standard format for Macintosh text files.
846 Automatically detect the end-of-line type but do not
847 generate subsidiary coding systems. (This value is
848 converted to nil when stored internally, and
849 `coding-system-property' will return nil.)
852 If non-nil, composition/decomposition for combining characters
855 'use-entity-reference
856 If non-nil, SGML style entity-reference is used for non-system-characters.
858 'post-read-conversion
859 Function called after a file has been read in, to perform the
860 decoding. Called with two arguments, START and END, denoting
861 a region of the current buffer to be decoded.
863 'pre-write-conversion
864 Function called before a file is written out, to perform the
865 encoding. Called with two arguments, START and END, denoting
866 a region of the current buffer to be encoded.
869 The following additional properties are recognized if TYPE is 'iso2022:
875 The character set initially designated to the G0 - G3 registers.
876 The value should be one of
878 -- A charset object (designate that character set)
879 -- nil (do not ever use this register)
880 -- t (no character set is initially designated to
881 the register, but may be later on; this automatically
882 sets the corresponding `force-g*-on-output' property)
888 If non-nil, send an explicit designation sequence on output before
889 using the specified register.
892 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
893 "ESC $ B" on output in place of the full designation sequences
894 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
897 If non-nil, don't designate ASCII to G0 at each end of line on output.
898 Setting this to non-nil also suppresses other state-resetting that
899 normally happens at the end of a line.
902 If non-nil, don't designate ASCII to G0 before control chars on output.
905 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
909 If non-nil, use locking-shift (SO/SI) instead of single-shift
910 or designation by escape sequence.
913 If non-nil, don't use ISO6429's direction specification.
916 If non-nil, literal control characters that are the same as
917 the beginning of a recognized ISO2022 or ISO6429 escape sequence
918 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
919 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
920 so that they can be properly distinguished from an escape sequence.
921 (Note that doing this results in a non-portable encoding.) This
922 encoding flag is used for byte-compiled files. Note that ESC
923 is a good choice for a quoting character because there are no
924 escape sequences whose second byte is a character from the Control-0
925 or Control-1 character sets; this is explicitly disallowed by the
928 'input-charset-conversion
929 A list of conversion specifications, specifying conversion of
930 characters in one charset to another when decoding is performed.
931 Each specification is a list of two elements: the source charset,
932 and the destination charset.
934 'output-charset-conversion
935 A list of conversion specifications, specifying conversion of
936 characters in one charset to another when encoding is performed.
937 The form of each specification is the same as for
938 'input-charset-conversion.
941 The following additional properties are recognized (and required)
945 CCL program used for decoding (converting to internal format).
948 CCL program used for encoding (converting to external format).
950 (name, type, doc_string, props))
952 Lisp_Coding_System *codesys;
953 enum coding_system_type ty;
954 int need_to_setup_eol_systems = 1;
956 /* Convert type to constant */
957 if (NILP (type) || EQ (type, Qundecided))
958 { ty = CODESYS_AUTODETECT; }
960 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
961 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
962 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
963 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
964 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
965 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
966 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
968 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
970 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
973 signal_simple_error ("Invalid coding system type", type);
977 codesys = allocate_coding_system (ty, name);
979 if (NILP (doc_string))
980 doc_string = build_string ("");
982 CHECK_STRING (doc_string);
983 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
986 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
988 if (EQ (key, Qmnemonic))
991 CHECK_STRING (value);
992 CODING_SYSTEM_MNEMONIC (codesys) = value;
995 else if (EQ (key, Qeol_type))
997 need_to_setup_eol_systems = NILP (value);
1000 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1003 else if (EQ (key, Qpost_read_conversion))
1004 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1005 else if (EQ (key, Qpre_write_conversion))
1006 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1008 else if (EQ (key, Qdisable_composition))
1009 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1010 else if (EQ (key, Quse_entity_reference))
1011 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1014 else if (ty == CODESYS_ISO2022)
1016 #define FROB_INITIAL_CHARSET(charset_num) \
1017 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1018 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1020 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1021 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1022 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1023 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1025 #define FROB_FORCE_CHARSET(charset_num) \
1026 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1028 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1029 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1030 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1031 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1033 #define FROB_BOOLEAN_PROPERTY(prop) \
1034 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1036 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1037 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1038 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1039 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1040 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1041 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1042 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1044 else if (EQ (key, Qinput_charset_conversion))
1046 codesys->iso2022.input_conv =
1047 Dynarr_new (charset_conversion_spec);
1048 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1051 else if (EQ (key, Qoutput_charset_conversion))
1053 codesys->iso2022.output_conv =
1054 Dynarr_new (charset_conversion_spec);
1055 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1059 else if (EQ (key, Qccs_priority_list))
1061 codesys->ccs_priority_list = value;
1065 signal_simple_error ("Unrecognized property", key);
1068 else if (ty == CODESYS_UTF8)
1070 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1071 else if (EQ (key, Qcharset_g1))
1072 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1073 else if (EQ (key, Qcharset_g2))
1074 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1076 signal_simple_error ("Unrecognized property", key);
1078 else if (ty == CODESYS_BIG5)
1080 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1081 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1083 signal_simple_error ("Unrecognized property", key);
1086 else if (EQ (type, Qccl))
1089 struct ccl_program test_ccl;
1092 /* Check key first. */
1093 if (EQ (key, Qdecode))
1094 suffix = "-ccl-decode";
1095 else if (EQ (key, Qencode))
1096 suffix = "-ccl-encode";
1098 signal_simple_error ("Unrecognized property", key);
1100 /* If value is vector, register it as a ccl program
1101 associated with an newly created symbol for
1102 backward compatibility. */
1103 if (VECTORP (value))
1105 sym = Fintern (concat2 (Fsymbol_name (name),
1106 build_string (suffix)),
1108 Fregister_ccl_program (sym, value);
1112 CHECK_SYMBOL (value);
1115 /* check if the given ccl programs are valid. */
1116 if (setup_ccl_program (&test_ccl, sym) < 0)
1117 signal_simple_error ("Invalid CCL program", value);
1119 if (EQ (key, Qdecode))
1120 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1121 else if (EQ (key, Qencode))
1122 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1127 signal_simple_error ("Unrecognized property", key);
1131 if (need_to_setup_eol_systems)
1132 setup_eol_coding_systems (codesys);
1135 Lisp_Object codesys_obj;
1136 XSETCODING_SYSTEM (codesys_obj, codesys);
1137 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1142 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1143 Copy OLD-CODING-SYSTEM to NEW-NAME.
1144 If NEW-NAME does not name an existing coding system, a new one will
1147 (old_coding_system, new_name))
1149 Lisp_Object new_coding_system;
1150 old_coding_system = Fget_coding_system (old_coding_system);
1151 new_coding_system = Ffind_coding_system (new_name);
1152 if (NILP (new_coding_system))
1154 XSETCODING_SYSTEM (new_coding_system,
1155 allocate_coding_system
1156 (XCODING_SYSTEM_TYPE (old_coding_system),
1158 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1162 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1163 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1164 memcpy (((char *) to ) + sizeof (to->header),
1165 ((char *) from) + sizeof (from->header),
1166 sizeof (*from) - sizeof (from->header));
1167 to->name = new_name;
1169 return new_coding_system;
1172 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1173 Return t if OBJECT names a coding system, and is not a coding system alias.
1177 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1181 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1182 Return t if OBJECT is a coding system alias.
1183 All coding system aliases are created by `define-coding-system-alias'.
1187 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1191 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1192 Return the coding-system symbol for which symbol ALIAS is an alias.
1196 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1197 if (SYMBOLP (aliasee))
1200 signal_simple_error ("Symbol is not a coding system alias", alias);
1201 return Qnil; /* To keep the compiler happy */
1205 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1207 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1211 /* A maphash function, for removing dangling coding system aliases. */
1213 dangling_coding_system_alias_p (Lisp_Object alias,
1214 Lisp_Object aliasee,
1215 void *dangling_aliases)
1217 if (SYMBOLP (aliasee)
1218 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1220 (*(int *) dangling_aliases)++;
1227 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1228 Define symbol ALIAS as an alias for coding system ALIASEE.
1230 You can use this function to redefine an alias that has already been defined,
1231 but you cannot redefine a name which is the canonical name for a coding system.
1232 \(a canonical name of a coding system is what is returned when you call
1233 `coding-system-name' on a coding system).
1235 ALIASEE itself can be an alias, which allows you to define nested aliases.
1237 You are forbidden, however, from creating alias loops or `dangling' aliases.
1238 These will be detected, and an error will be signaled if you attempt to do so.
1240 If ALIASEE is nil, then ALIAS will simply be undefined.
1242 See also `coding-system-alias-p', `coding-system-aliasee',
1243 and `coding-system-canonical-name-p'.
1247 Lisp_Object real_coding_system, probe;
1249 CHECK_SYMBOL (alias);
1251 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1253 ("Symbol is the canonical name of a coding system and cannot be redefined",
1258 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1259 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1260 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1262 Fremhash (alias, Vcoding_system_hash_table);
1264 /* Undefine subsidiary aliases,
1265 presumably created by a previous call to this function */
1266 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1267 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1268 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1270 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1271 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1272 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1275 /* Undefine dangling coding system aliases. */
1277 int dangling_aliases;
1280 dangling_aliases = 0;
1281 elisp_map_remhash (dangling_coding_system_alias_p,
1282 Vcoding_system_hash_table,
1284 } while (dangling_aliases > 0);
1290 if (CODING_SYSTEMP (aliasee))
1291 aliasee = XCODING_SYSTEM_NAME (aliasee);
1293 /* Checks that aliasee names a coding-system */
1294 real_coding_system = Fget_coding_system (aliasee);
1296 /* Check for coding system alias loops */
1297 if (EQ (alias, aliasee))
1298 alias_loop: signal_simple_error_2
1299 ("Attempt to create a coding system alias loop", alias, aliasee);
1301 for (probe = aliasee;
1303 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1305 if (EQ (probe, alias))
1309 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1311 /* Set up aliases for subsidiaries.
1312 #### There must be a better way to handle subsidiary coding systems. */
1314 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1316 for (i = 0; i < countof (suffixes); i++)
1318 Lisp_Object alias_subsidiary =
1319 append_suffix_to_symbol (alias, suffixes[i]);
1320 Lisp_Object aliasee_subsidiary =
1321 append_suffix_to_symbol (aliasee, suffixes[i]);
1323 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1324 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1327 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1328 but it doesn't look intentional, so I'd rather return something
1329 meaningful or nothing at all. */
1334 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1336 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1337 Lisp_Object new_coding_system;
1339 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1340 return coding_system;
1344 case EOL_AUTODETECT: return coding_system;
1345 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1346 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1347 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1348 default: abort (); return Qnil;
1351 return NILP (new_coding_system) ? coding_system : new_coding_system;
1354 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1355 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1357 (coding_system, eol_type))
1359 coding_system = Fget_coding_system (coding_system);
1361 return subsidiary_coding_system (coding_system,
1362 symbol_to_eol_type (eol_type));
1366 /************************************************************************/
1367 /* Coding system accessors */
1368 /************************************************************************/
1370 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1371 Return the doc string for CODING-SYSTEM.
1375 coding_system = Fget_coding_system (coding_system);
1376 return XCODING_SYSTEM_DOC_STRING (coding_system);
1379 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1380 Return the type of CODING-SYSTEM.
1384 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1387 case CODESYS_AUTODETECT: return Qundecided;
1389 case CODESYS_SHIFT_JIS: return Qshift_jis;
1390 case CODESYS_ISO2022: return Qiso2022;
1391 case CODESYS_BIG5: return Qbig5;
1392 case CODESYS_UCS4: return Qucs4;
1393 case CODESYS_UTF16: return Qutf16;
1394 case CODESYS_UTF8: return Qutf8;
1395 case CODESYS_CCL: return Qccl;
1397 case CODESYS_NO_CONVERSION: return Qno_conversion;
1399 case CODESYS_INTERNAL: return Qinternal;
1406 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1409 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1411 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1414 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1415 Return initial charset of CODING-SYSTEM designated to GNUM.
1418 (coding_system, gnum))
1420 coding_system = Fget_coding_system (coding_system);
1423 return coding_system_charset (coding_system, XINT (gnum));
1427 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1428 Return the PROP property of CODING-SYSTEM.
1430 (coding_system, prop))
1433 enum coding_system_type type;
1435 coding_system = Fget_coding_system (coding_system);
1436 CHECK_SYMBOL (prop);
1437 type = XCODING_SYSTEM_TYPE (coding_system);
1439 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1440 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1443 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1445 case CODESYS_PROP_ALL_OK:
1448 case CODESYS_PROP_ISO2022:
1449 if (type != CODESYS_ISO2022)
1451 ("Property only valid in ISO2022 coding systems",
1455 case CODESYS_PROP_CCL:
1456 if (type != CODESYS_CCL)
1458 ("Property only valid in CCL coding systems",
1468 signal_simple_error ("Unrecognized property", prop);
1470 if (EQ (prop, Qname))
1471 return XCODING_SYSTEM_NAME (coding_system);
1472 else if (EQ (prop, Qtype))
1473 return Fcoding_system_type (coding_system);
1474 else if (EQ (prop, Qdoc_string))
1475 return XCODING_SYSTEM_DOC_STRING (coding_system);
1476 else if (EQ (prop, Qmnemonic))
1477 return XCODING_SYSTEM_MNEMONIC (coding_system);
1478 else if (EQ (prop, Qeol_type))
1479 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1480 else if (EQ (prop, Qeol_lf))
1481 return XCODING_SYSTEM_EOL_LF (coding_system);
1482 else if (EQ (prop, Qeol_crlf))
1483 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1484 else if (EQ (prop, Qeol_cr))
1485 return XCODING_SYSTEM_EOL_CR (coding_system);
1486 else if (EQ (prop, Qpost_read_conversion))
1487 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1488 else if (EQ (prop, Qpre_write_conversion))
1489 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1492 else if (EQ (prop, Qdisable_composition))
1493 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1494 else if (EQ (prop, Quse_entity_reference))
1495 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1496 else if (EQ (prop, Qccs_priority_list))
1497 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1499 else if (type == CODESYS_ISO2022)
1501 if (EQ (prop, Qcharset_g0))
1502 return coding_system_charset (coding_system, 0);
1503 else if (EQ (prop, Qcharset_g1))
1504 return coding_system_charset (coding_system, 1);
1505 else if (EQ (prop, Qcharset_g2))
1506 return coding_system_charset (coding_system, 2);
1507 else if (EQ (prop, Qcharset_g3))
1508 return coding_system_charset (coding_system, 3);
1510 #define FORCE_CHARSET(charset_num) \
1511 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1512 (coding_system, charset_num) ? Qt : Qnil)
1514 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1515 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1516 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1517 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1519 #define LISP_BOOLEAN(prop) \
1520 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1522 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1523 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1524 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1525 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1526 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1527 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1528 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1530 else if (EQ (prop, Qinput_charset_conversion))
1532 unparse_charset_conversion_specs
1533 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1534 else if (EQ (prop, Qoutput_charset_conversion))
1536 unparse_charset_conversion_specs
1537 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1541 else if (type == CODESYS_CCL)
1543 if (EQ (prop, Qdecode))
1544 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1545 else if (EQ (prop, Qencode))
1546 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1554 return Qnil; /* not reached */
1558 /************************************************************************/
1559 /* Coding category functions */
1560 /************************************************************************/
1563 decode_coding_category (Lisp_Object symbol)
1567 CHECK_SYMBOL (symbol);
1568 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1569 if (EQ (coding_category_symbol[i], symbol))
1572 signal_simple_error ("Unrecognized coding category", symbol);
1573 return 0; /* not reached */
1576 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1577 Return a list of all recognized coding categories.
1582 Lisp_Object list = Qnil;
1584 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1585 list = Fcons (coding_category_symbol[i], list);
1589 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1590 Change the priority order of the coding categories.
1591 LIST should be list of coding categories, in descending order of
1592 priority. Unspecified coding categories will be lower in priority
1593 than all specified ones, in the same relative order they were in
1598 int category_to_priority[CODING_CATEGORY_LAST];
1602 /* First generate a list that maps coding categories to priorities. */
1604 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1605 category_to_priority[i] = -1;
1607 /* Highest priority comes from the specified list. */
1609 EXTERNAL_LIST_LOOP (rest, list)
1611 int cat = decode_coding_category (XCAR (rest));
1613 if (category_to_priority[cat] >= 0)
1614 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1615 category_to_priority[cat] = i++;
1618 /* Now go through the existing categories by priority to retrieve
1619 the categories not yet specified and preserve their priority
1621 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1623 int cat = fcd->coding_category_by_priority[j];
1624 if (category_to_priority[cat] < 0)
1625 category_to_priority[cat] = i++;
1628 /* Now we need to construct the inverse of the mapping we just
1631 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1632 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1634 /* Phew! That was confusing. */
1638 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1639 Return a list of coding categories in descending order of priority.
1644 Lisp_Object list = Qnil;
1646 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1647 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1652 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1653 Change the coding system associated with a coding category.
1655 (coding_category, coding_system))
1657 int cat = decode_coding_category (coding_category);
1659 coding_system = Fget_coding_system (coding_system);
1660 fcd->coding_category_system[cat] = coding_system;
1664 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1665 Return the coding system associated with a coding category.
1669 int cat = decode_coding_category (coding_category);
1670 Lisp_Object sys = fcd->coding_category_system[cat];
1673 return XCODING_SYSTEM_NAME (sys);
1678 /************************************************************************/
1679 /* Detecting the encoding of data */
1680 /************************************************************************/
1682 struct detection_state
1684 eol_type_t eol_type;
1727 struct iso2022_decoder iso;
1729 int high_byte_count;
1730 unsigned int saw_single_shift:1;
1743 acceptable_control_char_p (int c)
1747 /* Allow and ignore control characters that you might
1748 reasonably see in a text file */
1753 case 8: /* backspace */
1754 case 11: /* vertical tab */
1755 case 12: /* form feed */
1756 case 26: /* MS-DOS C-z junk */
1757 case 31: /* '^_' -- for info */
1765 mask_has_at_most_one_bit_p (int mask)
1767 /* Perhaps the only thing useful you learn from intensive Microsoft
1768 technical interviews */
1769 return (mask & (mask - 1)) == 0;
1773 detect_eol_type (struct detection_state *st, const Extbyte *src,
1774 Lstream_data_count n)
1778 unsigned char c = *(unsigned char *)src++;
1781 if (st->eol.just_saw_cr)
1783 else if (st->eol.seen_anything)
1786 else if (st->eol.just_saw_cr)
1789 st->eol.just_saw_cr = 1;
1791 st->eol.just_saw_cr = 0;
1792 st->eol.seen_anything = 1;
1795 return EOL_AUTODETECT;
1798 /* Attempt to determine the encoding and EOL type of the given text.
1799 Before calling this function for the first type, you must initialize
1800 st->eol_type as appropriate and initialize st->mask to ~0.
1802 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1805 st->mask holds the determined coding category mask, or ~0 if only
1806 ASCII has been seen so far.
1810 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1811 is present in st->mask
1812 1 == definitive answers are here for both st->eol_type and st->mask
1816 detect_coding_type (struct detection_state *st, const Extbyte *src,
1817 Lstream_data_count n, int just_do_eol)
1819 if (st->eol_type == EOL_AUTODETECT)
1820 st->eol_type = detect_eol_type (st, src, n);
1823 return st->eol_type != EOL_AUTODETECT;
1825 if (!st->seen_non_ascii)
1827 for (; n; n--, src++)
1829 unsigned char c = *(unsigned char *) src;
1830 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1832 st->seen_non_ascii = 1;
1834 st->shift_jis.mask = ~0;
1837 st->utf16.mask = ~0;
1839 st->iso2022.mask = ~0;
1849 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1850 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1851 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1852 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1853 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1854 st->big5.mask = detect_coding_big5 (st, src, n);
1855 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1856 st->utf8.mask = detect_coding_utf8 (st, src, n);
1857 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1858 st->utf16.mask = detect_coding_utf16 (st, src, n);
1859 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1860 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1863 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1864 | st->utf8.mask | st->ucs4.mask;
1867 int retval = mask_has_at_most_one_bit_p (st->mask);
1868 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1869 return retval && st->eol_type != EOL_AUTODETECT;
1874 coding_system_from_mask (int mask)
1878 /* If the file was entirely or basically ASCII, use the
1879 default value of `buffer-file-coding-system'. */
1880 Lisp_Object retval =
1881 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1884 retval = Ffind_coding_system (retval);
1888 (Qbad_variable, Qwarning,
1889 "Invalid `default-buffer-file-coding-system', set to nil");
1890 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1894 retval = Fget_coding_system (Qraw_text);
1902 mask = postprocess_iso2022_mask (mask);
1904 /* Look through the coding categories by priority and find
1905 the first one that is allowed. */
1906 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1908 cat = fcd->coding_category_by_priority[i];
1909 if ((mask & (1 << cat)) &&
1910 !NILP (fcd->coding_category_system[cat]))
1914 return fcd->coding_category_system[cat];
1916 return Fget_coding_system (Qraw_text);
1920 /* Given a seekable read stream and potential coding system and EOL type
1921 as specified, do any autodetection that is called for. If the
1922 coding system and/or EOL type are not `autodetect', they will be left
1923 alone; but this function will never return an autodetect coding system
1926 This function does not automatically fetch subsidiary coding systems;
1927 that should be unnecessary with the explicit eol-type argument. */
1929 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1930 /* number of leading lines to check for a coding cookie */
1931 #define LINES_TO_CHECK 2
1934 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1935 eol_type_t *eol_type_in_out)
1937 struct detection_state decst;
1939 if (*eol_type_in_out == EOL_AUTODETECT)
1940 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1943 decst.eol_type = *eol_type_in_out;
1946 /* If autodetection is called for, do it now. */
1947 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1948 || *eol_type_in_out == EOL_AUTODETECT)
1951 Lisp_Object coding_system = Qnil;
1953 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1955 int lines_checked = 0;
1957 /* Look for initial "-*-"; mode line prefix */
1959 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1961 && lines_checked < LINES_TO_CHECK;
1963 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1965 Extbyte *local_vars_beg = p + 3;
1966 /* Look for final "-*-"; mode line suffix */
1967 for (p = local_vars_beg,
1968 scan_end = buf + nread - LENGTH ("-*-");
1970 && lines_checked < LINES_TO_CHECK;
1972 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1974 Extbyte *suffix = p;
1975 /* Look for "coding:" */
1976 for (p = local_vars_beg,
1977 scan_end = suffix - LENGTH ("coding:?");
1980 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1981 && (p == local_vars_beg
1982 || (*(p-1) == ' ' ||
1988 p += LENGTH ("coding:");
1989 while (*p == ' ' || *p == '\t') p++;
1991 /* Get coding system name */
1992 save = *suffix; *suffix = '\0';
1993 /* Characters valid in a MIME charset name (rfc 1521),
1994 and in a Lisp symbol name. */
1995 n = strspn ( (char *) p,
1996 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1997 "abcdefghijklmnopqrstuvwxyz"
2003 save = p[n]; p[n] = '\0';
2005 Ffind_coding_system (intern ((char *) p));
2012 /* #### file must use standard EOLs or we miss 2d line */
2013 /* #### not to mention this is broken for UTF-16 DOS files */
2014 else if (*p == '\n' || *p == '\r')
2017 /* skip past multibyte (DOS) newline */
2018 if (*p == '\r' && *(p+1) == '\n') p++;
2022 /* #### file must use standard EOLs or we miss 2d line */
2023 /* #### not to mention this is broken for UTF-16 DOS files */
2024 else if (*p == '\n' || *p == '\r')
2027 /* skip past multibyte (DOS) newline */
2028 if (*p == '\r' && *(p+1) == '\n') p++;
2031 if (NILP (coding_system))
2034 if (detect_coding_type (&decst, buf, nread,
2035 XCODING_SYSTEM_TYPE (*codesys_in_out)
2036 != CODESYS_AUTODETECT))
2038 nread = Lstream_read (stream, buf, sizeof (buf));
2044 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2045 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2048 if (detect_coding_type (&decst, buf, nread, 1))
2050 nread = Lstream_read (stream, buf, sizeof (buf));
2056 *eol_type_in_out = decst.eol_type;
2057 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2059 if (NILP (coding_system))
2060 *codesys_in_out = coding_system_from_mask (decst.mask);
2062 *codesys_in_out = coding_system;
2066 /* If we absolutely can't determine the EOL type, just assume LF. */
2067 if (*eol_type_in_out == EOL_AUTODETECT)
2068 *eol_type_in_out = EOL_LF;
2070 Lstream_rewind (stream);
2073 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2074 Detect coding system of the text in the region between START and END.
2075 Return a list of possible coding systems ordered by priority.
2076 If only ASCII characters are found, return 'undecided or one of
2077 its subsidiary coding systems according to a detected end-of-line
2078 type. Optional arg BUFFER defaults to the current buffer.
2080 (start, end, buffer))
2082 Lisp_Object val = Qnil;
2083 struct buffer *buf = decode_buffer (buffer, 0);
2085 Lisp_Object instream, lb_instream;
2086 Lstream *istr, *lb_istr;
2087 struct detection_state decst;
2088 struct gcpro gcpro1, gcpro2;
2090 get_buffer_range_char (buf, start, end, &b, &e, 0);
2091 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2092 lb_istr = XLSTREAM (lb_instream);
2093 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2094 istr = XLSTREAM (instream);
2095 GCPRO2 (instream, lb_instream);
2097 decst.eol_type = EOL_AUTODETECT;
2101 Extbyte random_buffer[4096];
2102 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2106 if (detect_coding_type (&decst, random_buffer, nread, 0))
2110 if (decst.mask == ~0)
2111 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2119 decst.mask = postprocess_iso2022_mask (decst.mask);
2121 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2123 int sys = fcd->coding_category_by_priority[i];
2124 if (decst.mask & (1 << sys))
2126 Lisp_Object codesys = fcd->coding_category_system[sys];
2127 if (!NILP (codesys))
2128 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2129 val = Fcons (codesys, val);
2133 Lstream_close (istr);
2135 Lstream_delete (istr);
2136 Lstream_delete (lb_istr);
2141 /************************************************************************/
2142 /* Converting to internal Mule format ("decoding") */
2143 /************************************************************************/
2145 /* A decoding stream is a stream used for decoding text (i.e.
2146 converting from some external format to internal format).
2147 The decoding-stream object keeps track of the actual coding
2148 stream, the stream that is at the other end, and data that
2149 needs to be persistent across the lifetime of the stream. */
2151 /* Handle the EOL stuff related to just-read-in character C.
2152 EOL_TYPE is the EOL type of the coding stream.
2153 FLAGS is the current value of FLAGS in the coding stream, and may
2154 be modified by this macro. (The macro only looks at the
2155 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2156 bytes are to be written. You need to also define a local goto
2157 label "label_continue_loop" that is at the end of the main
2158 character-reading loop.
2160 If C is a CR character, then this macro handles it entirely and
2161 jumps to label_continue_loop. Otherwise, this macro does not add
2162 anything to DST, and continues normally. You should continue
2163 processing C normally after this macro. */
2165 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2169 if (eol_type == EOL_CR) \
2170 Dynarr_add (dst, '\n'); \
2171 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2172 Dynarr_add (dst, c); \
2174 flags |= CODING_STATE_CR; \
2175 goto label_continue_loop; \
2177 else if (flags & CODING_STATE_CR) \
2178 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2180 Dynarr_add (dst, '\r'); \
2181 flags &= ~CODING_STATE_CR; \
2185 /* C should be a binary character in the range 0 - 255; convert
2186 to internal format and add to Dynarr DST. */
2189 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2191 if (BYTE_ASCII_P (c)) \
2192 Dynarr_add (dst, c); \
2195 Dynarr_add (dst, (c >> 6) | 0xc0); \
2196 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2200 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2202 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2206 Dynarr_add (dst, c);
2208 else if ( c <= 0x7ff )
2210 Dynarr_add (dst, (c >> 6) | 0xc0);
2211 Dynarr_add (dst, (c & 0x3f) | 0x80);
2213 else if ( c <= 0xffff )
2215 Dynarr_add (dst, (c >> 12) | 0xe0);
2216 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2217 Dynarr_add (dst, (c & 0x3f) | 0x80);
2219 else if ( c <= 0x1fffff )
2221 Dynarr_add (dst, (c >> 18) | 0xf0);
2222 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2223 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2224 Dynarr_add (dst, (c & 0x3f) | 0x80);
2226 else if ( c <= 0x3ffffff )
2228 Dynarr_add (dst, (c >> 24) | 0xf8);
2229 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2230 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2231 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2232 Dynarr_add (dst, (c & 0x3f) | 0x80);
2236 Dynarr_add (dst, (c >> 30) | 0xfc);
2237 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2238 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2239 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2240 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2241 Dynarr_add (dst, (c & 0x3f) | 0x80);
2245 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2247 if (BYTE_ASCII_P (c)) \
2248 Dynarr_add (dst, c); \
2249 else if (BYTE_C1_P (c)) \
2251 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2252 Dynarr_add (dst, c + 0x20); \
2256 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2257 Dynarr_add (dst, c); \
2262 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2266 DECODE_ADD_BINARY_CHAR (ch, dst); \
2271 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2273 if (flags & CODING_STATE_END) \
2275 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2276 if (flags & CODING_STATE_CR) \
2277 Dynarr_add (dst, '\r'); \
2281 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2283 #define ER_BUF_SIZE 24
2285 struct decoding_stream
2287 /* Coding system that governs the conversion. */
2288 Lisp_Coding_System *codesys;
2290 /* Stream that we read the encoded data from or
2291 write the decoded data to. */
2294 /* If we are reading, then we can return only a fixed amount of
2295 data, so if the conversion resulted in too much data, we store it
2296 here for retrieval the next time around. */
2297 unsigned_char_dynarr *runoff;
2299 /* FLAGS holds flags indicating the current state of the decoding.
2300 Some of these flags are dependent on the coding system. */
2303 /* CPOS holds a partially built-up code-point of character. */
2306 /* EOL_TYPE specifies the type of end-of-line conversion that
2307 currently applies. We need to keep this separate from the
2308 EOL type stored in CODESYS because the latter might indicate
2309 automatic EOL-type detection while the former will always
2310 indicate a particular EOL type. */
2311 eol_type_t eol_type;
2313 /* Additional ISO2022 information. We define the structure above
2314 because it's also needed by the detection routines. */
2315 struct iso2022_decoder iso2022;
2317 /* Additional information (the state of the running CCL program)
2318 used by the CCL decoder. */
2319 struct ccl_program ccl;
2321 /* counter for UTF-8 or UCS-4 */
2322 unsigned char counter;
2326 unsigned char er_counter;
2327 unsigned char er_buf[ER_BUF_SIZE];
2329 unsigned combined_char_count;
2330 Emchar combined_chars[16];
2331 Lisp_Object combining_table;
2333 struct detection_state decst;
2336 static Lstream_data_count decoding_reader (Lstream *stream,
2337 unsigned char *data, Lstream_data_count size);
2338 static Lstream_data_count decoding_writer (Lstream *stream,
2339 const unsigned char *data, Lstream_data_count size);
2340 static int decoding_rewinder (Lstream *stream);
2341 static int decoding_seekable_p (Lstream *stream);
2342 static int decoding_flusher (Lstream *stream);
2343 static int decoding_closer (Lstream *stream);
2345 static Lisp_Object decoding_marker (Lisp_Object stream);
2347 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2348 sizeof (struct decoding_stream));
2351 decoding_marker (Lisp_Object stream)
2353 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2354 Lisp_Object str_obj;
2356 /* We do not need to mark the coding systems or charsets stored
2357 within the stream because they are stored in a global list
2358 and automatically marked. */
2360 XSETLSTREAM (str_obj, str);
2361 mark_object (str_obj);
2362 if (str->imp->marker)
2363 return (str->imp->marker) (str_obj);
2368 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2369 so we read data from the other end, decode it, and store it into DATA. */
2371 static Lstream_data_count
2372 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2374 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2375 unsigned char *orig_data = data;
2376 Lstream_data_count read_size;
2377 int error_occurred = 0;
2379 /* We need to interface to mule_decode(), which expects to take some
2380 amount of data and store the result into a Dynarr. We have
2381 mule_decode() store into str->runoff, and take data from there
2384 /* We loop until we have enough data, reading chunks from the other
2385 end and decoding it. */
2388 /* Take data from the runoff if we can. Make sure to take at
2389 most SIZE bytes, and delete the data from the runoff. */
2390 if (Dynarr_length (str->runoff) > 0)
2392 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2393 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2394 Dynarr_delete_many (str->runoff, 0, chunk);
2400 break; /* No more room for data */
2402 if (str->flags & CODING_STATE_END)
2403 /* This means that on the previous iteration, we hit the EOF on
2404 the other end. We loop once more so that mule_decode() can
2405 output any final stuff it may be holding, or any "go back
2406 to a sane state" escape sequences. (This latter makes sense
2407 during encoding.) */
2410 /* Exhausted the runoff, so get some more. DATA has at least
2411 SIZE bytes left of storage in it, so it's OK to read directly
2412 into it. (We'll be overwriting above, after we've decoded it
2413 into the runoff.) */
2414 read_size = Lstream_read (str->other_end, data, size);
2421 /* There might be some more end data produced in the translation.
2422 See the comment above. */
2423 str->flags |= CODING_STATE_END;
2424 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2427 if (data - orig_data == 0)
2428 return error_occurred ? -1 : 0;
2430 return data - orig_data;
2433 static Lstream_data_count
2434 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2436 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2437 Lstream_data_count retval;
2439 /* Decode all our data into the runoff, and then attempt to write
2440 it all out to the other end. Remove whatever chunk we succeeded
2442 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2443 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2444 Dynarr_length (str->runoff));
2446 Dynarr_delete_many (str->runoff, 0, retval);
2447 /* Do NOT return retval. The return value indicates how much
2448 of the incoming data was written, not how many bytes were
2454 reset_decoding_stream (struct decoding_stream *str)
2457 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2459 Lisp_Object coding_system;
2460 XSETCODING_SYSTEM (coding_system, str->codesys);
2461 reset_iso2022 (coding_system, &str->iso2022);
2463 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2465 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2471 str->er_counter = 0;
2472 str->combined_char_count = 0;
2473 str->combining_table = Qnil;
2475 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2476 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2479 str->decst.eol_type = EOL_AUTODETECT;
2480 str->decst.mask = ~0;
2482 str->flags = str->cpos = 0;
2486 decoding_rewinder (Lstream *stream)
2488 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2489 reset_decoding_stream (str);
2490 Dynarr_reset (str->runoff);
2491 return Lstream_rewind (str->other_end);
2495 decoding_seekable_p (Lstream *stream)
2497 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2498 return Lstream_seekable_p (str->other_end);
2502 decoding_flusher (Lstream *stream)
2504 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2505 return Lstream_flush (str->other_end);
2509 decoding_closer (Lstream *stream)
2511 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2512 if (stream->flags & LSTREAM_FL_WRITE)
2514 str->flags |= CODING_STATE_END;
2515 decoding_writer (stream, 0, 0);
2517 Dynarr_free (str->runoff);
2519 #ifdef ENABLE_COMPOSITE_CHARS
2520 if (str->iso2022.composite_chars)
2521 Dynarr_free (str->iso2022.composite_chars);
2524 return Lstream_close (str->other_end);
2528 decoding_stream_coding_system (Lstream *stream)
2530 Lisp_Object coding_system;
2531 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2533 XSETCODING_SYSTEM (coding_system, str->codesys);
2534 return subsidiary_coding_system (coding_system, str->eol_type);
2538 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2540 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2541 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2543 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2544 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2545 reset_decoding_stream (str);
2548 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2549 stream for writing, no automatic code detection will be performed.
2550 The reason for this is that automatic code detection requires a
2551 seekable input. Things will also fail if you open a decoding
2552 stream for reading using a non-fully-specified coding system and
2553 a non-seekable input stream. */
2556 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2559 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2560 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2564 str->other_end = stream;
2565 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2566 str->eol_type = EOL_AUTODETECT;
2567 if (!strcmp (mode, "r")
2568 && Lstream_seekable_p (stream))
2569 /* We can determine the coding system now. */
2570 determine_real_coding_system (stream, &codesys, &str->eol_type);
2571 set_decoding_stream_coding_system (lstr, codesys);
2572 str->decst.eol_type = str->eol_type;
2573 str->decst.mask = ~0;
2574 XSETLSTREAM (obj, lstr);
2579 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2581 return make_decoding_stream_1 (stream, codesys, "r");
2585 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2587 return make_decoding_stream_1 (stream, codesys, "w");
2590 /* Note: the decode_coding_* functions all take the same
2591 arguments as mule_decode(), which is to say some SRC data of
2592 size N, which is to be stored into dynamic array DST.
2593 DECODING is the stream within which the decoding is
2594 taking place, but no data is actually read from or
2595 written to that stream; that is handled in decoding_reader()
2596 or decoding_writer(). This allows the same functions to
2597 be used for both reading and writing. */
2600 mule_decode (Lstream *decoding, const Extbyte *src,
2601 unsigned_char_dynarr *dst, Lstream_data_count n)
2603 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2605 /* If necessary, do encoding-detection now. We do this when
2606 we're a writing stream or a non-seekable reading stream,
2607 meaning that we can't just process the whole input,
2608 rewind, and start over. */
2610 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2611 str->eol_type == EOL_AUTODETECT)
2613 Lisp_Object codesys;
2615 XSETCODING_SYSTEM (codesys, str->codesys);
2616 detect_coding_type (&str->decst, src, n,
2617 CODING_SYSTEM_TYPE (str->codesys) !=
2618 CODESYS_AUTODETECT);
2619 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2620 str->decst.mask != ~0)
2621 /* #### This is cheesy. What we really ought to do is
2622 buffer up a certain amount of data so as to get a
2623 less random result. */
2624 codesys = coding_system_from_mask (str->decst.mask);
2625 str->eol_type = str->decst.eol_type;
2626 if (XCODING_SYSTEM (codesys) != str->codesys)
2628 /* Preserve the CODING_STATE_END flag in case it was set.
2629 If we erase it, bad things might happen. */
2630 int was_end = str->flags & CODING_STATE_END;
2631 set_decoding_stream_coding_system (decoding, codesys);
2633 str->flags |= CODING_STATE_END;
2637 switch (CODING_SYSTEM_TYPE (str->codesys))
2640 case CODESYS_INTERNAL:
2641 Dynarr_add_many (dst, src, n);
2644 case CODESYS_AUTODETECT:
2645 /* If we got this far and still haven't decided on the coding
2646 system, then do no conversion. */
2647 case CODESYS_NO_CONVERSION:
2648 decode_coding_no_conversion (decoding, src, dst, n);
2651 case CODESYS_SHIFT_JIS:
2652 decode_coding_sjis (decoding, src, dst, n);
2655 decode_coding_big5 (decoding, src, dst, n);
2658 decode_coding_ucs4 (decoding, src, dst, n);
2661 decode_coding_utf16 (decoding, src, dst, n);
2664 decode_coding_utf8 (decoding, src, dst, n);
2667 str->ccl.last_block = str->flags & CODING_STATE_END;
2668 /* When applying ccl program to stream, MUST NOT set NULL
2670 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2671 dst, n, 0, CCL_MODE_DECODING);
2673 case CODESYS_ISO2022:
2674 decode_coding_iso2022 (decoding, src, dst, n);
2682 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2683 Decode the text between START and END which is encoded in CODING-SYSTEM.
2684 This is useful if you've read in encoded text from a file without decoding
2685 it (e.g. you read in a JIS-formatted file but used the `binary' or
2686 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2687 Return length of decoded text.
2688 BUFFER defaults to the current buffer if unspecified.
2690 (start, end, coding_system, buffer))
2693 struct buffer *buf = decode_buffer (buffer, 0);
2694 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2695 Lstream *istr, *ostr;
2696 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2698 get_buffer_range_char (buf, start, end, &b, &e, 0);
2700 barf_if_buffer_read_only (buf, b, e);
2702 coding_system = Fget_coding_system (coding_system);
2703 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2704 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2705 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2707 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2708 Fget_coding_system (Qbinary));
2709 istr = XLSTREAM (instream);
2710 ostr = XLSTREAM (outstream);
2711 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2713 /* The chain of streams looks like this:
2715 [BUFFER] <----- send through
2716 ------> [ENCODE AS BINARY]
2717 ------> [DECODE AS SPECIFIED]
2723 char tempbuf[1024]; /* some random amount */
2724 Bufpos newpos, even_newer_pos;
2725 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2726 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2730 newpos = lisp_buffer_stream_startpos (istr);
2731 Lstream_write (ostr, tempbuf, size_in_bytes);
2732 even_newer_pos = lisp_buffer_stream_startpos (istr);
2733 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2736 Lstream_close (istr);
2737 Lstream_close (ostr);
2739 Lstream_delete (istr);
2740 Lstream_delete (ostr);
2741 Lstream_delete (XLSTREAM (de_outstream));
2742 Lstream_delete (XLSTREAM (lb_outstream));
2747 /************************************************************************/
2748 /* Converting to an external encoding ("encoding") */
2749 /************************************************************************/
2751 /* An encoding stream is an output stream. When you create the
2752 stream, you specify the coding system that governs the encoding
2753 and another stream that the resulting encoded data is to be
2754 sent to, and then start sending data to it. */
2756 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2758 struct encoding_stream
2760 /* Coding system that governs the conversion. */
2761 Lisp_Coding_System *codesys;
2763 /* Stream that we read the encoded data from or
2764 write the decoded data to. */
2767 /* If we are reading, then we can return only a fixed amount of
2768 data, so if the conversion resulted in too much data, we store it
2769 here for retrieval the next time around. */
2770 unsigned_char_dynarr *runoff;
2772 /* FLAGS holds flags indicating the current state of the encoding.
2773 Some of these flags are dependent on the coding system. */
2776 /* CH holds a partially built-up character. Since we only deal
2777 with one- and two-byte characters at the moment, we only use
2778 this to store the first byte of a two-byte character. */
2781 /* Additional information used by the ISO2022 encoder. */
2784 /* CHARSET holds the character sets currently assigned to the G0
2785 through G3 registers. It is initialized from the array
2786 INITIAL_CHARSET in CODESYS. */
2787 Lisp_Object charset[4];
2789 /* Which registers are currently invoked into the left (GL) and
2790 right (GR) halves of the 8-bit encoding space? */
2791 int register_left, register_right;
2793 /* Whether we need to explicitly designate the charset in the
2794 G? register before using it. It is initialized from the
2795 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2796 unsigned char force_charset_on_output[4];
2798 /* Other state variables that need to be preserved across
2800 Lisp_Object current_charset;
2802 int current_char_boundary;
2805 void (*encode_char) (struct encoding_stream *str, Emchar c,
2806 unsigned_char_dynarr *dst, unsigned int *flags);
2807 void (*finish) (struct encoding_stream *str,
2808 unsigned_char_dynarr *dst, unsigned int *flags);
2810 /* Additional information (the state of the running CCL program)
2811 used by the CCL encoder. */
2812 struct ccl_program ccl;
2816 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2817 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2818 Lstream_data_count size);
2819 static int encoding_rewinder (Lstream *stream);
2820 static int encoding_seekable_p (Lstream *stream);
2821 static int encoding_flusher (Lstream *stream);
2822 static int encoding_closer (Lstream *stream);
2824 static Lisp_Object encoding_marker (Lisp_Object stream);
2826 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2827 sizeof (struct encoding_stream));
2830 encoding_marker (Lisp_Object stream)
2832 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2833 Lisp_Object str_obj;
2835 /* We do not need to mark the coding systems or charsets stored
2836 within the stream because they are stored in a global list
2837 and automatically marked. */
2839 XSETLSTREAM (str_obj, str);
2840 mark_object (str_obj);
2841 if (str->imp->marker)
2842 return (str->imp->marker) (str_obj);
2847 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2848 so we read data from the other end, encode it, and store it into DATA. */
2850 static Lstream_data_count
2851 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2853 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2854 unsigned char *orig_data = data;
2855 Lstream_data_count read_size;
2856 int error_occurred = 0;
2858 /* We need to interface to mule_encode(), which expects to take some
2859 amount of data and store the result into a Dynarr. We have
2860 mule_encode() store into str->runoff, and take data from there
2863 /* We loop until we have enough data, reading chunks from the other
2864 end and encoding it. */
2867 /* Take data from the runoff if we can. Make sure to take at
2868 most SIZE bytes, and delete the data from the runoff. */
2869 if (Dynarr_length (str->runoff) > 0)
2871 int chunk = min ((int) size, Dynarr_length (str->runoff));
2872 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2873 Dynarr_delete_many (str->runoff, 0, chunk);
2879 break; /* No more room for data */
2881 if (str->flags & CODING_STATE_END)
2882 /* This means that on the previous iteration, we hit the EOF on
2883 the other end. We loop once more so that mule_encode() can
2884 output any final stuff it may be holding, or any "go back
2885 to a sane state" escape sequences. (This latter makes sense
2886 during encoding.) */
2889 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2890 left of storage in it, so it's OK to read directly into it.
2891 (We'll be overwriting above, after we've encoded it into the
2893 read_size = Lstream_read (str->other_end, data, size);
2900 /* There might be some more end data produced in the translation.
2901 See the comment above. */
2902 str->flags |= CODING_STATE_END;
2903 mule_encode (stream, data, str->runoff, read_size);
2906 if (data == orig_data)
2907 return error_occurred ? -1 : 0;
2909 return data - orig_data;
2912 static Lstream_data_count
2913 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2915 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2916 Lstream_data_count retval;
2918 /* Encode all our data into the runoff, and then attempt to write
2919 it all out to the other end. Remove whatever chunk we succeeded
2921 mule_encode (stream, data, str->runoff, size);
2922 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2923 Dynarr_length (str->runoff));
2925 Dynarr_delete_many (str->runoff, 0, retval);
2926 /* Do NOT return retval. The return value indicates how much
2927 of the incoming data was written, not how many bytes were
2933 reset_encoding_stream (struct encoding_stream *str)
2936 switch (CODING_SYSTEM_TYPE (str->codesys))
2938 case CODESYS_ISO2022:
2942 str->encode_char = &char_encode_iso2022;
2943 str->finish = &char_finish_iso2022;
2944 for (i = 0; i < 4; i++)
2946 str->iso2022.charset[i] =
2947 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2948 str->iso2022.force_charset_on_output[i] =
2949 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2951 str->iso2022.register_left = 0;
2952 str->iso2022.register_right = 1;
2953 str->iso2022.current_charset = Qnil;
2954 str->iso2022.current_half = 0;
2958 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2961 str->encode_char = &char_encode_utf8;
2962 str->finish = &char_finish_utf8;
2965 str->encode_char = &char_encode_utf16;
2966 str->finish = &char_finish_utf16;
2969 str->encode_char = &char_encode_ucs4;
2970 str->finish = &char_finish_ucs4;
2972 case CODESYS_SHIFT_JIS:
2973 str->encode_char = &char_encode_shift_jis;
2974 str->finish = &char_finish_shift_jis;
2977 str->encode_char = &char_encode_big5;
2978 str->finish = &char_finish_big5;
2984 str->iso2022.current_char_boundary = 0;
2985 str->flags = str->ch = 0;
2989 encoding_rewinder (Lstream *stream)
2991 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2992 reset_encoding_stream (str);
2993 Dynarr_reset (str->runoff);
2994 return Lstream_rewind (str->other_end);
2998 encoding_seekable_p (Lstream *stream)
3000 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3001 return Lstream_seekable_p (str->other_end);
3005 encoding_flusher (Lstream *stream)
3007 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3008 return Lstream_flush (str->other_end);
3012 encoding_closer (Lstream *stream)
3014 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3015 if (stream->flags & LSTREAM_FL_WRITE)
3017 str->flags |= CODING_STATE_END;
3018 encoding_writer (stream, 0, 0);
3020 Dynarr_free (str->runoff);
3021 return Lstream_close (str->other_end);
3025 encoding_stream_coding_system (Lstream *stream)
3027 Lisp_Object coding_system;
3028 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3030 XSETCODING_SYSTEM (coding_system, str->codesys);
3031 return coding_system;
3035 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3037 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3038 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3040 reset_encoding_stream (str);
3044 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3047 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3048 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3052 str->runoff = Dynarr_new (unsigned_char);
3053 str->other_end = stream;
3054 set_encoding_stream_coding_system (lstr, codesys);
3055 XSETLSTREAM (obj, lstr);
3060 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3062 return make_encoding_stream_1 (stream, codesys, "r");
3066 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3068 return make_encoding_stream_1 (stream, codesys, "w");
3071 /* Convert N bytes of internally-formatted data stored in SRC to an
3072 external format, according to the encoding stream ENCODING.
3073 Store the encoded data into DST. */
3076 mule_encode (Lstream *encoding, const Bufbyte *src,
3077 unsigned_char_dynarr *dst, Lstream_data_count n)
3079 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3081 switch (CODING_SYSTEM_TYPE (str->codesys))
3084 case CODESYS_INTERNAL:
3085 Dynarr_add_many (dst, src, n);
3088 case CODESYS_AUTODETECT:
3089 /* If we got this far and still haven't decided on the coding
3090 system, then do no conversion. */
3091 case CODESYS_NO_CONVERSION:
3092 encode_coding_no_conversion (encoding, src, dst, n);
3096 str->ccl.last_block = str->flags & CODING_STATE_END;
3097 /* When applying ccl program to stream, MUST NOT set NULL
3099 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3100 dst, n, 0, CCL_MODE_ENCODING);
3104 text_encode_generic (encoding, src, dst, n);
3108 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3109 Encode the text between START and END using CODING-SYSTEM.
3110 This will, for example, convert Japanese characters into stuff such as
3111 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3112 text. BUFFER defaults to the current buffer if unspecified.
3114 (start, end, coding_system, buffer))
3117 struct buffer *buf = decode_buffer (buffer, 0);
3118 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3119 Lstream *istr, *ostr;
3120 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3122 get_buffer_range_char (buf, start, end, &b, &e, 0);
3124 barf_if_buffer_read_only (buf, b, e);
3126 coding_system = Fget_coding_system (coding_system);
3127 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3128 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3129 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3130 Fget_coding_system (Qbinary));
3131 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3133 istr = XLSTREAM (instream);
3134 ostr = XLSTREAM (outstream);
3135 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3136 /* The chain of streams looks like this:
3138 [BUFFER] <----- send through
3139 ------> [ENCODE AS SPECIFIED]
3140 ------> [DECODE AS BINARY]
3145 char tempbuf[1024]; /* some random amount */
3146 Bufpos newpos, even_newer_pos;
3147 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3148 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3152 newpos = lisp_buffer_stream_startpos (istr);
3153 Lstream_write (ostr, tempbuf, size_in_bytes);
3154 even_newer_pos = lisp_buffer_stream_startpos (istr);
3155 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3161 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3162 Lstream_close (istr);
3163 Lstream_close (ostr);
3165 Lstream_delete (istr);
3166 Lstream_delete (ostr);
3167 Lstream_delete (XLSTREAM (de_outstream));
3168 Lstream_delete (XLSTREAM (lb_outstream));
3169 return make_int (retlen);
3176 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3177 unsigned_char_dynarr *dst, Lstream_data_count n)
3180 unsigned char char_boundary;
3181 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3182 unsigned int flags = str->flags;
3183 Emchar ch = str->ch;
3185 char_boundary = str->iso2022.current_char_boundary;
3191 if (char_boundary == 0)
3219 (*str->encode_char) (str, c, dst, &flags);
3221 else if (char_boundary == 1)
3223 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3229 ch = (ch << 6) | (c & 0x3f);
3234 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3236 (*str->finish) (str, dst, &flags);
3241 str->iso2022.current_char_boundary = char_boundary;
3246 /************************************************************************/
3247 /* entity reference */
3248 /************************************************************************/
3251 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3253 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3255 if ( str->er_counter > 0)
3257 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3258 str->er_counter = 0;
3262 EXFUN (Fregexp_quote, 1);
3264 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3265 unsigned_char_dynarr* dst);
3267 decode_add_er_char (struct decoding_stream *str, Emchar c,
3268 unsigned_char_dynarr* dst)
3270 if (str->er_counter == 0)
3272 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3275 str->er_buf[0] = '&';
3279 DECODE_ADD_UCS_CHAR (c, dst);
3283 Lisp_Object string = make_string (str->er_buf,
3290 Lisp_Object char_type;
3293 for ( rest = Vcoded_charset_entity_reference_alist;
3294 !NILP (rest); rest = Fcdr (rest) )
3300 char_type = XCDR (ccs);
3305 if (NILP (ccs = Ffind_charset (ccs)))
3314 pat = Fregexp_quote (pat);
3321 pat = concat3 (build_string ("^&"),
3322 pat, build_string ("\\([0-9]+\\)$"));
3325 else if (EQ (ret, Qx))
3327 pat = concat3 (build_string ("^&"),
3328 pat, build_string ("\\([0-9a-f]+\\)$"));
3331 else if (EQ (ret, QX))
3333 pat = concat3 (build_string ("^&"),
3334 pat, build_string ("\\([0-9A-F]+\\)$"));
3340 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3343 = XINT (Fstring_to_number
3344 (Fsubstring (string,
3345 Fmatch_beginning (make_int (1)),
3346 Fmatch_end (make_int (1))),
3350 ? DECODE_CHAR (ccs, code, 0)
3351 : decode_builtin_char (ccs, code);
3354 DECODE_ADD_UCS_CHAR (chr, dst);
3357 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3358 Dynarr_add (dst, ';');
3364 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3365 string, Qnil, Qnil)))
3368 = XUINT (Fstring_to_number
3369 (Fsubstring (string,
3370 Fmatch_beginning (make_int (1)),
3371 Fmatch_end (make_int (1))),
3374 DECODE_ADD_UCS_CHAR (code, dst);
3378 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3379 Dynarr_add (dst, ';');
3382 str->er_counter = 0;
3384 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3386 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3387 str->er_counter = 0;
3388 DECODE_ADD_UCS_CHAR (c, dst);
3391 str->er_buf[str->er_counter++] = c;
3394 void char_encode_as_entity_reference (Emchar ch, char* buf);
3396 char_encode_as_entity_reference (Emchar ch, char* buf)
3398 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3401 Lisp_Object char_type;
3402 int format_columns, idx;
3403 char format[ER_BUF_SIZE];
3405 while (!NILP (rest))
3411 char_type = XCDR (ccs);
3416 if (!NILP (ccs = Ffind_charset (ccs)))
3418 int code_point = charset_code_point (ccs, ch, 0);
3420 if ( (code_point >= 0)
3421 && (NILP (char_type)
3422 || DECODE_CHAR (ccs, code_point, 0) != ch) )
3428 if ( STRINGP (ret) &&
3429 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3432 strncpy (&format[1], XSTRING_DATA (ret), idx);
3442 format[idx++] = '%';
3443 format_columns = XINT (ret);
3444 if ( (2 <= format_columns) && (format_columns <= 8)
3445 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3447 format [idx++] = '0';
3448 format [idx++] = '0' + format_columns;
3457 format [idx++] = 'd';
3458 else if (EQ (ret, Qx))
3459 format [idx++] = 'x';
3460 else if (EQ (ret, QX))
3461 format [idx++] = 'X';
3464 format [idx++] = ';';
3467 sprintf (buf, format, code_point);
3474 sprintf (buf, "&MCS-%08X;", ch);
3478 /************************************************************************/
3479 /* character composition */
3480 /************************************************************************/
3481 extern Lisp_Object Qcomposition, Qrep_decomposition;
3484 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3486 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3490 for (i = 0; i < str->combined_char_count; i++)
3491 decode_add_er_char (str, str->combined_chars[i], dst);
3492 str->combined_char_count = 0;
3493 str->combining_table = Qnil;
3496 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3497 unsigned_char_dynarr* dst);
3499 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3500 Emchar character, unsigned_char_dynarr* dst)
3502 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3503 decode_add_er_char (str, character, dst);
3504 else if (!CONSP (str->combining_table))
3507 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3511 decode_add_er_char (str, character, dst);
3514 str->combined_chars[0] = character;
3515 str->combined_char_count = 1;
3516 str->combining_table = ret;
3522 = Fcdr (Fassq (make_char (character), str->combining_table));
3526 Emchar char2 = XCHARVAL (ret);
3527 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3532 decode_add_er_char (str, char2, dst);
3533 str->combined_char_count = 0;
3534 str->combining_table = Qnil;
3538 str->combined_chars[0] = char2;
3539 str->combined_char_count = 1;
3540 str->combining_table = ret2;
3545 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3548 COMPOSE_FLUSH_CHARS (str, dst);
3550 decode_add_er_char (str, character, dst);
3553 str->combined_chars[0] = character;
3554 str->combined_char_count = 1;
3555 str->combining_table = ret;
3560 #else /* not UTF2000 */
3561 #define COMPOSE_FLUSH_CHARS(str, dst)
3562 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3563 #endif /* UTF2000 */
3566 /************************************************************************/
3567 /* Shift-JIS methods */
3568 /************************************************************************/
3570 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3571 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3572 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3573 encoded by "position-code + 0x80". A character of JISX0208
3574 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3575 position-codes are divided and shifted so that it fit in the range
3578 --- CODE RANGE of Shift-JIS ---
3579 (character set) (range)
3581 JISX0201-Kana 0xA0 .. 0xDF
3582 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3583 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3584 -------------------------------
3588 /* Is this the first byte of a Shift-JIS two-byte char? */
3590 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3591 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3593 /* Is this the second byte of a Shift-JIS two-byte char? */
3595 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3596 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3598 #define BYTE_SJIS_KATAKANA_P(c) \
3599 ((c) >= 0xA1 && (c) <= 0xDF)
3602 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3606 unsigned char c = *(unsigned char *)src++;
3607 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3609 if (st->shift_jis.in_second_byte)
3611 st->shift_jis.in_second_byte = 0;
3615 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3616 st->shift_jis.in_second_byte = 1;
3618 return CODING_CATEGORY_SHIFT_JIS_MASK;
3621 /* Convert Shift-JIS data to internal format. */
3624 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3625 unsigned_char_dynarr *dst, Lstream_data_count n)
3627 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3628 unsigned int flags = str->flags;
3629 unsigned int cpos = str->cpos;
3630 eol_type_t eol_type = str->eol_type;
3634 unsigned char c = *(unsigned char *)src++;
3638 /* Previous character was first byte of Shift-JIS Kanji char. */
3639 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3641 unsigned char e1, e2;
3643 DECODE_SJIS (cpos, c, e1, e2);
3645 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3649 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3650 Dynarr_add (dst, e1);
3651 Dynarr_add (dst, e2);
3656 DECODE_ADD_BINARY_CHAR (cpos, dst);
3657 DECODE_ADD_BINARY_CHAR (c, dst);
3663 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3664 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3666 else if (BYTE_SJIS_KATAKANA_P (c))
3669 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3672 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3673 Dynarr_add (dst, c);
3678 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3682 DECODE_ADD_BINARY_CHAR (c, dst);
3684 label_continue_loop:;
3687 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3693 /* Convert internal character representation to Shift_JIS. */
3696 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3697 unsigned_char_dynarr *dst, unsigned int *flags)
3699 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3703 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3704 Dynarr_add (dst, '\r');
3705 if (eol_type != EOL_CR)
3706 Dynarr_add (dst, ch);
3710 unsigned int s1, s2;
3712 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3714 if (code_point >= 0)
3715 Dynarr_add (dst, code_point);
3716 else if ((code_point
3717 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3720 ENCODE_SJIS ((code_point >> 8) | 0x80,
3721 (code_point & 0xFF) | 0x80, s1, s2);
3722 Dynarr_add (dst, s1);
3723 Dynarr_add (dst, s2);
3725 else if ((code_point
3726 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3728 Dynarr_add (dst, code_point | 0x80);
3729 else if ((code_point
3730 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3733 ENCODE_SJIS ((code_point >> 8) | 0x80,
3734 (code_point & 0xFF) | 0x80, s1, s2);
3735 Dynarr_add (dst, s1);
3736 Dynarr_add (dst, s2);
3738 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3740 Dynarr_add (dst, code_point);
3742 Dynarr_add (dst, '?');
3744 Lisp_Object charset;
3745 unsigned int c1, c2;
3747 BREAKUP_CHAR (ch, charset, c1, c2);
3749 if (EQ(charset, Vcharset_katakana_jisx0201))
3751 Dynarr_add (dst, c1 | 0x80);
3755 Dynarr_add (dst, c1);
3757 else if (EQ(charset, Vcharset_japanese_jisx0208))
3759 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3760 Dynarr_add (dst, s1);
3761 Dynarr_add (dst, s2);
3764 Dynarr_add (dst, '?');
3770 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3771 unsigned int *flags)
3775 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3776 Decode a JISX0208 character of Shift-JIS coding-system.
3777 CODE is the character code in Shift-JIS as a cons of type bytes.
3778 Return the corresponding character.
3782 unsigned char c1, c2, s1, s2;
3785 CHECK_INT (XCAR (code));
3786 CHECK_INT (XCDR (code));
3787 s1 = XINT (XCAR (code));
3788 s2 = XINT (XCDR (code));
3789 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3790 BYTE_SJIS_TWO_BYTE_2_P (s2))
3792 DECODE_SJIS (s1, s2, c1, c2);
3793 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3794 c1 & 0x7F, c2 & 0x7F));
3800 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3801 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3802 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3806 Lisp_Object charset;
3809 CHECK_CHAR_COERCE_INT (character);
3810 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3811 if (EQ (charset, Vcharset_japanese_jisx0208))
3813 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3814 return Fcons (make_int (s1), make_int (s2));
3821 /************************************************************************/
3823 /************************************************************************/
3825 /* BIG5 is a coding system encoding two character sets: ASCII and
3826 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3827 character set and is encoded in two-byte.
3829 --- CODE RANGE of BIG5 ---
3830 (character set) (range)
3832 Big5 (1st byte) 0xA1 .. 0xFE
3833 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3834 --------------------------
3836 Since the number of characters in Big5 is larger than maximum
3837 characters in Emacs' charset (96x96), it can't be handled as one
3838 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3839 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3840 contains frequently used characters and the latter contains less
3841 frequently used characters. */
3844 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3845 ((c) >= 0x81 && (c) <= 0xFE)
3847 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3848 ((c) >= 0xA1 && (c) <= 0xFE)
3851 /* Is this the second byte of a Shift-JIS two-byte char? */
3853 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3854 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3856 /* Number of Big5 characters which have the same code in 1st byte. */
3858 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3860 /* Code conversion macros. These are macros because they are used in
3861 inner loops during code conversion.
3863 Note that temporary variables in macros introduce the classic
3864 dynamic-scoping problems with variable names. We use capital-
3865 lettered variables in the assumption that XEmacs does not use
3866 capital letters in variables except in a very formalized way
3869 /* Convert Big5 code (b1, b2) into its internal string representation
3872 /* There is a much simpler way to split the Big5 charset into two.
3873 For the moment I'm going to leave the algorithm as-is because it
3874 claims to separate out the most-used characters into a single
3875 charset, which perhaps will lead to optimizations in various
3878 The way the algorithm works is something like this:
3880 Big5 can be viewed as a 94x157 charset, where the row is
3881 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3882 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3883 the split between low and high column numbers is apparently
3884 meaningless; ascending rows produce less and less frequent chars.
3885 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3886 the first charset, and the upper half (0xC9 .. 0xFE) to the
3887 second. To do the conversion, we convert the character into
3888 a single number where 0 .. 156 is the first row, 157 .. 313
3889 is the second, etc. That way, the characters are ordered by
3890 decreasing frequency. Then we just chop the space in two
3891 and coerce the result into a 94x94 space.
3894 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3896 int B1 = b1, B2 = b2; \
3898 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3902 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3906 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3907 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3909 c1 = I / (0xFF - 0xA1) + 0xA1; \
3910 c2 = I % (0xFF - 0xA1) + 0xA1; \
3913 /* Convert the internal string representation of a Big5 character
3914 (lb, c1, c2) into Big5 code (b1, b2). */
3916 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3918 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3920 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3922 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3924 b1 = I / BIG5_SAME_ROW + 0xA1; \
3925 b2 = I % BIG5_SAME_ROW; \
3926 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3930 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3934 unsigned char c = *(unsigned char *)src++;
3935 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3937 || (c >= 0x80 && c <= 0xA0)
3941 if (st->big5.in_second_byte)
3943 st->big5.in_second_byte = 0;
3944 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3954 st->big5.in_second_byte = 1;
3956 return CODING_CATEGORY_BIG5_MASK;
3959 /* Convert Big5 data to internal format. */
3962 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3963 unsigned_char_dynarr *dst, Lstream_data_count n)
3965 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3966 unsigned int flags = str->flags;
3967 unsigned int cpos = str->cpos;
3968 eol_type_t eol_type = str->eol_type;
3971 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3972 (decoding)->codesys, 1);
3977 unsigned char c = *(unsigned char *)src++;
3980 /* Previous character was first byte of Big5 char. */
3981 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3984 int code_point = (cpos << 8) | c;
3985 Emchar char_id = decode_defined_char (ccs, code_point, 0);
3989 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3990 DECODE_ADD_UCS_CHAR (char_id, dst);
3992 unsigned char b1, b2, b3;
3993 DECODE_BIG5 (cpos, c, b1, b2, b3);
3994 Dynarr_add (dst, b1);
3995 Dynarr_add (dst, b2);
3996 Dynarr_add (dst, b3);
4001 DECODE_ADD_BINARY_CHAR (cpos, dst);
4002 DECODE_ADD_BINARY_CHAR (c, dst);
4008 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4009 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4011 decode_flush_er_chars (str, dst);
4016 decode_flush_er_chars (str, dst);
4017 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4018 DECODE_ADD_BINARY_CHAR (c, dst);
4022 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4023 decode_add_er_char (str, c, dst);
4026 label_continue_loop:;
4029 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4030 if (flags & CODING_STATE_END)
4032 decode_flush_er_chars (str, dst);
4033 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4034 if (flags & CODING_STATE_CR)
4035 Dynarr_add (dst, '\r');
4042 /* Convert internally-formatted data to Big5. */
4045 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4046 unsigned_char_dynarr *dst, unsigned int *flags)
4048 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4052 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4053 Dynarr_add (dst, '\r');
4054 if (eol_type != EOL_CR)
4055 Dynarr_add (dst, ch);
4062 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4064 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4065 Dynarr_add (dst, code_point);
4066 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4068 Dynarr_add (dst, code_point >> 8);
4069 Dynarr_add (dst, code_point & 0xFF);
4071 else if ((code_point
4072 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4074 Dynarr_add (dst, code_point >> 8);
4075 Dynarr_add (dst, code_point & 0xFF);
4077 else if ((code_point
4078 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4081 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4082 + ((code_point & 0xFF) - 33);
4083 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4084 unsigned char b2 = I % BIG5_SAME_ROW;
4086 b2 += b2 < 0x3F ? 0x40 : 0x62;
4087 Dynarr_add (dst, b1);
4088 Dynarr_add (dst, b2);
4090 else if ((code_point
4091 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4094 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4095 + ((code_point & 0xFF) - 33);
4096 unsigned char b1, b2;
4098 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4099 b1 = I / BIG5_SAME_ROW + 0xA1;
4100 b2 = I % BIG5_SAME_ROW;
4101 b2 += b2 < 0x3F ? 0x40 : 0x62;
4102 Dynarr_add (dst, b1);
4103 Dynarr_add (dst, b2);
4105 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4109 char_encode_as_entity_reference (ch, buf);
4110 Dynarr_add_many (dst, buf, strlen (buf));
4113 Dynarr_add (dst, '?');
4120 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4121 unsigned int *flags)
4126 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4127 Decode a Big5 character CODE of BIG5 coding-system.
4128 CODE is the character code in BIG5, a cons of two integers.
4129 Return the corresponding character.
4133 unsigned char c1, c2, b1, b2;
4136 CHECK_INT (XCAR (code));
4137 CHECK_INT (XCDR (code));
4138 b1 = XINT (XCAR (code));
4139 b2 = XINT (XCDR (code));
4140 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4141 BYTE_BIG5_TWO_BYTE_2_P (b2))
4143 Charset_ID leading_byte;
4144 Lisp_Object charset;
4145 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4146 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4147 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4153 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4154 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4155 Return the corresponding character code in Big5.
4159 Lisp_Object charset;
4162 CHECK_CHAR_COERCE_INT (character);
4163 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4164 if (EQ (charset, Vcharset_chinese_big5_1) ||
4165 EQ (charset, Vcharset_chinese_big5_2))
4167 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4169 return Fcons (make_int (b1), make_int (b2));
4176 /************************************************************************/
4178 /************************************************************************/
4181 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4185 unsigned char c = *(unsigned char *)src++;
4186 switch (st->ucs4.in_byte)
4195 st->ucs4.in_byte = 0;
4201 return CODING_CATEGORY_UCS4_MASK;
4205 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4206 unsigned_char_dynarr *dst, Lstream_data_count n)
4208 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4209 unsigned int flags = str->flags;
4210 unsigned int cpos = str->cpos;
4211 unsigned char counter = str->counter;
4215 unsigned char c = *(unsigned char *)src++;
4223 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4228 cpos = ( cpos << 8 ) | c;
4232 if (counter & CODING_STATE_END)
4233 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4237 str->counter = counter;
4241 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4242 unsigned_char_dynarr *dst, unsigned int *flags)
4244 Dynarr_add (dst, ch >> 24);
4245 Dynarr_add (dst, ch >> 16);
4246 Dynarr_add (dst, ch >> 8);
4247 Dynarr_add (dst, ch );
4251 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4252 unsigned int *flags)
4257 /************************************************************************/
4258 /* UTF-16 methods */
4259 /************************************************************************/
4262 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4264 return CODING_CATEGORY_UTF16_MASK;
4268 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4269 unsigned_char_dynarr *dst, Lstream_data_count n)
4271 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4272 unsigned int flags = str->flags;
4273 unsigned int cpos = str->cpos;
4274 unsigned char counter = str->counter & 3;
4275 unsigned char byte_order = str->counter >> 2;
4276 eol_type_t eol_type = str->eol_type;
4280 unsigned char c = *(unsigned char *)src++;
4286 else if (counter == 1)
4290 if (byte_order == 0)
4291 code = (c << 8) | cpos;
4293 code = (cpos << 8) | c;
4296 code = ((code & 0xFF) << 8) | (code >> 8);
4297 if ( byte_order == 0 )
4302 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4313 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4314 DECODE_ADD_UCS_CHAR (code, dst);
4318 else if (counter == 2)
4320 cpos = (cpos << 8) | c;
4328 ? (c << 8) | (cpos & 0xFF)
4329 : ((cpos & 0xFF) << 8) | c;
4331 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4336 label_continue_loop:;
4338 if (counter & CODING_STATE_END)
4339 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4343 str->counter = (byte_order << 2) | counter;
4347 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4348 unsigned_char_dynarr *dst, unsigned int *flags)
4352 Dynarr_add (dst, ch);
4353 Dynarr_add (dst, ch >> 8);
4357 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4358 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4360 Dynarr_add (dst, y);
4361 Dynarr_add (dst, y >> 8);
4362 Dynarr_add (dst, z);
4363 Dynarr_add (dst, z >> 8);
4368 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4369 unsigned int *flags)
4374 /************************************************************************/
4376 /************************************************************************/
4379 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4383 unsigned char c = *(unsigned char *)src++;
4384 switch (st->utf8.in_byte)
4387 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4390 st->utf8.in_byte = 5;
4392 st->utf8.in_byte = 4;
4394 st->utf8.in_byte = 3;
4396 st->utf8.in_byte = 2;
4398 st->utf8.in_byte = 1;
4403 if ((c & 0xc0) != 0x80)
4409 return CODING_CATEGORY_UTF8_MASK;
4413 decode_output_utf8_partial_char (unsigned char counter,
4415 unsigned_char_dynarr *dst)
4418 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4419 else if (counter == 4)
4421 if (cpos < (1 << 6))
4422 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4425 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4426 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4429 else if (counter == 3)
4431 if (cpos < (1 << 6))
4432 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4433 else if (cpos < (1 << 12))
4435 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4436 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4440 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4441 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4442 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4445 else if (counter == 2)
4447 if (cpos < (1 << 6))
4448 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4449 else if (cpos < (1 << 12))
4451 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4452 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4454 else if (cpos < (1 << 18))
4456 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4457 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4458 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4462 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4463 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4464 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4465 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4470 if (cpos < (1 << 6))
4471 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4472 else if (cpos < (1 << 12))
4474 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4475 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4477 else if (cpos < (1 << 18))
4479 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4480 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4481 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4483 else if (cpos < (1 << 24))
4485 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4486 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4487 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4488 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4492 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4493 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4494 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4495 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4496 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4502 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4503 unsigned_char_dynarr *dst, Lstream_data_count n)
4505 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4506 unsigned int flags = str->flags;
4507 unsigned int cpos = str->cpos;
4508 eol_type_t eol_type = str->eol_type;
4509 unsigned char counter = str->counter;
4511 int bom_flag = str->bom_flag;
4513 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4514 (decoding)->codesys, 0);
4519 unsigned char c = *(unsigned char *)src++;
4524 COMPOSE_FLUSH_CHARS (str, dst);
4525 decode_flush_er_chars (str, dst);
4526 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4528 if ( bom_flag == 0 )
4531 DECODE_ADD_UCS_CHAR (c, dst);
4533 else if ( c < 0xC0 )
4535 if ( bom_flag == 0 )
4538 /* decode_add_er_char (str, c, dst); */
4539 COMPOSE_ADD_CHAR (str, c, dst);
4543 /* decode_flush_er_chars (str, dst); */
4549 else if ( c < 0xF0 )
4554 else if ( c < 0xF8 )
4559 else if ( c < 0xFC )
4571 else if ( (c & 0xC0) == 0x80 )
4573 cpos = ( cpos << 6 ) | ( c & 0x3f );
4578 if ( bom_flag == 0 )
4580 if ( cpos == 0xFEFF )
4591 char_id = decode_defined_char (ccs, cpos, 0);
4598 COMPOSE_ADD_CHAR (str, char_id, dst);
4608 COMPOSE_FLUSH_CHARS (str, dst);
4609 decode_flush_er_chars (str, dst);
4610 decode_output_utf8_partial_char (counter, cpos, dst);
4611 DECODE_ADD_BINARY_CHAR (c, dst);
4615 label_continue_loop:;
4618 if (flags & CODING_STATE_END)
4620 COMPOSE_FLUSH_CHARS (str, dst);
4621 decode_flush_er_chars (str, dst);
4624 decode_output_utf8_partial_char (counter, cpos, dst);
4631 str->counter = counter;
4633 str->bom_flag = bom_flag;
4638 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4639 unsigned_char_dynarr *dst, unsigned int *flags)
4641 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4645 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4646 Dynarr_add (dst, '\r');
4647 if (eol_type != EOL_CR)
4648 Dynarr_add (dst, ch);
4650 else if (ch <= 0x7f)
4652 Dynarr_add (dst, ch);
4657 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4658 int code_point = charset_code_point (ucs_ccs, ch, 0);
4660 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4662 Lisp_Object rest = Vdecomposition_feature_list;
4663 Lisp_Object decomp_f;
4664 Lisp_Object seq = Qnil;
4665 Lisp_Object map, ret;
4666 struct gcpro gcpro1;
4668 while ( CONSP (rest) )
4670 decomp_f = XCAR (rest);
4672 seq = Fchar_feature (make_char (ch), decomp_f, Qnil,
4682 Lisp_Object base = Fcar (seq);
4685 if ( CHARP (base) && CONSP (seq) )
4687 Lisp_Object comb = Fcar (seq);
4691 char_encode_utf8 (str, XCHAR (base), dst, flags);
4692 char_encode_utf8 (str, XCHAR (comb), dst, flags);
4698 map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4700 && INTP (ret = Fchar_feature (make_char (ch),
4703 code_point = XINT (ret);
4704 else if ( !NILP (map =
4705 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4707 && INTP (ret = Fchar_feature (make_char (ch),
4710 code_point = XINT (ret);
4711 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4715 char_encode_as_entity_reference (ch, buf);
4716 Dynarr_add_many (dst, buf, strlen (buf));
4722 if (code_point <= 0x7ff)
4724 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4725 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4727 else if (code_point <= 0xffff)
4729 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4730 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4731 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4733 else if (code_point <= 0x1fffff)
4735 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4736 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4737 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4738 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4740 else if (code_point <= 0x3ffffff)
4742 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4743 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4744 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4745 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4746 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4750 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4751 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4752 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4753 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4754 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4755 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4761 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4762 unsigned int *flags)
4767 /************************************************************************/
4768 /* ISO2022 methods */
4769 /************************************************************************/
4771 /* The following note describes the coding system ISO2022 briefly.
4772 Since the intention of this note is to help understand the
4773 functions in this file, some parts are NOT ACCURATE or OVERLY
4774 SIMPLIFIED. For thorough understanding, please refer to the
4775 original document of ISO2022.
4777 ISO2022 provides many mechanisms to encode several character sets
4778 in 7-bit and 8-bit environments. For 7-bit environments, all text
4779 is encoded using bytes less than 128. This may make the encoded
4780 text a little bit longer, but the text passes more easily through
4781 several gateways, some of which strip off MSB (Most Signigant Bit).
4783 There are two kinds of character sets: control character set and
4784 graphic character set. The former contains control characters such
4785 as `newline' and `escape' to provide control functions (control
4786 functions are also provided by escape sequences). The latter
4787 contains graphic characters such as 'A' and '-'. Emacs recognizes
4788 two control character sets and many graphic character sets.
4790 Graphic character sets are classified into one of the following
4791 four classes, according to the number of bytes (DIMENSION) and
4792 number of characters in one dimension (CHARS) of the set:
4793 - DIMENSION1_CHARS94
4794 - DIMENSION1_CHARS96
4795 - DIMENSION2_CHARS94
4796 - DIMENSION2_CHARS96
4798 In addition, each character set is assigned an identification tag,
4799 unique for each set, called "final character" (denoted as <F>
4800 hereafter). The <F> of each character set is decided by ECMA(*)
4801 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4802 (0x30..0x3F are for private use only).
4804 Note (*): ECMA = European Computer Manufacturers Association
4806 Here are examples of graphic character set [NAME(<F>)]:
4807 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4808 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4809 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4810 o DIMENSION2_CHARS96 -- none for the moment
4812 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4813 C0 [0x00..0x1F] -- control character plane 0
4814 GL [0x20..0x7F] -- graphic character plane 0
4815 C1 [0x80..0x9F] -- control character plane 1
4816 GR [0xA0..0xFF] -- graphic character plane 1
4818 A control character set is directly designated and invoked to C0 or
4819 C1 by an escape sequence. The most common case is that:
4820 - ISO646's control character set is designated/invoked to C0, and
4821 - ISO6429's control character set is designated/invoked to C1,
4822 and usually these designations/invocations are omitted in encoded
4823 text. In a 7-bit environment, only C0 can be used, and a control
4824 character for C1 is encoded by an appropriate escape sequence to
4825 fit into the environment. All control characters for C1 are
4826 defined to have corresponding escape sequences.
4828 A graphic character set is at first designated to one of four
4829 graphic registers (G0 through G3), then these graphic registers are
4830 invoked to GL or GR. These designations and invocations can be
4831 done independently. The most common case is that G0 is invoked to
4832 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4833 these invocations and designations are omitted in encoded text.
4834 In a 7-bit environment, only GL can be used.
4836 When a graphic character set of CHARS94 is invoked to GL, codes
4837 0x20 and 0x7F of the GL area work as control characters SPACE and
4838 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4841 There are two ways of invocation: locking-shift and single-shift.
4842 With locking-shift, the invocation lasts until the next different
4843 invocation, whereas with single-shift, the invocation affects the
4844 following character only and doesn't affect the locking-shift
4845 state. Invocations are done by the following control characters or
4848 ----------------------------------------------------------------------
4849 abbrev function cntrl escape seq description
4850 ----------------------------------------------------------------------
4851 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4852 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4853 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4854 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4855 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4856 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4857 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4858 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4859 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4860 ----------------------------------------------------------------------
4861 (*) These are not used by any known coding system.
4863 Control characters for these functions are defined by macros
4864 ISO_CODE_XXX in `coding.h'.
4866 Designations are done by the following escape sequences:
4867 ----------------------------------------------------------------------
4868 escape sequence description
4869 ----------------------------------------------------------------------
4870 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4871 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4872 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4873 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4874 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4875 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4876 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4877 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4878 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4879 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4880 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4881 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4882 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4883 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4884 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4885 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4886 ----------------------------------------------------------------------
4888 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4889 of dimension 1, chars 94, and final character <F>, etc...
4891 Note (*): Although these designations are not allowed in ISO2022,
4892 Emacs accepts them on decoding, and produces them on encoding
4893 CHARS96 character sets in a coding system which is characterized as
4894 7-bit environment, non-locking-shift, and non-single-shift.
4896 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4897 '(' can be omitted. We refer to this as "short-form" hereafter.
4899 Now you may notice that there are a lot of ways for encoding the
4900 same multilingual text in ISO2022. Actually, there exist many
4901 coding systems such as Compound Text (used in X11's inter client
4902 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4903 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4904 localized platforms), and all of these are variants of ISO2022.
4906 In addition to the above, Emacs handles two more kinds of escape
4907 sequences: ISO6429's direction specification and Emacs' private
4908 sequence for specifying character composition.
4910 ISO6429's direction specification takes the following form:
4911 o CSI ']' -- end of the current direction
4912 o CSI '0' ']' -- end of the current direction
4913 o CSI '1' ']' -- start of left-to-right text
4914 o CSI '2' ']' -- start of right-to-left text
4915 The control character CSI (0x9B: control sequence introducer) is
4916 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4918 Character composition specification takes the following form:
4919 o ESC '0' -- start character composition
4920 o ESC '1' -- end character composition
4921 Since these are not standard escape sequences of any ISO standard,
4922 their use with these meanings is restricted to Emacs only. */
4925 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4929 for (i = 0; i < 4; i++)
4931 if (!NILP (coding_system))
4933 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4935 iso->charset[i] = Qt;
4936 iso->invalid_designated[i] = 0;
4938 iso->esc = ISO_ESC_NOTHING;
4939 iso->esc_bytes_index = 0;
4940 iso->register_left = 0;
4941 iso->register_right = 1;
4942 iso->switched_dir_and_no_valid_charset_yet = 0;
4943 iso->invalid_switch_dir = 0;
4944 iso->output_direction_sequence = 0;
4945 iso->output_literally = 0;
4946 #ifdef ENABLE_COMPOSITE_CHARS
4947 if (iso->composite_chars)
4948 Dynarr_reset (iso->composite_chars);
4953 fit_to_be_escape_quoted (unsigned char c)
4970 /* Parse one byte of an ISO2022 escape sequence.
4971 If the result is an invalid escape sequence, return 0 and
4972 do not change anything in STR. Otherwise, if the result is
4973 an incomplete escape sequence, update ISO2022.ESC and
4974 ISO2022.ESC_BYTES and return -1. Otherwise, update
4975 all the state variables (but not ISO2022.ESC_BYTES) and
4978 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4979 or invocation of an invalid character set and treat that as
4980 an unrecognized escape sequence. */
4983 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4984 unsigned char c, unsigned int *flags,
4985 int check_invalid_charsets)
4987 /* (1) If we're at the end of a designation sequence, CS is the
4988 charset being designated and REG is the register to designate
4991 (2) If we're at the end of a locking-shift sequence, REG is
4992 the register to invoke and HALF (0 == left, 1 == right) is
4993 the half to invoke it into.
4995 (3) If we're at the end of a single-shift sequence, REG is
4996 the register to invoke. */
4997 Lisp_Object cs = Qnil;
5000 /* NOTE: This code does goto's all over the fucking place.
5001 The reason for this is that we're basically implementing
5002 a state machine here, and hierarchical languages like C
5003 don't really provide a clean way of doing this. */
5005 if (! (*flags & CODING_STATE_ESCAPE))
5006 /* At beginning of escape sequence; we need to reset our
5007 escape-state variables. */
5008 iso->esc = ISO_ESC_NOTHING;
5010 iso->output_literally = 0;
5011 iso->output_direction_sequence = 0;
5015 case ISO_ESC_NOTHING:
5016 iso->esc_bytes_index = 0;
5019 case ISO_CODE_ESC: /* Start escape sequence */
5020 *flags |= CODING_STATE_ESCAPE;
5024 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
5025 *flags |= CODING_STATE_ESCAPE;
5026 iso->esc = ISO_ESC_5_11;
5029 case ISO_CODE_SO: /* locking shift 1 */
5032 case ISO_CODE_SI: /* locking shift 0 */
5036 case ISO_CODE_SS2: /* single shift */
5039 case ISO_CODE_SS3: /* single shift */
5043 default: /* Other control characters */
5050 /**** single shift ****/
5052 case 'N': /* single shift 2 */
5055 case 'O': /* single shift 3 */
5059 /**** locking shift ****/
5061 case '~': /* locking shift 1 right */
5064 case 'n': /* locking shift 2 */
5067 case '}': /* locking shift 2 right */
5070 case 'o': /* locking shift 3 */
5073 case '|': /* locking shift 3 right */
5077 #ifdef ENABLE_COMPOSITE_CHARS
5078 /**** composite ****/
5081 iso->esc = ISO_ESC_START_COMPOSITE;
5082 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5083 CODING_STATE_COMPOSITE;
5087 iso->esc = ISO_ESC_END_COMPOSITE;
5088 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5089 ~CODING_STATE_COMPOSITE;
5091 #endif /* ENABLE_COMPOSITE_CHARS */
5093 /**** directionality ****/
5096 iso->esc = ISO_ESC_5_11;
5099 /**** designation ****/
5101 case '$': /* multibyte charset prefix */
5102 iso->esc = ISO_ESC_2_4;
5106 if (0x28 <= c && c <= 0x2F)
5108 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5112 /* This function is called with CODESYS equal to nil when
5113 doing coding-system detection. */
5115 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5116 && fit_to_be_escape_quoted (c))
5118 iso->esc = ISO_ESC_LITERAL;
5119 *flags &= CODING_STATE_ISO2022_LOCK;
5129 /**** directionality ****/
5131 case ISO_ESC_5_11: /* ISO6429 direction control */
5134 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5135 goto directionality;
5137 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5138 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5139 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5143 case ISO_ESC_5_11_0:
5146 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5147 goto directionality;
5151 case ISO_ESC_5_11_1:
5154 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5155 goto directionality;
5159 case ISO_ESC_5_11_2:
5162 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5163 goto directionality;
5168 iso->esc = ISO_ESC_DIRECTIONALITY;
5169 /* Various junk here to attempt to preserve the direction sequences
5170 literally in the text if they would otherwise be swallowed due
5171 to invalid designations that don't show up as actual charset
5172 changes in the text. */
5173 if (iso->invalid_switch_dir)
5175 /* We already inserted a direction switch literally into the
5176 text. We assume (#### this may not be right) that the
5177 next direction switch is the one going the other way,
5178 and we need to output that literally as well. */
5179 iso->output_literally = 1;
5180 iso->invalid_switch_dir = 0;
5186 /* If we are in the thrall of an invalid designation,
5187 then stick the directionality sequence literally into the
5188 output stream so it ends up in the original text again. */
5189 for (jj = 0; jj < 4; jj++)
5190 if (iso->invalid_designated[jj])
5194 iso->output_literally = 1;
5195 iso->invalid_switch_dir = 1;
5198 /* Indicate that we haven't yet seen a valid designation,
5199 so that if a switch-dir is directly followed by an
5200 invalid designation, both get inserted literally. */
5201 iso->switched_dir_and_no_valid_charset_yet = 1;
5206 /**** designation ****/
5209 if (0x28 <= c && c <= 0x2F)
5211 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5214 if (0x40 <= c && c <= 0x42)
5217 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5218 *flags & CODING_STATE_R2L ?
5219 CHARSET_RIGHT_TO_LEFT :
5220 CHARSET_LEFT_TO_RIGHT);
5231 if (c < '0' || c > '~')
5232 return 0; /* bad final byte */
5234 if (iso->esc >= ISO_ESC_2_8 &&
5235 iso->esc <= ISO_ESC_2_15)
5237 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5238 single = 1; /* single-byte */
5239 reg = (iso->esc - ISO_ESC_2_8) & 3;
5241 else if (iso->esc >= ISO_ESC_2_4_8 &&
5242 iso->esc <= ISO_ESC_2_4_15)
5244 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5245 single = -1; /* multi-byte */
5246 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5250 /* Can this ever be reached? -slb */
5254 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5255 *flags & CODING_STATE_R2L ?
5256 CHARSET_RIGHT_TO_LEFT :
5257 CHARSET_LEFT_TO_RIGHT);
5263 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5267 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5268 /* can't invoke something that ain't there. */
5270 iso->esc = ISO_ESC_SINGLE_SHIFT;
5271 *flags &= CODING_STATE_ISO2022_LOCK;
5273 *flags |= CODING_STATE_SS2;
5275 *flags |= CODING_STATE_SS3;
5279 if (check_invalid_charsets &&
5280 !CHARSETP (iso->charset[reg]))
5281 /* can't invoke something that ain't there. */
5284 iso->register_right = reg;
5286 iso->register_left = reg;
5287 *flags &= CODING_STATE_ISO2022_LOCK;
5288 iso->esc = ISO_ESC_LOCKING_SHIFT;
5292 if (NILP (cs) && check_invalid_charsets)
5294 iso->invalid_designated[reg] = 1;
5295 iso->charset[reg] = Vcharset_ascii;
5296 iso->esc = ISO_ESC_DESIGNATE;
5297 *flags &= CODING_STATE_ISO2022_LOCK;
5298 iso->output_literally = 1;
5299 if (iso->switched_dir_and_no_valid_charset_yet)
5301 /* We encountered a switch-direction followed by an
5302 invalid designation. Ensure that the switch-direction
5303 gets outputted; otherwise it will probably get eaten
5304 when the text is written out again. */
5305 iso->switched_dir_and_no_valid_charset_yet = 0;
5306 iso->output_direction_sequence = 1;
5307 /* And make sure that the switch-dir going the other
5308 way gets outputted, as well. */
5309 iso->invalid_switch_dir = 1;
5313 /* This function is called with CODESYS equal to nil when
5314 doing coding-system detection. */
5315 if (!NILP (codesys))
5317 charset_conversion_spec_dynarr *dyn =
5318 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5324 for (i = 0; i < Dynarr_length (dyn); i++)
5326 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5327 if (EQ (cs, spec->from_charset))
5328 cs = spec->to_charset;
5333 iso->charset[reg] = cs;
5334 iso->esc = ISO_ESC_DESIGNATE;
5335 *flags &= CODING_STATE_ISO2022_LOCK;
5336 if (iso->invalid_designated[reg])
5338 iso->invalid_designated[reg] = 0;
5339 iso->output_literally = 1;
5341 if (iso->switched_dir_and_no_valid_charset_yet)
5342 iso->switched_dir_and_no_valid_charset_yet = 0;
5347 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5351 /* #### There are serious deficiencies in the recognition mechanism
5352 here. This needs to be much smarter if it's going to cut it.
5353 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5354 it should be detected as Latin-1.
5355 All the ISO2022 stuff in this file should be synced up with the
5356 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5357 Perhaps we should wait till R2L works in FSF Emacs? */
5359 if (!st->iso2022.initted)
5361 reset_iso2022 (Qnil, &st->iso2022.iso);
5362 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5363 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5364 CODING_CATEGORY_ISO_8_1_MASK |
5365 CODING_CATEGORY_ISO_8_2_MASK |
5366 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5367 st->iso2022.flags = 0;
5368 st->iso2022.high_byte_count = 0;
5369 st->iso2022.saw_single_shift = 0;
5370 st->iso2022.initted = 1;
5373 mask = st->iso2022.mask;
5377 unsigned char c = *(unsigned char *)src++;
5380 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5381 st->iso2022.high_byte_count++;
5385 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5387 if (st->iso2022.high_byte_count & 1)
5388 /* odd number of high bytes; assume not iso-8-2 */
5389 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5391 st->iso2022.high_byte_count = 0;
5392 st->iso2022.saw_single_shift = 0;
5394 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5396 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5397 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5398 { /* control chars */
5401 /* Allow and ignore control characters that you might
5402 reasonably see in a text file */
5407 case 8: /* backspace */
5408 case 11: /* vertical tab */
5409 case 12: /* form feed */
5410 case 26: /* MS-DOS C-z junk */
5411 case 31: /* '^_' -- for info */
5412 goto label_continue_loop;
5419 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5422 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5423 &st->iso2022.flags, 0))
5425 switch (st->iso2022.iso.esc)
5427 case ISO_ESC_DESIGNATE:
5428 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5429 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5431 case ISO_ESC_LOCKING_SHIFT:
5432 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5433 goto ran_out_of_chars;
5434 case ISO_ESC_SINGLE_SHIFT:
5435 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5436 st->iso2022.saw_single_shift = 1;
5445 goto ran_out_of_chars;
5448 label_continue_loop:;
5457 postprocess_iso2022_mask (int mask)
5459 /* #### kind of cheesy */
5460 /* If seven-bit ISO is allowed, then assume that the encoding is
5461 entirely seven-bit and turn off the eight-bit ones. */
5462 if (mask & CODING_CATEGORY_ISO_7_MASK)
5463 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5464 CODING_CATEGORY_ISO_8_1_MASK |
5465 CODING_CATEGORY_ISO_8_2_MASK);
5469 /* If FLAGS is a null pointer or specifies right-to-left motion,
5470 output a switch-dir-to-left-to-right sequence to DST.
5471 Also update FLAGS if it is not a null pointer.
5472 If INTERNAL_P is set, we are outputting in internal format and
5473 need to handle the CSI differently. */
5476 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5477 unsigned_char_dynarr *dst,
5478 unsigned int *flags,
5481 if (!flags || (*flags & CODING_STATE_R2L))
5483 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5485 Dynarr_add (dst, ISO_CODE_ESC);
5486 Dynarr_add (dst, '[');
5488 else if (internal_p)
5489 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5491 Dynarr_add (dst, ISO_CODE_CSI);
5492 Dynarr_add (dst, '0');
5493 Dynarr_add (dst, ']');
5495 *flags &= ~CODING_STATE_R2L;
5499 /* If FLAGS is a null pointer or specifies a direction different from
5500 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5501 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5502 sequence to DST. Also update FLAGS if it is not a null pointer.
5503 If INTERNAL_P is set, we are outputting in internal format and
5504 need to handle the CSI differently. */
5507 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5508 unsigned_char_dynarr *dst, unsigned int *flags,
5511 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5512 direction == CHARSET_LEFT_TO_RIGHT)
5513 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5514 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5515 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5516 direction == CHARSET_RIGHT_TO_LEFT)
5518 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5520 Dynarr_add (dst, ISO_CODE_ESC);
5521 Dynarr_add (dst, '[');
5523 else if (internal_p)
5524 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5526 Dynarr_add (dst, ISO_CODE_CSI);
5527 Dynarr_add (dst, '2');
5528 Dynarr_add (dst, ']');
5530 *flags |= CODING_STATE_R2L;
5534 /* Convert ISO2022-format data to internal format. */
5537 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5538 unsigned_char_dynarr *dst, Lstream_data_count n)
5540 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5541 unsigned int flags = str->flags;
5542 unsigned int cpos = str->cpos;
5543 unsigned char counter = str->counter;
5544 eol_type_t eol_type = str->eol_type;
5545 #ifdef ENABLE_COMPOSITE_CHARS
5546 unsigned_char_dynarr *real_dst = dst;
5548 Lisp_Object coding_system;
5550 XSETCODING_SYSTEM (coding_system, str->codesys);
5552 #ifdef ENABLE_COMPOSITE_CHARS
5553 if (flags & CODING_STATE_COMPOSITE)
5554 dst = str->iso2022.composite_chars;
5555 #endif /* ENABLE_COMPOSITE_CHARS */
5559 unsigned char c = *(unsigned char *)src++;
5560 if (flags & CODING_STATE_ESCAPE)
5561 { /* Within ESC sequence */
5562 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5567 switch (str->iso2022.esc)
5569 #ifdef ENABLE_COMPOSITE_CHARS
5570 case ISO_ESC_START_COMPOSITE:
5571 if (str->iso2022.composite_chars)
5572 Dynarr_reset (str->iso2022.composite_chars);
5574 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5575 dst = str->iso2022.composite_chars;
5577 case ISO_ESC_END_COMPOSITE:
5579 Bufbyte comstr[MAX_EMCHAR_LEN];
5581 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5582 Dynarr_length (dst));
5584 len = set_charptr_emchar (comstr, emch);
5585 Dynarr_add_many (dst, comstr, len);
5588 #endif /* ENABLE_COMPOSITE_CHARS */
5590 case ISO_ESC_LITERAL:
5591 COMPOSE_FLUSH_CHARS (str, dst);
5592 decode_flush_er_chars (str, dst);
5593 DECODE_ADD_BINARY_CHAR (c, dst);
5597 /* Everything else handled already */
5602 /* Attempted error recovery. */
5603 if (str->iso2022.output_direction_sequence)
5604 ensure_correct_direction (flags & CODING_STATE_R2L ?
5605 CHARSET_RIGHT_TO_LEFT :
5606 CHARSET_LEFT_TO_RIGHT,
5607 str->codesys, dst, 0, 1);
5608 /* More error recovery. */
5609 if (!retval || str->iso2022.output_literally)
5611 /* Output the (possibly invalid) sequence */
5613 COMPOSE_FLUSH_CHARS (str, dst);
5614 decode_flush_er_chars (str, dst);
5615 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5616 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5617 flags &= CODING_STATE_ISO2022_LOCK;
5619 n++, src--;/* Repeat the loop with the same character. */
5622 /* No sense in reprocessing the final byte of the
5623 escape sequence; it could mess things up anyway.
5625 COMPOSE_FLUSH_CHARS (str, dst);
5626 decode_flush_er_chars (str, dst);
5627 DECODE_ADD_BINARY_CHAR (c, dst);
5633 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5634 { /* Control characters */
5636 /***** Error-handling *****/
5638 /* If we were in the middle of a character, dump out the
5639 partial character. */
5642 COMPOSE_FLUSH_CHARS (str, dst);
5643 decode_flush_er_chars (str, dst);
5647 DECODE_ADD_BINARY_CHAR
5648 ((unsigned char)(cpos >> (counter * 8)), dst);
5653 /* If we just saw a single-shift character, dump it out.
5654 This may dump out the wrong sort of single-shift character,
5655 but least it will give an indication that something went
5657 if (flags & CODING_STATE_SS2)
5659 COMPOSE_FLUSH_CHARS (str, dst);
5660 decode_flush_er_chars (str, dst);
5661 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5662 flags &= ~CODING_STATE_SS2;
5664 if (flags & CODING_STATE_SS3)
5666 COMPOSE_FLUSH_CHARS (str, dst);
5667 decode_flush_er_chars (str, dst);
5668 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5669 flags &= ~CODING_STATE_SS3;
5672 /***** Now handle the control characters. *****/
5678 COMPOSE_FLUSH_CHARS (str, dst);
5679 decode_flush_er_chars (str, dst);
5680 if (eol_type == EOL_CR)
5681 Dynarr_add (dst, '\n');
5682 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5683 Dynarr_add (dst, c);
5685 flags |= CODING_STATE_CR;
5686 goto label_continue_loop;
5688 else if (flags & CODING_STATE_CR)
5689 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5691 Dynarr_add (dst, '\r');
5692 flags &= ~CODING_STATE_CR;
5695 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5698 flags &= CODING_STATE_ISO2022_LOCK;
5700 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5702 COMPOSE_FLUSH_CHARS (str, dst);
5703 decode_flush_er_chars (str, dst);
5704 DECODE_ADD_BINARY_CHAR (c, dst);
5708 { /* Graphic characters */
5709 Lisp_Object charset;
5718 COMPOSE_FLUSH_CHARS (str, dst);
5719 decode_flush_er_chars (str, dst);
5720 if (eol_type == EOL_CR)
5721 Dynarr_add (dst, '\n');
5722 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5723 Dynarr_add (dst, c);
5725 flags |= CODING_STATE_CR;
5726 goto label_continue_loop;
5728 else if (flags & CODING_STATE_CR)
5729 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5731 Dynarr_add (dst, '\r');
5732 flags &= ~CODING_STATE_CR;
5735 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5738 /* Now determine the charset. */
5739 reg = ((flags & CODING_STATE_SS2) ? 2
5740 : (flags & CODING_STATE_SS3) ? 3
5741 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5742 : str->iso2022.register_left);
5743 charset = str->iso2022.charset[reg];
5745 /* Error checking: */
5746 if (! CHARSETP (charset)
5747 || str->iso2022.invalid_designated[reg]
5748 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5749 && XCHARSET_CHARS (charset) == 94))
5750 /* Mrmph. We are trying to invoke a register that has no
5751 or an invalid charset in it, or trying to add a character
5752 outside the range of the charset. Insert that char literally
5753 to preserve it for the output. */
5755 COMPOSE_FLUSH_CHARS (str, dst);
5756 decode_flush_er_chars (str, dst);
5760 DECODE_ADD_BINARY_CHAR
5761 ((unsigned char)(cpos >> (counter * 8)), dst);
5764 DECODE_ADD_BINARY_CHAR (c, dst);
5769 /* Things are probably hunky-dorey. */
5771 /* Fetch reverse charset, maybe. */
5772 if (((flags & CODING_STATE_R2L) &&
5773 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5775 (!(flags & CODING_STATE_R2L) &&
5776 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5778 Lisp_Object new_charset =
5779 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5780 if (!NILP (new_charset))
5781 charset = new_charset;
5786 if (XCHARSET_DIMENSION (charset) == counter)
5788 COMPOSE_ADD_CHAR (str,
5789 DECODE_CHAR (charset,
5790 ((cpos & 0x7F7F7F) << 8)
5797 cpos = (cpos << 8) | c;
5799 lb = XCHARSET_LEADING_BYTE (charset);
5800 switch (XCHARSET_REP_BYTES (charset))
5803 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5804 Dynarr_add (dst, c & 0x7F);
5807 case 2: /* one-byte official */
5808 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5809 Dynarr_add (dst, lb);
5810 Dynarr_add (dst, c | 0x80);
5813 case 3: /* one-byte private or two-byte official */
5814 if (XCHARSET_PRIVATE_P (charset))
5816 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5817 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5818 Dynarr_add (dst, lb);
5819 Dynarr_add (dst, c | 0x80);
5825 Dynarr_add (dst, lb);
5826 Dynarr_add (dst, ch | 0x80);
5827 Dynarr_add (dst, c | 0x80);
5835 default: /* two-byte private */
5838 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5839 Dynarr_add (dst, lb);
5840 Dynarr_add (dst, ch | 0x80);
5841 Dynarr_add (dst, c | 0x80);
5851 flags &= CODING_STATE_ISO2022_LOCK;
5854 label_continue_loop:;
5857 if (flags & CODING_STATE_END)
5859 COMPOSE_FLUSH_CHARS (str, dst);
5860 decode_flush_er_chars (str, dst);
5861 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5865 str->counter = counter;
5869 /***** ISO2022 encoder *****/
5871 /* Designate CHARSET into register REG. */
5874 iso2022_designate (Lisp_Object charset, unsigned char reg,
5875 struct encoding_stream *str, unsigned_char_dynarr *dst)
5877 static const char inter94[] = "()*+";
5878 static const char inter96[] = ",-./";
5879 unsigned short chars;
5880 unsigned char dimension;
5881 unsigned char final;
5882 Lisp_Object old_charset = str->iso2022.charset[reg];
5884 str->iso2022.charset[reg] = charset;
5885 if (!CHARSETP (charset))
5886 /* charset might be an initial nil or t. */
5888 chars = XCHARSET_CHARS (charset);
5889 dimension = XCHARSET_DIMENSION (charset);
5890 final = XCHARSET_FINAL (charset);
5891 if (!str->iso2022.force_charset_on_output[reg] &&
5892 CHARSETP (old_charset) &&
5893 XCHARSET_CHARS (old_charset) == chars &&
5894 XCHARSET_DIMENSION (old_charset) == dimension &&
5895 XCHARSET_FINAL (old_charset) == final)
5898 str->iso2022.force_charset_on_output[reg] = 0;
5901 charset_conversion_spec_dynarr *dyn =
5902 str->codesys->iso2022.output_conv;
5908 for (i = 0; i < Dynarr_length (dyn); i++)
5910 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5911 if (EQ (charset, spec->from_charset))
5912 charset = spec->to_charset;
5917 Dynarr_add (dst, ISO_CODE_ESC);
5922 Dynarr_add (dst, inter94[reg]);
5925 Dynarr_add (dst, '$');
5927 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5930 Dynarr_add (dst, inter94[reg]);
5935 Dynarr_add (dst, inter96[reg]);
5938 Dynarr_add (dst, '$');
5939 Dynarr_add (dst, inter96[reg]);
5943 Dynarr_add (dst, final);
5947 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5949 if (str->iso2022.register_left != 0)
5951 Dynarr_add (dst, ISO_CODE_SI);
5952 str->iso2022.register_left = 0;
5957 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5959 if (str->iso2022.register_left != 1)
5961 Dynarr_add (dst, ISO_CODE_SO);
5962 str->iso2022.register_left = 1;
5967 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5968 unsigned_char_dynarr *dst, unsigned int *flags)
5970 unsigned char charmask;
5971 Lisp_Coding_System* codesys = str->codesys;
5972 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5974 Lisp_Object charset = str->iso2022.current_charset;
5975 int half = str->iso2022.current_half;
5976 int code_point = -1;
5980 restore_left_to_right_direction (codesys, dst, flags, 0);
5982 /* Make sure G0 contains ASCII */
5983 if ((ch > ' ' && ch < ISO_CODE_DEL)
5984 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5986 ensure_normal_shift (str, dst);
5987 iso2022_designate (Vcharset_ascii, 0, str, dst);
5990 /* If necessary, restore everything to the default state
5992 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5994 restore_left_to_right_direction (codesys, dst, flags, 0);
5996 ensure_normal_shift (str, dst);
5998 for (i = 0; i < 4; i++)
6000 Lisp_Object initial_charset =
6001 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6002 iso2022_designate (initial_charset, i, str, dst);
6007 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6008 Dynarr_add (dst, '\r');
6009 if (eol_type != EOL_CR)
6010 Dynarr_add (dst, ch);
6014 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6015 && fit_to_be_escape_quoted (ch))
6016 Dynarr_add (dst, ISO_CODE_ESC);
6017 Dynarr_add (dst, ch);
6020 else if ( (0x80 <= ch) && (ch <= 0x9f) )
6022 charmask = (half == 0 ? 0x00 : 0x80);
6024 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6025 && fit_to_be_escape_quoted (ch))
6026 Dynarr_add (dst, ISO_CODE_ESC);
6027 /* you asked for it ... */
6028 Dynarr_add (dst, ch);
6034 /* Now determine which register to use. */
6036 for (i = 0; i < 4; i++)
6038 if ((CHARSETP (charset = str->iso2022.charset[i])
6039 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6043 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6044 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6052 Lisp_Object original_default_coded_charset_priority_list
6053 = Vdefault_coded_charset_priority_list;
6054 Vdefault_coded_charset_priority_list
6055 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6056 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6058 code_point = ENCODE_CHAR (ch, charset);
6059 if (XCHARSET_FINAL (charset))
6061 Vdefault_coded_charset_priority_list
6062 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6063 Vdefault_coded_charset_priority_list));
6065 Vdefault_coded_charset_priority_list
6066 = original_default_coded_charset_priority_list;
6067 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6069 code_point = ENCODE_CHAR (ch, charset);
6070 if (XCHARSET_FINAL (charset))
6072 Vdefault_coded_charset_priority_list
6073 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6074 Vdefault_coded_charset_priority_list));
6076 code_point = ENCODE_CHAR (ch, charset);
6077 if (!XCHARSET_FINAL (charset))
6079 charset = Vcharset_ascii;
6083 Vdefault_coded_charset_priority_list
6084 = original_default_coded_charset_priority_list;
6086 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6087 codesys, dst, flags, 0);
6091 if (XCHARSET_GRAPHIC (charset) != 0)
6093 if (!NILP (str->iso2022.charset[1]) &&
6094 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6095 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6097 else if (!NILP (str->iso2022.charset[2]))
6099 else if (!NILP (str->iso2022.charset[3]))
6108 iso2022_designate (charset, reg, str, dst);
6110 /* Now invoke that register. */
6114 ensure_normal_shift (str, dst);
6118 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6120 ensure_shift_out (str, dst);
6127 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6129 Dynarr_add (dst, ISO_CODE_ESC);
6130 Dynarr_add (dst, 'N');
6135 Dynarr_add (dst, ISO_CODE_SS2);
6140 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6142 Dynarr_add (dst, ISO_CODE_ESC);
6143 Dynarr_add (dst, 'O');
6148 Dynarr_add (dst, ISO_CODE_SS3);
6156 charmask = (half == 0 ? 0x00 : 0x80);
6158 switch (XCHARSET_DIMENSION (charset))
6161 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6164 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6165 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6168 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6169 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6170 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6173 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6174 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6175 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6176 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6182 str->iso2022.current_charset = charset;
6183 str->iso2022.current_half = half;
6187 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6188 unsigned int *flags)
6190 Lisp_Coding_System* codesys = str->codesys;
6193 restore_left_to_right_direction (codesys, dst, flags, 0);
6194 ensure_normal_shift (str, dst);
6195 for (i = 0; i < 4; i++)
6197 Lisp_Object initial_charset
6198 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6199 iso2022_designate (initial_charset, i, str, dst);
6204 /************************************************************************/
6205 /* No-conversion methods */
6206 /************************************************************************/
6208 /* This is used when reading in "binary" files -- i.e. files that may
6209 contain all 256 possible byte values and that are not to be
6210 interpreted as being in any particular decoding. */
6212 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6213 unsigned_char_dynarr *dst, Lstream_data_count n)
6215 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6216 unsigned int flags = str->flags;
6217 unsigned int cpos = str->cpos;
6218 eol_type_t eol_type = str->eol_type;
6222 unsigned char c = *(unsigned char *)src++;
6224 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6225 DECODE_ADD_BINARY_CHAR (c, dst);
6226 label_continue_loop:;
6229 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6236 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6237 unsigned_char_dynarr *dst, Lstream_data_count n)
6240 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6241 unsigned int flags = str->flags;
6242 unsigned int ch = str->ch;
6243 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6245 unsigned char char_boundary = str->iso2022.current_char_boundary;
6252 if (char_boundary == 0)
6258 else if ( c >= 0xf8 )
6263 else if ( c >= 0xf0 )
6268 else if ( c >= 0xe0 )
6273 else if ( c >= 0xc0 )
6283 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6284 Dynarr_add (dst, '\r');
6285 if (eol_type != EOL_CR)
6286 Dynarr_add (dst, c);
6289 Dynarr_add (dst, c);
6292 else if (char_boundary == 1)
6294 ch = ( ch << 6 ) | ( c & 0x3f );
6295 Dynarr_add (dst, ch & 0xff);
6300 ch = ( ch << 6 ) | ( c & 0x3f );
6303 #else /* not UTF2000 */
6306 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6307 Dynarr_add (dst, '\r');
6308 if (eol_type != EOL_CR)
6309 Dynarr_add (dst, '\n');
6312 else if (BYTE_ASCII_P (c))
6315 Dynarr_add (dst, c);
6317 else if (BUFBYTE_LEADING_BYTE_P (c))
6320 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6321 c == LEADING_BYTE_CONTROL_1)
6324 Dynarr_add (dst, '~'); /* untranslatable character */
6328 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6329 Dynarr_add (dst, c);
6330 else if (ch == LEADING_BYTE_CONTROL_1)
6333 Dynarr_add (dst, c - 0x20);
6335 /* else it should be the second or third byte of an
6336 untranslatable character, so ignore it */
6339 #endif /* not UTF2000 */
6345 str->iso2022.current_char_boundary = char_boundary;
6351 /************************************************************************/
6352 /* Initialization */
6353 /************************************************************************/
6356 syms_of_file_coding (void)
6358 INIT_LRECORD_IMPLEMENTATION (coding_system);
6360 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6362 DEFSUBR (Fcoding_system_p);
6363 DEFSUBR (Ffind_coding_system);
6364 DEFSUBR (Fget_coding_system);
6365 DEFSUBR (Fcoding_system_list);
6366 DEFSUBR (Fcoding_system_name);
6367 DEFSUBR (Fmake_coding_system);
6368 DEFSUBR (Fcopy_coding_system);
6369 DEFSUBR (Fcoding_system_canonical_name_p);
6370 DEFSUBR (Fcoding_system_alias_p);
6371 DEFSUBR (Fcoding_system_aliasee);
6372 DEFSUBR (Fdefine_coding_system_alias);
6373 DEFSUBR (Fsubsidiary_coding_system);
6375 DEFSUBR (Fcoding_system_type);
6376 DEFSUBR (Fcoding_system_doc_string);
6378 DEFSUBR (Fcoding_system_charset);
6380 DEFSUBR (Fcoding_system_property);
6382 DEFSUBR (Fcoding_category_list);
6383 DEFSUBR (Fset_coding_priority_list);
6384 DEFSUBR (Fcoding_priority_list);
6385 DEFSUBR (Fset_coding_category_system);
6386 DEFSUBR (Fcoding_category_system);
6388 DEFSUBR (Fdetect_coding_region);
6389 DEFSUBR (Fdecode_coding_region);
6390 DEFSUBR (Fencode_coding_region);
6392 DEFSUBR (Fdecode_shift_jis_char);
6393 DEFSUBR (Fencode_shift_jis_char);
6394 DEFSUBR (Fdecode_big5_char);
6395 DEFSUBR (Fencode_big5_char);
6397 defsymbol (&Qcoding_systemp, "coding-system-p");
6398 defsymbol (&Qno_conversion, "no-conversion");
6399 defsymbol (&Qraw_text, "raw-text");
6401 defsymbol (&Qbig5, "big5");
6402 defsymbol (&Qshift_jis, "shift-jis");
6403 defsymbol (&Qucs4, "ucs-4");
6404 defsymbol (&Qutf8, "utf-8");
6405 defsymbol (&Qutf16, "utf-16");
6406 defsymbol (&Qccl, "ccl");
6407 defsymbol (&Qiso2022, "iso2022");
6409 defsymbol (&Qmnemonic, "mnemonic");
6410 defsymbol (&Qeol_type, "eol-type");
6411 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6412 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6414 defsymbol (&Qcr, "cr");
6415 defsymbol (&Qlf, "lf");
6416 defsymbol (&Qcrlf, "crlf");
6417 defsymbol (&Qeol_cr, "eol-cr");
6418 defsymbol (&Qeol_lf, "eol-lf");
6419 defsymbol (&Qeol_crlf, "eol-crlf");
6421 defsymbol (&Qcharset_g0, "charset-g0");
6422 defsymbol (&Qcharset_g1, "charset-g1");
6423 defsymbol (&Qcharset_g2, "charset-g2");
6424 defsymbol (&Qcharset_g3, "charset-g3");
6425 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6426 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6427 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6428 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6429 defsymbol (&Qno_iso6429, "no-iso6429");
6430 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6431 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6433 defsymbol (&Qshort, "short");
6434 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6435 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6436 defsymbol (&Qseven, "seven");
6437 defsymbol (&Qlock_shift, "lock-shift");
6438 defsymbol (&Qescape_quoted, "escape-quoted");
6441 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6442 defsymbol (&Qdisable_composition, "disable-composition");
6443 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6444 defsymbol (&Quse_entity_reference, "use-entity-reference");
6445 defsymbol (&Qd, "d");
6446 defsymbol (&Qx, "x");
6447 defsymbol (&QX, "X");
6449 defsymbol (&Qencode, "encode");
6450 defsymbol (&Qdecode, "decode");
6453 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6455 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6457 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6459 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6461 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6463 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6465 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6467 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6469 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6471 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6474 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6479 lstream_type_create_file_coding (void)
6481 LSTREAM_HAS_METHOD (decoding, reader);
6482 LSTREAM_HAS_METHOD (decoding, writer);
6483 LSTREAM_HAS_METHOD (decoding, rewinder);
6484 LSTREAM_HAS_METHOD (decoding, seekable_p);
6485 LSTREAM_HAS_METHOD (decoding, flusher);
6486 LSTREAM_HAS_METHOD (decoding, closer);
6487 LSTREAM_HAS_METHOD (decoding, marker);
6489 LSTREAM_HAS_METHOD (encoding, reader);
6490 LSTREAM_HAS_METHOD (encoding, writer);
6491 LSTREAM_HAS_METHOD (encoding, rewinder);
6492 LSTREAM_HAS_METHOD (encoding, seekable_p);
6493 LSTREAM_HAS_METHOD (encoding, flusher);
6494 LSTREAM_HAS_METHOD (encoding, closer);
6495 LSTREAM_HAS_METHOD (encoding, marker);
6499 vars_of_file_coding (void)
6503 fcd = xnew (struct file_coding_dump);
6504 dump_add_root_struct_ptr (&fcd, &fcd_description);
6506 /* Initialize to something reasonable ... */
6507 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6509 fcd->coding_category_system[i] = Qnil;
6510 fcd->coding_category_by_priority[i] = i;
6513 Fprovide (intern ("file-coding"));
6515 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6516 Coding system used for TTY keyboard input.
6517 Not used under a windowing system.
6519 Vkeyboard_coding_system = Qnil;
6521 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6522 Coding system used for TTY display output.
6523 Not used under a windowing system.
6525 Vterminal_coding_system = Qnil;
6527 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6528 Overriding coding system used when reading from a file or process.
6529 You should bind this variable with `let', but do not set it globally.
6530 If this is non-nil, it specifies the coding system that will be used
6531 to decode input on read operations, such as from a file or process.
6532 It overrides `buffer-file-coding-system-for-read',
6533 `insert-file-contents-pre-hook', etc. Use those variables instead of
6534 this one for permanent changes to the environment. */ );
6535 Vcoding_system_for_read = Qnil;
6537 DEFVAR_LISP ("coding-system-for-write",
6538 &Vcoding_system_for_write /*
6539 Overriding coding system used when writing to a file or process.
6540 You should bind this variable with `let', but do not set it globally.
6541 If this is non-nil, it specifies the coding system that will be used
6542 to encode output for write operations, such as to a file or process.
6543 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6544 Use those variables instead of this one for permanent changes to the
6546 Vcoding_system_for_write = Qnil;
6548 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6549 Coding system used to convert pathnames when accessing files.
6551 Vfile_name_coding_system = Qnil;
6553 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6554 &Vcoded_charset_entity_reference_alist /*
6555 Alist of coded-charset vs corresponding entity-reference.
6556 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6557 CCS is coded-charset.
6558 CODE-COLUMNS is columns of code-point of entity-reference.
6559 CODE-TYPE is format type of code-point of entity-reference.
6560 `d' means decimal value and `x' means hexadecimal value.
6562 Vcoded_charset_entity_reference_alist = Qnil;
6564 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6565 Non-nil means the buffer contents are regarded as multi-byte form
6566 of characters, not a binary code. This affects the display, file I/O,
6567 and behaviors of various editing commands.
6569 Setting this to nil does not do anything.
6571 enable_multibyte_characters = 1;
6574 DEFVAR_LISP ("decomposition-feature-list",
6575 &Vdecomposition_feature_list /*
6576 List of `=decomposition@FOO' feature to encode characters as IVS.
6578 Vdecomposition_feature_list = Qnil;
6583 complex_vars_of_file_coding (void)
6585 staticpro (&Vcoding_system_hash_table);
6586 Vcoding_system_hash_table =
6587 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6589 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6590 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6592 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6594 struct codesys_prop csp; \
6596 csp.prop_type = (Prop_Type); \
6597 Dynarr_add (the_codesys_prop_dynarr, csp); \
6600 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6601 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6602 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6603 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6604 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6605 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6606 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6608 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6609 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6610 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6611 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6612 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6613 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6614 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6615 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6616 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6617 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6618 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6619 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6620 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6621 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6622 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6623 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6624 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6626 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6629 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6630 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6632 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6633 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6636 /* Need to create this here or we're really screwed. */
6638 (Qraw_text, Qno_conversion,
6639 build_string ("Raw text, which means it converts only line-break-codes."),
6640 list2 (Qmnemonic, build_string ("Raw")));
6643 (Qbinary, Qno_conversion,
6644 build_string ("Binary, which means it does not convert anything."),
6645 list4 (Qeol_type, Qlf,
6646 Qmnemonic, build_string ("Binary")));
6652 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6653 list2 (Qmnemonic, build_string ("MTF8")));
6656 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6658 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6660 Fdefine_coding_system_alias (Qterminal, Qbinary);
6661 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6663 /* Need this for bootstrapping */
6664 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6665 Fget_coding_system (Qraw_text);
6668 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6669 = Fget_coding_system (Qutf_8_mcs);
6672 #if defined(MULE) && !defined(UTF2000)
6676 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6677 fcd->ucs_to_mule_table[i] = Qnil;
6679 staticpro (&mule_to_ucs_table);
6680 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6681 #endif /* defined(MULE) && !defined(UTF2000) */