1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
39 #include "file-coding.h"
41 Lisp_Object Qcoding_system_error;
43 Lisp_Object Vkeyboard_coding_system;
44 Lisp_Object Vterminal_coding_system;
45 Lisp_Object Vcoding_system_for_read;
46 Lisp_Object Vcoding_system_for_write;
47 Lisp_Object Vfile_name_coding_system;
49 /* Table of symbols identifying each coding category. */
50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
54 struct file_coding_dump {
55 /* Coding system currently associated with each coding category. */
56 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
58 /* Table of all coding categories in decreasing order of priority.
59 This describes a permutation of the possible coding categories. */
60 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 #if defined(MULE) && !defined(UTF2000)
63 Lisp_Object ucs_to_mule_table[65536];
67 static const struct lrecord_description fcd_description_1[] = {
68 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 #if defined(MULE) && !defined(UTF2000)
70 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
75 static const struct struct_description fcd_description = {
76 sizeof (struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
105 Lisp_Object Qdisable_composition;
107 Lisp_Object Qencode, Qdecode;
109 Lisp_Object Vcoding_system_hash_table;
111 int enable_multibyte_characters;
114 /* Additional information used by the ISO2022 decoder and detector. */
115 struct iso2022_decoder
117 /* CHARSET holds the character sets currently assigned to the G0
118 through G3 variables. It is initialized from the array
119 INITIAL_CHARSET in CODESYS. */
120 Lisp_Object charset[4];
122 /* Which registers are currently invoked into the left (GL) and
123 right (GR) halves of the 8-bit encoding space? */
124 int register_left, register_right;
126 /* ISO_ESC holds a value indicating part of an escape sequence
127 that has already been seen. */
128 enum iso_esc_flag esc;
130 /* This records the bytes we've seen so far in an escape sequence,
131 in case the sequence is invalid (we spit out the bytes unchanged). */
132 unsigned char esc_bytes[8];
134 /* Index for next byte to store in ISO escape sequence. */
137 #ifdef ENABLE_COMPOSITE_CHARS
138 /* Stuff seen so far when composing a string. */
139 unsigned_char_dynarr *composite_chars;
142 /* If we saw an invalid designation sequence for a particular
143 register, we flag it here and switch to ASCII. The next time we
144 see a valid designation for this register, we turn off the flag
145 and do the designation normally, but pretend the sequence was
146 invalid. The effect of all this is that (most of the time) the
147 escape sequences for both the switch to the unknown charset, and
148 the switch back to the known charset, get inserted literally into
149 the buffer and saved out as such. The hope is that we can
150 preserve the escape sequences so that the resulting written out
151 file makes sense. If we don't do any of this, the designation
152 to the invalid charset will be preserved but that switch back
153 to the known charset will probably get eaten because it was
154 the same charset that was already present in the register. */
155 unsigned char invalid_designated[4];
157 /* We try to do similar things as above for direction-switching
158 sequences. If we encountered a direction switch while an
159 invalid designation was present, or an invalid designation
160 just after a direction switch (i.e. no valid designation
161 encountered yet), we insert the direction-switch escape
162 sequence literally into the output stream, and later on
163 insert the corresponding direction-restoring escape sequence
165 unsigned int switched_dir_and_no_valid_charset_yet :1;
166 unsigned int invalid_switch_dir :1;
168 /* Tells the decoder to output the escape sequence literally
169 even though it was valid. Used in the games we play to
170 avoid lossage when we encounter invalid designations. */
171 unsigned int output_literally :1;
172 /* We encountered a direction switch followed by an invalid
173 designation. We didn't output the direction switch
174 literally because we didn't know about the invalid designation;
175 but we have to do so now. */
176 unsigned int output_direction_sequence :1;
179 EXFUN (Fcopy_coding_system, 2);
181 struct detection_state;
184 text_encode_generic (Lstream *encoding, const unsigned char *src,
185 unsigned_char_dynarr *dst, unsigned int n);
187 static int detect_coding_sjis (struct detection_state *st,
188 const unsigned char *src,
190 static void decode_coding_sjis (Lstream *decoding,
191 const unsigned char *src,
192 unsigned_char_dynarr *dst,
194 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
195 unsigned_char_dynarr *dst, unsigned int *flags);
196 void char_finish_shift_jis (struct encoding_stream *str,
197 unsigned_char_dynarr *dst, unsigned int *flags);
199 static int detect_coding_big5 (struct detection_state *st,
200 const unsigned char *src,
202 static void decode_coding_big5 (Lstream *decoding,
203 const unsigned char *src,
204 unsigned_char_dynarr *dst, unsigned int n);
205 static void encode_coding_big5 (Lstream *encoding,
206 const unsigned char *src,
207 unsigned_char_dynarr *dst, unsigned int n);
208 static int detect_coding_ucs4 (struct detection_state *st,
209 const unsigned char *src,
211 static void decode_coding_ucs4 (Lstream *decoding,
212 const unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
215 unsigned_char_dynarr *dst, unsigned int *flags);
216 void char_finish_ucs4 (struct encoding_stream *str,
217 unsigned_char_dynarr *dst, unsigned int *flags);
219 static int detect_coding_utf8 (struct detection_state *st,
220 const unsigned char *src,
222 static void decode_coding_utf8 (Lstream *decoding,
223 const unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
226 unsigned_char_dynarr *dst, unsigned int *flags);
227 void char_finish_utf8 (struct encoding_stream *str,
228 unsigned_char_dynarr *dst, unsigned int *flags);
230 static int postprocess_iso2022_mask (int mask);
231 static void reset_iso2022 (Lisp_Object coding_system,
232 struct iso2022_decoder *iso);
233 static int detect_coding_iso2022 (struct detection_state *st,
234 const unsigned char *src,
236 static void decode_coding_iso2022 (Lstream *decoding,
237 const unsigned char *src,
238 unsigned_char_dynarr *dst, unsigned int n);
239 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
240 unsigned_char_dynarr *dst, unsigned int *flags);
241 void char_finish_iso2022 (struct encoding_stream *str,
242 unsigned_char_dynarr *dst, unsigned int *flags);
244 static void decode_coding_no_conversion (Lstream *decoding,
245 const unsigned char *src,
246 unsigned_char_dynarr *dst,
248 static void encode_coding_no_conversion (Lstream *encoding,
249 const unsigned char *src,
250 unsigned_char_dynarr *dst,
252 static void mule_decode (Lstream *decoding, const unsigned char *src,
253 unsigned_char_dynarr *dst, unsigned int n);
254 static void mule_encode (Lstream *encoding, const unsigned char *src,
255 unsigned_char_dynarr *dst, unsigned int n);
257 typedef struct codesys_prop codesys_prop;
266 Dynarr_declare (codesys_prop);
267 } codesys_prop_dynarr;
269 static const struct lrecord_description codesys_prop_description_1[] = {
270 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
274 static const struct struct_description codesys_prop_description = {
275 sizeof (codesys_prop),
276 codesys_prop_description_1
279 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
280 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
284 static const struct struct_description codesys_prop_dynarr_description = {
285 sizeof (codesys_prop_dynarr),
286 codesys_prop_dynarr_description_1
289 codesys_prop_dynarr *the_codesys_prop_dynarr;
291 enum codesys_prop_enum
294 CODESYS_PROP_ISO2022,
299 /************************************************************************/
300 /* Coding system functions */
301 /************************************************************************/
303 static Lisp_Object mark_coding_system (Lisp_Object);
304 static void print_coding_system (Lisp_Object, Lisp_Object, int);
305 static void finalize_coding_system (void *header, int for_disksave);
308 static const struct lrecord_description ccs_description_1[] = {
309 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
310 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
314 static const struct struct_description ccs_description = {
315 sizeof (charset_conversion_spec),
319 static const struct lrecord_description ccsd_description_1[] = {
320 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
324 static const struct struct_description ccsd_description = {
325 sizeof (charset_conversion_spec_dynarr),
330 static const struct lrecord_description coding_system_description[] = {
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
332 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
333 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
334 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
340 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
341 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
342 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
343 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
344 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
349 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
350 mark_coding_system, print_coding_system,
351 finalize_coding_system,
352 0, 0, coding_system_description,
356 mark_coding_system (Lisp_Object obj)
358 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
360 mark_object (CODING_SYSTEM_NAME (codesys));
361 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
362 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
363 mark_object (CODING_SYSTEM_EOL_LF (codesys));
364 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
365 mark_object (CODING_SYSTEM_EOL_CR (codesys));
367 switch (CODING_SYSTEM_TYPE (codesys))
371 case CODESYS_ISO2022:
372 for (i = 0; i < 4; i++)
373 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
374 if (codesys->iso2022.input_conv)
376 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
378 struct charset_conversion_spec *ccs =
379 Dynarr_atp (codesys->iso2022.input_conv, i);
380 mark_object (ccs->from_charset);
381 mark_object (ccs->to_charset);
384 if (codesys->iso2022.output_conv)
386 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
388 struct charset_conversion_spec *ccs =
389 Dynarr_atp (codesys->iso2022.output_conv, i);
390 mark_object (ccs->from_charset);
391 mark_object (ccs->to_charset);
397 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
398 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
405 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
406 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
410 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
413 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
415 error ("printing unreadable object #<coding_system 0x%x>",
418 write_c_string ("#<coding_system ", printcharfun);
419 print_internal (c->name, printcharfun, 1);
420 write_c_string (">", printcharfun);
424 finalize_coding_system (void *header, int for_disksave)
426 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
427 /* Since coding systems never go away, this function is not
428 necessary. But it would be necessary if we changed things
429 so that coding systems could go away. */
430 if (!for_disksave) /* see comment in lstream.c */
432 switch (CODING_SYSTEM_TYPE (c))
435 case CODESYS_ISO2022:
436 if (c->iso2022.input_conv)
438 Dynarr_free (c->iso2022.input_conv);
439 c->iso2022.input_conv = 0;
441 if (c->iso2022.output_conv)
443 Dynarr_free (c->iso2022.output_conv);
444 c->iso2022.output_conv = 0;
455 symbol_to_eol_type (Lisp_Object symbol)
457 CHECK_SYMBOL (symbol);
458 if (NILP (symbol)) return EOL_AUTODETECT;
459 if (EQ (symbol, Qlf)) return EOL_LF;
460 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
461 if (EQ (symbol, Qcr)) return EOL_CR;
463 signal_simple_error ("Unrecognized eol type", symbol);
464 return EOL_AUTODETECT; /* not reached */
468 eol_type_to_symbol (eol_type_t type)
473 case EOL_LF: return Qlf;
474 case EOL_CRLF: return Qcrlf;
475 case EOL_CR: return Qcr;
476 case EOL_AUTODETECT: return Qnil;
481 setup_eol_coding_systems (Lisp_Coding_System *codesys)
483 Lisp_Object codesys_obj;
484 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
485 char *codesys_name = (char *) alloca (len + 7);
487 char *codesys_mnemonic=0;
489 Lisp_Object codesys_name_sym, sub_codesys_obj;
493 XSETCODING_SYSTEM (codesys_obj, codesys);
495 memcpy (codesys_name,
496 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
498 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
500 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
501 codesys_mnemonic = (char *) alloca (mlen + 7);
502 memcpy (codesys_mnemonic,
503 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
506 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
507 strcpy (codesys_name + len, "-" op_sys); \
509 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
510 codesys_name_sym = intern (codesys_name); \
511 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
512 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
514 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
515 build_string (codesys_mnemonic); \
516 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
519 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
520 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
521 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
524 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
525 Return t if OBJECT is a coding system.
526 A coding system is an object that defines how text containing multiple
527 character sets is encoded into a stream of (typically 8-bit) bytes.
528 The coding system is used to decode the stream into a series of
529 characters (which may be from multiple charsets) when the text is read
530 from a file or process, and is used to encode the text back into the
531 same format when it is written out to a file or process.
533 For example, many ISO2022-compliant coding systems (such as Compound
534 Text, which is used for inter-client data under the X Window System)
535 use escape sequences to switch between different charsets -- Japanese
536 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
537 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
538 `make-coding-system' for more information.
540 Coding systems are normally identified using a symbol, and the
541 symbol is accepted in place of the actual coding system object whenever
542 a coding system is called for. (This is similar to how faces work.)
546 return CODING_SYSTEMP (object) ? Qt : Qnil;
549 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
550 Retrieve the coding system of the given name.
552 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
553 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
554 If there is no such coding system, nil is returned. Otherwise the
555 associated coding system object is returned.
557 (coding_system_or_name))
559 if (NILP (coding_system_or_name))
560 coding_system_or_name = Qbinary;
561 else if (CODING_SYSTEMP (coding_system_or_name))
562 return coding_system_or_name;
564 CHECK_SYMBOL (coding_system_or_name);
568 coding_system_or_name =
569 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
571 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
572 return coding_system_or_name;
576 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
577 Retrieve the coding system of the given name.
578 Same as `find-coding-system' except that if there is no such
579 coding system, an error is signaled instead of returning nil.
583 Lisp_Object coding_system = Ffind_coding_system (name);
585 if (NILP (coding_system))
586 signal_simple_error ("No such coding system", name);
587 return coding_system;
590 /* We store the coding systems in hash tables with the names as the key and the
591 actual coding system object as the value. Occasionally we need to use them
592 in a list format. These routines provide us with that. */
593 struct coding_system_list_closure
595 Lisp_Object *coding_system_list;
599 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
600 void *coding_system_list_closure)
602 /* This function can GC */
603 struct coding_system_list_closure *cscl =
604 (struct coding_system_list_closure *) coding_system_list_closure;
605 Lisp_Object *coding_system_list = cscl->coding_system_list;
607 *coding_system_list = Fcons (key, *coding_system_list);
611 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
612 Return a list of the names of all defined coding systems.
616 Lisp_Object coding_system_list = Qnil;
618 struct coding_system_list_closure coding_system_list_closure;
620 GCPRO1 (coding_system_list);
621 coding_system_list_closure.coding_system_list = &coding_system_list;
622 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
623 &coding_system_list_closure);
626 return coding_system_list;
629 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
630 Return the name of the given coding system.
634 coding_system = Fget_coding_system (coding_system);
635 return XCODING_SYSTEM_NAME (coding_system);
638 static Lisp_Coding_System *
639 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
641 Lisp_Coding_System *codesys =
642 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
644 zero_lcrecord (codesys);
645 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
646 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
647 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
648 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
649 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
650 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
651 CODING_SYSTEM_TYPE (codesys) = type;
652 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
654 if (type == CODESYS_ISO2022)
657 for (i = 0; i < 4; i++)
658 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
660 else if (type == CODESYS_CCL)
662 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
663 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
666 CODING_SYSTEM_NAME (codesys) = name;
672 /* Given a list of charset conversion specs as specified in a Lisp
673 program, parse it into STORE_HERE. */
676 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
677 Lisp_Object spec_list)
681 EXTERNAL_LIST_LOOP (rest, spec_list)
683 Lisp_Object car = XCAR (rest);
684 Lisp_Object from, to;
685 struct charset_conversion_spec spec;
687 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
688 signal_simple_error ("Invalid charset conversion spec", car);
689 from = Fget_charset (XCAR (car));
690 to = Fget_charset (XCAR (XCDR (car)));
691 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
692 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
693 signal_simple_error_2
694 ("Attempted conversion between different charset types",
696 spec.from_charset = from;
697 spec.to_charset = to;
699 Dynarr_add (store_here, spec);
703 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
704 specs, return the equivalent as the Lisp programmer would see it.
706 If LOAD_HERE is 0, return Qnil. */
709 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
716 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
718 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
719 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
722 return Fnreverse (result);
727 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
728 Register symbol NAME as a coding system.
730 TYPE describes the conversion method used and should be one of
733 Automatic conversion. XEmacs attempts to detect the coding system
736 No conversion. Use this for binary files and such. On output,
737 graphic characters that are not in ASCII or Latin-1 will be
738 replaced by a ?. (For a no-conversion-encoded buffer, these
739 characters will only be present if you explicitly insert them.)
741 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
743 ISO 10646 UCS-4 encoding.
745 ISO 10646 UTF-8 encoding.
747 Any ISO2022-compliant encoding. Among other things, this includes
748 JIS (the Japanese encoding commonly used for e-mail), EUC (the
749 standard Unix encoding for Japanese and other languages), and
750 Compound Text (the encoding used in X11). You can specify more
751 specific information about the conversion with the FLAGS argument.
753 Big5 (the encoding commonly used for Taiwanese).
755 The conversion is performed using a user-written pseudo-code
756 program. CCL (Code Conversion Language) is the name of this
759 Write out or read in the raw contents of the memory representing
760 the buffer's text. This is primarily useful for debugging
761 purposes, and is only enabled when XEmacs has been compiled with
762 DEBUG_XEMACS defined (via the --debug configure option).
763 WARNING: Reading in a file using 'internal conversion can result
764 in an internal inconsistency in the memory representing a
765 buffer's text, which will produce unpredictable results and may
766 cause XEmacs to crash. Under normal circumstances you should
767 never use 'internal conversion.
769 DOC-STRING is a string describing the coding system.
771 PROPS is a property list, describing the specific nature of the
772 character set. Recognized properties are:
775 String to be displayed in the modeline when this coding system is
779 End-of-line conversion to be used. It should be one of
782 Automatically detect the end-of-line type (LF, CRLF,
783 or CR). Also generate subsidiary coding systems named
784 `NAME-unix', `NAME-dos', and `NAME-mac', that are
785 identical to this coding system but have an EOL-TYPE
786 value of 'lf, 'crlf, and 'cr, respectively.
788 The end of a line is marked externally using ASCII LF.
789 Since this is also the way that XEmacs represents an
790 end-of-line internally, specifying this option results
791 in no end-of-line conversion. This is the standard
792 format for Unix text files.
794 The end of a line is marked externally using ASCII
795 CRLF. This is the standard format for MS-DOS text
798 The end of a line is marked externally using ASCII CR.
799 This is the standard format for Macintosh text files.
801 Automatically detect the end-of-line type but do not
802 generate subsidiary coding systems. (This value is
803 converted to nil when stored internally, and
804 `coding-system-property' will return nil.)
807 If non-nil, composition/decomposition for combining characters
810 'post-read-conversion
811 Function called after a file has been read in, to perform the
812 decoding. Called with two arguments, BEG and END, denoting
813 a region of the current buffer to be decoded.
815 'pre-write-conversion
816 Function called before a file is written out, to perform the
817 encoding. Called with two arguments, BEG and END, denoting
818 a region of the current buffer to be encoded.
821 The following additional properties are recognized if TYPE is 'iso2022:
827 The character set initially designated to the G0 - G3 registers.
828 The value should be one of
830 -- A charset object (designate that character set)
831 -- nil (do not ever use this register)
832 -- t (no character set is initially designated to
833 the register, but may be later on; this automatically
834 sets the corresponding `force-g*-on-output' property)
840 If non-nil, send an explicit designation sequence on output before
841 using the specified register.
844 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
845 "ESC $ B" on output in place of the full designation sequences
846 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
849 If non-nil, don't designate ASCII to G0 at each end of line on output.
850 Setting this to non-nil also suppresses other state-resetting that
851 normally happens at the end of a line.
854 If non-nil, don't designate ASCII to G0 before control chars on output.
857 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
861 If non-nil, use locking-shift (SO/SI) instead of single-shift
862 or designation by escape sequence.
865 If non-nil, don't use ISO6429's direction specification.
868 If non-nil, literal control characters that are the same as
869 the beginning of a recognized ISO2022 or ISO6429 escape sequence
870 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
871 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
872 so that they can be properly distinguished from an escape sequence.
873 (Note that doing this results in a non-portable encoding.) This
874 encoding flag is used for byte-compiled files. Note that ESC
875 is a good choice for a quoting character because there are no
876 escape sequences whose second byte is a character from the Control-0
877 or Control-1 character sets; this is explicitly disallowed by the
880 'input-charset-conversion
881 A list of conversion specifications, specifying conversion of
882 characters in one charset to another when decoding is performed.
883 Each specification is a list of two elements: the source charset,
884 and the destination charset.
886 'output-charset-conversion
887 A list of conversion specifications, specifying conversion of
888 characters in one charset to another when encoding is performed.
889 The form of each specification is the same as for
890 'input-charset-conversion.
893 The following additional properties are recognized (and required)
897 CCL program used for decoding (converting to internal format).
900 CCL program used for encoding (converting to external format).
902 (name, type, doc_string, props))
904 Lisp_Coding_System *codesys;
905 Lisp_Object rest, key, value;
906 enum coding_system_type ty;
907 int need_to_setup_eol_systems = 1;
909 /* Convert type to constant */
910 if (NILP (type) || EQ (type, Qundecided))
911 { ty = CODESYS_AUTODETECT; }
913 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
914 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
915 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
916 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
917 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
918 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
920 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
922 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
925 signal_simple_error ("Invalid coding system type", type);
929 codesys = allocate_coding_system (ty, name);
931 if (NILP (doc_string))
932 doc_string = build_string ("");
934 CHECK_STRING (doc_string);
935 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
937 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
939 if (EQ (key, Qmnemonic))
942 CHECK_STRING (value);
943 CODING_SYSTEM_MNEMONIC (codesys) = value;
946 else if (EQ (key, Qeol_type))
948 need_to_setup_eol_systems = NILP (value);
951 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
954 else if (EQ (key, Qpost_read_conversion))
955 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
956 else if (EQ (key, Qpre_write_conversion))
957 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
959 else if (EQ (key, Qdisable_composition))
960 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
963 else if (ty == CODESYS_ISO2022)
965 #define FROB_INITIAL_CHARSET(charset_num) \
966 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
967 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
969 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
970 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
971 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
972 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
974 #define FROB_FORCE_CHARSET(charset_num) \
975 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
977 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
978 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
979 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
980 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
982 #define FROB_BOOLEAN_PROPERTY(prop) \
983 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
986 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
987 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
988 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
989 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
990 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
991 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
993 else if (EQ (key, Qinput_charset_conversion))
995 codesys->iso2022.input_conv =
996 Dynarr_new (charset_conversion_spec);
997 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1000 else if (EQ (key, Qoutput_charset_conversion))
1002 codesys->iso2022.output_conv =
1003 Dynarr_new (charset_conversion_spec);
1004 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1008 signal_simple_error ("Unrecognized property", key);
1010 else if (EQ (type, Qccl))
1012 if (EQ (key, Qdecode))
1014 CHECK_VECTOR (value);
1015 CODING_SYSTEM_CCL_DECODE (codesys) = value;
1017 else if (EQ (key, Qencode))
1019 CHECK_VECTOR (value);
1020 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1023 signal_simple_error ("Unrecognized property", key);
1027 signal_simple_error ("Unrecognized property", key);
1030 if (need_to_setup_eol_systems)
1031 setup_eol_coding_systems (codesys);
1034 Lisp_Object codesys_obj;
1035 XSETCODING_SYSTEM (codesys_obj, codesys);
1036 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1041 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1042 Copy OLD-CODING-SYSTEM to NEW-NAME.
1043 If NEW-NAME does not name an existing coding system, a new one will
1046 (old_coding_system, new_name))
1048 Lisp_Object new_coding_system;
1049 old_coding_system = Fget_coding_system (old_coding_system);
1050 new_coding_system = Ffind_coding_system (new_name);
1051 if (NILP (new_coding_system))
1053 XSETCODING_SYSTEM (new_coding_system,
1054 allocate_coding_system
1055 (XCODING_SYSTEM_TYPE (old_coding_system),
1057 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1061 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1062 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1063 memcpy (((char *) to ) + sizeof (to->header),
1064 ((char *) from) + sizeof (from->header),
1065 sizeof (*from) - sizeof (from->header));
1066 to->name = new_name;
1068 return new_coding_system;
1071 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1072 Return t if OBJECT names a coding system, and is not a coding system alias.
1076 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1080 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1081 Return t if OBJECT is a coding system alias.
1082 All coding system aliases are created by `define-coding-system-alias'.
1086 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1090 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1091 Return the coding-system symbol for which symbol ALIAS is an alias.
1095 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1096 if (SYMBOLP (aliasee))
1099 signal_simple_error ("Symbol is not a coding system alias", alias);
1103 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1105 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1109 /* A maphash function, for removing dangling coding system aliases. */
1111 dangling_coding_system_alias_p (Lisp_Object alias,
1112 Lisp_Object aliasee,
1113 void *dangling_aliases)
1115 if (SYMBOLP (aliasee)
1116 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1118 (*(int *) dangling_aliases)++;
1125 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1126 Define symbol ALIAS as an alias for coding system ALIASEE.
1128 You can use this function to redefine an alias that has already been defined,
1129 but you cannot redefine a name which is the canonical name for a coding system.
1130 \(a canonical name of a coding system is what is returned when you call
1131 `coding-system-name' on a coding system).
1133 ALIASEE itself can be an alias, which allows you to define nested aliases.
1135 You are forbidden, however, from creating alias loops or `dangling' aliases.
1136 These will be detected, and an error will be signaled if you attempt to do so.
1138 If ALIASEE is nil, then ALIAS will simply be undefined.
1140 See also `coding-system-alias-p', `coding-system-aliasee',
1141 and `coding-system-canonical-name-p'.
1145 Lisp_Object real_coding_system, probe;
1147 CHECK_SYMBOL (alias);
1149 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1151 ("Symbol is the canonical name of a coding system and cannot be redefined",
1156 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1157 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1158 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1160 Fremhash (alias, Vcoding_system_hash_table);
1162 /* Undefine subsidiary aliases,
1163 presumably created by a previous call to this function */
1164 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1165 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1166 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1168 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1169 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1170 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1173 /* Undefine dangling coding system aliases. */
1175 int dangling_aliases;
1178 dangling_aliases = 0;
1179 elisp_map_remhash (dangling_coding_system_alias_p,
1180 Vcoding_system_hash_table,
1182 } while (dangling_aliases > 0);
1188 if (CODING_SYSTEMP (aliasee))
1189 aliasee = XCODING_SYSTEM_NAME (aliasee);
1191 /* Checks that aliasee names a coding-system */
1192 real_coding_system = Fget_coding_system (aliasee);
1194 /* Check for coding system alias loops */
1195 if (EQ (alias, aliasee))
1196 alias_loop: signal_simple_error_2
1197 ("Attempt to create a coding system alias loop", alias, aliasee);
1199 for (probe = aliasee;
1201 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1203 if (EQ (probe, alias))
1207 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1209 /* Set up aliases for subsidiaries.
1210 #### There must be a better way to handle subsidiary coding systems. */
1212 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1214 for (i = 0; i < countof (suffixes); i++)
1216 Lisp_Object alias_subsidiary =
1217 append_suffix_to_symbol (alias, suffixes[i]);
1218 Lisp_Object aliasee_subsidiary =
1219 append_suffix_to_symbol (aliasee, suffixes[i]);
1221 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1222 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1225 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1226 but it doesn't look intentional, so I'd rather return something
1227 meaningful or nothing at all. */
1232 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1234 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1235 Lisp_Object new_coding_system;
1237 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1238 return coding_system;
1242 case EOL_AUTODETECT: return coding_system;
1243 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1244 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1245 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1249 return NILP (new_coding_system) ? coding_system : new_coding_system;
1252 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1253 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1255 (coding_system, eol_type))
1257 coding_system = Fget_coding_system (coding_system);
1259 return subsidiary_coding_system (coding_system,
1260 symbol_to_eol_type (eol_type));
1264 /************************************************************************/
1265 /* Coding system accessors */
1266 /************************************************************************/
1268 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1269 Return the doc string for CODING-SYSTEM.
1273 coding_system = Fget_coding_system (coding_system);
1274 return XCODING_SYSTEM_DOC_STRING (coding_system);
1277 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1278 Return the type of CODING-SYSTEM.
1282 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1285 case CODESYS_AUTODETECT: return Qundecided;
1287 case CODESYS_SHIFT_JIS: return Qshift_jis;
1288 case CODESYS_ISO2022: return Qiso2022;
1289 case CODESYS_BIG5: return Qbig5;
1290 case CODESYS_UCS4: return Qucs4;
1291 case CODESYS_UTF8: return Qutf8;
1292 case CODESYS_CCL: return Qccl;
1294 case CODESYS_NO_CONVERSION: return Qno_conversion;
1296 case CODESYS_INTERNAL: return Qinternal;
1303 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1306 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1308 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1311 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1312 Return initial charset of CODING-SYSTEM designated to GNUM.
1315 (coding_system, gnum))
1317 coding_system = Fget_coding_system (coding_system);
1320 return coding_system_charset (coding_system, XINT (gnum));
1324 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1325 Return the PROP property of CODING-SYSTEM.
1327 (coding_system, prop))
1330 enum coding_system_type type;
1332 coding_system = Fget_coding_system (coding_system);
1333 CHECK_SYMBOL (prop);
1334 type = XCODING_SYSTEM_TYPE (coding_system);
1336 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1337 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1340 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1342 case CODESYS_PROP_ALL_OK:
1345 case CODESYS_PROP_ISO2022:
1346 if (type != CODESYS_ISO2022)
1348 ("Property only valid in ISO2022 coding systems",
1352 case CODESYS_PROP_CCL:
1353 if (type != CODESYS_CCL)
1355 ("Property only valid in CCL coding systems",
1365 signal_simple_error ("Unrecognized property", prop);
1367 if (EQ (prop, Qname))
1368 return XCODING_SYSTEM_NAME (coding_system);
1369 else if (EQ (prop, Qtype))
1370 return Fcoding_system_type (coding_system);
1371 else if (EQ (prop, Qdoc_string))
1372 return XCODING_SYSTEM_DOC_STRING (coding_system);
1373 else if (EQ (prop, Qmnemonic))
1374 return XCODING_SYSTEM_MNEMONIC (coding_system);
1375 else if (EQ (prop, Qeol_type))
1376 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1377 else if (EQ (prop, Qeol_lf))
1378 return XCODING_SYSTEM_EOL_LF (coding_system);
1379 else if (EQ (prop, Qeol_crlf))
1380 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1381 else if (EQ (prop, Qeol_cr))
1382 return XCODING_SYSTEM_EOL_CR (coding_system);
1383 else if (EQ (prop, Qpost_read_conversion))
1384 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1385 else if (EQ (prop, Qpre_write_conversion))
1386 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1388 else if (type == CODESYS_ISO2022)
1390 if (EQ (prop, Qcharset_g0))
1391 return coding_system_charset (coding_system, 0);
1392 else if (EQ (prop, Qcharset_g1))
1393 return coding_system_charset (coding_system, 1);
1394 else if (EQ (prop, Qcharset_g2))
1395 return coding_system_charset (coding_system, 2);
1396 else if (EQ (prop, Qcharset_g3))
1397 return coding_system_charset (coding_system, 3);
1399 #define FORCE_CHARSET(charset_num) \
1400 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1401 (coding_system, charset_num) ? Qt : Qnil)
1403 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1404 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1405 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1406 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1408 #define LISP_BOOLEAN(prop) \
1409 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1411 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1412 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1413 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1414 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1415 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1416 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1417 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1419 else if (EQ (prop, Qinput_charset_conversion))
1421 unparse_charset_conversion_specs
1422 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1423 else if (EQ (prop, Qoutput_charset_conversion))
1425 unparse_charset_conversion_specs
1426 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1430 else if (type == CODESYS_CCL)
1432 if (EQ (prop, Qdecode))
1433 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1434 else if (EQ (prop, Qencode))
1435 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1443 return Qnil; /* not reached */
1447 /************************************************************************/
1448 /* Coding category functions */
1449 /************************************************************************/
1452 decode_coding_category (Lisp_Object symbol)
1456 CHECK_SYMBOL (symbol);
1457 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1458 if (EQ (coding_category_symbol[i], symbol))
1461 signal_simple_error ("Unrecognized coding category", symbol);
1462 return 0; /* not reached */
1465 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1466 Return a list of all recognized coding categories.
1471 Lisp_Object list = Qnil;
1473 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1474 list = Fcons (coding_category_symbol[i], list);
1478 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1479 Change the priority order of the coding categories.
1480 LIST should be list of coding categories, in descending order of
1481 priority. Unspecified coding categories will be lower in priority
1482 than all specified ones, in the same relative order they were in
1487 int category_to_priority[CODING_CATEGORY_LAST + 1];
1491 /* First generate a list that maps coding categories to priorities. */
1493 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1494 category_to_priority[i] = -1;
1496 /* Highest priority comes from the specified list. */
1498 EXTERNAL_LIST_LOOP (rest, list)
1500 int cat = decode_coding_category (XCAR (rest));
1502 if (category_to_priority[cat] >= 0)
1503 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1504 category_to_priority[cat] = i++;
1507 /* Now go through the existing categories by priority to retrieve
1508 the categories not yet specified and preserve their priority
1510 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1512 int cat = fcd->coding_category_by_priority[j];
1513 if (category_to_priority[cat] < 0)
1514 category_to_priority[cat] = i++;
1517 /* Now we need to construct the inverse of the mapping we just
1520 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1521 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1523 /* Phew! That was confusing. */
1527 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1528 Return a list of coding categories in descending order of priority.
1533 Lisp_Object list = Qnil;
1535 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1536 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1541 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1542 Change the coding system associated with a coding category.
1544 (coding_category, coding_system))
1546 int cat = decode_coding_category (coding_category);
1548 coding_system = Fget_coding_system (coding_system);
1549 fcd->coding_category_system[cat] = coding_system;
1553 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1554 Return the coding system associated with a coding category.
1558 int cat = decode_coding_category (coding_category);
1559 Lisp_Object sys = fcd->coding_category_system[cat];
1562 return XCODING_SYSTEM_NAME (sys);
1567 /************************************************************************/
1568 /* Detecting the encoding of data */
1569 /************************************************************************/
1571 struct detection_state
1573 eol_type_t eol_type;
1609 struct iso2022_decoder iso;
1611 int high_byte_count;
1612 unsigned int saw_single_shift:1;
1625 acceptable_control_char_p (int c)
1629 /* Allow and ignore control characters that you might
1630 reasonably see in a text file */
1635 case 8: /* backspace */
1636 case 11: /* vertical tab */
1637 case 12: /* form feed */
1638 case 26: /* MS-DOS C-z junk */
1639 case 31: /* '^_' -- for info */
1647 mask_has_at_most_one_bit_p (int mask)
1649 /* Perhaps the only thing useful you learn from intensive Microsoft
1650 technical interviews */
1651 return (mask & (mask - 1)) == 0;
1655 detect_eol_type (struct detection_state *st, const unsigned char *src,
1665 if (st->eol.just_saw_cr)
1667 else if (st->eol.seen_anything)
1670 else if (st->eol.just_saw_cr)
1673 st->eol.just_saw_cr = 1;
1675 st->eol.just_saw_cr = 0;
1676 st->eol.seen_anything = 1;
1679 return EOL_AUTODETECT;
1682 /* Attempt to determine the encoding and EOL type of the given text.
1683 Before calling this function for the first type, you must initialize
1684 st->eol_type as appropriate and initialize st->mask to ~0.
1686 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1689 st->mask holds the determined coding category mask, or ~0 if only
1690 ASCII has been seen so far.
1694 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1695 is present in st->mask
1696 1 == definitive answers are here for both st->eol_type and st->mask
1700 detect_coding_type (struct detection_state *st, const Extbyte *src,
1701 unsigned int n, int just_do_eol)
1705 if (st->eol_type == EOL_AUTODETECT)
1706 st->eol_type = detect_eol_type (st, src, n);
1709 return st->eol_type != EOL_AUTODETECT;
1711 if (!st->seen_non_ascii)
1713 for (; n; n--, src++)
1716 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1718 st->seen_non_ascii = 1;
1720 st->shift_jis.mask = ~0;
1724 st->iso2022.mask = ~0;
1734 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1735 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1736 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1737 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1738 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1739 st->big5.mask = detect_coding_big5 (st, src, n);
1740 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1741 st->utf8.mask = detect_coding_utf8 (st, src, n);
1742 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1743 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1746 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1747 | st->utf8.mask | st->ucs4.mask;
1750 int retval = mask_has_at_most_one_bit_p (st->mask);
1751 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1752 return retval && st->eol_type != EOL_AUTODETECT;
1757 coding_system_from_mask (int mask)
1761 /* If the file was entirely or basically ASCII, use the
1762 default value of `buffer-file-coding-system'. */
1763 Lisp_Object retval =
1764 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1767 retval = Ffind_coding_system (retval);
1771 (Qbad_variable, Qwarning,
1772 "Invalid `default-buffer-file-coding-system', set to nil");
1773 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1777 retval = Fget_coding_system (Qraw_text);
1785 mask = postprocess_iso2022_mask (mask);
1787 /* Look through the coding categories by priority and find
1788 the first one that is allowed. */
1789 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1791 cat = fcd->coding_category_by_priority[i];
1792 if ((mask & (1 << cat)) &&
1793 !NILP (fcd->coding_category_system[cat]))
1797 return fcd->coding_category_system[cat];
1799 return Fget_coding_system (Qraw_text);
1803 /* Given a seekable read stream and potential coding system and EOL type
1804 as specified, do any autodetection that is called for. If the
1805 coding system and/or EOL type are not `autodetect', they will be left
1806 alone; but this function will never return an autodetect coding system
1809 This function does not automatically fetch subsidiary coding systems;
1810 that should be unnecessary with the explicit eol-type argument. */
1812 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1815 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1816 eol_type_t *eol_type_in_out)
1818 struct detection_state decst;
1820 if (*eol_type_in_out == EOL_AUTODETECT)
1821 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1824 decst.eol_type = *eol_type_in_out;
1827 /* If autodetection is called for, do it now. */
1828 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1829 || *eol_type_in_out == EOL_AUTODETECT)
1832 Lisp_Object coding_system = Qnil;
1834 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1837 /* Look for initial "-*-"; mode line prefix */
1839 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1844 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1846 Extbyte *local_vars_beg = p + 3;
1847 /* Look for final "-*-"; mode line suffix */
1848 for (p = local_vars_beg,
1849 scan_end = buf + nread - LENGTH ("-*-");
1854 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1856 Extbyte *suffix = p;
1857 /* Look for "coding:" */
1858 for (p = local_vars_beg,
1859 scan_end = suffix - LENGTH ("coding:?");
1862 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1863 && (p == local_vars_beg
1864 || (*(p-1) == ' ' ||
1870 p += LENGTH ("coding:");
1871 while (*p == ' ' || *p == '\t') p++;
1873 /* Get coding system name */
1874 save = *suffix; *suffix = '\0';
1875 /* Characters valid in a MIME charset name (rfc 1521),
1876 and in a Lisp symbol name. */
1877 n = strspn ( (char *) p,
1878 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1879 "abcdefghijklmnopqrstuvwxyz"
1885 save = p[n]; p[n] = '\0';
1887 Ffind_coding_system (intern ((char *) p));
1897 if (NILP (coding_system))
1900 if (detect_coding_type (&decst, buf, nread,
1901 XCODING_SYSTEM_TYPE (*codesys_in_out)
1902 != CODESYS_AUTODETECT))
1904 nread = Lstream_read (stream, buf, sizeof (buf));
1910 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1911 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1914 if (detect_coding_type (&decst, buf, nread, 1))
1916 nread = Lstream_read (stream, buf, sizeof (buf));
1922 *eol_type_in_out = decst.eol_type;
1923 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1925 if (NILP (coding_system))
1926 *codesys_in_out = coding_system_from_mask (decst.mask);
1928 *codesys_in_out = coding_system;
1932 /* If we absolutely can't determine the EOL type, just assume LF. */
1933 if (*eol_type_in_out == EOL_AUTODETECT)
1934 *eol_type_in_out = EOL_LF;
1936 Lstream_rewind (stream);
1939 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1940 Detect coding system of the text in the region between START and END.
1941 Returned a list of possible coding systems ordered by priority.
1942 If only ASCII characters are found, it returns 'undecided or one of
1943 its subsidiary coding systems according to a detected end-of-line
1944 type. Optional arg BUFFER defaults to the current buffer.
1946 (start, end, buffer))
1948 Lisp_Object val = Qnil;
1949 struct buffer *buf = decode_buffer (buffer, 0);
1951 Lisp_Object instream, lb_instream;
1952 Lstream *istr, *lb_istr;
1953 struct detection_state decst;
1954 struct gcpro gcpro1, gcpro2;
1956 get_buffer_range_char (buf, start, end, &b, &e, 0);
1957 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1958 lb_istr = XLSTREAM (lb_instream);
1959 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1960 istr = XLSTREAM (instream);
1961 GCPRO2 (instream, lb_instream);
1963 decst.eol_type = EOL_AUTODETECT;
1967 unsigned char random_buffer[4096];
1968 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1972 if (detect_coding_type (&decst, random_buffer, nread, 0))
1976 if (decst.mask == ~0)
1977 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1985 decst.mask = postprocess_iso2022_mask (decst.mask);
1987 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1989 int sys = fcd->coding_category_by_priority[i];
1990 if (decst.mask & (1 << sys))
1992 Lisp_Object codesys = fcd->coding_category_system[sys];
1993 if (!NILP (codesys))
1994 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1995 val = Fcons (codesys, val);
1999 Lstream_close (istr);
2001 Lstream_delete (istr);
2002 Lstream_delete (lb_istr);
2007 /************************************************************************/
2008 /* Converting to internal Mule format ("decoding") */
2009 /************************************************************************/
2011 /* A decoding stream is a stream used for decoding text (i.e.
2012 converting from some external format to internal format).
2013 The decoding-stream object keeps track of the actual coding
2014 stream, the stream that is at the other end, and data that
2015 needs to be persistent across the lifetime of the stream. */
2017 /* Handle the EOL stuff related to just-read-in character C.
2018 EOL_TYPE is the EOL type of the coding stream.
2019 FLAGS is the current value of FLAGS in the coding stream, and may
2020 be modified by this macro. (The macro only looks at the
2021 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2022 bytes are to be written. You need to also define a local goto
2023 label "label_continue_loop" that is at the end of the main
2024 character-reading loop.
2026 If C is a CR character, then this macro handles it entirely and
2027 jumps to label_continue_loop. Otherwise, this macro does not add
2028 anything to DST, and continues normally. You should continue
2029 processing C normally after this macro. */
2031 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2035 if (eol_type == EOL_CR) \
2036 Dynarr_add (dst, '\n'); \
2037 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2038 Dynarr_add (dst, c); \
2040 flags |= CODING_STATE_CR; \
2041 goto label_continue_loop; \
2043 else if (flags & CODING_STATE_CR) \
2044 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2046 Dynarr_add (dst, '\r'); \
2047 flags &= ~CODING_STATE_CR; \
2051 /* C should be a binary character in the range 0 - 255; convert
2052 to internal format and add to Dynarr DST. */
2055 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2057 if (BYTE_ASCII_P (c)) \
2058 Dynarr_add (dst, c); \
2061 Dynarr_add (dst, (c >> 6) | 0xc0); \
2062 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2066 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2068 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2072 Dynarr_add (dst, c);
2074 else if ( c <= 0x7ff )
2076 Dynarr_add (dst, (c >> 6) | 0xc0);
2077 Dynarr_add (dst, (c & 0x3f) | 0x80);
2079 else if ( c <= 0xffff )
2081 Dynarr_add (dst, (c >> 12) | 0xe0);
2082 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2083 Dynarr_add (dst, (c & 0x3f) | 0x80);
2085 else if ( c <= 0x1fffff )
2087 Dynarr_add (dst, (c >> 18) | 0xf0);
2088 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2089 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2090 Dynarr_add (dst, (c & 0x3f) | 0x80);
2092 else if ( c <= 0x3ffffff )
2094 Dynarr_add (dst, (c >> 24) | 0xf8);
2095 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2096 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2097 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2098 Dynarr_add (dst, (c & 0x3f) | 0x80);
2102 Dynarr_add (dst, (c >> 30) | 0xfc);
2103 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2104 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2107 Dynarr_add (dst, (c & 0x3f) | 0x80);
2111 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2113 if (BYTE_ASCII_P (c)) \
2114 Dynarr_add (dst, c); \
2115 else if (BYTE_C1_P (c)) \
2117 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2118 Dynarr_add (dst, c + 0x20); \
2122 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2123 Dynarr_add (dst, c); \
2128 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2132 DECODE_ADD_BINARY_CHAR (ch, dst); \
2137 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2139 if (flags & CODING_STATE_END) \
2141 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2142 if (flags & CODING_STATE_CR) \
2143 Dynarr_add (dst, '\r'); \
2147 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2149 struct decoding_stream
2151 /* Coding system that governs the conversion. */
2152 Lisp_Coding_System *codesys;
2154 /* Stream that we read the encoded data from or
2155 write the decoded data to. */
2158 /* If we are reading, then we can return only a fixed amount of
2159 data, so if the conversion resulted in too much data, we store it
2160 here for retrieval the next time around. */
2161 unsigned_char_dynarr *runoff;
2163 /* FLAGS holds flags indicating the current state of the decoding.
2164 Some of these flags are dependent on the coding system. */
2167 /* CPOS holds a partially built-up code-point of character. */
2170 /* EOL_TYPE specifies the type of end-of-line conversion that
2171 currently applies. We need to keep this separate from the
2172 EOL type stored in CODESYS because the latter might indicate
2173 automatic EOL-type detection while the former will always
2174 indicate a particular EOL type. */
2175 eol_type_t eol_type;
2177 /* Additional ISO2022 information. We define the structure above
2178 because it's also needed by the detection routines. */
2179 struct iso2022_decoder iso2022;
2181 /* Additional information (the state of the running CCL program)
2182 used by the CCL decoder. */
2183 struct ccl_program ccl;
2185 /* counter for UTF-8 or UCS-4 */
2186 unsigned char counter;
2189 unsigned combined_char_count;
2190 Emchar combined_chars[16];
2191 Lisp_Object combining_table;
2193 struct detection_state decst;
2197 extern Lisp_Object Vcharacter_composition_table;
2200 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2202 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2206 for (i = 0; i < str->combined_char_count; i++)
2207 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2208 str->combined_char_count = 0;
2209 str->combining_table = Qnil;
2212 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2213 unsigned_char_dynarr* dst);
2215 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2216 Emchar character, unsigned_char_dynarr* dst)
2218 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2219 DECODE_ADD_UCS_CHAR (character, dst);
2220 else if (!CHAR_ID_TABLE_P (str->combining_table))
2223 = get_char_id_table (character, Vcharacter_composition_table);
2226 DECODE_ADD_UCS_CHAR (character, dst);
2229 str->combined_chars[0] = character;
2230 str->combined_char_count = 1;
2231 str->combining_table = ret;
2237 = get_char_id_table (character, str->combining_table);
2241 Emchar char2 = XCHARVAL (ret);
2242 ret = get_char_id_table (char2, Vcharacter_composition_table);
2245 DECODE_ADD_UCS_CHAR (char2, dst);
2246 str->combined_char_count = 0;
2247 str->combining_table = Qnil;
2251 str->combined_chars[0] = char2;
2252 str->combined_char_count = 1;
2253 str->combining_table = ret;
2256 else if (CHAR_ID_TABLE_P (ret))
2258 str->combined_chars[str->combined_char_count++] = character;
2259 str->combining_table = ret;
2263 COMPOSE_FLUSH_CHARS (str, dst);
2264 DECODE_ADD_UCS_CHAR (character, dst);
2268 #else /* not UTF2000 */
2269 #define COMPOSE_FLUSH_CHARS(str, dst)
2270 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2271 #endif /* UTF2000 */
2273 static ssize_t decoding_reader (Lstream *stream,
2274 unsigned char *data, size_t size);
2275 static ssize_t decoding_writer (Lstream *stream,
2276 const unsigned char *data, size_t size);
2277 static int decoding_rewinder (Lstream *stream);
2278 static int decoding_seekable_p (Lstream *stream);
2279 static int decoding_flusher (Lstream *stream);
2280 static int decoding_closer (Lstream *stream);
2282 static Lisp_Object decoding_marker (Lisp_Object stream);
2284 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2285 sizeof (struct decoding_stream));
2288 decoding_marker (Lisp_Object stream)
2290 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2291 Lisp_Object str_obj;
2293 /* We do not need to mark the coding systems or charsets stored
2294 within the stream because they are stored in a global list
2295 and automatically marked. */
2297 XSETLSTREAM (str_obj, str);
2298 mark_object (str_obj);
2299 if (str->imp->marker)
2300 return (str->imp->marker) (str_obj);
2305 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2306 so we read data from the other end, decode it, and store it into DATA. */
2309 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2311 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2312 unsigned char *orig_data = data;
2314 int error_occurred = 0;
2316 /* We need to interface to mule_decode(), which expects to take some
2317 amount of data and store the result into a Dynarr. We have
2318 mule_decode() store into str->runoff, and take data from there
2321 /* We loop until we have enough data, reading chunks from the other
2322 end and decoding it. */
2325 /* Take data from the runoff if we can. Make sure to take at
2326 most SIZE bytes, and delete the data from the runoff. */
2327 if (Dynarr_length (str->runoff) > 0)
2329 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2330 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2331 Dynarr_delete_many (str->runoff, 0, chunk);
2337 break; /* No more room for data */
2339 if (str->flags & CODING_STATE_END)
2340 /* This means that on the previous iteration, we hit the EOF on
2341 the other end. We loop once more so that mule_decode() can
2342 output any final stuff it may be holding, or any "go back
2343 to a sane state" escape sequences. (This latter makes sense
2344 during encoding.) */
2347 /* Exhausted the runoff, so get some more. DATA has at least
2348 SIZE bytes left of storage in it, so it's OK to read directly
2349 into it. (We'll be overwriting above, after we've decoded it
2350 into the runoff.) */
2351 read_size = Lstream_read (str->other_end, data, size);
2358 /* There might be some more end data produced in the translation.
2359 See the comment above. */
2360 str->flags |= CODING_STATE_END;
2361 mule_decode (stream, data, str->runoff, read_size);
2364 if (data - orig_data == 0)
2365 return error_occurred ? -1 : 0;
2367 return data - orig_data;
2371 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2373 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2376 /* Decode all our data into the runoff, and then attempt to write
2377 it all out to the other end. Remove whatever chunk we succeeded
2379 mule_decode (stream, data, str->runoff, size);
2380 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2381 Dynarr_length (str->runoff));
2383 Dynarr_delete_many (str->runoff, 0, retval);
2384 /* Do NOT return retval. The return value indicates how much
2385 of the incoming data was written, not how many bytes were
2391 reset_decoding_stream (struct decoding_stream *str)
2394 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2396 Lisp_Object coding_system;
2397 XSETCODING_SYSTEM (coding_system, str->codesys);
2398 reset_iso2022 (coding_system, &str->iso2022);
2400 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2402 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2407 str->combined_char_count = 0;
2408 str->combining_table = Qnil;
2410 str->flags = str->cpos = 0;
2414 decoding_rewinder (Lstream *stream)
2416 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2417 reset_decoding_stream (str);
2418 Dynarr_reset (str->runoff);
2419 return Lstream_rewind (str->other_end);
2423 decoding_seekable_p (Lstream *stream)
2425 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2426 return Lstream_seekable_p (str->other_end);
2430 decoding_flusher (Lstream *stream)
2432 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2433 return Lstream_flush (str->other_end);
2437 decoding_closer (Lstream *stream)
2439 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2440 if (stream->flags & LSTREAM_FL_WRITE)
2442 str->flags |= CODING_STATE_END;
2443 decoding_writer (stream, 0, 0);
2445 Dynarr_free (str->runoff);
2447 #ifdef ENABLE_COMPOSITE_CHARS
2448 if (str->iso2022.composite_chars)
2449 Dynarr_free (str->iso2022.composite_chars);
2452 return Lstream_close (str->other_end);
2456 decoding_stream_coding_system (Lstream *stream)
2458 Lisp_Object coding_system;
2459 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2461 XSETCODING_SYSTEM (coding_system, str->codesys);
2462 return subsidiary_coding_system (coding_system, str->eol_type);
2466 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2468 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2469 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2471 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2472 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2473 reset_decoding_stream (str);
2476 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2477 stream for writing, no automatic code detection will be performed.
2478 The reason for this is that automatic code detection requires a
2479 seekable input. Things will also fail if you open a decoding
2480 stream for reading using a non-fully-specified coding system and
2481 a non-seekable input stream. */
2484 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2487 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2488 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2492 str->other_end = stream;
2493 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2494 str->eol_type = EOL_AUTODETECT;
2495 if (!strcmp (mode, "r")
2496 && Lstream_seekable_p (stream))
2497 /* We can determine the coding system now. */
2498 determine_real_coding_system (stream, &codesys, &str->eol_type);
2499 set_decoding_stream_coding_system (lstr, codesys);
2500 str->decst.eol_type = str->eol_type;
2501 str->decst.mask = ~0;
2502 XSETLSTREAM (obj, lstr);
2507 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2509 return make_decoding_stream_1 (stream, codesys, "r");
2513 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2515 return make_decoding_stream_1 (stream, codesys, "w");
2518 /* Note: the decode_coding_* functions all take the same
2519 arguments as mule_decode(), which is to say some SRC data of
2520 size N, which is to be stored into dynamic array DST.
2521 DECODING is the stream within which the decoding is
2522 taking place, but no data is actually read from or
2523 written to that stream; that is handled in decoding_reader()
2524 or decoding_writer(). This allows the same functions to
2525 be used for both reading and writing. */
2528 mule_decode (Lstream *decoding, const unsigned char *src,
2529 unsigned_char_dynarr *dst, unsigned int n)
2531 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2533 /* If necessary, do encoding-detection now. We do this when
2534 we're a writing stream or a non-seekable reading stream,
2535 meaning that we can't just process the whole input,
2536 rewind, and start over. */
2538 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2539 str->eol_type == EOL_AUTODETECT)
2541 Lisp_Object codesys;
2543 XSETCODING_SYSTEM (codesys, str->codesys);
2544 detect_coding_type (&str->decst, src, n,
2545 CODING_SYSTEM_TYPE (str->codesys) !=
2546 CODESYS_AUTODETECT);
2547 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2548 str->decst.mask != ~0)
2549 /* #### This is cheesy. What we really ought to do is
2550 buffer up a certain amount of data so as to get a
2551 less random result. */
2552 codesys = coding_system_from_mask (str->decst.mask);
2553 str->eol_type = str->decst.eol_type;
2554 if (XCODING_SYSTEM (codesys) != str->codesys)
2556 /* Preserve the CODING_STATE_END flag in case it was set.
2557 If we erase it, bad things might happen. */
2558 int was_end = str->flags & CODING_STATE_END;
2559 set_decoding_stream_coding_system (decoding, codesys);
2561 str->flags |= CODING_STATE_END;
2565 switch (CODING_SYSTEM_TYPE (str->codesys))
2568 case CODESYS_INTERNAL:
2569 Dynarr_add_many (dst, src, n);
2572 case CODESYS_AUTODETECT:
2573 /* If we got this far and still haven't decided on the coding
2574 system, then do no conversion. */
2575 case CODESYS_NO_CONVERSION:
2576 decode_coding_no_conversion (decoding, src, dst, n);
2579 case CODESYS_SHIFT_JIS:
2580 decode_coding_sjis (decoding, src, dst, n);
2583 decode_coding_big5 (decoding, src, dst, n);
2586 decode_coding_ucs4 (decoding, src, dst, n);
2589 decode_coding_utf8 (decoding, src, dst, n);
2592 str->ccl.last_block = str->flags & CODING_STATE_END;
2593 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2595 case CODESYS_ISO2022:
2596 decode_coding_iso2022 (decoding, src, dst, n);
2604 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2605 Decode the text between START and END which is encoded in CODING-SYSTEM.
2606 This is useful if you've read in encoded text from a file without decoding
2607 it (e.g. you read in a JIS-formatted file but used the `binary' or
2608 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2609 Return length of decoded text.
2610 BUFFER defaults to the current buffer if unspecified.
2612 (start, end, coding_system, buffer))
2615 struct buffer *buf = decode_buffer (buffer, 0);
2616 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2617 Lstream *istr, *ostr;
2618 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2620 get_buffer_range_char (buf, start, end, &b, &e, 0);
2622 barf_if_buffer_read_only (buf, b, e);
2624 coding_system = Fget_coding_system (coding_system);
2625 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2626 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2627 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2629 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2630 Fget_coding_system (Qbinary));
2631 istr = XLSTREAM (instream);
2632 ostr = XLSTREAM (outstream);
2633 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2635 /* The chain of streams looks like this:
2637 [BUFFER] <----- send through
2638 ------> [ENCODE AS BINARY]
2639 ------> [DECODE AS SPECIFIED]
2645 char tempbuf[1024]; /* some random amount */
2646 Bufpos newpos, even_newer_pos;
2647 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2648 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2652 newpos = lisp_buffer_stream_startpos (istr);
2653 Lstream_write (ostr, tempbuf, size_in_bytes);
2654 even_newer_pos = lisp_buffer_stream_startpos (istr);
2655 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2658 Lstream_close (istr);
2659 Lstream_close (ostr);
2661 Lstream_delete (istr);
2662 Lstream_delete (ostr);
2663 Lstream_delete (XLSTREAM (de_outstream));
2664 Lstream_delete (XLSTREAM (lb_outstream));
2669 /************************************************************************/
2670 /* Converting to an external encoding ("encoding") */
2671 /************************************************************************/
2673 /* An encoding stream is an output stream. When you create the
2674 stream, you specify the coding system that governs the encoding
2675 and another stream that the resulting encoded data is to be
2676 sent to, and then start sending data to it. */
2678 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2680 struct encoding_stream
2682 /* Coding system that governs the conversion. */
2683 Lisp_Coding_System *codesys;
2685 /* Stream that we read the encoded data from or
2686 write the decoded data to. */
2689 /* If we are reading, then we can return only a fixed amount of
2690 data, so if the conversion resulted in too much data, we store it
2691 here for retrieval the next time around. */
2692 unsigned_char_dynarr *runoff;
2694 /* FLAGS holds flags indicating the current state of the encoding.
2695 Some of these flags are dependent on the coding system. */
2698 /* CH holds a partially built-up character. Since we only deal
2699 with one- and two-byte characters at the moment, we only use
2700 this to store the first byte of a two-byte character. */
2703 /* Additional information used by the ISO2022 encoder. */
2706 /* CHARSET holds the character sets currently assigned to the G0
2707 through G3 registers. It is initialized from the array
2708 INITIAL_CHARSET in CODESYS. */
2709 Lisp_Object charset[4];
2711 /* Which registers are currently invoked into the left (GL) and
2712 right (GR) halves of the 8-bit encoding space? */
2713 int register_left, register_right;
2715 /* Whether we need to explicitly designate the charset in the
2716 G? register before using it. It is initialized from the
2717 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2718 unsigned char force_charset_on_output[4];
2720 /* Other state variables that need to be preserved across
2722 Lisp_Object current_charset;
2724 int current_char_boundary;
2727 void (*encode_char) (struct encoding_stream *str, Emchar c,
2728 unsigned_char_dynarr *dst, unsigned int *flags);
2729 void (*finish) (struct encoding_stream *str,
2730 unsigned_char_dynarr *dst, unsigned int *flags);
2732 /* Additional information (the state of the running CCL program)
2733 used by the CCL encoder. */
2734 struct ccl_program ccl;
2738 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2739 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2741 static int encoding_rewinder (Lstream *stream);
2742 static int encoding_seekable_p (Lstream *stream);
2743 static int encoding_flusher (Lstream *stream);
2744 static int encoding_closer (Lstream *stream);
2746 static Lisp_Object encoding_marker (Lisp_Object stream);
2748 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2749 sizeof (struct encoding_stream));
2752 encoding_marker (Lisp_Object stream)
2754 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2755 Lisp_Object str_obj;
2757 /* We do not need to mark the coding systems or charsets stored
2758 within the stream because they are stored in a global list
2759 and automatically marked. */
2761 XSETLSTREAM (str_obj, str);
2762 mark_object (str_obj);
2763 if (str->imp->marker)
2764 return (str->imp->marker) (str_obj);
2769 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2770 so we read data from the other end, encode it, and store it into DATA. */
2773 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2775 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2776 unsigned char *orig_data = data;
2778 int error_occurred = 0;
2780 /* We need to interface to mule_encode(), which expects to take some
2781 amount of data and store the result into a Dynarr. We have
2782 mule_encode() store into str->runoff, and take data from there
2785 /* We loop until we have enough data, reading chunks from the other
2786 end and encoding it. */
2789 /* Take data from the runoff if we can. Make sure to take at
2790 most SIZE bytes, and delete the data from the runoff. */
2791 if (Dynarr_length (str->runoff) > 0)
2793 int chunk = min ((int) size, Dynarr_length (str->runoff));
2794 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2795 Dynarr_delete_many (str->runoff, 0, chunk);
2801 break; /* No more room for data */
2803 if (str->flags & CODING_STATE_END)
2804 /* This means that on the previous iteration, we hit the EOF on
2805 the other end. We loop once more so that mule_encode() can
2806 output any final stuff it may be holding, or any "go back
2807 to a sane state" escape sequences. (This latter makes sense
2808 during encoding.) */
2811 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2812 left of storage in it, so it's OK to read directly into it.
2813 (We'll be overwriting above, after we've encoded it into the
2815 read_size = Lstream_read (str->other_end, data, size);
2822 /* There might be some more end data produced in the translation.
2823 See the comment above. */
2824 str->flags |= CODING_STATE_END;
2825 mule_encode (stream, data, str->runoff, read_size);
2828 if (data == orig_data)
2829 return error_occurred ? -1 : 0;
2831 return data - orig_data;
2835 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2837 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2840 /* Encode all our data into the runoff, and then attempt to write
2841 it all out to the other end. Remove whatever chunk we succeeded
2843 mule_encode (stream, data, str->runoff, size);
2844 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2845 Dynarr_length (str->runoff));
2847 Dynarr_delete_many (str->runoff, 0, retval);
2848 /* Do NOT return retval. The return value indicates how much
2849 of the incoming data was written, not how many bytes were
2855 reset_encoding_stream (struct encoding_stream *str)
2858 switch (CODING_SYSTEM_TYPE (str->codesys))
2860 case CODESYS_ISO2022:
2864 str->encode_char = &char_encode_iso2022;
2865 str->finish = &char_finish_iso2022;
2866 for (i = 0; i < 4; i++)
2868 str->iso2022.charset[i] =
2869 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2870 str->iso2022.force_charset_on_output[i] =
2871 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2873 str->iso2022.register_left = 0;
2874 str->iso2022.register_right = 1;
2875 str->iso2022.current_charset = Qnil;
2876 str->iso2022.current_half = 0;
2880 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2883 str->encode_char = &char_encode_utf8;
2884 str->finish = &char_finish_utf8;
2887 str->encode_char = &char_encode_ucs4;
2888 str->finish = &char_finish_ucs4;
2890 case CODESYS_SHIFT_JIS:
2891 str->encode_char = &char_encode_shift_jis;
2892 str->finish = &char_finish_shift_jis;
2898 str->iso2022.current_char_boundary = 0;
2899 str->flags = str->ch = 0;
2903 encoding_rewinder (Lstream *stream)
2905 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2906 reset_encoding_stream (str);
2907 Dynarr_reset (str->runoff);
2908 return Lstream_rewind (str->other_end);
2912 encoding_seekable_p (Lstream *stream)
2914 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915 return Lstream_seekable_p (str->other_end);
2919 encoding_flusher (Lstream *stream)
2921 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2922 return Lstream_flush (str->other_end);
2926 encoding_closer (Lstream *stream)
2928 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2929 if (stream->flags & LSTREAM_FL_WRITE)
2931 str->flags |= CODING_STATE_END;
2932 encoding_writer (stream, 0, 0);
2934 Dynarr_free (str->runoff);
2935 return Lstream_close (str->other_end);
2939 encoding_stream_coding_system (Lstream *stream)
2941 Lisp_Object coding_system;
2942 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2944 XSETCODING_SYSTEM (coding_system, str->codesys);
2945 return coding_system;
2949 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2951 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2952 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2954 reset_encoding_stream (str);
2958 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2961 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2962 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2966 str->runoff = Dynarr_new (unsigned_char);
2967 str->other_end = stream;
2968 set_encoding_stream_coding_system (lstr, codesys);
2969 XSETLSTREAM (obj, lstr);
2974 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2976 return make_encoding_stream_1 (stream, codesys, "r");
2980 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2982 return make_encoding_stream_1 (stream, codesys, "w");
2985 /* Convert N bytes of internally-formatted data stored in SRC to an
2986 external format, according to the encoding stream ENCODING.
2987 Store the encoded data into DST. */
2990 mule_encode (Lstream *encoding, const unsigned char *src,
2991 unsigned_char_dynarr *dst, unsigned int n)
2993 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2995 switch (CODING_SYSTEM_TYPE (str->codesys))
2998 case CODESYS_INTERNAL:
2999 Dynarr_add_many (dst, src, n);
3002 case CODESYS_AUTODETECT:
3003 /* If we got this far and still haven't decided on the coding
3004 system, then do no conversion. */
3005 case CODESYS_NO_CONVERSION:
3006 encode_coding_no_conversion (encoding, src, dst, n);
3010 encode_coding_big5 (encoding, src, dst, n);
3013 str->ccl.last_block = str->flags & CODING_STATE_END;
3014 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
3018 text_encode_generic (encoding, src, dst, n);
3022 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3023 Encode the text between START and END using CODING-SYSTEM.
3024 This will, for example, convert Japanese characters into stuff such as
3025 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3026 text. BUFFER defaults to the current buffer if unspecified.
3028 (start, end, coding_system, buffer))
3031 struct buffer *buf = decode_buffer (buffer, 0);
3032 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3033 Lstream *istr, *ostr;
3034 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3036 get_buffer_range_char (buf, start, end, &b, &e, 0);
3038 barf_if_buffer_read_only (buf, b, e);
3040 coding_system = Fget_coding_system (coding_system);
3041 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3042 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3043 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3044 Fget_coding_system (Qbinary));
3045 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3047 istr = XLSTREAM (instream);
3048 ostr = XLSTREAM (outstream);
3049 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3050 /* The chain of streams looks like this:
3052 [BUFFER] <----- send through
3053 ------> [ENCODE AS SPECIFIED]
3054 ------> [DECODE AS BINARY]
3059 char tempbuf[1024]; /* some random amount */
3060 Bufpos newpos, even_newer_pos;
3061 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3062 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3066 newpos = lisp_buffer_stream_startpos (istr);
3067 Lstream_write (ostr, tempbuf, size_in_bytes);
3068 even_newer_pos = lisp_buffer_stream_startpos (istr);
3069 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3075 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3076 Lstream_close (istr);
3077 Lstream_close (ostr);
3079 Lstream_delete (istr);
3080 Lstream_delete (ostr);
3081 Lstream_delete (XLSTREAM (de_outstream));
3082 Lstream_delete (XLSTREAM (lb_outstream));
3083 return make_int (retlen);
3090 text_encode_generic (Lstream *encoding, const unsigned char *src,
3091 unsigned_char_dynarr *dst, unsigned int n)
3094 unsigned char char_boundary;
3095 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3096 unsigned int flags = str->flags;
3097 Emchar ch = str->ch;
3099 char_boundary = str->iso2022.current_char_boundary;
3105 if (char_boundary == 0)
3133 (*str->encode_char) (str, c, dst, &flags);
3135 else if (char_boundary == 1)
3137 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3143 ch = (ch << 6) | (c & 0x3f);
3148 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3150 (*str->finish) (str, dst, &flags);
3155 str->iso2022.current_char_boundary = char_boundary;
3159 /************************************************************************/
3160 /* Shift-JIS methods */
3161 /************************************************************************/
3163 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3164 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3165 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3166 encoded by "position-code + 0x80". A character of JISX0208
3167 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3168 position-codes are divided and shifted so that it fit in the range
3171 --- CODE RANGE of Shift-JIS ---
3172 (character set) (range)
3174 JISX0201-Kana 0xA0 .. 0xDF
3175 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3176 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3177 -------------------------------
3181 /* Is this the first byte of a Shift-JIS two-byte char? */
3183 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3184 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3186 /* Is this the second byte of a Shift-JIS two-byte char? */
3188 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3189 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3191 #define BYTE_SJIS_KATAKANA_P(c) \
3192 ((c) >= 0xA1 && (c) <= 0xDF)
3195 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
3203 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3205 if (st->shift_jis.in_second_byte)
3207 st->shift_jis.in_second_byte = 0;
3211 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3212 st->shift_jis.in_second_byte = 1;
3214 return CODING_CATEGORY_SHIFT_JIS_MASK;
3217 /* Convert Shift-JIS data to internal format. */
3220 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
3221 unsigned_char_dynarr *dst, unsigned int n)
3224 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3225 unsigned int flags = str->flags;
3226 unsigned int cpos = str->cpos;
3227 eol_type_t eol_type = str->eol_type;
3235 /* Previous character was first byte of Shift-JIS Kanji char. */
3236 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3238 unsigned char e1, e2;
3240 DECODE_SJIS (cpos, c, e1, e2);
3242 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3246 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3247 Dynarr_add (dst, e1);
3248 Dynarr_add (dst, e2);
3253 DECODE_ADD_BINARY_CHAR (cpos, dst);
3254 DECODE_ADD_BINARY_CHAR (c, dst);
3260 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3261 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3263 else if (BYTE_SJIS_KATAKANA_P (c))
3266 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3269 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3270 Dynarr_add (dst, c);
3275 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3279 DECODE_ADD_BINARY_CHAR (c, dst);
3281 label_continue_loop:;
3284 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3290 /* Convert internal character representation to Shift_JIS. */
3293 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3294 unsigned_char_dynarr *dst, unsigned int *flags)
3296 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3300 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3301 Dynarr_add (dst, '\r');
3302 if (eol_type != EOL_CR)
3303 Dynarr_add (dst, ch);
3307 unsigned int s1, s2;
3309 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3311 if (code_point >= 0)
3312 Dynarr_add (dst, code_point);
3313 else if ((code_point
3314 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3317 ENCODE_SJIS ((code_point >> 8) | 0x80,
3318 (code_point & 0xFF) | 0x80, s1, s2);
3319 Dynarr_add (dst, s1);
3320 Dynarr_add (dst, s2);
3322 else if ((code_point
3323 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3325 Dynarr_add (dst, code_point | 0x80);
3326 else if ((code_point
3327 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3330 ENCODE_SJIS ((code_point >> 8) | 0x80,
3331 (code_point & 0xFF) | 0x80, s1, s2);
3332 Dynarr_add (dst, s1);
3333 Dynarr_add (dst, s2);
3335 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3337 Dynarr_add (dst, code_point);
3339 Dynarr_add (dst, '?');
3341 Lisp_Object charset;
3342 unsigned int c1, c2;
3344 BREAKUP_CHAR (ch, charset, c1, c2);
3346 if (EQ(charset, Vcharset_katakana_jisx0201))
3348 Dynarr_add (dst, c1 | 0x80);
3352 Dynarr_add (dst, c1);
3354 else if (EQ(charset, Vcharset_japanese_jisx0208))
3356 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3357 Dynarr_add (dst, s1);
3358 Dynarr_add (dst, s2);
3361 Dynarr_add (dst, '?');
3367 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3368 unsigned int *flags)
3372 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3373 Decode a JISX0208 character of Shift-JIS coding-system.
3374 CODE is the character code in Shift-JIS as a cons of type bytes.
3375 Return the corresponding character.
3379 unsigned char c1, c2, s1, s2;
3382 CHECK_INT (XCAR (code));
3383 CHECK_INT (XCDR (code));
3384 s1 = XINT (XCAR (code));
3385 s2 = XINT (XCDR (code));
3386 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3387 BYTE_SJIS_TWO_BYTE_2_P (s2))
3389 DECODE_SJIS (s1, s2, c1, c2);
3390 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3391 c1 & 0x7F, c2 & 0x7F));
3397 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3398 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3399 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3403 Lisp_Object charset;
3406 CHECK_CHAR_COERCE_INT (ch);
3407 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3408 if (EQ (charset, Vcharset_japanese_jisx0208))
3410 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3411 return Fcons (make_int (s1), make_int (s2));
3418 /************************************************************************/
3420 /************************************************************************/
3422 /* BIG5 is a coding system encoding two character sets: ASCII and
3423 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3424 character set and is encoded in two-byte.
3426 --- CODE RANGE of BIG5 ---
3427 (character set) (range)
3429 Big5 (1st byte) 0xA1 .. 0xFE
3430 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3431 --------------------------
3433 Since the number of characters in Big5 is larger than maximum
3434 characters in Emacs' charset (96x96), it can't be handled as one
3435 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3436 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3437 contains frequently used characters and the latter contains less
3438 frequently used characters. */
3440 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3441 ((c) >= 0xA1 && (c) <= 0xFE)
3443 /* Is this the second byte of a Shift-JIS two-byte char? */
3445 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3446 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3448 /* Number of Big5 characters which have the same code in 1st byte. */
3450 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3452 /* Code conversion macros. These are macros because they are used in
3453 inner loops during code conversion.
3455 Note that temporary variables in macros introduce the classic
3456 dynamic-scoping problems with variable names. We use capital-
3457 lettered variables in the assumption that XEmacs does not use
3458 capital letters in variables except in a very formalized way
3461 /* Convert Big5 code (b1, b2) into its internal string representation
3464 /* There is a much simpler way to split the Big5 charset into two.
3465 For the moment I'm going to leave the algorithm as-is because it
3466 claims to separate out the most-used characters into a single
3467 charset, which perhaps will lead to optimizations in various
3470 The way the algorithm works is something like this:
3472 Big5 can be viewed as a 94x157 charset, where the row is
3473 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3474 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3475 the split between low and high column numbers is apparently
3476 meaningless; ascending rows produce less and less frequent chars.
3477 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3478 the first charset, and the upper half (0xC9 .. 0xFE) to the
3479 second. To do the conversion, we convert the character into
3480 a single number where 0 .. 156 is the first row, 157 .. 313
3481 is the second, etc. That way, the characters are ordered by
3482 decreasing frequency. Then we just chop the space in two
3483 and coerce the result into a 94x94 space.
3486 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3488 int B1 = b1, B2 = b2; \
3490 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3494 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3498 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3499 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3501 c1 = I / (0xFF - 0xA1) + 0xA1; \
3502 c2 = I % (0xFF - 0xA1) + 0xA1; \
3505 /* Convert the internal string representation of a Big5 character
3506 (lb, c1, c2) into Big5 code (b1, b2). */
3508 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3510 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3512 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3514 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3516 b1 = I / BIG5_SAME_ROW + 0xA1; \
3517 b2 = I % BIG5_SAME_ROW; \
3518 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3522 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3530 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3531 (c >= 0x80 && c <= 0xA0))
3533 if (st->big5.in_second_byte)
3535 st->big5.in_second_byte = 0;
3536 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3540 st->big5.in_second_byte = 1;
3542 return CODING_CATEGORY_BIG5_MASK;
3545 /* Convert Big5 data to internal format. */
3548 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3549 unsigned_char_dynarr *dst, unsigned int n)
3552 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3553 unsigned int flags = str->flags;
3554 unsigned int cpos = str->cpos;
3555 eol_type_t eol_type = str->eol_type;
3562 /* Previous character was first byte of Big5 char. */
3563 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3567 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3570 unsigned char b1, b2, b3;
3571 DECODE_BIG5 (cpos, c, b1, b2, b3);
3572 Dynarr_add (dst, b1);
3573 Dynarr_add (dst, b2);
3574 Dynarr_add (dst, b3);
3579 DECODE_ADD_BINARY_CHAR (cpos, dst);
3580 DECODE_ADD_BINARY_CHAR (c, dst);
3586 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3587 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3590 DECODE_ADD_BINARY_CHAR (c, dst);
3592 label_continue_loop:;
3595 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3601 /* Convert internally-formatted data to Big5. */
3604 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3605 unsigned_char_dynarr *dst, unsigned int n)
3609 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3610 unsigned int flags = str->flags;
3611 unsigned int ch = str->ch;
3612 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3619 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3620 Dynarr_add (dst, '\r');
3621 if (eol_type != EOL_CR)
3622 Dynarr_add (dst, '\n');
3624 else if (BYTE_ASCII_P (c))
3627 Dynarr_add (dst, c);
3629 else if (BUFBYTE_LEADING_BYTE_P (c))
3631 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3632 c == LEADING_BYTE_CHINESE_BIG5_2)
3634 /* A recognized leading byte. */
3636 continue; /* not done with this character. */
3638 /* otherwise just ignore this character. */
3640 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3641 ch == LEADING_BYTE_CHINESE_BIG5_2)
3643 /* Previous char was a recognized leading byte. */
3645 continue; /* not done with this character. */
3649 /* Encountering second byte of a Big5 character. */
3650 unsigned char b1, b2;
3652 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3653 Dynarr_add (dst, b1);
3654 Dynarr_add (dst, b2);
3666 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3667 Decode a Big5 character CODE of BIG5 coding-system.
3668 CODE is the character code in BIG5, a cons of two integers.
3669 Return the corresponding character.
3673 unsigned char c1, c2, b1, b2;
3676 CHECK_INT (XCAR (code));
3677 CHECK_INT (XCDR (code));
3678 b1 = XINT (XCAR (code));
3679 b2 = XINT (XCDR (code));
3680 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3681 BYTE_BIG5_TWO_BYTE_2_P (b2))
3683 Charset_ID leading_byte;
3684 Lisp_Object charset;
3685 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3686 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3687 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3693 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3694 Encode the Big5 character CH to BIG5 coding-system.
3695 Return the corresponding character code in Big5.
3699 Lisp_Object charset;
3702 CHECK_CHAR_COERCE_INT (ch);
3703 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3704 if (EQ (charset, Vcharset_chinese_big5_1) ||
3705 EQ (charset, Vcharset_chinese_big5_2))
3707 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3709 return Fcons (make_int (b1), make_int (b2));
3716 /************************************************************************/
3718 /************************************************************************/
3721 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3727 switch (st->ucs4.in_byte)
3736 st->ucs4.in_byte = 0;
3742 return CODING_CATEGORY_UCS4_MASK;
3746 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3747 unsigned_char_dynarr *dst, unsigned int n)
3749 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3750 unsigned int flags = str->flags;
3751 unsigned int cpos = str->cpos;
3752 unsigned char counter = str->counter;
3756 unsigned char c = *src++;
3764 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3769 cpos = ( cpos << 8 ) | c;
3773 if (counter & CODING_STATE_END)
3774 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3778 str->counter = counter;
3782 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3783 unsigned_char_dynarr *dst, unsigned int *flags)
3785 Dynarr_add (dst, ch >> 24);
3786 Dynarr_add (dst, ch >> 16);
3787 Dynarr_add (dst, ch >> 8);
3788 Dynarr_add (dst, ch );
3792 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3793 unsigned int *flags)
3798 /************************************************************************/
3800 /************************************************************************/
3803 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3808 unsigned char c = *src++;
3809 switch (st->utf8.in_byte)
3812 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3815 st->utf8.in_byte = 5;
3817 st->utf8.in_byte = 4;
3819 st->utf8.in_byte = 3;
3821 st->utf8.in_byte = 2;
3823 st->utf8.in_byte = 1;
3828 if ((c & 0xc0) != 0x80)
3834 return CODING_CATEGORY_UTF8_MASK;
3838 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3839 unsigned_char_dynarr *dst, unsigned int n)
3841 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3842 unsigned int flags = str->flags;
3843 unsigned int cpos = str->cpos;
3844 eol_type_t eol_type = str->eol_type;
3845 unsigned char counter = str->counter;
3849 unsigned char c = *src++;
3858 else if ( c >= 0xf8 )
3863 else if ( c >= 0xf0 )
3868 else if ( c >= 0xe0 )
3873 else if ( c >= 0xc0 )
3880 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3881 DECODE_ADD_UCS_CHAR (c, dst);
3885 cpos = ( cpos << 6 ) | ( c & 0x3f );
3886 DECODE_ADD_UCS_CHAR (cpos, dst);
3891 cpos = ( cpos << 6 ) | ( c & 0x3f );
3894 label_continue_loop:;
3897 if (flags & CODING_STATE_END)
3898 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3902 str->counter = counter;
3906 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3907 unsigned_char_dynarr *dst, unsigned int *flags)
3909 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3913 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3914 Dynarr_add (dst, '\r');
3915 if (eol_type != EOL_CR)
3916 Dynarr_add (dst, ch);
3918 else if (ch <= 0x7f)
3920 Dynarr_add (dst, ch);
3922 else if (ch <= 0x7ff)
3924 Dynarr_add (dst, (ch >> 6) | 0xc0);
3925 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3927 else if (ch <= 0xffff)
3929 Dynarr_add (dst, (ch >> 12) | 0xe0);
3930 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3931 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3933 else if (ch <= 0x1fffff)
3935 Dynarr_add (dst, (ch >> 18) | 0xf0);
3936 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3937 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3938 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3940 else if (ch <= 0x3ffffff)
3942 Dynarr_add (dst, (ch >> 24) | 0xf8);
3943 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3944 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3945 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3946 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3950 Dynarr_add (dst, (ch >> 30) | 0xfc);
3951 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3952 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3953 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3954 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3955 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3960 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3961 unsigned int *flags)
3966 /************************************************************************/
3967 /* ISO2022 methods */
3968 /************************************************************************/
3970 /* The following note describes the coding system ISO2022 briefly.
3971 Since the intention of this note is to help understand the
3972 functions in this file, some parts are NOT ACCURATE or OVERLY
3973 SIMPLIFIED. For thorough understanding, please refer to the
3974 original document of ISO2022.
3976 ISO2022 provides many mechanisms to encode several character sets
3977 in 7-bit and 8-bit environments. For 7-bit environments, all text
3978 is encoded using bytes less than 128. This may make the encoded
3979 text a little bit longer, but the text passes more easily through
3980 several gateways, some of which strip off MSB (Most Signigant Bit).
3982 There are two kinds of character sets: control character set and
3983 graphic character set. The former contains control characters such
3984 as `newline' and `escape' to provide control functions (control
3985 functions are also provided by escape sequences). The latter
3986 contains graphic characters such as 'A' and '-'. Emacs recognizes
3987 two control character sets and many graphic character sets.
3989 Graphic character sets are classified into one of the following
3990 four classes, according to the number of bytes (DIMENSION) and
3991 number of characters in one dimension (CHARS) of the set:
3992 - DIMENSION1_CHARS94
3993 - DIMENSION1_CHARS96
3994 - DIMENSION2_CHARS94
3995 - DIMENSION2_CHARS96
3997 In addition, each character set is assigned an identification tag,
3998 unique for each set, called "final character" (denoted as <F>
3999 hereafter). The <F> of each character set is decided by ECMA(*)
4000 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4001 (0x30..0x3F are for private use only).
4003 Note (*): ECMA = European Computer Manufacturers Association
4005 Here are examples of graphic character set [NAME(<F>)]:
4006 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4007 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4008 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4009 o DIMENSION2_CHARS96 -- none for the moment
4011 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4012 C0 [0x00..0x1F] -- control character plane 0
4013 GL [0x20..0x7F] -- graphic character plane 0
4014 C1 [0x80..0x9F] -- control character plane 1
4015 GR [0xA0..0xFF] -- graphic character plane 1
4017 A control character set is directly designated and invoked to C0 or
4018 C1 by an escape sequence. The most common case is that:
4019 - ISO646's control character set is designated/invoked to C0, and
4020 - ISO6429's control character set is designated/invoked to C1,
4021 and usually these designations/invocations are omitted in encoded
4022 text. In a 7-bit environment, only C0 can be used, and a control
4023 character for C1 is encoded by an appropriate escape sequence to
4024 fit into the environment. All control characters for C1 are
4025 defined to have corresponding escape sequences.
4027 A graphic character set is at first designated to one of four
4028 graphic registers (G0 through G3), then these graphic registers are
4029 invoked to GL or GR. These designations and invocations can be
4030 done independently. The most common case is that G0 is invoked to
4031 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4032 these invocations and designations are omitted in encoded text.
4033 In a 7-bit environment, only GL can be used.
4035 When a graphic character set of CHARS94 is invoked to GL, codes
4036 0x20 and 0x7F of the GL area work as control characters SPACE and
4037 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4040 There are two ways of invocation: locking-shift and single-shift.
4041 With locking-shift, the invocation lasts until the next different
4042 invocation, whereas with single-shift, the invocation affects the
4043 following character only and doesn't affect the locking-shift
4044 state. Invocations are done by the following control characters or
4047 ----------------------------------------------------------------------
4048 abbrev function cntrl escape seq description
4049 ----------------------------------------------------------------------
4050 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4051 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4052 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4053 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4054 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4055 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4056 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4057 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4058 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4059 ----------------------------------------------------------------------
4060 (*) These are not used by any known coding system.
4062 Control characters for these functions are defined by macros
4063 ISO_CODE_XXX in `coding.h'.
4065 Designations are done by the following escape sequences:
4066 ----------------------------------------------------------------------
4067 escape sequence description
4068 ----------------------------------------------------------------------
4069 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4070 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4071 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4072 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4073 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4074 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4075 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4076 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4077 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4078 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4079 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4080 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4081 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4082 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4083 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4084 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4085 ----------------------------------------------------------------------
4087 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4088 of dimension 1, chars 94, and final character <F>, etc...
4090 Note (*): Although these designations are not allowed in ISO2022,
4091 Emacs accepts them on decoding, and produces them on encoding
4092 CHARS96 character sets in a coding system which is characterized as
4093 7-bit environment, non-locking-shift, and non-single-shift.
4095 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4096 '(' can be omitted. We refer to this as "short-form" hereafter.
4098 Now you may notice that there are a lot of ways for encoding the
4099 same multilingual text in ISO2022. Actually, there exist many
4100 coding systems such as Compound Text (used in X11's inter client
4101 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4102 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4103 localized platforms), and all of these are variants of ISO2022.
4105 In addition to the above, Emacs handles two more kinds of escape
4106 sequences: ISO6429's direction specification and Emacs' private
4107 sequence for specifying character composition.
4109 ISO6429's direction specification takes the following form:
4110 o CSI ']' -- end of the current direction
4111 o CSI '0' ']' -- end of the current direction
4112 o CSI '1' ']' -- start of left-to-right text
4113 o CSI '2' ']' -- start of right-to-left text
4114 The control character CSI (0x9B: control sequence introducer) is
4115 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4117 Character composition specification takes the following form:
4118 o ESC '0' -- start character composition
4119 o ESC '1' -- end character composition
4120 Since these are not standard escape sequences of any ISO standard,
4121 their use with these meanings is restricted to Emacs only. */
4124 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4128 for (i = 0; i < 4; i++)
4130 if (!NILP (coding_system))
4132 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4134 iso->charset[i] = Qt;
4135 iso->invalid_designated[i] = 0;
4137 iso->esc = ISO_ESC_NOTHING;
4138 iso->esc_bytes_index = 0;
4139 iso->register_left = 0;
4140 iso->register_right = 1;
4141 iso->switched_dir_and_no_valid_charset_yet = 0;
4142 iso->invalid_switch_dir = 0;
4143 iso->output_direction_sequence = 0;
4144 iso->output_literally = 0;
4145 #ifdef ENABLE_COMPOSITE_CHARS
4146 if (iso->composite_chars)
4147 Dynarr_reset (iso->composite_chars);
4152 fit_to_be_escape_quoted (unsigned char c)
4169 /* Parse one byte of an ISO2022 escape sequence.
4170 If the result is an invalid escape sequence, return 0 and
4171 do not change anything in STR. Otherwise, if the result is
4172 an incomplete escape sequence, update ISO2022.ESC and
4173 ISO2022.ESC_BYTES and return -1. Otherwise, update
4174 all the state variables (but not ISO2022.ESC_BYTES) and
4177 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4178 or invocation of an invalid character set and treat that as
4179 an unrecognized escape sequence. */
4182 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4183 unsigned char c, unsigned int *flags,
4184 int check_invalid_charsets)
4186 /* (1) If we're at the end of a designation sequence, CS is the
4187 charset being designated and REG is the register to designate
4190 (2) If we're at the end of a locking-shift sequence, REG is
4191 the register to invoke and HALF (0 == left, 1 == right) is
4192 the half to invoke it into.
4194 (3) If we're at the end of a single-shift sequence, REG is
4195 the register to invoke. */
4196 Lisp_Object cs = Qnil;
4199 /* NOTE: This code does goto's all over the fucking place.
4200 The reason for this is that we're basically implementing
4201 a state machine here, and hierarchical languages like C
4202 don't really provide a clean way of doing this. */
4204 if (! (*flags & CODING_STATE_ESCAPE))
4205 /* At beginning of escape sequence; we need to reset our
4206 escape-state variables. */
4207 iso->esc = ISO_ESC_NOTHING;
4209 iso->output_literally = 0;
4210 iso->output_direction_sequence = 0;
4214 case ISO_ESC_NOTHING:
4215 iso->esc_bytes_index = 0;
4218 case ISO_CODE_ESC: /* Start escape sequence */
4219 *flags |= CODING_STATE_ESCAPE;
4223 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4224 *flags |= CODING_STATE_ESCAPE;
4225 iso->esc = ISO_ESC_5_11;
4228 case ISO_CODE_SO: /* locking shift 1 */
4231 case ISO_CODE_SI: /* locking shift 0 */
4235 case ISO_CODE_SS2: /* single shift */
4238 case ISO_CODE_SS3: /* single shift */
4242 default: /* Other control characters */
4249 /**** single shift ****/
4251 case 'N': /* single shift 2 */
4254 case 'O': /* single shift 3 */
4258 /**** locking shift ****/
4260 case '~': /* locking shift 1 right */
4263 case 'n': /* locking shift 2 */
4266 case '}': /* locking shift 2 right */
4269 case 'o': /* locking shift 3 */
4272 case '|': /* locking shift 3 right */
4276 #ifdef ENABLE_COMPOSITE_CHARS
4277 /**** composite ****/
4280 iso->esc = ISO_ESC_START_COMPOSITE;
4281 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4282 CODING_STATE_COMPOSITE;
4286 iso->esc = ISO_ESC_END_COMPOSITE;
4287 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4288 ~CODING_STATE_COMPOSITE;
4290 #endif /* ENABLE_COMPOSITE_CHARS */
4292 /**** directionality ****/
4295 iso->esc = ISO_ESC_5_11;
4298 /**** designation ****/
4300 case '$': /* multibyte charset prefix */
4301 iso->esc = ISO_ESC_2_4;
4305 if (0x28 <= c && c <= 0x2F)
4307 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4311 /* This function is called with CODESYS equal to nil when
4312 doing coding-system detection. */
4314 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4315 && fit_to_be_escape_quoted (c))
4317 iso->esc = ISO_ESC_LITERAL;
4318 *flags &= CODING_STATE_ISO2022_LOCK;
4328 /**** directionality ****/
4330 case ISO_ESC_5_11: /* ISO6429 direction control */
4333 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4334 goto directionality;
4336 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4337 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4338 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4342 case ISO_ESC_5_11_0:
4345 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4346 goto directionality;
4350 case ISO_ESC_5_11_1:
4353 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4354 goto directionality;
4358 case ISO_ESC_5_11_2:
4361 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4362 goto directionality;
4367 iso->esc = ISO_ESC_DIRECTIONALITY;
4368 /* Various junk here to attempt to preserve the direction sequences
4369 literally in the text if they would otherwise be swallowed due
4370 to invalid designations that don't show up as actual charset
4371 changes in the text. */
4372 if (iso->invalid_switch_dir)
4374 /* We already inserted a direction switch literally into the
4375 text. We assume (#### this may not be right) that the
4376 next direction switch is the one going the other way,
4377 and we need to output that literally as well. */
4378 iso->output_literally = 1;
4379 iso->invalid_switch_dir = 0;
4385 /* If we are in the thrall of an invalid designation,
4386 then stick the directionality sequence literally into the
4387 output stream so it ends up in the original text again. */
4388 for (jj = 0; jj < 4; jj++)
4389 if (iso->invalid_designated[jj])
4393 iso->output_literally = 1;
4394 iso->invalid_switch_dir = 1;
4397 /* Indicate that we haven't yet seen a valid designation,
4398 so that if a switch-dir is directly followed by an
4399 invalid designation, both get inserted literally. */
4400 iso->switched_dir_and_no_valid_charset_yet = 1;
4405 /**** designation ****/
4408 if (0x28 <= c && c <= 0x2F)
4410 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4413 if (0x40 <= c && c <= 0x42)
4416 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4417 *flags & CODING_STATE_R2L ?
4418 CHARSET_RIGHT_TO_LEFT :
4419 CHARSET_LEFT_TO_RIGHT);
4430 if (c < '0' || c > '~')
4431 return 0; /* bad final byte */
4433 if (iso->esc >= ISO_ESC_2_8 &&
4434 iso->esc <= ISO_ESC_2_15)
4436 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4437 single = 1; /* single-byte */
4438 reg = (iso->esc - ISO_ESC_2_8) & 3;
4440 else if (iso->esc >= ISO_ESC_2_4_8 &&
4441 iso->esc <= ISO_ESC_2_4_15)
4443 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4444 single = -1; /* multi-byte */
4445 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4449 /* Can this ever be reached? -slb */
4453 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4454 *flags & CODING_STATE_R2L ?
4455 CHARSET_RIGHT_TO_LEFT :
4456 CHARSET_LEFT_TO_RIGHT);
4462 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4466 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4467 /* can't invoke something that ain't there. */
4469 iso->esc = ISO_ESC_SINGLE_SHIFT;
4470 *flags &= CODING_STATE_ISO2022_LOCK;
4472 *flags |= CODING_STATE_SS2;
4474 *flags |= CODING_STATE_SS3;
4478 if (check_invalid_charsets &&
4479 !CHARSETP (iso->charset[reg]))
4480 /* can't invoke something that ain't there. */
4483 iso->register_right = reg;
4485 iso->register_left = reg;
4486 *flags &= CODING_STATE_ISO2022_LOCK;
4487 iso->esc = ISO_ESC_LOCKING_SHIFT;
4491 if (NILP (cs) && check_invalid_charsets)
4493 iso->invalid_designated[reg] = 1;
4494 iso->charset[reg] = Vcharset_ascii;
4495 iso->esc = ISO_ESC_DESIGNATE;
4496 *flags &= CODING_STATE_ISO2022_LOCK;
4497 iso->output_literally = 1;
4498 if (iso->switched_dir_and_no_valid_charset_yet)
4500 /* We encountered a switch-direction followed by an
4501 invalid designation. Ensure that the switch-direction
4502 gets outputted; otherwise it will probably get eaten
4503 when the text is written out again. */
4504 iso->switched_dir_and_no_valid_charset_yet = 0;
4505 iso->output_direction_sequence = 1;
4506 /* And make sure that the switch-dir going the other
4507 way gets outputted, as well. */
4508 iso->invalid_switch_dir = 1;
4512 /* This function is called with CODESYS equal to nil when
4513 doing coding-system detection. */
4514 if (!NILP (codesys))
4516 charset_conversion_spec_dynarr *dyn =
4517 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4523 for (i = 0; i < Dynarr_length (dyn); i++)
4525 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4526 if (EQ (cs, spec->from_charset))
4527 cs = spec->to_charset;
4532 iso->charset[reg] = cs;
4533 iso->esc = ISO_ESC_DESIGNATE;
4534 *flags &= CODING_STATE_ISO2022_LOCK;
4535 if (iso->invalid_designated[reg])
4537 iso->invalid_designated[reg] = 0;
4538 iso->output_literally = 1;
4540 if (iso->switched_dir_and_no_valid_charset_yet)
4541 iso->switched_dir_and_no_valid_charset_yet = 0;
4546 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4551 /* #### There are serious deficiencies in the recognition mechanism
4552 here. This needs to be much smarter if it's going to cut it.
4553 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4554 it should be detected as Latin-1.
4555 All the ISO2022 stuff in this file should be synced up with the
4556 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4557 Perhaps we should wait till R2L works in FSF Emacs? */
4559 if (!st->iso2022.initted)
4561 reset_iso2022 (Qnil, &st->iso2022.iso);
4562 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4563 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4564 CODING_CATEGORY_ISO_8_1_MASK |
4565 CODING_CATEGORY_ISO_8_2_MASK |
4566 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4567 st->iso2022.flags = 0;
4568 st->iso2022.high_byte_count = 0;
4569 st->iso2022.saw_single_shift = 0;
4570 st->iso2022.initted = 1;
4573 mask = st->iso2022.mask;
4580 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4581 st->iso2022.high_byte_count++;
4585 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4587 if (st->iso2022.high_byte_count & 1)
4588 /* odd number of high bytes; assume not iso-8-2 */
4589 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4591 st->iso2022.high_byte_count = 0;
4592 st->iso2022.saw_single_shift = 0;
4594 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4596 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4597 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4598 { /* control chars */
4601 /* Allow and ignore control characters that you might
4602 reasonably see in a text file */
4607 case 8: /* backspace */
4608 case 11: /* vertical tab */
4609 case 12: /* form feed */
4610 case 26: /* MS-DOS C-z junk */
4611 case 31: /* '^_' -- for info */
4612 goto label_continue_loop;
4619 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4622 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4623 &st->iso2022.flags, 0))
4625 switch (st->iso2022.iso.esc)
4627 case ISO_ESC_DESIGNATE:
4628 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4629 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4631 case ISO_ESC_LOCKING_SHIFT:
4632 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4633 goto ran_out_of_chars;
4634 case ISO_ESC_SINGLE_SHIFT:
4635 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4636 st->iso2022.saw_single_shift = 1;
4645 goto ran_out_of_chars;
4648 label_continue_loop:;
4657 postprocess_iso2022_mask (int mask)
4659 /* #### kind of cheesy */
4660 /* If seven-bit ISO is allowed, then assume that the encoding is
4661 entirely seven-bit and turn off the eight-bit ones. */
4662 if (mask & CODING_CATEGORY_ISO_7_MASK)
4663 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4664 CODING_CATEGORY_ISO_8_1_MASK |
4665 CODING_CATEGORY_ISO_8_2_MASK);
4669 /* If FLAGS is a null pointer or specifies right-to-left motion,
4670 output a switch-dir-to-left-to-right sequence to DST.
4671 Also update FLAGS if it is not a null pointer.
4672 If INTERNAL_P is set, we are outputting in internal format and
4673 need to handle the CSI differently. */
4676 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4677 unsigned_char_dynarr *dst,
4678 unsigned int *flags,
4681 if (!flags || (*flags & CODING_STATE_R2L))
4683 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4685 Dynarr_add (dst, ISO_CODE_ESC);
4686 Dynarr_add (dst, '[');
4688 else if (internal_p)
4689 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4691 Dynarr_add (dst, ISO_CODE_CSI);
4692 Dynarr_add (dst, '0');
4693 Dynarr_add (dst, ']');
4695 *flags &= ~CODING_STATE_R2L;
4699 /* If FLAGS is a null pointer or specifies a direction different from
4700 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4701 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4702 sequence to DST. Also update FLAGS if it is not a null pointer.
4703 If INTERNAL_P is set, we are outputting in internal format and
4704 need to handle the CSI differently. */
4707 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4708 unsigned_char_dynarr *dst, unsigned int *flags,
4711 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4712 direction == CHARSET_LEFT_TO_RIGHT)
4713 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4714 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4715 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4716 direction == CHARSET_RIGHT_TO_LEFT)
4718 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4720 Dynarr_add (dst, ISO_CODE_ESC);
4721 Dynarr_add (dst, '[');
4723 else if (internal_p)
4724 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4726 Dynarr_add (dst, ISO_CODE_CSI);
4727 Dynarr_add (dst, '2');
4728 Dynarr_add (dst, ']');
4730 *flags |= CODING_STATE_R2L;
4734 /* Convert ISO2022-format data to internal format. */
4737 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4738 unsigned_char_dynarr *dst, unsigned int n)
4740 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4741 unsigned int flags = str->flags;
4742 unsigned int cpos = str->cpos;
4743 unsigned char counter = str->counter;
4744 eol_type_t eol_type = str->eol_type;
4745 #ifdef ENABLE_COMPOSITE_CHARS
4746 unsigned_char_dynarr *real_dst = dst;
4748 Lisp_Object coding_system;
4750 XSETCODING_SYSTEM (coding_system, str->codesys);
4752 #ifdef ENABLE_COMPOSITE_CHARS
4753 if (flags & CODING_STATE_COMPOSITE)
4754 dst = str->iso2022.composite_chars;
4755 #endif /* ENABLE_COMPOSITE_CHARS */
4759 unsigned char c = *src++;
4760 if (flags & CODING_STATE_ESCAPE)
4761 { /* Within ESC sequence */
4762 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4767 switch (str->iso2022.esc)
4769 #ifdef ENABLE_COMPOSITE_CHARS
4770 case ISO_ESC_START_COMPOSITE:
4771 if (str->iso2022.composite_chars)
4772 Dynarr_reset (str->iso2022.composite_chars);
4774 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4775 dst = str->iso2022.composite_chars;
4777 case ISO_ESC_END_COMPOSITE:
4779 Bufbyte comstr[MAX_EMCHAR_LEN];
4781 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4782 Dynarr_length (dst));
4784 len = set_charptr_emchar (comstr, emch);
4785 Dynarr_add_many (dst, comstr, len);
4788 #endif /* ENABLE_COMPOSITE_CHARS */
4790 case ISO_ESC_LITERAL:
4791 COMPOSE_FLUSH_CHARS (str, dst);
4792 DECODE_ADD_BINARY_CHAR (c, dst);
4796 /* Everything else handled already */
4801 /* Attempted error recovery. */
4802 if (str->iso2022.output_direction_sequence)
4803 ensure_correct_direction (flags & CODING_STATE_R2L ?
4804 CHARSET_RIGHT_TO_LEFT :
4805 CHARSET_LEFT_TO_RIGHT,
4806 str->codesys, dst, 0, 1);
4807 /* More error recovery. */
4808 if (!retval || str->iso2022.output_literally)
4810 /* Output the (possibly invalid) sequence */
4812 COMPOSE_FLUSH_CHARS (str, dst);
4813 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4814 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4815 flags &= CODING_STATE_ISO2022_LOCK;
4817 n++, src--;/* Repeat the loop with the same character. */
4820 /* No sense in reprocessing the final byte of the
4821 escape sequence; it could mess things up anyway.
4823 COMPOSE_FLUSH_CHARS (str, dst);
4824 DECODE_ADD_BINARY_CHAR (c, dst);
4830 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4831 { /* Control characters */
4833 /***** Error-handling *****/
4835 /* If we were in the middle of a character, dump out the
4836 partial character. */
4839 COMPOSE_FLUSH_CHARS (str, dst);
4843 DECODE_ADD_BINARY_CHAR
4844 ((unsigned char)(cpos >> (counter * 8)), dst);
4849 /* If we just saw a single-shift character, dump it out.
4850 This may dump out the wrong sort of single-shift character,
4851 but least it will give an indication that something went
4853 if (flags & CODING_STATE_SS2)
4855 COMPOSE_FLUSH_CHARS (str, dst);
4856 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4857 flags &= ~CODING_STATE_SS2;
4859 if (flags & CODING_STATE_SS3)
4861 COMPOSE_FLUSH_CHARS (str, dst);
4862 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4863 flags &= ~CODING_STATE_SS3;
4866 /***** Now handle the control characters. *****/
4872 COMPOSE_FLUSH_CHARS (str, dst);
4873 if (eol_type == EOL_CR)
4874 Dynarr_add (dst, '\n');
4875 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4876 Dynarr_add (dst, c);
4878 flags |= CODING_STATE_CR;
4879 goto label_continue_loop;
4881 else if (flags & CODING_STATE_CR)
4882 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4884 Dynarr_add (dst, '\r');
4885 flags &= ~CODING_STATE_CR;
4888 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4891 flags &= CODING_STATE_ISO2022_LOCK;
4893 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4895 COMPOSE_FLUSH_CHARS (str, dst);
4896 DECODE_ADD_BINARY_CHAR (c, dst);
4900 { /* Graphic characters */
4901 Lisp_Object charset;
4910 COMPOSE_FLUSH_CHARS (str, dst);
4911 if (eol_type == EOL_CR)
4912 Dynarr_add (dst, '\n');
4913 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4914 Dynarr_add (dst, c);
4916 flags |= CODING_STATE_CR;
4917 goto label_continue_loop;
4919 else if (flags & CODING_STATE_CR)
4920 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
4922 Dynarr_add (dst, '\r');
4923 flags &= ~CODING_STATE_CR;
4926 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4929 /* Now determine the charset. */
4930 reg = ((flags & CODING_STATE_SS2) ? 2
4931 : (flags & CODING_STATE_SS3) ? 3
4932 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4933 : str->iso2022.register_left);
4934 charset = str->iso2022.charset[reg];
4936 /* Error checking: */
4937 if (! CHARSETP (charset)
4938 || str->iso2022.invalid_designated[reg]
4939 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4940 && XCHARSET_CHARS (charset) == 94))
4941 /* Mrmph. We are trying to invoke a register that has no
4942 or an invalid charset in it, or trying to add a character
4943 outside the range of the charset. Insert that char literally
4944 to preserve it for the output. */
4946 COMPOSE_FLUSH_CHARS (str, dst);
4950 DECODE_ADD_BINARY_CHAR
4951 ((unsigned char)(cpos >> (counter * 8)), dst);
4954 DECODE_ADD_BINARY_CHAR (c, dst);
4959 /* Things are probably hunky-dorey. */
4961 /* Fetch reverse charset, maybe. */
4962 if (((flags & CODING_STATE_R2L) &&
4963 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4965 (!(flags & CODING_STATE_R2L) &&
4966 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4968 Lisp_Object new_charset =
4969 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4970 if (!NILP (new_charset))
4971 charset = new_charset;
4976 if (XCHARSET_DIMENSION (charset) == counter)
4978 COMPOSE_ADD_CHAR (str,
4979 DECODE_CHAR (charset,
4980 ((cpos & 0x7F7F7F) << 8)
4987 cpos = (cpos << 8) | c;
4989 lb = XCHARSET_LEADING_BYTE (charset);
4990 switch (XCHARSET_REP_BYTES (charset))
4993 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4994 Dynarr_add (dst, c & 0x7F);
4997 case 2: /* one-byte official */
4998 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4999 Dynarr_add (dst, lb);
5000 Dynarr_add (dst, c | 0x80);
5003 case 3: /* one-byte private or two-byte official */
5004 if (XCHARSET_PRIVATE_P (charset))
5006 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5007 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5008 Dynarr_add (dst, lb);
5009 Dynarr_add (dst, c | 0x80);
5015 Dynarr_add (dst, lb);
5016 Dynarr_add (dst, ch | 0x80);
5017 Dynarr_add (dst, c | 0x80);
5025 default: /* two-byte private */
5028 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5029 Dynarr_add (dst, lb);
5030 Dynarr_add (dst, ch | 0x80);
5031 Dynarr_add (dst, c | 0x80);
5041 flags &= CODING_STATE_ISO2022_LOCK;
5044 label_continue_loop:;
5047 if (flags & CODING_STATE_END)
5049 COMPOSE_FLUSH_CHARS (str, dst);
5050 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5054 str->counter = counter;
5058 /***** ISO2022 encoder *****/
5060 /* Designate CHARSET into register REG. */
5063 iso2022_designate (Lisp_Object charset, unsigned char reg,
5064 struct encoding_stream *str, unsigned_char_dynarr *dst)
5066 static const char inter94[] = "()*+";
5067 static const char inter96[] = ",-./";
5068 unsigned short chars;
5069 unsigned char dimension;
5070 unsigned char final;
5071 Lisp_Object old_charset = str->iso2022.charset[reg];
5073 str->iso2022.charset[reg] = charset;
5074 if (!CHARSETP (charset))
5075 /* charset might be an initial nil or t. */
5077 chars = XCHARSET_CHARS (charset);
5078 dimension = XCHARSET_DIMENSION (charset);
5079 final = XCHARSET_FINAL (charset);
5080 if (!str->iso2022.force_charset_on_output[reg] &&
5081 CHARSETP (old_charset) &&
5082 XCHARSET_CHARS (old_charset) == chars &&
5083 XCHARSET_DIMENSION (old_charset) == dimension &&
5084 XCHARSET_FINAL (old_charset) == final)
5087 str->iso2022.force_charset_on_output[reg] = 0;
5090 charset_conversion_spec_dynarr *dyn =
5091 str->codesys->iso2022.output_conv;
5097 for (i = 0; i < Dynarr_length (dyn); i++)
5099 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5100 if (EQ (charset, spec->from_charset))
5101 charset = spec->to_charset;
5106 Dynarr_add (dst, ISO_CODE_ESC);
5111 Dynarr_add (dst, inter94[reg]);
5114 Dynarr_add (dst, '$');
5116 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5119 Dynarr_add (dst, inter94[reg]);
5124 Dynarr_add (dst, inter96[reg]);
5127 Dynarr_add (dst, '$');
5128 Dynarr_add (dst, inter96[reg]);
5132 Dynarr_add (dst, final);
5136 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5138 if (str->iso2022.register_left != 0)
5140 Dynarr_add (dst, ISO_CODE_SI);
5141 str->iso2022.register_left = 0;
5146 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5148 if (str->iso2022.register_left != 1)
5150 Dynarr_add (dst, ISO_CODE_SO);
5151 str->iso2022.register_left = 1;
5156 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5157 unsigned_char_dynarr *dst, unsigned int *flags)
5159 unsigned char charmask;
5160 Lisp_Coding_System* codesys = str->codesys;
5161 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5163 Lisp_Object charset = str->iso2022.current_charset;
5164 int half = str->iso2022.current_half;
5165 int code_point = -1;
5169 restore_left_to_right_direction (codesys, dst, flags, 0);
5171 /* Make sure G0 contains ASCII */
5172 if ((ch > ' ' && ch < ISO_CODE_DEL)
5173 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5175 ensure_normal_shift (str, dst);
5176 iso2022_designate (Vcharset_ascii, 0, str, dst);
5179 /* If necessary, restore everything to the default state
5181 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5183 restore_left_to_right_direction (codesys, dst, flags, 0);
5185 ensure_normal_shift (str, dst);
5187 for (i = 0; i < 4; i++)
5189 Lisp_Object initial_charset =
5190 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5191 iso2022_designate (initial_charset, i, str, dst);
5196 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5197 Dynarr_add (dst, '\r');
5198 if (eol_type != EOL_CR)
5199 Dynarr_add (dst, ch);
5203 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5204 && fit_to_be_escape_quoted (ch))
5205 Dynarr_add (dst, ISO_CODE_ESC);
5206 Dynarr_add (dst, ch);
5209 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5211 charmask = (half == 0 ? 0x00 : 0x80);
5213 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5214 && fit_to_be_escape_quoted (ch))
5215 Dynarr_add (dst, ISO_CODE_ESC);
5216 /* you asked for it ... */
5217 Dynarr_add (dst, ch);
5223 /* Now determine which register to use. */
5225 for (i = 0; i < 4; i++)
5227 if ((CHARSETP (charset = str->iso2022.charset[i])
5228 && ((code_point = charset_code_point (charset, ch)) >= 0))
5232 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5233 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5241 Lisp_Object original_default_coded_charset_priority_list
5242 = Vdefault_coded_charset_priority_list;
5244 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5246 code_point = ENCODE_CHAR (ch, charset);
5247 if (XCHARSET_FINAL (charset))
5249 Vdefault_coded_charset_priority_list
5250 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5251 Vdefault_coded_charset_priority_list));
5253 code_point = ENCODE_CHAR (ch, charset);
5254 if (!XCHARSET_FINAL (charset))
5256 charset = Vcharset_ascii;
5260 Vdefault_coded_charset_priority_list
5261 = original_default_coded_charset_priority_list;
5263 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5264 codesys, dst, flags, 0);
5268 if (XCHARSET_GRAPHIC (charset) != 0)
5270 if (!NILP (str->iso2022.charset[1]) &&
5271 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5272 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5274 else if (!NILP (str->iso2022.charset[2]))
5276 else if (!NILP (str->iso2022.charset[3]))
5285 iso2022_designate (charset, reg, str, dst);
5287 /* Now invoke that register. */
5291 ensure_normal_shift (str, dst);
5295 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5297 ensure_shift_out (str, dst);
5304 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5306 Dynarr_add (dst, ISO_CODE_ESC);
5307 Dynarr_add (dst, 'N');
5312 Dynarr_add (dst, ISO_CODE_SS2);
5317 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5319 Dynarr_add (dst, ISO_CODE_ESC);
5320 Dynarr_add (dst, 'O');
5325 Dynarr_add (dst, ISO_CODE_SS3);
5333 charmask = (half == 0 ? 0x00 : 0x80);
5335 switch (XCHARSET_DIMENSION (charset))
5338 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5341 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5342 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5345 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5346 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5347 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5350 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5351 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5352 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5353 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5359 str->iso2022.current_charset = charset;
5360 str->iso2022.current_half = half;
5364 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5365 unsigned int *flags)
5367 Lisp_Coding_System* codesys = str->codesys;
5370 restore_left_to_right_direction (codesys, dst, flags, 0);
5371 ensure_normal_shift (str, dst);
5372 for (i = 0; i < 4; i++)
5374 Lisp_Object initial_charset
5375 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5376 iso2022_designate (initial_charset, i, str, dst);
5381 /************************************************************************/
5382 /* No-conversion methods */
5383 /************************************************************************/
5385 /* This is used when reading in "binary" files -- i.e. files that may
5386 contain all 256 possible byte values and that are not to be
5387 interpreted as being in any particular decoding. */
5389 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5390 unsigned_char_dynarr *dst, unsigned int n)
5393 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5394 unsigned int flags = str->flags;
5395 unsigned int cpos = str->cpos;
5396 eol_type_t eol_type = str->eol_type;
5402 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5403 DECODE_ADD_BINARY_CHAR (c, dst);
5404 label_continue_loop:;
5407 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5414 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5415 unsigned_char_dynarr *dst, unsigned int n)
5418 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5419 unsigned int flags = str->flags;
5420 unsigned int ch = str->ch;
5421 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5423 unsigned char char_boundary = str->iso2022.current_char_boundary;
5430 if (char_boundary == 0)
5436 else if ( c >= 0xf8 )
5441 else if ( c >= 0xf0 )
5446 else if ( c >= 0xe0 )
5451 else if ( c >= 0xc0 )
5461 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5462 Dynarr_add (dst, '\r');
5463 if (eol_type != EOL_CR)
5464 Dynarr_add (dst, c);
5467 Dynarr_add (dst, c);
5470 else if (char_boundary == 1)
5472 ch = ( ch << 6 ) | ( c & 0x3f );
5473 Dynarr_add (dst, ch & 0xff);
5478 ch = ( ch << 6 ) | ( c & 0x3f );
5481 #else /* not UTF2000 */
5484 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5485 Dynarr_add (dst, '\r');
5486 if (eol_type != EOL_CR)
5487 Dynarr_add (dst, '\n');
5490 else if (BYTE_ASCII_P (c))
5493 Dynarr_add (dst, c);
5495 else if (BUFBYTE_LEADING_BYTE_P (c))
5498 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5499 c == LEADING_BYTE_CONTROL_1)
5502 Dynarr_add (dst, '~'); /* untranslatable character */
5506 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5507 Dynarr_add (dst, c);
5508 else if (ch == LEADING_BYTE_CONTROL_1)
5511 Dynarr_add (dst, c - 0x20);
5513 /* else it should be the second or third byte of an
5514 untranslatable character, so ignore it */
5517 #endif /* not UTF2000 */
5523 str->iso2022.current_char_boundary = char_boundary;
5529 /************************************************************************/
5530 /* Initialization */
5531 /************************************************************************/
5534 syms_of_file_coding (void)
5536 INIT_LRECORD_IMPLEMENTATION (coding_system);
5538 deferror (&Qcoding_system_error, "coding-system-error",
5539 "Coding-system error", Qio_error);
5541 DEFSUBR (Fcoding_system_p);
5542 DEFSUBR (Ffind_coding_system);
5543 DEFSUBR (Fget_coding_system);
5544 DEFSUBR (Fcoding_system_list);
5545 DEFSUBR (Fcoding_system_name);
5546 DEFSUBR (Fmake_coding_system);
5547 DEFSUBR (Fcopy_coding_system);
5548 DEFSUBR (Fcoding_system_canonical_name_p);
5549 DEFSUBR (Fcoding_system_alias_p);
5550 DEFSUBR (Fcoding_system_aliasee);
5551 DEFSUBR (Fdefine_coding_system_alias);
5552 DEFSUBR (Fsubsidiary_coding_system);
5554 DEFSUBR (Fcoding_system_type);
5555 DEFSUBR (Fcoding_system_doc_string);
5557 DEFSUBR (Fcoding_system_charset);
5559 DEFSUBR (Fcoding_system_property);
5561 DEFSUBR (Fcoding_category_list);
5562 DEFSUBR (Fset_coding_priority_list);
5563 DEFSUBR (Fcoding_priority_list);
5564 DEFSUBR (Fset_coding_category_system);
5565 DEFSUBR (Fcoding_category_system);
5567 DEFSUBR (Fdetect_coding_region);
5568 DEFSUBR (Fdecode_coding_region);
5569 DEFSUBR (Fencode_coding_region);
5571 DEFSUBR (Fdecode_shift_jis_char);
5572 DEFSUBR (Fencode_shift_jis_char);
5573 DEFSUBR (Fdecode_big5_char);
5574 DEFSUBR (Fencode_big5_char);
5576 defsymbol (&Qcoding_systemp, "coding-system-p");
5577 defsymbol (&Qno_conversion, "no-conversion");
5578 defsymbol (&Qraw_text, "raw-text");
5580 defsymbol (&Qbig5, "big5");
5581 defsymbol (&Qshift_jis, "shift-jis");
5582 defsymbol (&Qucs4, "ucs-4");
5583 defsymbol (&Qutf8, "utf-8");
5584 defsymbol (&Qccl, "ccl");
5585 defsymbol (&Qiso2022, "iso2022");
5587 defsymbol (&Qmnemonic, "mnemonic");
5588 defsymbol (&Qeol_type, "eol-type");
5589 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5590 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5592 defsymbol (&Qcr, "cr");
5593 defsymbol (&Qlf, "lf");
5594 defsymbol (&Qcrlf, "crlf");
5595 defsymbol (&Qeol_cr, "eol-cr");
5596 defsymbol (&Qeol_lf, "eol-lf");
5597 defsymbol (&Qeol_crlf, "eol-crlf");
5599 defsymbol (&Qcharset_g0, "charset-g0");
5600 defsymbol (&Qcharset_g1, "charset-g1");
5601 defsymbol (&Qcharset_g2, "charset-g2");
5602 defsymbol (&Qcharset_g3, "charset-g3");
5603 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5604 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5605 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5606 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5607 defsymbol (&Qno_iso6429, "no-iso6429");
5608 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5609 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5611 defsymbol (&Qshort, "short");
5612 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5613 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5614 defsymbol (&Qseven, "seven");
5615 defsymbol (&Qlock_shift, "lock-shift");
5616 defsymbol (&Qescape_quoted, "escape-quoted");
5619 defsymbol (&Qdisable_composition, "disable-composition");
5621 defsymbol (&Qencode, "encode");
5622 defsymbol (&Qdecode, "decode");
5625 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5627 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5629 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5631 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5633 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5635 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5637 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5639 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5641 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5644 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5649 lstream_type_create_file_coding (void)
5651 LSTREAM_HAS_METHOD (decoding, reader);
5652 LSTREAM_HAS_METHOD (decoding, writer);
5653 LSTREAM_HAS_METHOD (decoding, rewinder);
5654 LSTREAM_HAS_METHOD (decoding, seekable_p);
5655 LSTREAM_HAS_METHOD (decoding, flusher);
5656 LSTREAM_HAS_METHOD (decoding, closer);
5657 LSTREAM_HAS_METHOD (decoding, marker);
5659 LSTREAM_HAS_METHOD (encoding, reader);
5660 LSTREAM_HAS_METHOD (encoding, writer);
5661 LSTREAM_HAS_METHOD (encoding, rewinder);
5662 LSTREAM_HAS_METHOD (encoding, seekable_p);
5663 LSTREAM_HAS_METHOD (encoding, flusher);
5664 LSTREAM_HAS_METHOD (encoding, closer);
5665 LSTREAM_HAS_METHOD (encoding, marker);
5669 vars_of_file_coding (void)
5673 fcd = xnew (struct file_coding_dump);
5674 dumpstruct (&fcd, &fcd_description);
5676 /* Initialize to something reasonable ... */
5677 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5679 fcd->coding_category_system[i] = Qnil;
5680 fcd->coding_category_by_priority[i] = i;
5683 Fprovide (intern ("file-coding"));
5685 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5686 Coding system used for TTY keyboard input.
5687 Not used under a windowing system.
5689 Vkeyboard_coding_system = Qnil;
5691 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5692 Coding system used for TTY display output.
5693 Not used under a windowing system.
5695 Vterminal_coding_system = Qnil;
5697 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5698 Overriding coding system used when reading from a file or process.
5699 You should bind this variable with `let', but do not set it globally.
5700 If this is non-nil, it specifies the coding system that will be used
5701 to decode input on read operations, such as from a file or process.
5702 It overrides `buffer-file-coding-system-for-read',
5703 `insert-file-contents-pre-hook', etc. Use those variables instead of
5704 this one for permanent changes to the environment. */ );
5705 Vcoding_system_for_read = Qnil;
5707 DEFVAR_LISP ("coding-system-for-write",
5708 &Vcoding_system_for_write /*
5709 Overriding coding system used when writing to a file or process.
5710 You should bind this variable with `let', but do not set it globally.
5711 If this is non-nil, it specifies the coding system that will be used
5712 to encode output for write operations, such as to a file or process.
5713 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5714 Use those variables instead of this one for permanent changes to the
5716 Vcoding_system_for_write = Qnil;
5718 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5719 Coding system used to convert pathnames when accessing files.
5721 Vfile_name_coding_system = Qnil;
5723 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5724 Non-nil means the buffer contents are regarded as multi-byte form
5725 of characters, not a binary code. This affects the display, file I/O,
5726 and behaviors of various editing commands.
5728 Setting this to nil does not do anything.
5730 enable_multibyte_characters = 1;
5734 complex_vars_of_file_coding (void)
5736 staticpro (&Vcoding_system_hash_table);
5737 Vcoding_system_hash_table =
5738 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5740 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5741 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5743 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5745 struct codesys_prop csp; \
5747 csp.prop_type = (Prop_Type); \
5748 Dynarr_add (the_codesys_prop_dynarr, csp); \
5751 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5752 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5753 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5754 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5755 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5756 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5757 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5759 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5760 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5761 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5762 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5763 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5764 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5765 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5766 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5767 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5768 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5769 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5770 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5771 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5772 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5773 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5774 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5775 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5777 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5778 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5780 /* Need to create this here or we're really screwed. */
5782 (Qraw_text, Qno_conversion,
5783 build_string ("Raw text, which means it converts only line-break-codes."),
5784 list2 (Qmnemonic, build_string ("Raw")));
5787 (Qbinary, Qno_conversion,
5788 build_string ("Binary, which means it does not convert anything."),
5789 list4 (Qeol_type, Qlf,
5790 Qmnemonic, build_string ("Binary")));
5795 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5796 list2 (Qmnemonic, build_string ("UTF8")));
5799 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5801 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5803 Fdefine_coding_system_alias (Qterminal, Qbinary);
5804 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5806 /* Need this for bootstrapping */
5807 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5808 Fget_coding_system (Qraw_text);
5811 fcd->coding_category_system[CODING_CATEGORY_UTF8]
5812 = Fget_coding_system (Qutf8);
5815 #if defined(MULE) && !defined(UTF2000)
5819 for (i = 0; i < 65536; i++)
5820 fcd->ucs_to_mule_table[i] = Qnil;
5822 staticpro (&mule_to_ucs_table);
5823 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5824 #endif /* defined(MULE) && !defined(UTF2000) */