src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 185                      unsigned_char_dynarr *dst, size_t n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const Extbyte *src, size_t n);
 189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 190                                 unsigned_char_dynarr *dst, size_t n);
 191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 192                             unsigned_char_dynarr *dst, unsigned int *flags);
 193 void char_finish_shift_jis (struct encoding_stream *str,
 194                             unsigned_char_dynarr *dst, unsigned int *flags);
 195
 196 static int detect_coding_big5 (struct detection_state *st,
 197                                const Extbyte *src, size_t n);
 198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, size_t n);
 200 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
 201                                 unsigned_char_dynarr *dst, size_t n);
 202
 203 static int detect_coding_ucs4 (struct detection_state *st,
 204                                const Extbyte *src, size_t n);
 205 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 206                                 unsigned_char_dynarr *dst, size_t n);
 207 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 208                        unsigned_char_dynarr *dst, unsigned int *flags);
 209 void char_finish_ucs4 (struct encoding_stream *str,
 210                        unsigned_char_dynarr *dst, unsigned int *flags);
 211
 212 static int detect_coding_utf8 (struct detection_state *st,
 213                                const Extbyte *src, size_t n);
 214 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 215                                 unsigned_char_dynarr *dst, size_t n);
 216 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 217                        unsigned_char_dynarr *dst, unsigned int *flags);
 218 void char_finish_utf8 (struct encoding_stream *str,
 219                        unsigned_char_dynarr *dst, unsigned int *flags);
 220
 221 static int postprocess_iso2022_mask (int mask);
 222 static void reset_iso2022 (Lisp_Object coding_system,
 223                            struct iso2022_decoder *iso);
 224 static int detect_coding_iso2022 (struct detection_state *st,
 225                                   const Extbyte *src, size_t n);
 226 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 227                                    unsigned_char_dynarr *dst, size_t n);
 228 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 229                           unsigned_char_dynarr *dst, unsigned int *flags);
 230 void char_finish_iso2022 (struct encoding_stream *str,
 231                           unsigned_char_dynarr *dst, unsigned int *flags);
 232 #endif /* MULE */
 233 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 234                                          unsigned_char_dynarr *dst, size_t n);
 235 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 236                                          unsigned_char_dynarr *dst, size_t n);
 237 static void mule_decode (Lstream *decoding, const Extbyte *src,
 238                          unsigned_char_dynarr *dst, size_t n);
 239 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 240                          unsigned_char_dynarr *dst, size_t n);
 241
 242 typedef struct codesys_prop codesys_prop;
 243 struct codesys_prop
 244 {
 245   Lisp_Object sym;
 246   int prop_type;
 247 };
 248
 249 typedef struct
 250 {
 251   Dynarr_declare (codesys_prop);
 252 } codesys_prop_dynarr;
 253
 254 static const struct lrecord_description codesys_prop_description_1[] = {
 255   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 256   { XD_END }
 257 };
 258
 259 static const struct struct_description codesys_prop_description = {
 260   sizeof (codesys_prop),
 261   codesys_prop_description_1
 262 };
 263
 264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 265   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 266   { XD_END }
 267 };
 268
 269 static const struct struct_description codesys_prop_dynarr_description = {
 270   sizeof (codesys_prop_dynarr),
 271   codesys_prop_dynarr_description_1
 272 };
 273
 274 codesys_prop_dynarr *the_codesys_prop_dynarr;
 275
 276 enum codesys_prop_enum
 277 {
 278   CODESYS_PROP_ALL_OK,
 279   CODESYS_PROP_ISO2022,
 280   CODESYS_PROP_CCL
 281 };
 282
 283 \f
 284 /************************************************************************/
 285 /*                       Coding system functions                        */
 286 /************************************************************************/
 287
 288 static Lisp_Object mark_coding_system (Lisp_Object);
 289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 290 static void finalize_coding_system (void *header, int for_disksave);
 291
 292 #ifdef MULE
 293 static const struct lrecord_description ccs_description_1[] = {
 294   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 295   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 296   { XD_END }
 297 };
 298
 299 static const struct struct_description ccs_description = {
 300   sizeof (charset_conversion_spec),
 301   ccs_description_1
 302 };
 303
 304 static const struct lrecord_description ccsd_description_1[] = {
 305   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 306   { XD_END }
 307 };
 308
 309 static const struct struct_description ccsd_description = {
 310   sizeof (charset_conversion_spec_dynarr),
 311   ccsd_description_1
 312 };
 313 #endif
 314
 315 static const struct lrecord_description coding_system_description[] = {
 316   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 317   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 318   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 319   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 320   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 321   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 322   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 323   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 324 #ifdef MULE
 325   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 326   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 327   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 328   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 329   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 330 #endif
 331   { XD_END }
 332 };
 333
 334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 335                                mark_coding_system, print_coding_system,
 336                                finalize_coding_system,
 337                                0, 0, coding_system_description,
 338                                Lisp_Coding_System);
 339
 340 static Lisp_Object
 341 mark_coding_system (Lisp_Object obj)
 342 {
 343   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 344
 345   mark_object (CODING_SYSTEM_NAME (codesys));
 346   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 347   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 348   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 349   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 350   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 351
 352   switch (CODING_SYSTEM_TYPE (codesys))
 353     {
 354 #ifdef MULE
 355       int i;
 356     case CODESYS_ISO2022:
 357       for (i = 0; i < 4; i++)
 358         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 359       if (codesys->iso2022.input_conv)
 360         {
 361           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 362             {
 363               struct charset_conversion_spec *ccs =
 364                 Dynarr_atp (codesys->iso2022.input_conv, i);
 365               mark_object (ccs->from_charset);
 366               mark_object (ccs->to_charset);
 367             }
 368         }
 369       if (codesys->iso2022.output_conv)
 370         {
 371           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 372             {
 373               struct charset_conversion_spec *ccs =
 374                 Dynarr_atp (codesys->iso2022.output_conv, i);
 375               mark_object (ccs->from_charset);
 376               mark_object (ccs->to_charset);
 377             }
 378         }
 379       break;
 380
 381     case CODESYS_CCL:
 382       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 383       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 384       break;
 385 #endif /* MULE */
 386     default:
 387       break;
 388     }
 389
 390   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 391   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 392 }
 393
 394 static void
 395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 396                      int escapeflag)
 397 {
 398   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 399   if (print_readably)
 400     error ("printing unreadable object #<coding_system 0x%x>",
 401            c->header.uid);
 402
 403   write_c_string ("#<coding_system ", printcharfun);
 404   print_internal (c->name, printcharfun, 1);
 405   write_c_string (">", printcharfun);
 406 }
 407
 408 static void
 409 finalize_coding_system (void *header, int for_disksave)
 410 {
 411   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 412   /* Since coding systems never go away, this function is not
 413      necessary.  But it would be necessary if we changed things
 414      so that coding systems could go away. */
 415   if (!for_disksave) /* see comment in lstream.c */
 416     {
 417       switch (CODING_SYSTEM_TYPE (c))
 418         {
 419 #ifdef MULE
 420         case CODESYS_ISO2022:
 421           if (c->iso2022.input_conv)
 422             {
 423               Dynarr_free (c->iso2022.input_conv);
 424               c->iso2022.input_conv = 0;
 425             }
 426           if (c->iso2022.output_conv)
 427             {
 428               Dynarr_free (c->iso2022.output_conv);
 429               c->iso2022.output_conv = 0;
 430             }
 431           break;
 432 #endif /* MULE */
 433         default:
 434           break;
 435         }
 436     }
 437 }
 438
 439 static eol_type_t
 440 symbol_to_eol_type (Lisp_Object symbol)
 441 {
 442   CHECK_SYMBOL (symbol);
 443   if (NILP (symbol))      return EOL_AUTODETECT;
 444   if (EQ (symbol, Qlf))   return EOL_LF;
 445   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 446   if (EQ (symbol, Qcr))   return EOL_CR;
 447
 448   signal_simple_error ("Unrecognized eol type", symbol);
 449   return EOL_AUTODETECT; /* not reached */
 450 }
 451
 452 static Lisp_Object
 453 eol_type_to_symbol (eol_type_t type)
 454 {
 455   switch (type)
 456     {
 457     default: abort ();
 458     case EOL_LF:         return Qlf;
 459     case EOL_CRLF:       return Qcrlf;
 460     case EOL_CR:         return Qcr;
 461     case EOL_AUTODETECT: return Qnil;
 462     }
 463 }
 464
 465 static void
 466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 467 {
 468   Lisp_Object codesys_obj;
 469   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 470   char *codesys_name = (char *) alloca (len + 7);
 471   int mlen = -1;
 472   char *codesys_mnemonic=0;
 473
 474   Lisp_Object codesys_name_sym, sub_codesys_obj;
 475
 476   /* kludge */
 477
 478   XSETCODING_SYSTEM (codesys_obj, codesys);
 479
 480   memcpy (codesys_name,
 481           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 482
 483   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 484     {
 485       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 486       codesys_mnemonic = (char *) alloca (mlen + 7);
 487       memcpy (codesys_mnemonic,
 488               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 489     }
 490
 491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 492   strcpy (codesys_name + len, "-" op_sys);                                      \
 493   if (mlen != -1)                                                               \
 494     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 495   codesys_name_sym = intern (codesys_name);                                     \
 496   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 497   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 498   if (mlen != -1)                                                               \
 499     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 500       build_string (codesys_mnemonic);                                          \
 501   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 502 } while (0)
 503
 504   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 505   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 506   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 507 }
 508
 509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 510 Return t if OBJECT is a coding system.
 511 A coding system is an object that defines how text containing multiple
 512 character sets is encoded into a stream of (typically 8-bit) bytes.
 513 The coding system is used to decode the stream into a series of
 514 characters (which may be from multiple charsets) when the text is read
 515 from a file or process, and is used to encode the text back into the
 516 same format when it is written out to a file or process.
 517
 518 For example, many ISO2022-compliant coding systems (such as Compound
 519 Text, which is used for inter-client data under the X Window System)
 520 use escape sequences to switch between different charsets -- Japanese
 521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 523 `make-coding-system' for more information.
 524
 525 Coding systems are normally identified using a symbol, and the
 526 symbol is accepted in place of the actual coding system object whenever
 527 a coding system is called for. (This is similar to how faces work.)
 528 */
 529        (object))
 530 {
 531   return CODING_SYSTEMP (object) ? Qt : Qnil;
 532 }
 533
 534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 535 Retrieve the coding system of the given name.
 536
 537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 538 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 539 If there is no such coding system, nil is returned.  Otherwise the
 540 associated coding system object is returned.
 541 */
 542        (coding_system_or_name))
 543 {
 544   if (NILP (coding_system_or_name))
 545     coding_system_or_name = Qbinary;
 546   else if (CODING_SYSTEMP (coding_system_or_name))
 547     return coding_system_or_name;
 548   else
 549     CHECK_SYMBOL (coding_system_or_name);
 550
 551   while (1)
 552     {
 553       coding_system_or_name =
 554         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 555
 556       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 557         return coding_system_or_name;
 558     }
 559 }
 560
 561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 562 Retrieve the coding system of the given name.
 563 Same as `find-coding-system' except that if there is no such
 564 coding system, an error is signaled instead of returning nil.
 565 */
 566        (name))
 567 {
 568   Lisp_Object coding_system = Ffind_coding_system (name);
 569
 570   if (NILP (coding_system))
 571     signal_simple_error ("No such coding system", name);
 572   return coding_system;
 573 }
 574
 575 /* We store the coding systems in hash tables with the names as the key and the
 576    actual coding system object as the value.  Occasionally we need to use them
 577    in a list format.  These routines provide us with that. */
 578 struct coding_system_list_closure
 579 {
 580   Lisp_Object *coding_system_list;
 581 };
 582
 583 static int
 584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 585                                   void *coding_system_list_closure)
 586 {
 587   /* This function can GC */
 588   struct coding_system_list_closure *cscl =
 589     (struct coding_system_list_closure *) coding_system_list_closure;
 590   Lisp_Object *coding_system_list = cscl->coding_system_list;
 591
 592   *coding_system_list = Fcons (key, *coding_system_list);
 593   return 0;
 594 }
 595
 596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 597 Return a list of the names of all defined coding systems.
 598 */
 599        ())
 600 {
 601   Lisp_Object coding_system_list = Qnil;
 602   struct gcpro gcpro1;
 603   struct coding_system_list_closure coding_system_list_closure;
 604
 605   GCPRO1 (coding_system_list);
 606   coding_system_list_closure.coding_system_list = &coding_system_list;
 607   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 608                  &coding_system_list_closure);
 609   UNGCPRO;
 610
 611   return coding_system_list;
 612 }
 613
 614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 615 Return the name of the given coding system.
 616 */
 617        (coding_system))
 618 {
 619   coding_system = Fget_coding_system (coding_system);
 620   return XCODING_SYSTEM_NAME (coding_system);
 621 }
 622
 623 static Lisp_Coding_System *
 624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 625 {
 626   Lisp_Coding_System *codesys =
 627     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 628
 629   zero_lcrecord (codesys);
 630   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 631   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 632   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 633   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 634   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 635   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 636   CODING_SYSTEM_TYPE     (codesys) = type;
 637   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 638 #ifdef MULE
 639   if (type == CODESYS_ISO2022)
 640     {
 641       int i;
 642       for (i = 0; i < 4; i++)
 643         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 644     }
 645   else if (type == CODESYS_CCL)
 646     {
 647       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 648       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 649     }
 650 #endif /* MULE */
 651   CODING_SYSTEM_NAME (codesys) = name;
 652
 653   return codesys;
 654 }
 655
 656 #ifdef MULE
 657 /* Given a list of charset conversion specs as specified in a Lisp
 658    program, parse it into STORE_HERE. */
 659
 660 static void
 661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 662                                 Lisp_Object spec_list)
 663 {
 664   Lisp_Object rest;
 665
 666   EXTERNAL_LIST_LOOP (rest, spec_list)
 667     {
 668       Lisp_Object car = XCAR (rest);
 669       Lisp_Object from, to;
 670       struct charset_conversion_spec spec;
 671
 672       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 673         signal_simple_error ("Invalid charset conversion spec", car);
 674       from = Fget_charset (XCAR (car));
 675       to = Fget_charset (XCAR (XCDR (car)));
 676       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 677            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 678         signal_simple_error_2
 679           ("Attempted conversion between different charset types",
 680            from, to);
 681       spec.from_charset = from;
 682       spec.to_charset = to;
 683
 684       Dynarr_add (store_here, spec);
 685     }
 686 }
 687
 688 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 689    specs, return the equivalent as the Lisp programmer would see it.
 690
 691    If LOAD_HERE is 0, return Qnil. */
 692
 693 static Lisp_Object
 694 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 695 {
 696   int i;
 697   Lisp_Object result;
 698
 699   if (!load_here)
 700     return Qnil;
 701   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 702     {
 703       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 704       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 705     }
 706
 707   return Fnreverse (result);
 708 }
 709
 710 #endif /* MULE */
 711
 712 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 713 Register symbol NAME as a coding system.
 714
 715 TYPE describes the conversion method used and should be one of
 716
 717 nil or 'undecided
 718      Automatic conversion.  XEmacs attempts to detect the coding system
 719      used in the file.
 720 'no-conversion
 721      No conversion.  Use this for binary files and such.  On output,
 722      graphic characters that are not in ASCII or Latin-1 will be
 723      replaced by a ?. (For a no-conversion-encoded buffer, these
 724      characters will only be present if you explicitly insert them.)
 725 'shift-jis
 726      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 727 'ucs-4
 728      ISO 10646 UCS-4 encoding.
 729 'utf-8
 730      ISO 10646 UTF-8 encoding.
 731 'iso2022
 732      Any ISO2022-compliant encoding.  Among other things, this includes
 733      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 734      standard Unix encoding for Japanese and other languages), and
 735      Compound Text (the encoding used in X11).  You can specify more
 736      specific information about the conversion with the PROPS argument.
 737 'big5
 738      Big5 (the encoding commonly used for Taiwanese).
 739 'ccl
 740      The conversion is performed using a user-written pseudo-code
 741      program.  CCL (Code Conversion Language) is the name of this
 742      pseudo-code.
 743 'internal
 744      Write out or read in the raw contents of the memory representing
 745      the buffer's text.  This is primarily useful for debugging
 746      purposes, and is only enabled when XEmacs has been compiled with
 747      DEBUG_XEMACS defined (via the --debug configure option).
 748      WARNING: Reading in a file using 'internal conversion can result
 749      in an internal inconsistency in the memory representing a
 750      buffer's text, which will produce unpredictable results and may
 751      cause XEmacs to crash.  Under normal circumstances you should
 752      never use 'internal conversion.
 753
 754 DOC-STRING is a string describing the coding system.
 755
 756 PROPS is a property list, describing the specific nature of the
 757 character set.  Recognized properties are:
 758
 759 'mnemonic
 760      String to be displayed in the modeline when this coding system is
 761      active.
 762
 763 'eol-type
 764      End-of-line conversion to be used.  It should be one of
 765
 766         nil
 767                 Automatically detect the end-of-line type (LF, CRLF,
 768                 or CR).  Also generate subsidiary coding systems named
 769                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 770                 identical to this coding system but have an EOL-TYPE
 771                 value of 'lf, 'crlf, and 'cr, respectively.
 772         'lf
 773                 The end of a line is marked externally using ASCII LF.
 774                 Since this is also the way that XEmacs represents an
 775                 end-of-line internally, specifying this option results
 776                 in no end-of-line conversion.  This is the standard
 777                 format for Unix text files.
 778         'crlf
 779                 The end of a line is marked externally using ASCII
 780                 CRLF.  This is the standard format for MS-DOS text
 781                 files.
 782         'cr
 783                 The end of a line is marked externally using ASCII CR.
 784                 This is the standard format for Macintosh text files.
 785         t
 786                 Automatically detect the end-of-line type but do not
 787                 generate subsidiary coding systems.  (This value is
 788                 converted to nil when stored internally, and
 789                 `coding-system-property' will return nil.)
 790
 791 'disable-composition
 792      If non-nil, composition/decomposition for combining characters
 793      are disabled.
 794
 795 'post-read-conversion
 796      Function called after a file has been read in, to perform the
 797      decoding.  Called with two arguments, START and END, denoting
 798      a region of the current buffer to be decoded.
 799
 800 'pre-write-conversion
 801      Function called before a file is written out, to perform the
 802      encoding.  Called with two arguments, START and END, denoting
 803      a region of the current buffer to be encoded.
 804
 805
 806 The following additional properties are recognized if TYPE is 'iso2022:
 807
 808 'charset-g0
 809 'charset-g1
 810 'charset-g2
 811 'charset-g3
 812      The character set initially designated to the G0 - G3 registers.
 813      The value should be one of
 814
 815           -- A charset object (designate that character set)
 816           -- nil (do not ever use this register)
 817           -- t (no character set is initially designated to
 818                 the register, but may be later on; this automatically
 819                 sets the corresponding `force-g*-on-output' property)
 820
 821 'force-g0-on-output
 822 'force-g1-on-output
 823 'force-g2-on-output
 824 'force-g2-on-output
 825      If non-nil, send an explicit designation sequence on output before
 826      using the specified register.
 827
 828 'short
 829      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 830      "ESC $ B" on output in place of the full designation sequences
 831      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 832
 833 'no-ascii-eol
 834      If non-nil, don't designate ASCII to G0 at each end of line on output.
 835      Setting this to non-nil also suppresses other state-resetting that
 836      normally happens at the end of a line.
 837
 838 'no-ascii-cntl
 839      If non-nil, don't designate ASCII to G0 before control chars on output.
 840
 841 'seven
 842      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 843      environment.
 844
 845 'lock-shift
 846      If non-nil, use locking-shift (SO/SI) instead of single-shift
 847      or designation by escape sequence.
 848
 849 'no-iso6429
 850      If non-nil, don't use ISO6429's direction specification.
 851
 852 'escape-quoted
 853      If non-nil, literal control characters that are the same as
 854      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 855      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 856      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 857      so that they can be properly distinguished from an escape sequence.
 858      (Note that doing this results in a non-portable encoding.) This
 859      encoding flag is used for byte-compiled files.  Note that ESC
 860      is a good choice for a quoting character because there are no
 861      escape sequences whose second byte is a character from the Control-0
 862      or Control-1 character sets; this is explicitly disallowed by the
 863      ISO2022 standard.
 864
 865 'input-charset-conversion
 866      A list of conversion specifications, specifying conversion of
 867      characters in one charset to another when decoding is performed.
 868      Each specification is a list of two elements: the source charset,
 869      and the destination charset.
 870
 871 'output-charset-conversion
 872      A list of conversion specifications, specifying conversion of
 873      characters in one charset to another when encoding is performed.
 874      The form of each specification is the same as for
 875      'input-charset-conversion.
 876
 877
 878 The following additional properties are recognized (and required)
 879 if TYPE is 'ccl:
 880
 881 'decode
 882      CCL program used for decoding (converting to internal format).
 883
 884 'encode
 885      CCL program used for encoding (converting to external format).
 886 */
 887        (name, type, doc_string, props))
 888 {
 889   Lisp_Coding_System *codesys;
 890   Lisp_Object rest, key, value;
 891   enum coding_system_type ty;
 892   int need_to_setup_eol_systems = 1;
 893
 894   /* Convert type to constant */
 895   if (NILP (type) || EQ (type, Qundecided))
 896                                       { ty = CODESYS_AUTODETECT; }
 897 #ifdef MULE
 898   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 899   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 900   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 901   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 902   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 903   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 904 #endif
 905   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 906 #ifdef DEBUG_XEMACS
 907   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 908 #endif
 909   else
 910     signal_simple_error ("Invalid coding system type", type);
 911
 912   CHECK_SYMBOL (name);
 913
 914   codesys = allocate_coding_system (ty, name);
 915
 916   if (NILP (doc_string))
 917     doc_string = build_string ("");
 918   else
 919     CHECK_STRING (doc_string);
 920   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 921
 922   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 923     {
 924       if (EQ (key, Qmnemonic))
 925         {
 926           if (!NILP (value))
 927             CHECK_STRING (value);
 928           CODING_SYSTEM_MNEMONIC (codesys) = value;
 929         }
 930
 931       else if (EQ (key, Qeol_type))
 932         {
 933           need_to_setup_eol_systems = NILP (value);
 934           if (EQ (value, Qt))
 935             value = Qnil;
 936           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 937         }
 938
 939       else if (EQ (key, Qpost_read_conversion))
 940         CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 941       else if (EQ (key, Qpre_write_conversion))
 942         CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 943 #ifdef UTF2000
 944       else if (EQ (key, Qdisable_composition))
 945         CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 946 #endif
 947 #ifdef MULE
 948       else if (ty == CODESYS_ISO2022)
 949         {
 950 #define FROB_INITIAL_CHARSET(charset_num) \
 951   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 952     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 953
 954           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 955           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 956           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 957           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 958
 959 #define FROB_FORCE_CHARSET(charset_num) \
 960   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 961
 962           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 963           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 964           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 965           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 966
 967 #define FROB_BOOLEAN_PROPERTY(prop) \
 968   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 969
 970           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 971           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 972           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 973           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 974           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 975           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 976           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 977
 978           else if (EQ (key, Qinput_charset_conversion))
 979             {
 980               codesys->iso2022.input_conv =
 981                 Dynarr_new (charset_conversion_spec);
 982               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 983                                               value);
 984             }
 985           else if (EQ (key, Qoutput_charset_conversion))
 986             {
 987               codesys->iso2022.output_conv =
 988                 Dynarr_new (charset_conversion_spec);
 989               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 990                                               value);
 991             }
 992           else
 993             signal_simple_error ("Unrecognized property", key);
 994         }
 995       else if (EQ (type, Qccl))
 996         {
 997           Lisp_Object sym;
 998           struct ccl_program test_ccl;
 999           Extbyte *suffix;
1000
1001           /* Check key first.  */
1002           if (EQ (key, Qdecode))
1003             suffix = "-ccl-decode";
1004           else if (EQ (key, Qencode))
1005             suffix = "-ccl-encode";
1006           else
1007             signal_simple_error ("Unrecognized property", key);
1008
1009           /* If value is vector, register it as a ccl program
1010              associated with an newly created symbol for
1011              backward compatibility.  */
1012           if (VECTORP (value))
1013             {
1014               sym = Fintern (concat2 (Fsymbol_name (name),
1015                                       build_string (suffix)),
1016                              Qnil);
1017               Fregister_ccl_program (sym, value);
1018             }
1019           else
1020             {
1021               CHECK_SYMBOL (value);
1022               sym = value;
1023             }
1024           /* check if the given ccl programs are valid.  */
1025           if (setup_ccl_program (&test_ccl, sym) < 0)
1026             signal_simple_error ("Invalid CCL program", value);
1027
1028           if (EQ (key, Qdecode))
1029             CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1030           else if (EQ (key, Qencode))
1031             CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1032
1033         }
1034 #endif /* MULE */
1035       else
1036         signal_simple_error ("Unrecognized property", key);
1037     }
1038
1039   if (need_to_setup_eol_systems)
1040     setup_eol_coding_systems (codesys);
1041
1042   {
1043     Lisp_Object codesys_obj;
1044     XSETCODING_SYSTEM (codesys_obj, codesys);
1045     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1046     return codesys_obj;
1047   }
1048 }
1049
1050 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1051 Copy OLD-CODING-SYSTEM to NEW-NAME.
1052 If NEW-NAME does not name an existing coding system, a new one will
1053 be created.
1054 */
1055        (old_coding_system, new_name))
1056 {
1057   Lisp_Object new_coding_system;
1058   old_coding_system = Fget_coding_system (old_coding_system);
1059   new_coding_system = Ffind_coding_system (new_name);
1060   if (NILP (new_coding_system))
1061     {
1062       XSETCODING_SYSTEM (new_coding_system,
1063                          allocate_coding_system
1064                          (XCODING_SYSTEM_TYPE (old_coding_system),
1065                           new_name));
1066       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1067     }
1068
1069   {
1070     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1071     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1072     memcpy (((char *) to  ) + sizeof (to->header),
1073             ((char *) from) + sizeof (from->header),
1074             sizeof (*from) - sizeof (from->header));
1075     to->name = new_name;
1076   }
1077   return new_coding_system;
1078 }
1079
1080 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1081 Return t if OBJECT names a coding system, and is not a coding system alias.
1082 */
1083        (object))
1084 {
1085   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1086     ? Qt : Qnil;
1087 }
1088
1089 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1090 Return t if OBJECT is a coding system alias.
1091 All coding system aliases are created by `define-coding-system-alias'.
1092 */
1093        (object))
1094 {
1095   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1096     ? Qt : Qnil;
1097 }
1098
1099 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1100 Return the coding-system symbol for which symbol ALIAS is an alias.
1101 */
1102        (alias))
1103 {
1104   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1105   if (SYMBOLP (aliasee))
1106     return aliasee;
1107   else
1108     signal_simple_error ("Symbol is not a coding system alias", alias);
1109   return Qnil;          /* To keep the compiler happy */
1110 }
1111
1112 static Lisp_Object
1113 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1114 {
1115   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1116                   Qnil);
1117 }
1118
1119 /* A maphash function, for removing dangling coding system aliases. */
1120 static int
1121 dangling_coding_system_alias_p (Lisp_Object alias,
1122                                 Lisp_Object aliasee,
1123                                 void *dangling_aliases)
1124 {
1125   if (SYMBOLP (aliasee)
1126       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1127     {
1128       (*(int *) dangling_aliases)++;
1129       return 1;
1130     }
1131   else
1132     return 0;
1133 }
1134
1135 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1136 Define symbol ALIAS as an alias for coding system ALIASEE.
1137
1138 You can use this function to redefine an alias that has already been defined,
1139 but you cannot redefine a name which is the canonical name for a coding system.
1140 \(a canonical name of a coding system is what is returned when you call
1141 `coding-system-name' on a coding system).
1142
1143 ALIASEE itself can be an alias, which allows you to define nested aliases.
1144
1145 You are forbidden, however, from creating alias loops or `dangling' aliases.
1146 These will be detected, and an error will be signaled if you attempt to do so.
1147
1148 If ALIASEE is nil, then ALIAS will simply be undefined.
1149
1150 See also `coding-system-alias-p', `coding-system-aliasee',
1151 and `coding-system-canonical-name-p'.
1152 */
1153        (alias, aliasee))
1154 {
1155   Lisp_Object real_coding_system, probe;
1156
1157   CHECK_SYMBOL (alias);
1158
1159   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1160     signal_simple_error
1161       ("Symbol is the canonical name of a coding system and cannot be redefined",
1162        alias);
1163
1164   if (NILP (aliasee))
1165     {
1166       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1167       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1168       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1169
1170       Fremhash (alias, Vcoding_system_hash_table);
1171
1172       /* Undefine subsidiary aliases,
1173          presumably created by a previous call to this function */
1174       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1175           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1176           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1177         {
1178           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1179           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1180           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1181         }
1182
1183       /* Undefine dangling coding system aliases. */
1184       {
1185         int dangling_aliases;
1186
1187         do {
1188           dangling_aliases = 0;
1189           elisp_map_remhash (dangling_coding_system_alias_p,
1190                              Vcoding_system_hash_table,
1191                              &dangling_aliases);
1192         } while (dangling_aliases > 0);
1193       }
1194
1195       return Qnil;
1196     }
1197
1198   if (CODING_SYSTEMP (aliasee))
1199     aliasee = XCODING_SYSTEM_NAME (aliasee);
1200
1201   /* Checks that aliasee names a coding-system */
1202   real_coding_system = Fget_coding_system (aliasee);
1203
1204   /* Check for coding system alias loops */
1205   if (EQ (alias, aliasee))
1206     alias_loop: signal_simple_error_2
1207       ("Attempt to create a coding system alias loop", alias, aliasee);
1208
1209   for (probe = aliasee;
1210        SYMBOLP (probe);
1211        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1212     {
1213       if (EQ (probe, alias))
1214         goto alias_loop;
1215     }
1216
1217   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1218
1219   /* Set up aliases for subsidiaries.
1220      #### There must be a better way to handle subsidiary coding systems. */
1221   {
1222     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1223     int i;
1224     for (i = 0; i < countof (suffixes); i++)
1225       {
1226         Lisp_Object alias_subsidiary =
1227           append_suffix_to_symbol (alias, suffixes[i]);
1228         Lisp_Object aliasee_subsidiary =
1229           append_suffix_to_symbol (aliasee, suffixes[i]);
1230
1231         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1232           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1233       }
1234   }
1235   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1236      but it doesn't look intentional, so I'd rather return something
1237      meaningful or nothing at all. */
1238   return Qnil;
1239 }
1240
1241 static Lisp_Object
1242 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1243 {
1244   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1245   Lisp_Object new_coding_system;
1246
1247   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1248     return coding_system;
1249
1250   switch (type)
1251     {
1252     case EOL_AUTODETECT: return coding_system;
1253     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1254     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1255     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1256     default:       abort (); return Qnil;
1257     }
1258
1259   return NILP (new_coding_system) ? coding_system : new_coding_system;
1260 }
1261
1262 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1263 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1264 */
1265        (coding_system, eol_type))
1266 {
1267   coding_system = Fget_coding_system (coding_system);
1268
1269   return subsidiary_coding_system (coding_system,
1270                                    symbol_to_eol_type (eol_type));
1271 }
1272
1273 \f
1274 /************************************************************************/
1275 /*                         Coding system accessors                      */
1276 /************************************************************************/
1277
1278 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1279 Return the doc string for CODING-SYSTEM.
1280 */
1281        (coding_system))
1282 {
1283   coding_system = Fget_coding_system (coding_system);
1284   return XCODING_SYSTEM_DOC_STRING (coding_system);
1285 }
1286
1287 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1288 Return the type of CODING-SYSTEM.
1289 */
1290        (coding_system))
1291 {
1292   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1293     {
1294     default: abort ();
1295     case CODESYS_AUTODETECT:    return Qundecided;
1296 #ifdef MULE
1297     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1298     case CODESYS_ISO2022:       return Qiso2022;
1299     case CODESYS_BIG5:          return Qbig5;
1300     case CODESYS_UCS4:          return Qucs4;
1301     case CODESYS_UTF8:          return Qutf8;
1302     case CODESYS_CCL:           return Qccl;
1303 #endif
1304     case CODESYS_NO_CONVERSION: return Qno_conversion;
1305 #ifdef DEBUG_XEMACS
1306     case CODESYS_INTERNAL:      return Qinternal;
1307 #endif
1308     }
1309 }
1310
1311 #ifdef MULE
1312 static
1313 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1314 {
1315   Lisp_Object cs
1316     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1317
1318   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1319 }
1320
1321 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1322 Return initial charset of CODING-SYSTEM designated to GNUM.
1323 GNUM allows 0 .. 3.
1324 */
1325        (coding_system, gnum))
1326 {
1327   coding_system = Fget_coding_system (coding_system);
1328   CHECK_INT (gnum);
1329
1330   return coding_system_charset (coding_system, XINT (gnum));
1331 }
1332 #endif /* MULE */
1333
1334 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1335 Return the PROP property of CODING-SYSTEM.
1336 */
1337        (coding_system, prop))
1338 {
1339   int i, ok = 0;
1340   enum coding_system_type type;
1341
1342   coding_system = Fget_coding_system (coding_system);
1343   CHECK_SYMBOL (prop);
1344   type = XCODING_SYSTEM_TYPE (coding_system);
1345
1346   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1347     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1348       {
1349         ok = 1;
1350         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1351           {
1352           case CODESYS_PROP_ALL_OK:
1353             break;
1354 #ifdef MULE
1355           case CODESYS_PROP_ISO2022:
1356             if (type != CODESYS_ISO2022)
1357               signal_simple_error
1358                 ("Property only valid in ISO2022 coding systems",
1359                  prop);
1360             break;
1361
1362           case CODESYS_PROP_CCL:
1363             if (type != CODESYS_CCL)
1364               signal_simple_error
1365                 ("Property only valid in CCL coding systems",
1366                  prop);
1367             break;
1368 #endif /* MULE */
1369           default:
1370             abort ();
1371           }
1372       }
1373
1374   if (!ok)
1375     signal_simple_error ("Unrecognized property", prop);
1376
1377   if (EQ (prop, Qname))
1378     return XCODING_SYSTEM_NAME (coding_system);
1379   else if (EQ (prop, Qtype))
1380     return Fcoding_system_type (coding_system);
1381   else if (EQ (prop, Qdoc_string))
1382     return XCODING_SYSTEM_DOC_STRING (coding_system);
1383   else if (EQ (prop, Qmnemonic))
1384     return XCODING_SYSTEM_MNEMONIC (coding_system);
1385   else if (EQ (prop, Qeol_type))
1386     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1387   else if (EQ (prop, Qeol_lf))
1388     return XCODING_SYSTEM_EOL_LF (coding_system);
1389   else if (EQ (prop, Qeol_crlf))
1390     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1391   else if (EQ (prop, Qeol_cr))
1392     return XCODING_SYSTEM_EOL_CR (coding_system);
1393   else if (EQ (prop, Qpost_read_conversion))
1394     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1395   else if (EQ (prop, Qpre_write_conversion))
1396     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1397 #ifdef MULE
1398   else if (type == CODESYS_ISO2022)
1399     {
1400       if (EQ (prop, Qcharset_g0))
1401         return coding_system_charset (coding_system, 0);
1402       else if (EQ (prop, Qcharset_g1))
1403         return coding_system_charset (coding_system, 1);
1404       else if (EQ (prop, Qcharset_g2))
1405         return coding_system_charset (coding_system, 2);
1406       else if (EQ (prop, Qcharset_g3))
1407         return coding_system_charset (coding_system, 3);
1408
1409 #define FORCE_CHARSET(charset_num) \
1410   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1411    (coding_system, charset_num) ? Qt : Qnil)
1412
1413       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1414       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1415       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1416       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1417
1418 #define LISP_BOOLEAN(prop) \
1419   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1420
1421       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1422       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1423       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1424       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1425       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1426       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1427       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1428
1429       else if (EQ (prop, Qinput_charset_conversion))
1430         return
1431           unparse_charset_conversion_specs
1432             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1433       else if (EQ (prop, Qoutput_charset_conversion))
1434         return
1435           unparse_charset_conversion_specs
1436             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1437       else
1438         abort ();
1439     }
1440   else if (type == CODESYS_CCL)
1441     {
1442       if (EQ (prop, Qdecode))
1443         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1444       else if (EQ (prop, Qencode))
1445         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1446       else
1447         abort ();
1448     }
1449 #endif /* MULE */
1450   else
1451     abort ();
1452
1453   return Qnil; /* not reached */
1454 }
1455
1456 \f
1457 /************************************************************************/
1458 /*                       Coding category functions                      */
1459 /************************************************************************/
1460
1461 static int
1462 decode_coding_category (Lisp_Object symbol)
1463 {
1464   int i;
1465
1466   CHECK_SYMBOL (symbol);
1467   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1468     if (EQ (coding_category_symbol[i], symbol))
1469       return i;
1470
1471   signal_simple_error ("Unrecognized coding category", symbol);
1472   return 0; /* not reached */
1473 }
1474
1475 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1476 Return a list of all recognized coding categories.
1477 */
1478        ())
1479 {
1480   int i;
1481   Lisp_Object list = Qnil;
1482
1483   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1484     list = Fcons (coding_category_symbol[i], list);
1485   return list;
1486 }
1487
1488 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1489 Change the priority order of the coding categories.
1490 LIST should be list of coding categories, in descending order of
1491 priority.  Unspecified coding categories will be lower in priority
1492 than all specified ones, in the same relative order they were in
1493 previously.
1494 */
1495        (list))
1496 {
1497   int category_to_priority[CODING_CATEGORY_LAST + 1];
1498   int i, j;
1499   Lisp_Object rest;
1500
1501   /* First generate a list that maps coding categories to priorities. */
1502
1503   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1504     category_to_priority[i] = -1;
1505
1506   /* Highest priority comes from the specified list. */
1507   i = 0;
1508   EXTERNAL_LIST_LOOP (rest, list)
1509     {
1510       int cat = decode_coding_category (XCAR (rest));
1511
1512       if (category_to_priority[cat] >= 0)
1513         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1514       category_to_priority[cat] = i++;
1515     }
1516
1517   /* Now go through the existing categories by priority to retrieve
1518      the categories not yet specified and preserve their priority
1519      order. */
1520   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1521     {
1522       int cat = fcd->coding_category_by_priority[j];
1523       if (category_to_priority[cat] < 0)
1524         category_to_priority[cat] = i++;
1525     }
1526
1527   /* Now we need to construct the inverse of the mapping we just
1528      constructed. */
1529
1530   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1531     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1532
1533   /* Phew!  That was confusing. */
1534   return Qnil;
1535 }
1536
1537 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1538 Return a list of coding categories in descending order of priority.
1539 */
1540        ())
1541 {
1542   int i;
1543   Lisp_Object list = Qnil;
1544
1545   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1546     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1547                   list);
1548   return list;
1549 }
1550
1551 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1552 Change the coding system associated with a coding category.
1553 */
1554        (coding_category, coding_system))
1555 {
1556   int cat = decode_coding_category (coding_category);
1557
1558   coding_system = Fget_coding_system (coding_system);
1559   fcd->coding_category_system[cat] = coding_system;
1560   return Qnil;
1561 }
1562
1563 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1564 Return the coding system associated with a coding category.
1565 */
1566        (coding_category))
1567 {
1568   int cat = decode_coding_category (coding_category);
1569   Lisp_Object sys = fcd->coding_category_system[cat];
1570
1571   if (!NILP (sys))
1572     return XCODING_SYSTEM_NAME (sys);
1573   return Qnil;
1574 }
1575
1576 \f
1577 /************************************************************************/
1578 /*                     Detecting the encoding of data                   */
1579 /************************************************************************/
1580
1581 struct detection_state
1582 {
1583   eol_type_t eol_type;
1584   int seen_non_ascii;
1585   int mask;
1586 #ifdef MULE
1587   struct
1588     {
1589       int mask;
1590       int in_second_byte;
1591     }
1592   big5;
1593
1594   struct
1595     {
1596       int mask;
1597       int in_second_byte;
1598     }
1599   shift_jis;
1600
1601   struct
1602     {
1603       int mask;
1604       int in_byte;
1605   }
1606   ucs4;
1607
1608   struct
1609     {
1610       int mask;
1611       int in_byte;
1612     }
1613   utf8;
1614
1615   struct
1616     {
1617       int mask;
1618       int initted;
1619       struct iso2022_decoder iso;
1620       unsigned int flags;
1621       int high_byte_count;
1622       unsigned int saw_single_shift:1;
1623     }
1624   iso2022;
1625 #endif
1626   struct
1627     {
1628       int seen_anything;
1629       int just_saw_cr;
1630     }
1631   eol;
1632 };
1633
1634 static int
1635 acceptable_control_char_p (int c)
1636 {
1637   switch (c)
1638     {
1639       /* Allow and ignore control characters that you might
1640          reasonably see in a text file */
1641     case '\r':
1642     case '\n':
1643     case '\t':
1644     case  7: /* bell */
1645     case  8: /* backspace */
1646     case 11: /* vertical tab */
1647     case 12: /* form feed */
1648     case 26: /* MS-DOS C-z junk */
1649     case 31: /* '^_' -- for info */
1650       return 1;
1651     default:
1652       return 0;
1653     }
1654 }
1655
1656 static int
1657 mask_has_at_most_one_bit_p (int mask)
1658 {
1659   /* Perhaps the only thing useful you learn from intensive Microsoft
1660      technical interviews */
1661   return (mask & (mask - 1)) == 0;
1662 }
1663
1664 static eol_type_t
1665 detect_eol_type (struct detection_state *st, const Extbyte *src,
1666                  size_t n)
1667 {
1668   while (n--)
1669     {
1670       unsigned char c = *(unsigned char *)src++;
1671       if (c == '\n')
1672         {
1673           if (st->eol.just_saw_cr)
1674             return EOL_CRLF;
1675           else if (st->eol.seen_anything)
1676             return EOL_LF;
1677         }
1678       else if (st->eol.just_saw_cr)
1679         return EOL_CR;
1680       else if (c == '\r')
1681         st->eol.just_saw_cr = 1;
1682       else
1683         st->eol.just_saw_cr = 0;
1684       st->eol.seen_anything = 1;
1685     }
1686
1687   return EOL_AUTODETECT;
1688 }
1689
1690 /* Attempt to determine the encoding and EOL type of the given text.
1691    Before calling this function for the first type, you must initialize
1692    st->eol_type as appropriate and initialize st->mask to ~0.
1693
1694    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1695    not yet known.
1696
1697    st->mask holds the determined coding category mask, or ~0 if only
1698    ASCII has been seen so far.
1699
1700    Returns:
1701
1702    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1703         is present in st->mask
1704    1 == definitive answers are here for both st->eol_type and st->mask
1705 */
1706
1707 static int
1708 detect_coding_type (struct detection_state *st, const Extbyte *src,
1709                     size_t n, int just_do_eol)
1710 {
1711   if (st->eol_type == EOL_AUTODETECT)
1712     st->eol_type = detect_eol_type (st, src, n);
1713
1714   if (just_do_eol)
1715     return st->eol_type != EOL_AUTODETECT;
1716
1717   if (!st->seen_non_ascii)
1718     {
1719       for (; n; n--, src++)
1720         {
1721           unsigned char c = *(unsigned char *) src;
1722           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1723             {
1724               st->seen_non_ascii = 1;
1725 #ifdef MULE
1726               st->shift_jis.mask = ~0;
1727               st->big5.mask = ~0;
1728               st->ucs4.mask = ~0;
1729               st->utf8.mask = ~0;
1730               st->iso2022.mask = ~0;
1731 #endif
1732               break;
1733             }
1734         }
1735     }
1736
1737   if (!n)
1738     return 0;
1739 #ifdef MULE
1740   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1741     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1742   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1743     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1744   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1745     st->big5.mask = detect_coding_big5 (st, src, n);
1746   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1747     st->utf8.mask = detect_coding_utf8 (st, src, n);
1748   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1749     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1750
1751   st->mask
1752     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1753     | st->utf8.mask | st->ucs4.mask;
1754 #endif
1755   {
1756     int retval = mask_has_at_most_one_bit_p (st->mask);
1757     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1758     return retval && st->eol_type != EOL_AUTODETECT;
1759   }
1760 }
1761
1762 static Lisp_Object
1763 coding_system_from_mask (int mask)
1764 {
1765   if (mask == ~0)
1766     {
1767       /* If the file was entirely or basically ASCII, use the
1768          default value of `buffer-file-coding-system'. */
1769       Lisp_Object retval =
1770         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1771       if (!NILP (retval))
1772         {
1773           retval = Ffind_coding_system (retval);
1774           if (NILP (retval))
1775             {
1776               warn_when_safe
1777                 (Qbad_variable, Qwarning,
1778                  "Invalid `default-buffer-file-coding-system', set to nil");
1779               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1780             }
1781         }
1782       if (NILP (retval))
1783         retval = Fget_coding_system (Qraw_text);
1784       return retval;
1785     }
1786   else
1787     {
1788       int i;
1789       int cat = -1;
1790 #ifdef MULE
1791       mask = postprocess_iso2022_mask (mask);
1792 #endif
1793       /* Look through the coding categories by priority and find
1794          the first one that is allowed. */
1795       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1796         {
1797           cat = fcd->coding_category_by_priority[i];
1798           if ((mask & (1 << cat)) &&
1799               !NILP (fcd->coding_category_system[cat]))
1800             break;
1801         }
1802       if (cat >= 0)
1803         return fcd->coding_category_system[cat];
1804       else
1805         return Fget_coding_system (Qraw_text);
1806     }
1807 }
1808
1809 /* Given a seekable read stream and potential coding system and EOL type
1810    as specified, do any autodetection that is called for.  If the
1811    coding system and/or EOL type are not `autodetect', they will be left
1812    alone; but this function will never return an autodetect coding system
1813    or EOL type.
1814
1815    This function does not automatically fetch subsidiary coding systems;
1816    that should be unnecessary with the explicit eol-type argument. */
1817
1818 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1819
1820 void
1821 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1822                               eol_type_t *eol_type_in_out)
1823 {
1824   struct detection_state decst;
1825
1826   if (*eol_type_in_out == EOL_AUTODETECT)
1827     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1828
1829   xzero (decst);
1830   decst.eol_type = *eol_type_in_out;
1831   decst.mask = ~0;
1832
1833   /* If autodetection is called for, do it now. */
1834   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1835       || *eol_type_in_out == EOL_AUTODETECT)
1836     {
1837       Extbyte buf[4096];
1838       Lisp_Object coding_system = Qnil;
1839       Extbyte *p;
1840       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1841       Extbyte *scan_end;
1842
1843       /* Look for initial "-*-"; mode line prefix */
1844       for (p = buf,
1845              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1846            p <= scan_end
1847              && *p != '\n'
1848              && *p != '\r';
1849            p++)
1850         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1851           {
1852             Extbyte *local_vars_beg = p + 3;
1853             /* Look for final "-*-"; mode line suffix */
1854             for (p = local_vars_beg,
1855                    scan_end = buf + nread - LENGTH ("-*-");
1856                  p <= scan_end
1857                    && *p != '\n'
1858                    && *p != '\r';
1859                  p++)
1860               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1861                 {
1862                   Extbyte *suffix = p;
1863                   /* Look for "coding:" */
1864                   for (p = local_vars_beg,
1865                          scan_end = suffix - LENGTH ("coding:?");
1866                        p <= scan_end;
1867                        p++)
1868                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1869                         && (p == local_vars_beg
1870                             || (*(p-1) == ' '  ||
1871                                 *(p-1) == '\t' ||
1872                                 *(p-1) == ';')))
1873                       {
1874                         Extbyte save;
1875                         int n;
1876                         p += LENGTH ("coding:");
1877                         while (*p == ' ' || *p == '\t') p++;
1878
1879                         /* Get coding system name */
1880                         save = *suffix; *suffix = '\0';
1881                         /* Characters valid in a MIME charset name (rfc 1521),
1882                            and in a Lisp symbol name. */
1883                         n = strspn ( (char *) p,
1884                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1885                                     "abcdefghijklmnopqrstuvwxyz"
1886                                     "0123456789"
1887                                     "!$%&*+-.^_{|}~");
1888                         *suffix = save;
1889                         if (n > 0)
1890                           {
1891                             save = p[n]; p[n] = '\0';
1892                             coding_system =
1893                               Ffind_coding_system (intern ((char *) p));
1894                             p[n] = save;
1895                           }
1896                         break;
1897                       }
1898                   break;
1899                 }
1900             break;
1901           }
1902
1903       if (NILP (coding_system))
1904         do
1905           {
1906             if (detect_coding_type (&decst, buf, nread,
1907                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1908                                     != CODESYS_AUTODETECT))
1909               break;
1910             nread = Lstream_read (stream, buf, sizeof (buf));
1911             if (nread == 0)
1912               break;
1913           }
1914         while (1);
1915
1916       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1917                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1918         do
1919           {
1920             if (detect_coding_type (&decst, buf, nread, 1))
1921               break;
1922             nread = Lstream_read (stream, buf, sizeof (buf));
1923             if (!nread)
1924               break;
1925           }
1926         while (1);
1927
1928       *eol_type_in_out = decst.eol_type;
1929       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1930         {
1931           if (NILP (coding_system))
1932             *codesys_in_out = coding_system_from_mask (decst.mask);
1933           else
1934             *codesys_in_out = coding_system;
1935         }
1936     }
1937
1938   /* If we absolutely can't determine the EOL type, just assume LF. */
1939   if (*eol_type_in_out == EOL_AUTODETECT)
1940     *eol_type_in_out = EOL_LF;
1941
1942   Lstream_rewind (stream);
1943 }
1944
1945 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1946 Detect coding system of the text in the region between START and END.
1947 Return a list of possible coding systems ordered by priority.
1948 If only ASCII characters are found, return 'undecided or one of
1949 its subsidiary coding systems according to a detected end-of-line
1950 type.  Optional arg BUFFER defaults to the current buffer.
1951 */
1952        (start, end, buffer))
1953 {
1954   Lisp_Object val = Qnil;
1955   struct buffer *buf = decode_buffer (buffer, 0);
1956   Bufpos b, e;
1957   Lisp_Object instream, lb_instream;
1958   Lstream *istr, *lb_istr;
1959   struct detection_state decst;
1960   struct gcpro gcpro1, gcpro2;
1961
1962   get_buffer_range_char (buf, start, end, &b, &e, 0);
1963   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1964   lb_istr = XLSTREAM (lb_instream);
1965   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1966   istr = XLSTREAM (instream);
1967   GCPRO2 (instream, lb_instream);
1968   xzero (decst);
1969   decst.eol_type = EOL_AUTODETECT;
1970   decst.mask = ~0;
1971   while (1)
1972     {
1973       Extbyte random_buffer[4096];
1974       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1975
1976       if (!nread)
1977         break;
1978       if (detect_coding_type (&decst, random_buffer, nread, 0))
1979         break;
1980     }
1981
1982   if (decst.mask == ~0)
1983     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1984                                     decst.eol_type);
1985   else
1986     {
1987       int i;
1988
1989       val = Qnil;
1990 #ifdef MULE
1991       decst.mask = postprocess_iso2022_mask (decst.mask);
1992 #endif
1993       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1994         {
1995           int sys = fcd->coding_category_by_priority[i];
1996           if (decst.mask & (1 << sys))
1997             {
1998               Lisp_Object codesys = fcd->coding_category_system[sys];
1999               if (!NILP (codesys))
2000                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2001               val = Fcons (codesys, val);
2002             }
2003         }
2004     }
2005   Lstream_close (istr);
2006   UNGCPRO;
2007   Lstream_delete (istr);
2008   Lstream_delete (lb_istr);
2009   return val;
2010 }
2011
2012 \f
2013 /************************************************************************/
2014 /*           Converting to internal Mule format ("decoding")            */
2015 /************************************************************************/
2016
2017 /* A decoding stream is a stream used for decoding text (i.e.
2018    converting from some external format to internal format).
2019    The decoding-stream object keeps track of the actual coding
2020    stream, the stream that is at the other end, and data that
2021    needs to be persistent across the lifetime of the stream. */
2022
2023 /* Handle the EOL stuff related to just-read-in character C.
2024    EOL_TYPE is the EOL type of the coding stream.
2025    FLAGS is the current value of FLAGS in the coding stream, and may
2026    be modified by this macro.  (The macro only looks at the
2027    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2028    bytes are to be written.  You need to also define a local goto
2029    label "label_continue_loop" that is at the end of the main
2030    character-reading loop.
2031
2032    If C is a CR character, then this macro handles it entirely and
2033    jumps to label_continue_loop.  Otherwise, this macro does not add
2034    anything to DST, and continues normally.  You should continue
2035    processing C normally after this macro. */
2036
2037 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2038 do {                                                            \
2039   if (c == '\r')                                                \
2040     {                                                           \
2041       if (eol_type == EOL_CR)                                   \
2042         Dynarr_add (dst, '\n');                                 \
2043       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2044         Dynarr_add (dst, c);                                    \
2045       else                                                      \
2046         flags |= CODING_STATE_CR;                               \
2047       goto label_continue_loop;                                 \
2048     }                                                           \
2049   else if (flags & CODING_STATE_CR)                             \
2050     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2051       if (c != '\n')                                            \
2052         Dynarr_add (dst, '\r');                                 \
2053       flags &= ~CODING_STATE_CR;                                \
2054     }                                                           \
2055 } while (0)
2056
2057 /* C should be a binary character in the range 0 - 255; convert
2058    to internal format and add to Dynarr DST. */
2059
2060 #ifdef UTF2000
2061 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2062 do {                                            \
2063   if (BYTE_ASCII_P (c))                         \
2064     Dynarr_add (dst, c);                        \
2065   else                                          \
2066     {                                           \
2067       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2068       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2069     }                                           \
2070 } while (0)
2071
2072 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2073 INLINE_HEADER void
2074 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2075 {
2076   if ( c <= 0x7f )
2077     {
2078       Dynarr_add (dst, c);
2079     }
2080   else if ( c <= 0x7ff )
2081     {
2082       Dynarr_add (dst, (c >> 6) | 0xc0);
2083       Dynarr_add (dst, (c & 0x3f) | 0x80);
2084     }
2085   else if ( c <= 0xffff )
2086     {
2087       Dynarr_add (dst,  (c >> 12) | 0xe0);
2088       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2089       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2090     }
2091   else if ( c <= 0x1fffff )
2092     {
2093       Dynarr_add (dst,  (c >> 18) | 0xf0);
2094       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2095       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2096       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2097     }
2098   else if ( c <= 0x3ffffff )
2099     {
2100       Dynarr_add (dst,  (c >> 24) | 0xf8);
2101       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2102       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2103       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2104       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2105     }
2106   else
2107     {
2108       Dynarr_add (dst,  (c >> 30) | 0xfc);
2109       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2110       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2111       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2112       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2113       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2114     }
2115 }
2116 #else
2117 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2118 do {                                            \
2119   if (BYTE_ASCII_P (c))                         \
2120     Dynarr_add (dst, c);                        \
2121   else if (BYTE_C1_P (c))                       \
2122     {                                           \
2123       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2124       Dynarr_add (dst, c + 0x20);               \
2125     }                                           \
2126   else                                          \
2127     {                                           \
2128       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2129       Dynarr_add (dst, c);                      \
2130     }                                           \
2131 } while (0)
2132 #endif
2133
2134 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2135 do {                                    \
2136   if (ch)                               \
2137     {                                   \
2138       DECODE_ADD_BINARY_CHAR (ch, dst); \
2139       ch = 0;                           \
2140     }                                   \
2141 } while (0)
2142
2143 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2144 do {                                    \
2145   if (flags & CODING_STATE_END)         \
2146     {                                   \
2147       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2148       if (flags & CODING_STATE_CR)      \
2149         Dynarr_add (dst, '\r');         \
2150     }                                   \
2151 } while (0)
2152
2153 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2154
2155 struct decoding_stream
2156 {
2157   /* Coding system that governs the conversion. */
2158   Lisp_Coding_System *codesys;
2159
2160   /* Stream that we read the encoded data from or
2161      write the decoded data to. */
2162   Lstream *other_end;
2163
2164   /* If we are reading, then we can return only a fixed amount of
2165      data, so if the conversion resulted in too much data, we store it
2166      here for retrieval the next time around. */
2167   unsigned_char_dynarr *runoff;
2168
2169   /* FLAGS holds flags indicating the current state of the decoding.
2170      Some of these flags are dependent on the coding system. */
2171   unsigned int flags;
2172
2173   /* CPOS holds a partially built-up code-point of character. */
2174   unsigned int cpos;
2175
2176   /* EOL_TYPE specifies the type of end-of-line conversion that
2177      currently applies.  We need to keep this separate from the
2178      EOL type stored in CODESYS because the latter might indicate
2179      automatic EOL-type detection while the former will always
2180      indicate a particular EOL type. */
2181   eol_type_t eol_type;
2182 #ifdef MULE
2183   /* Additional ISO2022 information.  We define the structure above
2184      because it's also needed by the detection routines. */
2185   struct iso2022_decoder iso2022;
2186
2187   /* Additional information (the state of the running CCL program)
2188      used by the CCL decoder. */
2189   struct ccl_program ccl;
2190
2191   /* counter for UTF-8 or UCS-4 */
2192   unsigned char counter;
2193 #endif
2194 #ifdef UTF2000
2195   unsigned combined_char_count;
2196   Emchar combined_chars[16];
2197   Lisp_Object combining_table;
2198 #endif
2199   struct detection_state decst;
2200 };
2201
2202 #ifdef UTF2000
2203 extern Lisp_Object Vcharacter_composition_table;
2204
2205 INLINE_HEADER void
2206 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2207 INLINE_HEADER void
2208 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2209 {
2210   unsigned i;
2211
2212   for (i = 0; i < str->combined_char_count; i++)
2213     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2214   str->combined_char_count = 0;
2215   str->combining_table = Qnil;
2216 }
2217
2218 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2219                       unsigned_char_dynarr* dst);
2220 void
2221 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2222                  Emchar character, unsigned_char_dynarr* dst)
2223 {
2224   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2225     DECODE_ADD_UCS_CHAR (character, dst);
2226   else if (!CHAR_ID_TABLE_P (str->combining_table))
2227     {
2228       Lisp_Object ret
2229         = get_char_id_table (character, Vcharacter_composition_table);
2230
2231       if (NILP (ret))
2232         DECODE_ADD_UCS_CHAR (character, dst);
2233       else
2234         {
2235           str->combined_chars[0] = character;
2236           str->combined_char_count = 1;
2237           str->combining_table = ret;
2238         }
2239     }
2240   else
2241     {
2242       Lisp_Object ret
2243         = get_char_id_table (character, str->combining_table);
2244
2245       if (CHARP (ret))
2246         {
2247           Emchar char2 = XCHARVAL (ret);
2248           ret = get_char_id_table (char2, Vcharacter_composition_table);
2249           if (NILP (ret))
2250             {
2251               DECODE_ADD_UCS_CHAR (char2, dst);
2252               str->combined_char_count = 0;
2253               str->combining_table = Qnil;
2254             }
2255           else
2256             {
2257               str->combined_chars[0] = char2;
2258               str->combined_char_count = 1;
2259               str->combining_table = ret;
2260             }
2261         }
2262       else if (CHAR_ID_TABLE_P (ret))
2263         {
2264           str->combined_chars[str->combined_char_count++] = character;
2265           str->combining_table = ret;
2266         }
2267       else
2268         {
2269           COMPOSE_FLUSH_CHARS (str, dst);
2270           DECODE_ADD_UCS_CHAR (character, dst);
2271         }
2272     }
2273 }
2274 #else /* not UTF2000 */
2275 #define COMPOSE_FLUSH_CHARS(str, dst)
2276 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2277 #endif /* UTF2000 */
2278
2279 static ssize_t decoding_reader (Lstream *stream,
2280                                 unsigned char *data, size_t size);
2281 static ssize_t decoding_writer (Lstream *stream,
2282                                 const unsigned char *data, size_t size);
2283 static int decoding_rewinder   (Lstream *stream);
2284 static int decoding_seekable_p (Lstream *stream);
2285 static int decoding_flusher    (Lstream *stream);
2286 static int decoding_closer     (Lstream *stream);
2287
2288 static Lisp_Object decoding_marker (Lisp_Object stream);
2289
2290 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2291                                sizeof (struct decoding_stream));
2292
2293 static Lisp_Object
2294 decoding_marker (Lisp_Object stream)
2295 {
2296   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2297   Lisp_Object str_obj;
2298
2299   /* We do not need to mark the coding systems or charsets stored
2300      within the stream because they are stored in a global list
2301      and automatically marked. */
2302
2303   XSETLSTREAM (str_obj, str);
2304   mark_object (str_obj);
2305   if (str->imp->marker)
2306     return (str->imp->marker) (str_obj);
2307   else
2308     return Qnil;
2309 }
2310
2311 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2312    so we read data from the other end, decode it, and store it into DATA. */
2313
2314 static ssize_t
2315 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2316 {
2317   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2318   unsigned char *orig_data = data;
2319   ssize_t read_size;
2320   int error_occurred = 0;
2321
2322   /* We need to interface to mule_decode(), which expects to take some
2323      amount of data and store the result into a Dynarr.  We have
2324      mule_decode() store into str->runoff, and take data from there
2325      as necessary. */
2326
2327   /* We loop until we have enough data, reading chunks from the other
2328      end and decoding it. */
2329   while (1)
2330     {
2331       /* Take data from the runoff if we can.  Make sure to take at
2332          most SIZE bytes, and delete the data from the runoff. */
2333       if (Dynarr_length (str->runoff) > 0)
2334         {
2335           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2336           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2337           Dynarr_delete_many (str->runoff, 0, chunk);
2338           data += chunk;
2339           size -= chunk;
2340         }
2341
2342       if (size == 0)
2343         break; /* No more room for data */
2344
2345       if (str->flags & CODING_STATE_END)
2346         /* This means that on the previous iteration, we hit the EOF on
2347            the other end.  We loop once more so that mule_decode() can
2348            output any final stuff it may be holding, or any "go back
2349            to a sane state" escape sequences. (This latter makes sense
2350            during encoding.) */
2351         break;
2352
2353       /* Exhausted the runoff, so get some more.  DATA has at least
2354          SIZE bytes left of storage in it, so it's OK to read directly
2355          into it.  (We'll be overwriting above, after we've decoded it
2356          into the runoff.) */
2357       read_size = Lstream_read (str->other_end, data, size);
2358       if (read_size < 0)
2359         {
2360           error_occurred = 1;
2361           break;
2362         }
2363       if (read_size == 0)
2364         /* There might be some more end data produced in the translation.
2365            See the comment above. */
2366         str->flags |= CODING_STATE_END;
2367       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2368     }
2369
2370   if (data - orig_data == 0)
2371     return error_occurred ? -1 : 0;
2372   else
2373     return data - orig_data;
2374 }
2375
2376 static ssize_t
2377 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2378 {
2379   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2380   ssize_t retval;
2381
2382   /* Decode all our data into the runoff, and then attempt to write
2383      it all out to the other end.  Remove whatever chunk we succeeded
2384      in writing. */
2385   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2386   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2387                           Dynarr_length (str->runoff));
2388   if (retval > 0)
2389     Dynarr_delete_many (str->runoff, 0, retval);
2390   /* Do NOT return retval.  The return value indicates how much
2391      of the incoming data was written, not how many bytes were
2392      written. */
2393   return size;
2394 }
2395
2396 static void
2397 reset_decoding_stream (struct decoding_stream *str)
2398 {
2399 #ifdef MULE
2400   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2401     {
2402       Lisp_Object coding_system;
2403       XSETCODING_SYSTEM (coding_system, str->codesys);
2404       reset_iso2022 (coding_system, &str->iso2022);
2405     }
2406   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2407     {
2408       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2409     }
2410   str->counter = 0;
2411 #endif /* MULE */
2412 #ifdef UTF2000
2413   str->combined_char_count = 0;
2414   str->combining_table = Qnil;
2415 #endif
2416   str->flags = str->cpos = 0;
2417 }
2418
2419 static int
2420 decoding_rewinder (Lstream *stream)
2421 {
2422   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2423   reset_decoding_stream (str);
2424   Dynarr_reset (str->runoff);
2425   return Lstream_rewind (str->other_end);
2426 }
2427
2428 static int
2429 decoding_seekable_p (Lstream *stream)
2430 {
2431   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2432   return Lstream_seekable_p (str->other_end);
2433 }
2434
2435 static int
2436 decoding_flusher (Lstream *stream)
2437 {
2438   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2439   return Lstream_flush (str->other_end);
2440 }
2441
2442 static int
2443 decoding_closer (Lstream *stream)
2444 {
2445   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2446   if (stream->flags & LSTREAM_FL_WRITE)
2447     {
2448       str->flags |= CODING_STATE_END;
2449       decoding_writer (stream, 0, 0);
2450     }
2451   Dynarr_free (str->runoff);
2452 #ifdef MULE
2453 #ifdef ENABLE_COMPOSITE_CHARS
2454   if (str->iso2022.composite_chars)
2455     Dynarr_free (str->iso2022.composite_chars);
2456 #endif
2457 #endif
2458   return Lstream_close (str->other_end);
2459 }
2460
2461 Lisp_Object
2462 decoding_stream_coding_system (Lstream *stream)
2463 {
2464   Lisp_Object coding_system;
2465   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2466
2467   XSETCODING_SYSTEM (coding_system, str->codesys);
2468   return subsidiary_coding_system (coding_system, str->eol_type);
2469 }
2470
2471 void
2472 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2473 {
2474   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2475   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2476   str->codesys = cs;
2477   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2478     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2479   reset_decoding_stream (str);
2480 }
2481
2482 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2483    stream for writing, no automatic code detection will be performed.
2484    The reason for this is that automatic code detection requires a
2485    seekable input.  Things will also fail if you open a decoding
2486    stream for reading using a non-fully-specified coding system and
2487    a non-seekable input stream. */
2488
2489 static Lisp_Object
2490 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2491                         const char *mode)
2492 {
2493   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2494   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2495   Lisp_Object obj;
2496
2497   xzero (*str);
2498   str->other_end = stream;
2499   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2500   str->eol_type = EOL_AUTODETECT;
2501   if (!strcmp (mode, "r")
2502       && Lstream_seekable_p (stream))
2503     /* We can determine the coding system now. */
2504     determine_real_coding_system (stream, &codesys, &str->eol_type);
2505   set_decoding_stream_coding_system (lstr, codesys);
2506   str->decst.eol_type = str->eol_type;
2507   str->decst.mask = ~0;
2508   XSETLSTREAM (obj, lstr);
2509   return obj;
2510 }
2511
2512 Lisp_Object
2513 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2514 {
2515   return make_decoding_stream_1 (stream, codesys, "r");
2516 }
2517
2518 Lisp_Object
2519 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2520 {
2521   return make_decoding_stream_1 (stream, codesys, "w");
2522 }
2523
2524 /* Note: the decode_coding_* functions all take the same
2525    arguments as mule_decode(), which is to say some SRC data of
2526    size N, which is to be stored into dynamic array DST.
2527    DECODING is the stream within which the decoding is
2528    taking place, but no data is actually read from or
2529    written to that stream; that is handled in decoding_reader()
2530    or decoding_writer().  This allows the same functions to
2531    be used for both reading and writing. */
2532
2533 static void
2534 mule_decode (Lstream *decoding, const Extbyte *src,
2535              unsigned_char_dynarr *dst, size_t n)
2536 {
2537   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2538
2539   /* If necessary, do encoding-detection now.  We do this when
2540      we're a writing stream or a non-seekable reading stream,
2541      meaning that we can't just process the whole input,
2542      rewind, and start over. */
2543
2544   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2545       str->eol_type == EOL_AUTODETECT)
2546     {
2547       Lisp_Object codesys;
2548
2549       XSETCODING_SYSTEM (codesys, str->codesys);
2550       detect_coding_type (&str->decst, src, n,
2551                           CODING_SYSTEM_TYPE (str->codesys) !=
2552                           CODESYS_AUTODETECT);
2553       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2554           str->decst.mask != ~0)
2555         /* #### This is cheesy.  What we really ought to do is
2556            buffer up a certain amount of data so as to get a
2557            less random result. */
2558         codesys = coding_system_from_mask (str->decst.mask);
2559       str->eol_type = str->decst.eol_type;
2560       if (XCODING_SYSTEM (codesys) != str->codesys)
2561         {
2562           /* Preserve the CODING_STATE_END flag in case it was set.
2563              If we erase it, bad things might happen. */
2564           int was_end = str->flags & CODING_STATE_END;
2565           set_decoding_stream_coding_system (decoding, codesys);
2566           if (was_end)
2567             str->flags |= CODING_STATE_END;
2568         }
2569     }
2570
2571   switch (CODING_SYSTEM_TYPE (str->codesys))
2572     {
2573 #ifdef DEBUG_XEMACS
2574     case CODESYS_INTERNAL:
2575       Dynarr_add_many (dst, src, n);
2576       break;
2577 #endif
2578     case CODESYS_AUTODETECT:
2579       /* If we got this far and still haven't decided on the coding
2580          system, then do no conversion. */
2581     case CODESYS_NO_CONVERSION:
2582       decode_coding_no_conversion (decoding, src, dst, n);
2583       break;
2584 #ifdef MULE
2585     case CODESYS_SHIFT_JIS:
2586       decode_coding_sjis (decoding, src, dst, n);
2587       break;
2588     case CODESYS_BIG5:
2589       decode_coding_big5 (decoding, src, dst, n);
2590       break;
2591     case CODESYS_UCS4:
2592       decode_coding_ucs4 (decoding, src, dst, n);
2593       break;
2594     case CODESYS_UTF8:
2595       decode_coding_utf8 (decoding, src, dst, n);
2596       break;
2597     case CODESYS_CCL:
2598       str->ccl.last_block = str->flags & CODING_STATE_END;
2599       /* When applying ccl program to stream, MUST NOT set NULL
2600          pointer to src.  */
2601       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2602                   dst, n, 0, CCL_MODE_DECODING);
2603       break;
2604     case CODESYS_ISO2022:
2605       decode_coding_iso2022 (decoding, src, dst, n);
2606       break;
2607 #endif /* MULE */
2608     default:
2609       abort ();
2610     }
2611 }
2612
2613 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2614 Decode the text between START and END which is encoded in CODING-SYSTEM.
2615 This is useful if you've read in encoded text from a file without decoding
2616 it (e.g. you read in a JIS-formatted file but used the `binary' or
2617 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2618 Return length of decoded text.
2619 BUFFER defaults to the current buffer if unspecified.
2620 */
2621        (start, end, coding_system, buffer))
2622 {
2623   Bufpos b, e;
2624   struct buffer *buf = decode_buffer (buffer, 0);
2625   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2626   Lstream *istr, *ostr;
2627   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2628
2629   get_buffer_range_char (buf, start, end, &b, &e, 0);
2630
2631   barf_if_buffer_read_only (buf, b, e);
2632
2633   coding_system = Fget_coding_system (coding_system);
2634   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2635   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2636   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2637                                               coding_system);
2638   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2639                                            Fget_coding_system (Qbinary));
2640   istr = XLSTREAM (instream);
2641   ostr = XLSTREAM (outstream);
2642   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2643
2644   /* The chain of streams looks like this:
2645
2646      [BUFFER] <----- send through
2647                      ------> [ENCODE AS BINARY]
2648                              ------> [DECODE AS SPECIFIED]
2649                                      ------> [BUFFER]
2650    */
2651
2652   while (1)
2653     {
2654       char tempbuf[1024]; /* some random amount */
2655       Bufpos newpos, even_newer_pos;
2656       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2657       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2658
2659       if (!size_in_bytes)
2660         break;
2661       newpos = lisp_buffer_stream_startpos (istr);
2662       Lstream_write (ostr, tempbuf, size_in_bytes);
2663       even_newer_pos = lisp_buffer_stream_startpos (istr);
2664       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2665                            even_newer_pos, 0);
2666     }
2667   Lstream_close (istr);
2668   Lstream_close (ostr);
2669   UNGCPRO;
2670   Lstream_delete (istr);
2671   Lstream_delete (ostr);
2672   Lstream_delete (XLSTREAM (de_outstream));
2673   Lstream_delete (XLSTREAM (lb_outstream));
2674   return Qnil;
2675 }
2676
2677 \f
2678 /************************************************************************/
2679 /*           Converting to an external encoding ("encoding")            */
2680 /************************************************************************/
2681
2682 /* An encoding stream is an output stream.  When you create the
2683    stream, you specify the coding system that governs the encoding
2684    and another stream that the resulting encoded data is to be
2685    sent to, and then start sending data to it. */
2686
2687 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2688
2689 struct encoding_stream
2690 {
2691   /* Coding system that governs the conversion. */
2692   Lisp_Coding_System *codesys;
2693
2694   /* Stream that we read the encoded data from or
2695      write the decoded data to. */
2696   Lstream *other_end;
2697
2698   /* If we are reading, then we can return only a fixed amount of
2699      data, so if the conversion resulted in too much data, we store it
2700      here for retrieval the next time around. */
2701   unsigned_char_dynarr *runoff;
2702
2703   /* FLAGS holds flags indicating the current state of the encoding.
2704      Some of these flags are dependent on the coding system. */
2705   unsigned int flags;
2706
2707   /* CH holds a partially built-up character.  Since we only deal
2708      with one- and two-byte characters at the moment, we only use
2709      this to store the first byte of a two-byte character. */
2710   unsigned int ch;
2711 #ifdef MULE
2712   /* Additional information used by the ISO2022 encoder. */
2713   struct
2714     {
2715       /* CHARSET holds the character sets currently assigned to the G0
2716          through G3 registers.  It is initialized from the array
2717          INITIAL_CHARSET in CODESYS. */
2718       Lisp_Object charset[4];
2719
2720       /* Which registers are currently invoked into the left (GL) and
2721          right (GR) halves of the 8-bit encoding space? */
2722       int register_left, register_right;
2723
2724       /* Whether we need to explicitly designate the charset in the
2725          G? register before using it.  It is initialized from the
2726          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2727       unsigned char force_charset_on_output[4];
2728
2729       /* Other state variables that need to be preserved across
2730          invocations. */
2731       Lisp_Object current_charset;
2732       int current_half;
2733       int current_char_boundary;
2734     } iso2022;
2735
2736   void (*encode_char) (struct encoding_stream *str, Emchar c,
2737                        unsigned_char_dynarr *dst, unsigned int *flags);
2738   void (*finish) (struct encoding_stream *str,
2739                   unsigned_char_dynarr *dst, unsigned int *flags);
2740
2741   /* Additional information (the state of the running CCL program)
2742      used by the CCL encoder. */
2743   struct ccl_program ccl;
2744 #endif /* MULE */
2745 };
2746
2747 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2748 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2749                                 size_t size);
2750 static int encoding_rewinder   (Lstream *stream);
2751 static int encoding_seekable_p (Lstream *stream);
2752 static int encoding_flusher    (Lstream *stream);
2753 static int encoding_closer     (Lstream *stream);
2754
2755 static Lisp_Object encoding_marker (Lisp_Object stream);
2756
2757 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2758                                sizeof (struct encoding_stream));
2759
2760 static Lisp_Object
2761 encoding_marker (Lisp_Object stream)
2762 {
2763   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2764   Lisp_Object str_obj;
2765
2766   /* We do not need to mark the coding systems or charsets stored
2767      within the stream because they are stored in a global list
2768      and automatically marked. */
2769
2770   XSETLSTREAM (str_obj, str);
2771   mark_object (str_obj);
2772   if (str->imp->marker)
2773     return (str->imp->marker) (str_obj);
2774   else
2775     return Qnil;
2776 }
2777
2778 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2779    so we read data from the other end, encode it, and store it into DATA. */
2780
2781 static ssize_t
2782 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2783 {
2784   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2785   unsigned char *orig_data = data;
2786   ssize_t read_size;
2787   int error_occurred = 0;
2788
2789   /* We need to interface to mule_encode(), which expects to take some
2790      amount of data and store the result into a Dynarr.  We have
2791      mule_encode() store into str->runoff, and take data from there
2792      as necessary. */
2793
2794   /* We loop until we have enough data, reading chunks from the other
2795      end and encoding it. */
2796   while (1)
2797     {
2798       /* Take data from the runoff if we can.  Make sure to take at
2799          most SIZE bytes, and delete the data from the runoff. */
2800       if (Dynarr_length (str->runoff) > 0)
2801         {
2802           int chunk = min ((int) size, Dynarr_length (str->runoff));
2803           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2804           Dynarr_delete_many (str->runoff, 0, chunk);
2805           data += chunk;
2806           size -= chunk;
2807         }
2808
2809       if (size == 0)
2810         break; /* No more room for data */
2811
2812       if (str->flags & CODING_STATE_END)
2813         /* This means that on the previous iteration, we hit the EOF on
2814            the other end.  We loop once more so that mule_encode() can
2815            output any final stuff it may be holding, or any "go back
2816            to a sane state" escape sequences. (This latter makes sense
2817            during encoding.) */
2818         break;
2819
2820       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2821          left of storage in it, so it's OK to read directly into it.
2822          (We'll be overwriting above, after we've encoded it into the
2823          runoff.) */
2824       read_size = Lstream_read (str->other_end, data, size);
2825       if (read_size < 0)
2826         {
2827           error_occurred = 1;
2828           break;
2829         }
2830       if (read_size == 0)
2831         /* There might be some more end data produced in the translation.
2832            See the comment above. */
2833         str->flags |= CODING_STATE_END;
2834       mule_encode (stream, data, str->runoff, read_size);
2835     }
2836
2837   if (data == orig_data)
2838     return error_occurred ? -1 : 0;
2839   else
2840     return data - orig_data;
2841 }
2842
2843 static ssize_t
2844 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2845 {
2846   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2847   ssize_t retval;
2848
2849   /* Encode all our data into the runoff, and then attempt to write
2850      it all out to the other end.  Remove whatever chunk we succeeded
2851      in writing. */
2852   mule_encode (stream, data, str->runoff, size);
2853   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2854                           Dynarr_length (str->runoff));
2855   if (retval > 0)
2856     Dynarr_delete_many (str->runoff, 0, retval);
2857   /* Do NOT return retval.  The return value indicates how much
2858      of the incoming data was written, not how many bytes were
2859      written. */
2860   return size;
2861 }
2862
2863 static void
2864 reset_encoding_stream (struct encoding_stream *str)
2865 {
2866 #ifdef MULE
2867   switch (CODING_SYSTEM_TYPE (str->codesys))
2868     {
2869     case CODESYS_ISO2022:
2870       {
2871         int i;
2872
2873         str->encode_char = &char_encode_iso2022;
2874         str->finish = &char_finish_iso2022;
2875         for (i = 0; i < 4; i++)
2876           {
2877             str->iso2022.charset[i] =
2878               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2879             str->iso2022.force_charset_on_output[i] =
2880               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2881           }
2882         str->iso2022.register_left = 0;
2883         str->iso2022.register_right = 1;
2884         str->iso2022.current_charset = Qnil;
2885         str->iso2022.current_half = 0;
2886         break;
2887       }
2888     case CODESYS_CCL:
2889       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2890       break;
2891     case CODESYS_UTF8:
2892       str->encode_char = &char_encode_utf8;
2893       str->finish = &char_finish_utf8;
2894       break;
2895     case CODESYS_UCS4:
2896       str->encode_char = &char_encode_ucs4;
2897       str->finish = &char_finish_ucs4;
2898       break;
2899     case CODESYS_SHIFT_JIS:
2900       str->encode_char = &char_encode_shift_jis;
2901       str->finish = &char_finish_shift_jis;
2902       break;
2903     default:
2904       break;
2905     }
2906 #endif /* MULE */
2907   str->iso2022.current_char_boundary = 0;
2908   str->flags = str->ch = 0;
2909 }
2910
2911 static int
2912 encoding_rewinder (Lstream *stream)
2913 {
2914   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915   reset_encoding_stream (str);
2916   Dynarr_reset (str->runoff);
2917   return Lstream_rewind (str->other_end);
2918 }
2919
2920 static int
2921 encoding_seekable_p (Lstream *stream)
2922 {
2923   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2924   return Lstream_seekable_p (str->other_end);
2925 }
2926
2927 static int
2928 encoding_flusher (Lstream *stream)
2929 {
2930   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2931   return Lstream_flush (str->other_end);
2932 }
2933
2934 static int
2935 encoding_closer (Lstream *stream)
2936 {
2937   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2938   if (stream->flags & LSTREAM_FL_WRITE)
2939     {
2940       str->flags |= CODING_STATE_END;
2941       encoding_writer (stream, 0, 0);
2942     }
2943   Dynarr_free (str->runoff);
2944   return Lstream_close (str->other_end);
2945 }
2946
2947 Lisp_Object
2948 encoding_stream_coding_system (Lstream *stream)
2949 {
2950   Lisp_Object coding_system;
2951   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2952
2953   XSETCODING_SYSTEM (coding_system, str->codesys);
2954   return coding_system;
2955 }
2956
2957 void
2958 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2959 {
2960   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2961   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2962   str->codesys = cs;
2963   reset_encoding_stream (str);
2964 }
2965
2966 static Lisp_Object
2967 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2968                         const char *mode)
2969 {
2970   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2971   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2972   Lisp_Object obj;
2973
2974   xzero (*str);
2975   str->runoff = Dynarr_new (unsigned_char);
2976   str->other_end = stream;
2977   set_encoding_stream_coding_system (lstr, codesys);
2978   XSETLSTREAM (obj, lstr);
2979   return obj;
2980 }
2981
2982 Lisp_Object
2983 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2984 {
2985   return make_encoding_stream_1 (stream, codesys, "r");
2986 }
2987
2988 Lisp_Object
2989 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2990 {
2991   return make_encoding_stream_1 (stream, codesys, "w");
2992 }
2993
2994 /* Convert N bytes of internally-formatted data stored in SRC to an
2995    external format, according to the encoding stream ENCODING.
2996    Store the encoded data into DST. */
2997
2998 static void
2999 mule_encode (Lstream *encoding, const Bufbyte *src,
3000              unsigned_char_dynarr *dst, size_t n)
3001 {
3002   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3003
3004   switch (CODING_SYSTEM_TYPE (str->codesys))
3005     {
3006 #ifdef DEBUG_XEMACS
3007     case CODESYS_INTERNAL:
3008       Dynarr_add_many (dst, src, n);
3009       break;
3010 #endif
3011     case CODESYS_AUTODETECT:
3012       /* If we got this far and still haven't decided on the coding
3013          system, then do no conversion. */
3014     case CODESYS_NO_CONVERSION:
3015       encode_coding_no_conversion (encoding, src, dst, n);
3016       break;
3017 #ifdef MULE
3018     case CODESYS_BIG5:
3019       encode_coding_big5 (encoding, src, dst, n);
3020       break;
3021     case CODESYS_CCL:
3022       str->ccl.last_block = str->flags & CODING_STATE_END;
3023       /* When applying ccl program to stream, MUST NOT set NULL
3024          pointer to src.  */
3025       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3026                   dst, n, 0, CCL_MODE_ENCODING);
3027       break;
3028 #endif /* MULE */
3029     default:
3030       text_encode_generic (encoding, src, dst, n);
3031     }
3032 }
3033
3034 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3035 Encode the text between START and END using CODING-SYSTEM.
3036 This will, for example, convert Japanese characters into stuff such as
3037 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3038 text.  BUFFER defaults to the current buffer if unspecified.
3039 */
3040        (start, end, coding_system, buffer))
3041 {
3042   Bufpos b, e;
3043   struct buffer *buf = decode_buffer (buffer, 0);
3044   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3045   Lstream *istr, *ostr;
3046   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3047
3048   get_buffer_range_char (buf, start, end, &b, &e, 0);
3049
3050   barf_if_buffer_read_only (buf, b, e);
3051
3052   coding_system = Fget_coding_system (coding_system);
3053   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3054   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3055   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3056                                               Fget_coding_system (Qbinary));
3057   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3058                                            coding_system);
3059   istr = XLSTREAM (instream);
3060   ostr = XLSTREAM (outstream);
3061   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3062   /* The chain of streams looks like this:
3063
3064      [BUFFER] <----- send through
3065                      ------> [ENCODE AS SPECIFIED]
3066                              ------> [DECODE AS BINARY]
3067                                      ------> [BUFFER]
3068    */
3069   while (1)
3070     {
3071       char tempbuf[1024]; /* some random amount */
3072       Bufpos newpos, even_newer_pos;
3073       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3074       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3075
3076       if (!size_in_bytes)
3077         break;
3078       newpos = lisp_buffer_stream_startpos (istr);
3079       Lstream_write (ostr, tempbuf, size_in_bytes);
3080       even_newer_pos = lisp_buffer_stream_startpos (istr);
3081       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3082                            even_newer_pos, 0);
3083     }
3084
3085   {
3086     Charcount retlen =
3087       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3088     Lstream_close (istr);
3089     Lstream_close (ostr);
3090     UNGCPRO;
3091     Lstream_delete (istr);
3092     Lstream_delete (ostr);
3093     Lstream_delete (XLSTREAM (de_outstream));
3094     Lstream_delete (XLSTREAM (lb_outstream));
3095     return make_int (retlen);
3096   }
3097 }
3098
3099 #ifdef MULE
3100 \f
3101 static void
3102 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3103                      unsigned_char_dynarr *dst, size_t n)
3104 {
3105   unsigned char c;
3106   unsigned char char_boundary;
3107   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3108   unsigned int flags          = str->flags;
3109   Emchar ch                   = str->ch;
3110
3111   char_boundary = str->iso2022.current_char_boundary;
3112
3113   while (n--)
3114     {
3115       c = *src++;
3116
3117       if (char_boundary == 0)
3118         {
3119           if (c >= 0xfc)
3120             {
3121               ch = c & 0x01;
3122               char_boundary = 5;
3123             }
3124           else if (c >= 0xf8)
3125             {
3126               ch = c & 0x03;
3127               char_boundary = 4;
3128             }
3129           else if (c >= 0xf0)
3130             {
3131               ch = c & 0x07;
3132               char_boundary = 3;
3133             }
3134           else if (c >= 0xe0)
3135             {
3136               ch = c & 0x0f;
3137               char_boundary = 2;
3138             }
3139           else if (c >= 0xc0)
3140             {
3141               ch = c & 0x1f;
3142               char_boundary = 1;
3143             }
3144           else
3145             (*str->encode_char) (str, c, dst, &flags);
3146         }
3147       else if (char_boundary == 1)
3148         {
3149           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3150           ch =0;
3151           char_boundary = 0;
3152         }
3153       else
3154         {
3155           ch = (ch << 6) | (c & 0x3f);
3156           char_boundary--;
3157         }
3158     }
3159
3160   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3161     {
3162       (*str->finish) (str, dst, &flags);
3163     }
3164
3165   str->flags = flags;
3166   str->ch    = ch;
3167   str->iso2022.current_char_boundary = char_boundary;
3168 }
3169
3170 \f
3171 /************************************************************************/
3172 /*                          Shift-JIS methods                           */
3173 /************************************************************************/
3174
3175 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3176    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3177    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3178    encoded by "position-code + 0x80".  A character of JISX0208
3179    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3180    position-codes are divided and shifted so that it fit in the range
3181    below.
3182
3183    --- CODE RANGE of Shift-JIS ---
3184    (character set)      (range)
3185    ASCII                0x00 .. 0x7F
3186    JISX0201-Kana        0xA0 .. 0xDF
3187    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3188             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3189    -------------------------------
3190
3191 */
3192
3193 /* Is this the first byte of a Shift-JIS two-byte char? */
3194
3195 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3196   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3197
3198 /* Is this the second byte of a Shift-JIS two-byte char? */
3199
3200 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3201   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3202
3203 #define BYTE_SJIS_KATAKANA_P(c) \
3204   ((c) >= 0xA1 && (c) <= 0xDF)
3205
3206 static int
3207 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3208 {
3209   while (n--)
3210     {
3211       unsigned char c = *(unsigned char *)src++;
3212       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3213         return 0;
3214       if (st->shift_jis.in_second_byte)
3215         {
3216           st->shift_jis.in_second_byte = 0;
3217           if (c < 0x40)
3218             return 0;
3219         }
3220       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3221         st->shift_jis.in_second_byte = 1;
3222     }
3223   return CODING_CATEGORY_SHIFT_JIS_MASK;
3224 }
3225
3226 /* Convert Shift-JIS data to internal format. */
3227
3228 static void
3229 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3230                     unsigned_char_dynarr *dst, size_t n)
3231 {
3232   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3233   unsigned int flags  = str->flags;
3234   unsigned int cpos   = str->cpos;
3235   eol_type_t eol_type = str->eol_type;
3236
3237   while (n--)
3238     {
3239       unsigned char c = *(unsigned char *)src++;
3240
3241       if (cpos)
3242         {
3243           /* Previous character was first byte of Shift-JIS Kanji char. */
3244           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3245             {
3246               unsigned char e1, e2;
3247
3248               DECODE_SJIS (cpos, c, e1, e2);
3249 #ifdef UTF2000
3250               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3251                                             e1 & 0x7F,
3252                                             e2 & 0x7F), dst);
3253 #else
3254               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3255               Dynarr_add (dst, e1);
3256               Dynarr_add (dst, e2);
3257 #endif
3258             }
3259           else
3260             {
3261               DECODE_ADD_BINARY_CHAR (cpos, dst);
3262               DECODE_ADD_BINARY_CHAR (c, dst);
3263             }
3264           cpos = 0;
3265         }
3266       else
3267         {
3268           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3269           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3270             cpos = c;
3271           else if (BYTE_SJIS_KATAKANA_P (c))
3272             {
3273 #ifdef UTF2000
3274               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3275                                             c & 0x7F, 0), dst);
3276 #else
3277               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3278               Dynarr_add (dst, c);
3279 #endif
3280             }
3281 #ifdef UTF2000
3282           else if (c > 32)
3283             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3284                                           c, 0), dst);
3285 #endif
3286           else
3287             DECODE_ADD_BINARY_CHAR (c, dst);
3288         }
3289     label_continue_loop:;
3290     }
3291
3292   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3293
3294   str->flags = flags;
3295   str->cpos  = cpos;
3296 }
3297
3298 /* Convert internal character representation to Shift_JIS. */
3299
3300 void
3301 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3302                        unsigned_char_dynarr *dst, unsigned int *flags)
3303 {
3304   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3305
3306   if (ch == '\n')
3307     {
3308       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3309         Dynarr_add (dst, '\r');
3310       if (eol_type != EOL_CR)
3311         Dynarr_add (dst, ch);
3312     }
3313   else
3314     {
3315       unsigned int s1, s2;
3316 #ifdef UTF2000
3317       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3318
3319       if (code_point >= 0)
3320         Dynarr_add (dst, code_point);
3321       else if ((code_point
3322                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3323                >= 0)
3324         {
3325           ENCODE_SJIS ((code_point >> 8) | 0x80,
3326                        (code_point & 0xFF) | 0x80, s1, s2);
3327           Dynarr_add (dst, s1);
3328           Dynarr_add (dst, s2);
3329         }
3330       else if ((code_point
3331                 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3332                >= 0)
3333         Dynarr_add (dst, code_point | 0x80);
3334       else if ((code_point
3335                 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3336                >= 0)
3337         {
3338           ENCODE_SJIS ((code_point >> 8) | 0x80,
3339                        (code_point & 0xFF) | 0x80, s1, s2);
3340           Dynarr_add (dst, s1);
3341           Dynarr_add (dst, s2);
3342         }
3343       else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3344                >= 0)
3345         Dynarr_add (dst, code_point);
3346       else
3347         Dynarr_add (dst, '?');
3348 #else
3349       Lisp_Object charset;
3350       unsigned int c1, c2;
3351
3352       BREAKUP_CHAR (ch, charset, c1, c2);
3353
3354       if (EQ(charset, Vcharset_katakana_jisx0201))
3355         {
3356           Dynarr_add (dst, c1 | 0x80);
3357         }
3358       else if (c2 == 0)
3359         {
3360           Dynarr_add (dst, c1);
3361         }
3362       else if (EQ(charset, Vcharset_japanese_jisx0208))
3363         {
3364           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3365           Dynarr_add (dst, s1);
3366           Dynarr_add (dst, s2);
3367         }
3368       else
3369         Dynarr_add (dst, '?');
3370 #endif
3371     }
3372 }
3373
3374 void
3375 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3376                        unsigned int *flags)
3377 {
3378 }
3379
3380 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3381 Decode a JISX0208 character of Shift-JIS coding-system.
3382 CODE is the character code in Shift-JIS as a cons of type bytes.
3383 Return the corresponding character.
3384 */
3385        (code))
3386 {
3387   unsigned char c1, c2, s1, s2;
3388
3389   CHECK_CONS (code);
3390   CHECK_INT (XCAR (code));
3391   CHECK_INT (XCDR (code));
3392   s1 = XINT (XCAR (code));
3393   s2 = XINT (XCDR (code));
3394   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3395       BYTE_SJIS_TWO_BYTE_2_P (s2))
3396     {
3397       DECODE_SJIS (s1, s2, c1, c2);
3398       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3399                                    c1 & 0x7F, c2 & 0x7F));
3400     }
3401   else
3402     return Qnil;
3403 }
3404
3405 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3406 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3407 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3408 */
3409        (character))
3410 {
3411   Lisp_Object charset;
3412   int c1, c2, s1, s2;
3413
3414   CHECK_CHAR_COERCE_INT (character);
3415   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3416   if (EQ (charset, Vcharset_japanese_jisx0208))
3417     {
3418       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3419       return Fcons (make_int (s1), make_int (s2));
3420     }
3421   else
3422     return Qnil;
3423 }
3424
3425 \f
3426 /************************************************************************/
3427 /*                            Big5 methods                              */
3428 /************************************************************************/
3429
3430 /* BIG5 is a coding system encoding two character sets: ASCII and
3431    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3432    character set and is encoded in two-byte.
3433
3434    --- CODE RANGE of BIG5 ---
3435    (character set)      (range)
3436    ASCII                0x00 .. 0x7F
3437    Big5 (1st byte)      0xA1 .. 0xFE
3438         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3439    --------------------------
3440
3441    Since the number of characters in Big5 is larger than maximum
3442    characters in Emacs' charset (96x96), it can't be handled as one
3443    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3444    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3445    contains frequently used characters and the latter contains less
3446    frequently used characters.  */
3447
3448 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3449   ((c) >= 0xA1 && (c) <= 0xFE)
3450
3451 /* Is this the second byte of a Shift-JIS two-byte char? */
3452
3453 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3454   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3455
3456 /* Number of Big5 characters which have the same code in 1st byte.  */
3457
3458 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3459
3460 /* Code conversion macros.  These are macros because they are used in
3461    inner loops during code conversion.
3462
3463    Note that temporary variables in macros introduce the classic
3464    dynamic-scoping problems with variable names.  We use capital-
3465    lettered variables in the assumption that XEmacs does not use
3466    capital letters in variables except in a very formalized way
3467    (e.g. Qstring). */
3468
3469 /* Convert Big5 code (b1, b2) into its internal string representation
3470    (lb, c1, c2). */
3471
3472 /* There is a much simpler way to split the Big5 charset into two.
3473    For the moment I'm going to leave the algorithm as-is because it
3474    claims to separate out the most-used characters into a single
3475    charset, which perhaps will lead to optimizations in various
3476    places.
3477
3478    The way the algorithm works is something like this:
3479
3480    Big5 can be viewed as a 94x157 charset, where the row is
3481    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3482    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3483    the split between low and high column numbers is apparently
3484    meaningless; ascending rows produce less and less frequent chars.
3485    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3486    the first charset, and the upper half (0xC9 .. 0xFE) to the
3487    second.  To do the conversion, we convert the character into
3488    a single number where 0 .. 156 is the first row, 157 .. 313
3489    is the second, etc.  That way, the characters are ordered by
3490    decreasing frequency.  Then we just chop the space in two
3491    and coerce the result into a 94x94 space.
3492    */
3493
3494 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3495 {                                                                       \
3496   int B1 = b1, B2 = b2;                                                 \
3497   unsigned int I                                                        \
3498     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3499                                                                         \
3500   if (B1 < 0xC9)                                                        \
3501     {                                                                   \
3502       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3503     }                                                                   \
3504   else                                                                  \
3505     {                                                                   \
3506       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3507       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3508     }                                                                   \
3509   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3510   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3511 } while (0)
3512
3513 /* Convert the internal string representation of a Big5 character
3514    (lb, c1, c2) into Big5 code (b1, b2). */
3515
3516 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3517 {                                                                       \
3518   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3519                                                                         \
3520   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3521     {                                                                   \
3522       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3523     }                                                                   \
3524   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3525   b2 = I % BIG5_SAME_ROW;                                               \
3526   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3527 } while (0)
3528
3529 static int
3530 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3531 {
3532   while (n--)
3533     {
3534       unsigned char c = *(unsigned char *)src++;
3535       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3536           (c >= 0x80 && c <= 0xA0))
3537         return 0;
3538       if (st->big5.in_second_byte)
3539         {
3540           st->big5.in_second_byte = 0;
3541           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3542             return 0;
3543         }
3544       else if (c >= 0xA1)
3545         st->big5.in_second_byte = 1;
3546     }
3547   return CODING_CATEGORY_BIG5_MASK;
3548 }
3549
3550 /* Convert Big5 data to internal format. */
3551
3552 static void
3553 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3554                     unsigned_char_dynarr *dst, size_t n)
3555 {
3556   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3557   unsigned int flags  = str->flags;
3558   unsigned int cpos   = str->cpos;
3559   eol_type_t eol_type = str->eol_type;
3560
3561   while (n--)
3562     {
3563       unsigned char c = *(unsigned char *)src++;
3564       if (cpos)
3565         {
3566           /* Previous character was first byte of Big5 char. */
3567           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3568             {
3569 #ifdef UTF2000
3570               DECODE_ADD_UCS_CHAR
3571                 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3572                  dst);
3573 #else
3574               unsigned char b1, b2, b3;
3575               DECODE_BIG5 (cpos, c, b1, b2, b3);
3576               Dynarr_add (dst, b1);
3577               Dynarr_add (dst, b2);
3578               Dynarr_add (dst, b3);
3579 #endif
3580             }
3581           else
3582             {
3583               DECODE_ADD_BINARY_CHAR (cpos, dst);
3584               DECODE_ADD_BINARY_CHAR (c, dst);
3585             }
3586           cpos = 0;
3587         }
3588       else
3589         {
3590           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3591           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3592             cpos = c;
3593           else
3594             DECODE_ADD_BINARY_CHAR (c, dst);
3595         }
3596     label_continue_loop:;
3597     }
3598
3599   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3600
3601   str->flags = flags;
3602   str->cpos  = cpos;
3603 }
3604
3605 /* Convert internally-formatted data to Big5. */
3606
3607 static void
3608 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3609                     unsigned_char_dynarr *dst, size_t n)
3610 {
3611 #ifndef UTF2000
3612   unsigned char c;
3613   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3614   unsigned int flags  = str->flags;
3615   unsigned int ch     = str->ch;
3616   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3617
3618   while (n--)
3619     {
3620       c = *src++;
3621       if (c == '\n')
3622         {
3623           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3624             Dynarr_add (dst, '\r');
3625           if (eol_type != EOL_CR)
3626             Dynarr_add (dst, '\n');
3627         }
3628       else if (BYTE_ASCII_P (c))
3629         {
3630           /* ASCII. */
3631           Dynarr_add (dst, c);
3632         }
3633       else if (BUFBYTE_LEADING_BYTE_P (c))
3634         {
3635           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3636               c == LEADING_BYTE_CHINESE_BIG5_2)
3637             {
3638               /* A recognized leading byte. */
3639               ch = c;
3640               continue; /* not done with this character. */
3641             }
3642           /* otherwise just ignore this character. */
3643         }
3644       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3645                ch == LEADING_BYTE_CHINESE_BIG5_2)
3646         {
3647           /* Previous char was a recognized leading byte. */
3648           ch = (ch << 8) | c;
3649           continue; /* not done with this character. */
3650         }
3651       else if (ch)
3652         {
3653           /* Encountering second byte of a Big5 character. */
3654           unsigned char b1, b2;
3655
3656           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3657           Dynarr_add (dst, b1);
3658           Dynarr_add (dst, b2);
3659         }
3660
3661       ch = 0;
3662     }
3663
3664   str->flags = flags;
3665   str->ch    = ch;
3666 #endif
3667 }
3668
3669
3670 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3671 Decode a Big5 character CODE of BIG5 coding-system.
3672 CODE is the character code in BIG5, a cons of two integers.
3673 Return the corresponding character.
3674 */
3675        (code))
3676 {
3677   unsigned char c1, c2, b1, b2;
3678
3679   CHECK_CONS (code);
3680   CHECK_INT (XCAR (code));
3681   CHECK_INT (XCDR (code));
3682   b1 = XINT (XCAR (code));
3683   b2 = XINT (XCDR (code));
3684   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3685       BYTE_BIG5_TWO_BYTE_2_P (b2))
3686     {
3687       Charset_ID leading_byte;
3688       Lisp_Object charset;
3689       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3690       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3691       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3692     }
3693   else
3694     return Qnil;
3695 }
3696
3697 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3698 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3699 Return the corresponding character code in Big5.
3700 */
3701        (character))
3702 {
3703   Lisp_Object charset;
3704   int c1, c2, b1, b2;
3705
3706   CHECK_CHAR_COERCE_INT (character);
3707   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3708   if (EQ (charset, Vcharset_chinese_big5_1) ||
3709       EQ (charset, Vcharset_chinese_big5_2))
3710     {
3711       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3712                    b1, b2);
3713       return Fcons (make_int (b1), make_int (b2));
3714     }
3715   else
3716     return Qnil;
3717 }
3718
3719 \f
3720 /************************************************************************/
3721 /*                           UCS-4 methods                              */
3722 /************************************************************************/
3723
3724 static int
3725 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3726 {
3727   while (n--)
3728     {
3729       unsigned char c = *(unsigned char *)src++;
3730       switch (st->ucs4.in_byte)
3731         {
3732         case 0:
3733           if (c >= 128)
3734             return 0;
3735           else
3736             st->ucs4.in_byte++;
3737           break;
3738         case 3:
3739           st->ucs4.in_byte = 0;
3740           break;
3741         default:
3742           st->ucs4.in_byte++;
3743         }
3744     }
3745   return CODING_CATEGORY_UCS4_MASK;
3746 }
3747
3748 static void
3749 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3750                     unsigned_char_dynarr *dst, size_t n)
3751 {
3752   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3753   unsigned int flags = str->flags;
3754   unsigned int cpos  = str->cpos;
3755   unsigned char counter = str->counter;
3756
3757   while (n--)
3758     {
3759       unsigned char c = *(unsigned char *)src++;
3760       switch (counter)
3761         {
3762         case 0:
3763           cpos = c;
3764           counter = 3;
3765           break;
3766         case 1:
3767           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3768           cpos = 0;
3769           counter = 0;
3770           break;
3771         default:
3772           cpos = ( cpos << 8 ) | c;
3773           counter--;
3774         }
3775     }
3776   if (counter & CODING_STATE_END)
3777     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3778
3779   str->flags    = flags;
3780   str->cpos     = cpos;
3781   str->counter  = counter;
3782 }
3783
3784 void
3785 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3786                   unsigned_char_dynarr *dst, unsigned int *flags)
3787 {
3788   Dynarr_add (dst, ch >> 24);
3789   Dynarr_add (dst, ch >> 16);
3790   Dynarr_add (dst, ch >>  8);
3791   Dynarr_add (dst, ch      );
3792 }
3793
3794 void
3795 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3796                   unsigned int *flags)
3797 {
3798 }
3799
3800 \f
3801 /************************************************************************/
3802 /*                           UTF-8 methods                              */
3803 /************************************************************************/
3804
3805 static int
3806 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3807 {
3808   while (n--)
3809     {
3810       unsigned char c = *(unsigned char *)src++;
3811       switch (st->utf8.in_byte)
3812         {
3813         case 0:
3814           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3815             return 0;
3816           else if (c >= 0xfc)
3817             st->utf8.in_byte = 5;
3818           else if (c >= 0xf8)
3819             st->utf8.in_byte = 4;
3820           else if (c >= 0xf0)
3821             st->utf8.in_byte = 3;
3822           else if (c >= 0xe0)
3823             st->utf8.in_byte = 2;
3824           else if (c >= 0xc0)
3825             st->utf8.in_byte = 1;
3826           else if (c >= 0x80)
3827             return 0;
3828           break;
3829         default:
3830           if ((c & 0xc0) != 0x80)
3831             return 0;
3832           else
3833             st->utf8.in_byte--;
3834         }
3835     }
3836   return CODING_CATEGORY_UTF8_MASK;
3837 }
3838
3839 static void
3840 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3841                     unsigned_char_dynarr *dst, size_t n)
3842 {
3843   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3844   unsigned int flags    = str->flags;
3845   unsigned int cpos     = str->cpos;
3846   eol_type_t eol_type   = str->eol_type;
3847   unsigned char counter = str->counter;
3848
3849   while (n--)
3850     {
3851       unsigned char c = *(unsigned char *)src++;
3852       if (counter == 0)
3853         {
3854           if ( c < 0xC0 )
3855             {
3856               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3857               DECODE_ADD_UCS_CHAR (c, dst);
3858             }
3859           else if ( c < 0xE0 )
3860             {
3861               cpos = c & 0x1f;
3862               counter = 1;
3863             }
3864           else if ( c < 0xF0 )
3865             {
3866               cpos = c & 0x0f;
3867               counter = 2;
3868             }
3869           else if ( c < 0xF8 )
3870             {
3871               cpos = c & 0x07;
3872               counter = 3;
3873             }
3874           else if ( c < 0xFC )
3875             {
3876               cpos = c & 0x03;
3877               counter = 4;
3878             }
3879           else
3880             {
3881               cpos = c & 0x01;
3882               counter = 5;
3883             }
3884         }
3885       else if ( (c & 0xC0) == 0x80 )
3886         {
3887           cpos = ( cpos << 6 ) | ( c & 0x3f );
3888           if (counter == 1)
3889             {
3890               DECODE_ADD_UCS_CHAR (cpos, dst);
3891               cpos = 0;
3892               counter = 0;
3893             }
3894           else
3895             counter--;
3896         }
3897       else
3898         {
3899           if (counter == 5)
3900             DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3901           else if (counter == 4)
3902             {
3903               if (cpos < (1 << 6))
3904                 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3905               else
3906                 {
3907                   DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3908                   DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3909                 }
3910             }
3911           else if (counter == 3)
3912             {
3913               if (cpos < (1 << 6))
3914                 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3915               else if (cpos < (1 << 12))
3916                 {
3917                   DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3918                   DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3919                 }
3920               else
3921                 {
3922                   DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3923                   DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3924                   DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3925                 }
3926             }
3927           else if (counter == 2)
3928             {
3929               if (cpos < (1 << 6))
3930                 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3931               else if (cpos < (1 << 12))
3932                 {
3933                   DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3934                   DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3935                 }
3936               else if (cpos < (1 << 18))
3937                 {
3938                   DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3939                   DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3940                   DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
3941                 }
3942               else
3943                 {
3944                   DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3945                   DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3946                   DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3947                   DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3948                 }
3949             }
3950           else
3951             {
3952               if (cpos < (1 << 6))
3953                 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3954               else if (cpos < (1 << 12))
3955                 {
3956                   DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3957                   DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3958                 }
3959               else if (cpos < (1 << 18))
3960                 {
3961                   DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3962                   DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3963                   DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3964                 }
3965               else if (cpos < (1 << 24))
3966                 {
3967                   DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3968                   DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3969                   DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3970                   DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3971                 }
3972               else
3973                 {
3974                   DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3975                   DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3976                   DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3977                   DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3978                   DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3979                 }
3980             }
3981           DECODE_ADD_BINARY_CHAR (c, dst);
3982           cpos = 0;
3983           counter = 0;
3984         }
3985     label_continue_loop:;
3986     }
3987
3988   if (flags & CODING_STATE_END)
3989     if (counter > 0)
3990       {
3991         DECODE_ADD_BINARY_CHAR (cpos, dst);
3992         cpos = 0;
3993         counter = 0;
3994       }
3995   str->flags    = flags;
3996   str->cpos     = cpos;
3997   str->counter  = counter;
3998 }
3999
4000 void
4001 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4002                   unsigned_char_dynarr *dst, unsigned int *flags)
4003 {
4004   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4005
4006   if (ch == '\n')
4007     {
4008       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4009         Dynarr_add (dst, '\r');
4010       if (eol_type != EOL_CR)
4011         Dynarr_add (dst, ch);
4012     }
4013   else if (ch <= 0x7f)
4014     {
4015       Dynarr_add (dst, ch);
4016     }
4017   else if (ch <= 0x7ff)
4018     {
4019       Dynarr_add (dst, (ch >> 6) | 0xc0);
4020       Dynarr_add (dst, (ch & 0x3f) | 0x80);
4021     }
4022   else if (ch <= 0xffff)
4023     {
4024       Dynarr_add (dst,  (ch >> 12) | 0xe0);
4025       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4026       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4027     }
4028   else if (ch <= 0x1fffff)
4029     {
4030       Dynarr_add (dst,  (ch >> 18) | 0xf0);
4031       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4032       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4033       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4034     }
4035   else if (ch <= 0x3ffffff)
4036     {
4037       Dynarr_add (dst,  (ch >> 24) | 0xf8);
4038       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4039       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4040       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4041       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4042     }
4043   else
4044     {
4045       Dynarr_add (dst,  (ch >> 30) | 0xfc);
4046       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4047       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4048       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4049       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4050       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4051     }
4052 }
4053
4054 void
4055 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4056                   unsigned int *flags)
4057 {
4058 }
4059
4060 \f
4061 /************************************************************************/
4062 /*                           ISO2022 methods                            */
4063 /************************************************************************/
4064
4065 /* The following note describes the coding system ISO2022 briefly.
4066    Since the intention of this note is to help understand the
4067    functions in this file, some parts are NOT ACCURATE or OVERLY
4068    SIMPLIFIED.  For thorough understanding, please refer to the
4069    original document of ISO2022.
4070
4071    ISO2022 provides many mechanisms to encode several character sets
4072    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4073    is encoded using bytes less than 128.  This may make the encoded
4074    text a little bit longer, but the text passes more easily through
4075    several gateways, some of which strip off MSB (Most Signigant Bit).
4076
4077    There are two kinds of character sets: control character set and
4078    graphic character set.  The former contains control characters such
4079    as `newline' and `escape' to provide control functions (control
4080    functions are also provided by escape sequences).  The latter
4081    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4082    two control character sets and many graphic character sets.
4083
4084    Graphic character sets are classified into one of the following
4085    four classes, according to the number of bytes (DIMENSION) and
4086    number of characters in one dimension (CHARS) of the set:
4087    - DIMENSION1_CHARS94
4088    - DIMENSION1_CHARS96
4089    - DIMENSION2_CHARS94
4090    - DIMENSION2_CHARS96
4091
4092    In addition, each character set is assigned an identification tag,
4093    unique for each set, called "final character" (denoted as <F>
4094    hereafter).  The <F> of each character set is decided by ECMA(*)
4095    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4096    (0x30..0x3F are for private use only).
4097
4098    Note (*): ECMA = European Computer Manufacturers Association
4099
4100    Here are examples of graphic character set [NAME(<F>)]:
4101         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4102         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4103         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4104         o DIMENSION2_CHARS96 -- none for the moment
4105
4106    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4107         C0 [0x00..0x1F] -- control character plane 0
4108         GL [0x20..0x7F] -- graphic character plane 0
4109         C1 [0x80..0x9F] -- control character plane 1
4110         GR [0xA0..0xFF] -- graphic character plane 1
4111
4112    A control character set is directly designated and invoked to C0 or
4113    C1 by an escape sequence.  The most common case is that:
4114    - ISO646's  control character set is designated/invoked to C0, and
4115    - ISO6429's control character set is designated/invoked to C1,
4116    and usually these designations/invocations are omitted in encoded
4117    text.  In a 7-bit environment, only C0 can be used, and a control
4118    character for C1 is encoded by an appropriate escape sequence to
4119    fit into the environment.  All control characters for C1 are
4120    defined to have corresponding escape sequences.
4121
4122    A graphic character set is at first designated to one of four
4123    graphic registers (G0 through G3), then these graphic registers are
4124    invoked to GL or GR.  These designations and invocations can be
4125    done independently.  The most common case is that G0 is invoked to
4126    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4127    these invocations and designations are omitted in encoded text.
4128    In a 7-bit environment, only GL can be used.
4129
4130    When a graphic character set of CHARS94 is invoked to GL, codes
4131    0x20 and 0x7F of the GL area work as control characters SPACE and
4132    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4133    be used.
4134
4135    There are two ways of invocation: locking-shift and single-shift.
4136    With locking-shift, the invocation lasts until the next different
4137    invocation, whereas with single-shift, the invocation affects the
4138    following character only and doesn't affect the locking-shift
4139    state.  Invocations are done by the following control characters or
4140    escape sequences:
4141
4142    ----------------------------------------------------------------------
4143    abbrev  function                  cntrl escape seq   description
4144    ----------------------------------------------------------------------
4145    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4146    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4147    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4148    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4149    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4150    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4151    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4152    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4153    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4154    ----------------------------------------------------------------------
4155    (*) These are not used by any known coding system.
4156
4157    Control characters for these functions are defined by macros
4158    ISO_CODE_XXX in `coding.h'.
4159
4160    Designations are done by the following escape sequences:
4161    ----------------------------------------------------------------------
4162    escape sequence      description
4163    ----------------------------------------------------------------------
4164    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4165    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4166    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4167    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4168    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4169    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4170    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4171    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4172    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4173    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4174    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4175    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4176    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4177    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4178    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4179    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4180    ----------------------------------------------------------------------
4181
4182    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4183    of dimension 1, chars 94, and final character <F>, etc...
4184
4185    Note (*): Although these designations are not allowed in ISO2022,
4186    Emacs accepts them on decoding, and produces them on encoding
4187    CHARS96 character sets in a coding system which is characterized as
4188    7-bit environment, non-locking-shift, and non-single-shift.
4189
4190    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4191    '(' can be omitted.  We refer to this as "short-form" hereafter.
4192
4193    Now you may notice that there are a lot of ways for encoding the
4194    same multilingual text in ISO2022.  Actually, there exist many
4195    coding systems such as Compound Text (used in X11's inter client
4196    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4197    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4198    localized platforms), and all of these are variants of ISO2022.
4199
4200    In addition to the above, Emacs handles two more kinds of escape
4201    sequences: ISO6429's direction specification and Emacs' private
4202    sequence for specifying character composition.
4203
4204    ISO6429's direction specification takes the following form:
4205         o CSI ']'      -- end of the current direction
4206         o CSI '0' ']'  -- end of the current direction
4207         o CSI '1' ']'  -- start of left-to-right text
4208         o CSI '2' ']'  -- start of right-to-left text
4209    The control character CSI (0x9B: control sequence introducer) is
4210    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4211
4212    Character composition specification takes the following form:
4213         o ESC '0' -- start character composition
4214         o ESC '1' -- end character composition
4215    Since these are not standard escape sequences of any ISO standard,
4216    their use with these meanings is restricted to Emacs only.  */
4217
4218 static void
4219 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4220 {
4221   int i;
4222
4223   for (i = 0; i < 4; i++)
4224     {
4225       if (!NILP (coding_system))
4226         iso->charset[i] =
4227           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4228       else
4229         iso->charset[i] = Qt;
4230       iso->invalid_designated[i] = 0;
4231     }
4232   iso->esc = ISO_ESC_NOTHING;
4233   iso->esc_bytes_index = 0;
4234   iso->register_left = 0;
4235   iso->register_right = 1;
4236   iso->switched_dir_and_no_valid_charset_yet = 0;
4237   iso->invalid_switch_dir = 0;
4238   iso->output_direction_sequence = 0;
4239   iso->output_literally = 0;
4240 #ifdef ENABLE_COMPOSITE_CHARS
4241   if (iso->composite_chars)
4242     Dynarr_reset (iso->composite_chars);
4243 #endif
4244 }
4245
4246 static int
4247 fit_to_be_escape_quoted (unsigned char c)
4248 {
4249   switch (c)
4250     {
4251     case ISO_CODE_ESC:
4252     case ISO_CODE_CSI:
4253     case ISO_CODE_SS2:
4254     case ISO_CODE_SS3:
4255     case ISO_CODE_SO:
4256     case ISO_CODE_SI:
4257       return 1;
4258
4259     default:
4260       return 0;
4261     }
4262 }
4263
4264 /* Parse one byte of an ISO2022 escape sequence.
4265    If the result is an invalid escape sequence, return 0 and
4266    do not change anything in STR.  Otherwise, if the result is
4267    an incomplete escape sequence, update ISO2022.ESC and
4268    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4269    all the state variables (but not ISO2022.ESC_BYTES) and
4270    return 1.
4271
4272    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4273    or invocation of an invalid character set and treat that as
4274    an unrecognized escape sequence. */
4275
4276 static int
4277 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4278                    unsigned char c, unsigned int *flags,
4279                    int check_invalid_charsets)
4280 {
4281   /* (1) If we're at the end of a designation sequence, CS is the
4282      charset being designated and REG is the register to designate
4283      it to.
4284
4285      (2) If we're at the end of a locking-shift sequence, REG is
4286      the register to invoke and HALF (0 == left, 1 == right) is
4287      the half to invoke it into.
4288
4289      (3) If we're at the end of a single-shift sequence, REG is
4290      the register to invoke. */
4291   Lisp_Object cs = Qnil;
4292   int reg, half;
4293
4294   /* NOTE: This code does goto's all over the fucking place.
4295      The reason for this is that we're basically implementing
4296      a state machine here, and hierarchical languages like C
4297      don't really provide a clean way of doing this. */
4298
4299   if (! (*flags & CODING_STATE_ESCAPE))
4300     /* At beginning of escape sequence; we need to reset our
4301        escape-state variables. */
4302     iso->esc = ISO_ESC_NOTHING;
4303
4304   iso->output_literally = 0;
4305   iso->output_direction_sequence = 0;
4306
4307   switch (iso->esc)
4308     {
4309     case ISO_ESC_NOTHING:
4310       iso->esc_bytes_index = 0;
4311       switch (c)
4312         {
4313         case ISO_CODE_ESC:      /* Start escape sequence */
4314           *flags |= CODING_STATE_ESCAPE;
4315           iso->esc = ISO_ESC;
4316           goto not_done;
4317
4318         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4319           *flags |= CODING_STATE_ESCAPE;
4320           iso->esc = ISO_ESC_5_11;
4321           goto not_done;
4322
4323         case ISO_CODE_SO:       /* locking shift 1 */
4324           reg = 1; half = 0;
4325           goto locking_shift;
4326         case ISO_CODE_SI:       /* locking shift 0 */
4327           reg = 0; half = 0;
4328           goto locking_shift;
4329
4330         case ISO_CODE_SS2:      /* single shift */
4331           reg = 2;
4332           goto single_shift;
4333         case ISO_CODE_SS3:      /* single shift */
4334           reg = 3;
4335           goto single_shift;
4336
4337         default:                        /* Other control characters */
4338           return 0;
4339         }
4340
4341     case ISO_ESC:
4342       switch (c)
4343         {
4344           /**** single shift ****/
4345
4346         case 'N':       /* single shift 2 */
4347           reg = 2;
4348           goto single_shift;
4349         case 'O':       /* single shift 3 */
4350           reg = 3;
4351           goto single_shift;
4352
4353           /**** locking shift ****/
4354
4355         case '~':       /* locking shift 1 right */
4356           reg = 1; half = 1;
4357           goto locking_shift;
4358         case 'n':       /* locking shift 2 */
4359           reg = 2; half = 0;
4360           goto locking_shift;
4361         case '}':       /* locking shift 2 right */
4362           reg = 2; half = 1;
4363           goto locking_shift;
4364         case 'o':       /* locking shift 3 */
4365           reg = 3; half = 0;
4366           goto locking_shift;
4367         case '|':       /* locking shift 3 right */
4368           reg = 3; half = 1;
4369           goto locking_shift;
4370
4371 #ifdef ENABLE_COMPOSITE_CHARS
4372           /**** composite ****/
4373
4374         case '0':
4375           iso->esc = ISO_ESC_START_COMPOSITE;
4376           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4377             CODING_STATE_COMPOSITE;
4378           return 1;
4379
4380         case '1':
4381           iso->esc = ISO_ESC_END_COMPOSITE;
4382           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4383             ~CODING_STATE_COMPOSITE;
4384           return 1;
4385 #endif /* ENABLE_COMPOSITE_CHARS */
4386
4387           /**** directionality ****/
4388
4389         case '[':
4390           iso->esc = ISO_ESC_5_11;
4391           goto not_done;
4392
4393           /**** designation ****/
4394
4395         case '$':       /* multibyte charset prefix */
4396           iso->esc = ISO_ESC_2_4;
4397           goto not_done;
4398
4399         default:
4400           if (0x28 <= c && c <= 0x2F)
4401             {
4402               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4403               goto not_done;
4404             }
4405
4406           /* This function is called with CODESYS equal to nil when
4407              doing coding-system detection. */
4408           if (!NILP (codesys)
4409               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4410               && fit_to_be_escape_quoted (c))
4411             {
4412               iso->esc = ISO_ESC_LITERAL;
4413               *flags &= CODING_STATE_ISO2022_LOCK;
4414               return 1;
4415             }
4416
4417           /* bzzzt! */
4418           return 0;
4419         }
4420
4421
4422
4423       /**** directionality ****/
4424
4425     case ISO_ESC_5_11:          /* ISO6429 direction control */
4426       if (c == ']')
4427         {
4428           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4429           goto directionality;
4430         }
4431       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4432       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4433       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4434       else               return 0;
4435       goto not_done;
4436
4437     case ISO_ESC_5_11_0:
4438       if (c == ']')
4439         {
4440           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4441           goto directionality;
4442         }
4443       return 0;
4444
4445     case ISO_ESC_5_11_1:
4446       if (c == ']')
4447         {
4448           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4449           goto directionality;
4450         }
4451       return 0;
4452
4453     case ISO_ESC_5_11_2:
4454       if (c == ']')
4455         {
4456           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4457           goto directionality;
4458         }
4459       return 0;
4460
4461     directionality:
4462       iso->esc = ISO_ESC_DIRECTIONALITY;
4463       /* Various junk here to attempt to preserve the direction sequences
4464          literally in the text if they would otherwise be swallowed due
4465          to invalid designations that don't show up as actual charset
4466          changes in the text. */
4467       if (iso->invalid_switch_dir)
4468         {
4469           /* We already inserted a direction switch literally into the
4470              text.  We assume (#### this may not be right) that the
4471              next direction switch is the one going the other way,
4472              and we need to output that literally as well. */
4473           iso->output_literally = 1;
4474           iso->invalid_switch_dir = 0;
4475         }
4476       else
4477         {
4478           int jj;
4479
4480           /* If we are in the thrall of an invalid designation,
4481            then stick the directionality sequence literally into the
4482            output stream so it ends up in the original text again. */
4483           for (jj = 0; jj < 4; jj++)
4484             if (iso->invalid_designated[jj])
4485               break;
4486           if (jj < 4)
4487             {
4488               iso->output_literally = 1;
4489               iso->invalid_switch_dir = 1;
4490             }
4491           else
4492             /* Indicate that we haven't yet seen a valid designation,
4493                so that if a switch-dir is directly followed by an
4494                invalid designation, both get inserted literally. */
4495             iso->switched_dir_and_no_valid_charset_yet = 1;
4496         }
4497       return 1;
4498
4499
4500       /**** designation ****/
4501
4502     case ISO_ESC_2_4:
4503       if (0x28 <= c && c <= 0x2F)
4504         {
4505           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4506           goto not_done;
4507         }
4508       if (0x40 <= c && c <= 0x42)
4509         {
4510           /* 94^n-set */
4511           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4512                                       *flags & CODING_STATE_R2L ?
4513                                       CHARSET_RIGHT_TO_LEFT :
4514                                       CHARSET_LEFT_TO_RIGHT);
4515           reg = 0;
4516           goto designated;
4517         }
4518       return 0;
4519
4520     default:
4521       {
4522         int chars = 0;
4523         int single = 0;
4524
4525         if (c < '0' || c > '~')
4526           return 0; /* bad final byte */
4527
4528         if (iso->esc >= ISO_ESC_2_8 &&
4529             iso->esc <= ISO_ESC_2_15)
4530           {
4531             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4532             single = 1; /* single-byte */
4533             reg = (iso->esc - ISO_ESC_2_8) & 3;
4534           }
4535         else if (iso->esc >= ISO_ESC_2_4_8 &&
4536                  iso->esc <= ISO_ESC_2_4_15)
4537           {
4538             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4539             single = -1; /* multi-byte */
4540             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4541           }
4542         else
4543           {
4544             /* Can this ever be reached? -slb */
4545             abort();
4546           }
4547
4548         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4549                                     *flags & CODING_STATE_R2L ?
4550                                     CHARSET_RIGHT_TO_LEFT :
4551                                     CHARSET_LEFT_TO_RIGHT);
4552         goto designated;
4553       }
4554     }
4555
4556  not_done:
4557   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4558   return -1;
4559
4560  single_shift:
4561   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4562     /* can't invoke something that ain't there. */
4563     return 0;
4564   iso->esc = ISO_ESC_SINGLE_SHIFT;
4565   *flags &= CODING_STATE_ISO2022_LOCK;
4566   if (reg == 2)
4567     *flags |= CODING_STATE_SS2;
4568   else
4569     *flags |= CODING_STATE_SS3;
4570   return 1;
4571
4572  locking_shift:
4573   if (check_invalid_charsets &&
4574       !CHARSETP (iso->charset[reg]))
4575     /* can't invoke something that ain't there. */
4576     return 0;
4577   if (half)
4578     iso->register_right = reg;
4579   else
4580     iso->register_left = reg;
4581   *flags &= CODING_STATE_ISO2022_LOCK;
4582   iso->esc = ISO_ESC_LOCKING_SHIFT;
4583   return 1;
4584
4585  designated:
4586   if (NILP (cs) && check_invalid_charsets)
4587     {
4588       iso->invalid_designated[reg] = 1;
4589       iso->charset[reg] = Vcharset_ascii;
4590       iso->esc = ISO_ESC_DESIGNATE;
4591       *flags &= CODING_STATE_ISO2022_LOCK;
4592       iso->output_literally = 1;
4593       if (iso->switched_dir_and_no_valid_charset_yet)
4594         {
4595           /* We encountered a switch-direction followed by an
4596              invalid designation.  Ensure that the switch-direction
4597              gets outputted; otherwise it will probably get eaten
4598              when the text is written out again. */
4599           iso->switched_dir_and_no_valid_charset_yet = 0;
4600           iso->output_direction_sequence = 1;
4601           /* And make sure that the switch-dir going the other
4602              way gets outputted, as well. */
4603           iso->invalid_switch_dir = 1;
4604         }
4605       return 1;
4606     }
4607   /* This function is called with CODESYS equal to nil when
4608      doing coding-system detection. */
4609   if (!NILP (codesys))
4610     {
4611       charset_conversion_spec_dynarr *dyn =
4612         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4613
4614       if (dyn)
4615         {
4616           int i;
4617
4618           for (i = 0; i < Dynarr_length (dyn); i++)
4619             {
4620               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4621               if (EQ (cs, spec->from_charset))
4622                 cs = spec->to_charset;
4623             }
4624         }
4625     }
4626
4627   iso->charset[reg] = cs;
4628   iso->esc = ISO_ESC_DESIGNATE;
4629   *flags &= CODING_STATE_ISO2022_LOCK;
4630   if (iso->invalid_designated[reg])
4631     {
4632       iso->invalid_designated[reg] = 0;
4633       iso->output_literally = 1;
4634     }
4635   if (iso->switched_dir_and_no_valid_charset_yet)
4636     iso->switched_dir_and_no_valid_charset_yet = 0;
4637   return 1;
4638 }
4639
4640 static int
4641 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4642 {
4643   int mask;
4644
4645   /* #### There are serious deficiencies in the recognition mechanism
4646      here.  This needs to be much smarter if it's going to cut it.
4647      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4648      it should be detected as Latin-1.
4649      All the ISO2022 stuff in this file should be synced up with the
4650      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4651      Perhaps we should wait till R2L works in FSF Emacs? */
4652
4653   if (!st->iso2022.initted)
4654     {
4655       reset_iso2022 (Qnil, &st->iso2022.iso);
4656       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4657                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4658                           CODING_CATEGORY_ISO_8_1_MASK |
4659                           CODING_CATEGORY_ISO_8_2_MASK |
4660                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4661       st->iso2022.flags = 0;
4662       st->iso2022.high_byte_count = 0;
4663       st->iso2022.saw_single_shift = 0;
4664       st->iso2022.initted = 1;
4665     }
4666
4667   mask = st->iso2022.mask;
4668
4669   while (n--)
4670     {
4671       unsigned char c = *(unsigned char *)src++;
4672       if (c >= 0xA0)
4673         {
4674           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4675           st->iso2022.high_byte_count++;
4676         }
4677       else
4678         {
4679           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4680             {
4681               if (st->iso2022.high_byte_count & 1)
4682                 /* odd number of high bytes; assume not iso-8-2 */
4683                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4684             }
4685           st->iso2022.high_byte_count = 0;
4686           st->iso2022.saw_single_shift = 0;
4687           if (c > 0x80)
4688             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4689         }
4690       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4691           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4692         { /* control chars */
4693           switch (c)
4694             {
4695               /* Allow and ignore control characters that you might
4696                  reasonably see in a text file */
4697             case '\r':
4698             case '\n':
4699             case '\t':
4700             case  7: /* bell */
4701             case  8: /* backspace */
4702             case 11: /* vertical tab */
4703             case 12: /* form feed */
4704             case 26: /* MS-DOS C-z junk */
4705             case 31: /* '^_' -- for info */
4706               goto label_continue_loop;
4707
4708             default:
4709               break;
4710             }
4711         }
4712
4713       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4714           || BYTE_C1_P (c))
4715         {
4716           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4717                                  &st->iso2022.flags, 0))
4718             {
4719               switch (st->iso2022.iso.esc)
4720                 {
4721                 case ISO_ESC_DESIGNATE:
4722                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4723                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4724                   break;
4725                 case ISO_ESC_LOCKING_SHIFT:
4726                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4727                   goto ran_out_of_chars;
4728                 case ISO_ESC_SINGLE_SHIFT:
4729                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4730                   st->iso2022.saw_single_shift = 1;
4731                   break;
4732                 default:
4733                   break;
4734                 }
4735             }
4736           else
4737             {
4738               mask = 0;
4739               goto ran_out_of_chars;
4740             }
4741         }
4742     label_continue_loop:;
4743     }
4744
4745  ran_out_of_chars:
4746
4747   return mask;
4748 }
4749
4750 static int
4751 postprocess_iso2022_mask (int mask)
4752 {
4753   /* #### kind of cheesy */
4754   /* If seven-bit ISO is allowed, then assume that the encoding is
4755      entirely seven-bit and turn off the eight-bit ones. */
4756   if (mask & CODING_CATEGORY_ISO_7_MASK)
4757     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4758                CODING_CATEGORY_ISO_8_1_MASK |
4759                CODING_CATEGORY_ISO_8_2_MASK);
4760   return mask;
4761 }
4762
4763 /* If FLAGS is a null pointer or specifies right-to-left motion,
4764    output a switch-dir-to-left-to-right sequence to DST.
4765    Also update FLAGS if it is not a null pointer.
4766    If INTERNAL_P is set, we are outputting in internal format and
4767    need to handle the CSI differently. */
4768
4769 static void
4770 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4771                                  unsigned_char_dynarr *dst,
4772                                  unsigned int *flags,
4773                                  int internal_p)
4774 {
4775   if (!flags || (*flags & CODING_STATE_R2L))
4776     {
4777       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4778         {
4779           Dynarr_add (dst, ISO_CODE_ESC);
4780           Dynarr_add (dst, '[');
4781         }
4782       else if (internal_p)
4783         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4784       else
4785         Dynarr_add (dst, ISO_CODE_CSI);
4786       Dynarr_add (dst, '0');
4787       Dynarr_add (dst, ']');
4788       if (flags)
4789         *flags &= ~CODING_STATE_R2L;
4790     }
4791 }
4792
4793 /* If FLAGS is a null pointer or specifies a direction different from
4794    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4795    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4796    sequence to DST.  Also update FLAGS if it is not a null pointer.
4797    If INTERNAL_P is set, we are outputting in internal format and
4798    need to handle the CSI differently. */
4799
4800 static void
4801 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4802                           unsigned_char_dynarr *dst, unsigned int *flags,
4803                           int internal_p)
4804 {
4805   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4806       direction == CHARSET_LEFT_TO_RIGHT)
4807     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4808   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4809            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4810            direction == CHARSET_RIGHT_TO_LEFT)
4811     {
4812       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4813         {
4814           Dynarr_add (dst, ISO_CODE_ESC);
4815           Dynarr_add (dst, '[');
4816         }
4817       else if (internal_p)
4818         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4819       else
4820         Dynarr_add (dst, ISO_CODE_CSI);
4821       Dynarr_add (dst, '2');
4822       Dynarr_add (dst, ']');
4823       if (flags)
4824         *flags |= CODING_STATE_R2L;
4825     }
4826 }
4827
4828 /* Convert ISO2022-format data to internal format. */
4829
4830 static void
4831 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4832                        unsigned_char_dynarr *dst, size_t n)
4833 {
4834   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4835   unsigned int flags    = str->flags;
4836   unsigned int cpos     = str->cpos;
4837   unsigned char counter = str->counter;
4838   eol_type_t eol_type   = str->eol_type;
4839 #ifdef ENABLE_COMPOSITE_CHARS
4840   unsigned_char_dynarr *real_dst = dst;
4841 #endif
4842   Lisp_Object coding_system;
4843
4844   XSETCODING_SYSTEM (coding_system, str->codesys);
4845
4846 #ifdef ENABLE_COMPOSITE_CHARS
4847   if (flags & CODING_STATE_COMPOSITE)
4848     dst = str->iso2022.composite_chars;
4849 #endif /* ENABLE_COMPOSITE_CHARS */
4850
4851   while (n--)
4852     {
4853       unsigned char c = *(unsigned char *)src++;
4854       if (flags & CODING_STATE_ESCAPE)
4855         {       /* Within ESC sequence */
4856           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4857                                           c, &flags, 1);
4858
4859           if (retval)
4860             {
4861               switch (str->iso2022.esc)
4862                 {
4863 #ifdef ENABLE_COMPOSITE_CHARS
4864                 case ISO_ESC_START_COMPOSITE:
4865                   if (str->iso2022.composite_chars)
4866                     Dynarr_reset (str->iso2022.composite_chars);
4867                   else
4868                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4869                   dst = str->iso2022.composite_chars;
4870                   break;
4871                 case ISO_ESC_END_COMPOSITE:
4872                   {
4873                     Bufbyte comstr[MAX_EMCHAR_LEN];
4874                     Bytecount len;
4875                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4876                                                          Dynarr_length (dst));
4877                     dst = real_dst;
4878                     len = set_charptr_emchar (comstr, emch);
4879                     Dynarr_add_many (dst, comstr, len);
4880                     break;
4881                   }
4882 #endif /* ENABLE_COMPOSITE_CHARS */
4883
4884                 case ISO_ESC_LITERAL:
4885                   COMPOSE_FLUSH_CHARS (str, dst);
4886                   DECODE_ADD_BINARY_CHAR (c, dst);
4887                   break;
4888
4889                 default:
4890                   /* Everything else handled already */
4891                   break;
4892                 }
4893             }
4894
4895           /* Attempted error recovery. */
4896           if (str->iso2022.output_direction_sequence)
4897             ensure_correct_direction (flags & CODING_STATE_R2L ?
4898                                       CHARSET_RIGHT_TO_LEFT :
4899                                       CHARSET_LEFT_TO_RIGHT,
4900                                       str->codesys, dst, 0, 1);
4901           /* More error recovery. */
4902           if (!retval || str->iso2022.output_literally)
4903             {
4904               /* Output the (possibly invalid) sequence */
4905               int i;
4906               COMPOSE_FLUSH_CHARS (str, dst);
4907               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4908                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4909               flags &= CODING_STATE_ISO2022_LOCK;
4910               if (!retval)
4911                 n++, src--;/* Repeat the loop with the same character. */
4912               else
4913                 {
4914                   /* No sense in reprocessing the final byte of the
4915                      escape sequence; it could mess things up anyway.
4916                      Just add it now. */
4917                   COMPOSE_FLUSH_CHARS (str, dst);
4918                   DECODE_ADD_BINARY_CHAR (c, dst);
4919                 }
4920             }
4921           cpos = 0;
4922           counter = 0;
4923         }
4924       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4925         { /* Control characters */
4926
4927           /***** Error-handling *****/
4928
4929           /* If we were in the middle of a character, dump out the
4930              partial character. */
4931           if (counter)
4932             {
4933               COMPOSE_FLUSH_CHARS (str, dst);
4934               while (counter > 0)
4935                 {
4936                   counter--;
4937                   DECODE_ADD_BINARY_CHAR
4938                     ((unsigned char)(cpos >> (counter * 8)), dst);
4939                 }
4940               cpos = 0;
4941             }
4942
4943           /* If we just saw a single-shift character, dump it out.
4944              This may dump out the wrong sort of single-shift character,
4945              but least it will give an indication that something went
4946              wrong. */
4947           if (flags & CODING_STATE_SS2)
4948             {
4949               COMPOSE_FLUSH_CHARS (str, dst);
4950               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4951               flags &= ~CODING_STATE_SS2;
4952             }
4953           if (flags & CODING_STATE_SS3)
4954             {
4955               COMPOSE_FLUSH_CHARS (str, dst);
4956               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4957               flags &= ~CODING_STATE_SS3;
4958             }
4959
4960           /***** Now handle the control characters. *****/
4961
4962           /* Handle CR/LF */
4963 #ifdef UTF2000
4964           if (c == '\r')
4965             {
4966               COMPOSE_FLUSH_CHARS (str, dst);
4967               if (eol_type == EOL_CR)
4968                 Dynarr_add (dst, '\n');
4969               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4970                 Dynarr_add (dst, c);
4971               else
4972                 flags |= CODING_STATE_CR;
4973               goto label_continue_loop;
4974             }
4975           else if (flags & CODING_STATE_CR)
4976             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4977               if (c != '\n')
4978                 Dynarr_add (dst, '\r');
4979               flags &= ~CODING_STATE_CR;
4980             }
4981 #else
4982           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4983 #endif
4984
4985           flags &= CODING_STATE_ISO2022_LOCK;
4986
4987           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4988             {
4989               COMPOSE_FLUSH_CHARS (str, dst);
4990               DECODE_ADD_BINARY_CHAR (c, dst);
4991             }
4992         }
4993       else
4994         {                       /* Graphic characters */
4995           Lisp_Object charset;
4996 #ifndef UTF2000
4997           Charset_ID lb;
4998 #endif
4999           int reg;
5000
5001 #ifdef UTF2000
5002           if (c == '\r')
5003             {
5004               COMPOSE_FLUSH_CHARS (str, dst);
5005               if (eol_type == EOL_CR)
5006                 Dynarr_add (dst, '\n');
5007               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5008                 Dynarr_add (dst, c);
5009               else
5010                 flags |= CODING_STATE_CR;
5011               goto label_continue_loop;
5012             }
5013           else if (flags & CODING_STATE_CR)
5014             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5015               if (c != '\n')
5016                 Dynarr_add (dst, '\r');
5017               flags &= ~CODING_STATE_CR;
5018             }
5019 #else
5020           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5021 #endif
5022
5023           /* Now determine the charset. */
5024           reg = ((flags & CODING_STATE_SS2) ? 2
5025                  : (flags & CODING_STATE_SS3) ? 3
5026                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5027                  : str->iso2022.register_left);
5028           charset = str->iso2022.charset[reg];
5029
5030           /* Error checking: */
5031           if (! CHARSETP (charset)
5032               || str->iso2022.invalid_designated[reg]
5033               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5034                   && XCHARSET_CHARS (charset) == 94))
5035             /* Mrmph.  We are trying to invoke a register that has no
5036                or an invalid charset in it, or trying to add a character
5037                outside the range of the charset.  Insert that char literally
5038                to preserve it for the output. */
5039             {
5040               COMPOSE_FLUSH_CHARS (str, dst);
5041               while (counter > 0)
5042                 {
5043                   counter--;
5044                   DECODE_ADD_BINARY_CHAR
5045                     ((unsigned char)(cpos >> (counter * 8)), dst);
5046                 }
5047               cpos = 0;
5048               DECODE_ADD_BINARY_CHAR (c, dst);
5049             }
5050
5051           else
5052             {
5053               /* Things are probably hunky-dorey. */
5054
5055               /* Fetch reverse charset, maybe. */
5056               if (((flags & CODING_STATE_R2L) &&
5057                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5058                   ||
5059                   (!(flags & CODING_STATE_R2L) &&
5060                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5061                 {
5062                   Lisp_Object new_charset =
5063                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5064                   if (!NILP (new_charset))
5065                     charset = new_charset;
5066                 }
5067
5068 #ifdef UTF2000
5069               counter++;
5070               if (XCHARSET_DIMENSION (charset) == counter)
5071                 {
5072                   COMPOSE_ADD_CHAR (str,
5073                                     DECODE_CHAR (charset,
5074                                                  ((cpos & 0x7F7F7F) << 8)
5075                                                  | (c & 0x7F)),
5076                                     dst);
5077                   cpos = 0;
5078                   counter = 0;
5079                 }
5080               else
5081                 cpos = (cpos << 8) | c;
5082 #else
5083               lb = XCHARSET_LEADING_BYTE (charset);
5084               switch (XCHARSET_REP_BYTES (charset))
5085                 {
5086                 case 1: /* ASCII */
5087                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5088                   Dynarr_add (dst, c & 0x7F);
5089                   break;
5090
5091                 case 2: /* one-byte official */
5092                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5093                   Dynarr_add (dst, lb);
5094                   Dynarr_add (dst, c | 0x80);
5095                   break;
5096
5097                 case 3: /* one-byte private or two-byte official */
5098                   if (XCHARSET_PRIVATE_P (charset))
5099                     {
5100                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5101                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5102                       Dynarr_add (dst, lb);
5103                       Dynarr_add (dst, c | 0x80);
5104                     }
5105                   else
5106                     {
5107                       if (ch)
5108                         {
5109                           Dynarr_add (dst, lb);
5110                           Dynarr_add (dst, ch | 0x80);
5111                           Dynarr_add (dst, c | 0x80);
5112                           ch = 0;
5113                         }
5114                       else
5115                         ch = c;
5116                     }
5117                   break;
5118
5119                 default:        /* two-byte private */
5120                   if (ch)
5121                     {
5122                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5123                       Dynarr_add (dst, lb);
5124                       Dynarr_add (dst, ch | 0x80);
5125                       Dynarr_add (dst, c | 0x80);
5126                       ch = 0;
5127                     }
5128                   else
5129                     ch = c;
5130                 }
5131 #endif
5132             }
5133
5134           if (!cpos)
5135             flags &= CODING_STATE_ISO2022_LOCK;
5136         }
5137
5138     label_continue_loop:;
5139     }
5140
5141   if (flags & CODING_STATE_END)
5142     {
5143       COMPOSE_FLUSH_CHARS (str, dst);
5144       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5145     }
5146   str->flags   = flags;
5147   str->cpos    = cpos;
5148   str->counter = counter;
5149 }
5150
5151
5152 /***** ISO2022 encoder *****/
5153
5154 /* Designate CHARSET into register REG. */
5155
5156 static void
5157 iso2022_designate (Lisp_Object charset, unsigned char reg,
5158                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5159 {
5160   static const char inter94[] = "()*+";
5161   static const char inter96[] = ",-./";
5162   unsigned short chars;
5163   unsigned char dimension;
5164   unsigned char final;
5165   Lisp_Object old_charset = str->iso2022.charset[reg];
5166
5167   str->iso2022.charset[reg] = charset;
5168   if (!CHARSETP (charset))
5169     /* charset might be an initial nil or t. */
5170     return;
5171   chars = XCHARSET_CHARS (charset);
5172   dimension = XCHARSET_DIMENSION (charset);
5173   final = XCHARSET_FINAL (charset);
5174   if (!str->iso2022.force_charset_on_output[reg] &&
5175       CHARSETP (old_charset) &&
5176       XCHARSET_CHARS (old_charset) == chars &&
5177       XCHARSET_DIMENSION (old_charset) == dimension &&
5178       XCHARSET_FINAL (old_charset) == final)
5179     return;
5180
5181   str->iso2022.force_charset_on_output[reg] = 0;
5182
5183   {
5184     charset_conversion_spec_dynarr *dyn =
5185       str->codesys->iso2022.output_conv;
5186
5187     if (dyn)
5188       {
5189         int i;
5190
5191         for (i = 0; i < Dynarr_length (dyn); i++)
5192           {
5193             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5194             if (EQ (charset, spec->from_charset))
5195                 charset = spec->to_charset;
5196           }
5197       }
5198   }
5199
5200   Dynarr_add (dst, ISO_CODE_ESC);
5201   switch (chars)
5202     {
5203     case 94:
5204       if (dimension == 1)
5205         Dynarr_add (dst, inter94[reg]);
5206       else
5207         {
5208           Dynarr_add (dst, '$');
5209           if (reg != 0
5210               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5211               || final < '@'
5212               || final > 'B')
5213             Dynarr_add (dst, inter94[reg]);
5214         }
5215       break;
5216     case 96:
5217       if (dimension == 1)
5218         Dynarr_add (dst, inter96[reg]);
5219       else
5220         {
5221           Dynarr_add (dst, '$');
5222           Dynarr_add (dst, inter96[reg]);
5223         }
5224       break;
5225     }
5226   Dynarr_add (dst, final);
5227 }
5228
5229 static void
5230 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5231 {
5232   if (str->iso2022.register_left != 0)
5233     {
5234       Dynarr_add (dst, ISO_CODE_SI);
5235       str->iso2022.register_left = 0;
5236     }
5237 }
5238
5239 static void
5240 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5241 {
5242   if (str->iso2022.register_left != 1)
5243     {
5244       Dynarr_add (dst, ISO_CODE_SO);
5245       str->iso2022.register_left = 1;
5246     }
5247 }
5248
5249 void
5250 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5251                      unsigned_char_dynarr *dst, unsigned int *flags)
5252 {
5253   unsigned char charmask;
5254   Lisp_Coding_System* codesys = str->codesys;
5255   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5256   int i;
5257   Lisp_Object charset = str->iso2022.current_charset;
5258   int half = str->iso2022.current_half;
5259   int code_point = -1;
5260
5261   if (ch <= 0x7F)
5262     {
5263       restore_left_to_right_direction (codesys, dst, flags, 0);
5264
5265       /* Make sure G0 contains ASCII */
5266       if ((ch > ' ' && ch < ISO_CODE_DEL)
5267           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5268         {
5269           ensure_normal_shift (str, dst);
5270           iso2022_designate (Vcharset_ascii, 0, str, dst);
5271         }
5272
5273       /* If necessary, restore everything to the default state
5274          at end-of-line */
5275       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5276         {
5277           restore_left_to_right_direction (codesys, dst, flags, 0);
5278
5279           ensure_normal_shift (str, dst);
5280
5281           for (i = 0; i < 4; i++)
5282             {
5283               Lisp_Object initial_charset =
5284                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5285               iso2022_designate (initial_charset, i, str, dst);
5286             }
5287         }
5288       if (ch == '\n')
5289         {
5290           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5291             Dynarr_add (dst, '\r');
5292           if (eol_type != EOL_CR)
5293             Dynarr_add (dst, ch);
5294         }
5295       else
5296         {
5297           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5298               && fit_to_be_escape_quoted (ch))
5299             Dynarr_add (dst, ISO_CODE_ESC);
5300           Dynarr_add (dst, ch);
5301         }
5302     }
5303   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5304     {
5305       charmask = (half == 0 ? 0x00 : 0x80);
5306
5307       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5308           && fit_to_be_escape_quoted (ch))
5309         Dynarr_add (dst, ISO_CODE_ESC);
5310       /* you asked for it ... */
5311       Dynarr_add (dst, ch);
5312     }
5313   else
5314     {
5315       int reg;
5316
5317       /* Now determine which register to use. */
5318       reg = -1;
5319       for (i = 0; i < 4; i++)
5320         {
5321           if ((CHARSETP (charset = str->iso2022.charset[i])
5322                && ((code_point = charset_code_point (charset, ch)) >= 0))
5323               ||
5324               (CHARSETP
5325                (charset
5326                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5327                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5328             {
5329               reg = i;
5330               break;
5331             }
5332         }
5333       if (reg == -1)
5334         {
5335           Lisp_Object original_default_coded_charset_priority_list
5336             = Vdefault_coded_charset_priority_list;
5337
5338           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5339             {
5340               code_point = ENCODE_CHAR (ch, charset);
5341               if (XCHARSET_FINAL (charset))
5342                 goto found;
5343               Vdefault_coded_charset_priority_list
5344                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5345                                Vdefault_coded_charset_priority_list));
5346             }
5347           code_point = ENCODE_CHAR (ch, charset);
5348           if (!XCHARSET_FINAL (charset))
5349             {
5350               charset = Vcharset_ascii;
5351               code_point = '~';
5352             }
5353         found:
5354           Vdefault_coded_charset_priority_list
5355             = original_default_coded_charset_priority_list;
5356         }
5357       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5358                                 codesys, dst, flags, 0);
5359
5360       if (reg == -1)
5361         {
5362           if (XCHARSET_GRAPHIC (charset) != 0)
5363             {
5364               if (!NILP (str->iso2022.charset[1]) &&
5365                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5366                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5367                 reg = 1;
5368               else if (!NILP (str->iso2022.charset[2]))
5369                 reg = 2;
5370               else if (!NILP (str->iso2022.charset[3]))
5371                 reg = 3;
5372               else
5373                 reg = 0;
5374             }
5375           else
5376             reg = 0;
5377         }
5378
5379       iso2022_designate (charset, reg, str, dst);
5380
5381       /* Now invoke that register. */
5382       switch (reg)
5383         {
5384         case 0:
5385           ensure_normal_shift (str, dst);
5386           half = 0;
5387           break;
5388         case 1:
5389           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5390             {
5391               ensure_shift_out (str, dst);
5392               half = 0;
5393             }
5394           else
5395             half = 1;
5396           break;
5397         case 2:
5398           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5399             {
5400               Dynarr_add (dst, ISO_CODE_ESC);
5401               Dynarr_add (dst, 'N');
5402               half = 0;
5403             }
5404           else
5405             {
5406               Dynarr_add (dst, ISO_CODE_SS2);
5407               half = 1;
5408             }
5409           break;
5410         case 3:
5411           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5412             {
5413               Dynarr_add (dst, ISO_CODE_ESC);
5414               Dynarr_add (dst, 'O');
5415               half = 0;
5416             }
5417           else
5418             {
5419               Dynarr_add (dst, ISO_CODE_SS3);
5420               half = 1;
5421             }
5422           break;
5423         default:
5424           abort ();
5425         }
5426
5427       charmask = (half == 0 ? 0x00 : 0x80);
5428
5429       switch (XCHARSET_DIMENSION (charset))
5430         {
5431         case 1:
5432           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5433           break;
5434         case 2:
5435           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5436           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5437           break;
5438         case 3:
5439           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5440           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5441           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5442           break;
5443         case 4:
5444           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5445           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5446           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5447           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5448           break;
5449         default:
5450           abort ();
5451         }
5452     }
5453   str->iso2022.current_charset = charset;
5454   str->iso2022.current_half = half;
5455 }
5456
5457 void
5458 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5459                      unsigned int *flags)
5460 {
5461   Lisp_Coding_System* codesys = str->codesys;
5462   int i;
5463
5464   restore_left_to_right_direction (codesys, dst, flags, 0);
5465   ensure_normal_shift (str, dst);
5466   for (i = 0; i < 4; i++)
5467     {
5468       Lisp_Object initial_charset
5469         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5470       iso2022_designate (initial_charset, i, str, dst);
5471     }
5472 }
5473 #endif /* MULE */
5474 \f
5475 /************************************************************************/
5476 /*                     No-conversion methods                            */
5477 /************************************************************************/
5478
5479 /* This is used when reading in "binary" files -- i.e. files that may
5480    contain all 256 possible byte values and that are not to be
5481    interpreted as being in any particular decoding. */
5482 static void
5483 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5484                              unsigned_char_dynarr *dst, size_t n)
5485 {
5486   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5487   unsigned int flags  = str->flags;
5488   unsigned int cpos   = str->cpos;
5489   eol_type_t eol_type = str->eol_type;
5490
5491   while (n--)
5492     {
5493       unsigned char c = *(unsigned char *)src++;
5494
5495       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5496       DECODE_ADD_BINARY_CHAR (c, dst);
5497     label_continue_loop:;
5498     }
5499
5500   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5501
5502   str->flags = flags;
5503   str->cpos  = cpos;
5504 }
5505
5506 static void
5507 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5508                              unsigned_char_dynarr *dst, size_t n)
5509 {
5510   unsigned char c;
5511   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5512   unsigned int flags  = str->flags;
5513   unsigned int ch     = str->ch;
5514   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5515 #ifdef UTF2000
5516   unsigned char char_boundary = str->iso2022.current_char_boundary;
5517 #endif
5518
5519   while (n--)
5520     {
5521       c = *src++;
5522 #ifdef UTF2000
5523       if (char_boundary == 0)
5524         if ( c >= 0xfc )
5525           {
5526             ch = c & 0x01;
5527             char_boundary = 5;
5528           }
5529         else if ( c >= 0xf8 )
5530           {
5531             ch = c & 0x03;
5532             char_boundary = 4;
5533           }
5534         else if ( c >= 0xf0 )
5535           {
5536             ch = c & 0x07;
5537             char_boundary = 3;
5538           }
5539         else if ( c >= 0xe0 )
5540           {
5541             ch = c & 0x0f;
5542             char_boundary = 2;
5543           }
5544         else if ( c >= 0xc0 )
5545           {
5546             ch = c & 0x1f;
5547             char_boundary = 1;
5548           }
5549         else
5550           {
5551             ch = 0;
5552             if (c == '\n')
5553               {
5554                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5555                   Dynarr_add (dst, '\r');
5556                 if (eol_type != EOL_CR)
5557                   Dynarr_add (dst, c);
5558               }
5559             else
5560               Dynarr_add (dst, c);
5561             char_boundary = 0;
5562           }
5563       else if (char_boundary == 1)
5564         {
5565           ch = ( ch << 6 ) | ( c & 0x3f );
5566           Dynarr_add (dst, ch & 0xff);
5567           char_boundary = 0;
5568         }
5569       else
5570         {
5571           ch = ( ch << 6 ) | ( c & 0x3f );
5572           char_boundary--;
5573         }
5574 #else /* not UTF2000 */
5575       if (c == '\n')
5576         {
5577           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5578             Dynarr_add (dst, '\r');
5579           if (eol_type != EOL_CR)
5580             Dynarr_add (dst, '\n');
5581           ch = 0;
5582         }
5583       else if (BYTE_ASCII_P (c))
5584         {
5585           assert (ch == 0);
5586           Dynarr_add (dst, c);
5587         }
5588       else if (BUFBYTE_LEADING_BYTE_P (c))
5589         {
5590           assert (ch == 0);
5591           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5592               c == LEADING_BYTE_CONTROL_1)
5593             ch = c;
5594           else
5595             Dynarr_add (dst, '~'); /* untranslatable character */
5596         }
5597       else
5598         {
5599           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5600             Dynarr_add (dst, c);
5601           else if (ch == LEADING_BYTE_CONTROL_1)
5602             {
5603               assert (c < 0xC0);
5604               Dynarr_add (dst, c - 0x20);
5605             }
5606           /* else it should be the second or third byte of an
5607              untranslatable character, so ignore it */
5608           ch = 0;
5609         }
5610 #endif /* not UTF2000 */
5611     }
5612
5613   str->flags = flags;
5614   str->ch    = ch;
5615 #ifdef UTF2000
5616   str->iso2022.current_char_boundary = char_boundary;
5617 #endif
5618 }
5619
5620 \f
5621
5622 /************************************************************************/
5623 /*                             Initialization                           */
5624 /************************************************************************/
5625
5626 void
5627 syms_of_file_coding (void)
5628 {
5629   INIT_LRECORD_IMPLEMENTATION (coding_system);
5630
5631   deferror (&Qcoding_system_error, "coding-system-error",
5632             "Coding-system error", Qio_error);
5633
5634   DEFSUBR (Fcoding_system_p);
5635   DEFSUBR (Ffind_coding_system);
5636   DEFSUBR (Fget_coding_system);
5637   DEFSUBR (Fcoding_system_list);
5638   DEFSUBR (Fcoding_system_name);
5639   DEFSUBR (Fmake_coding_system);
5640   DEFSUBR (Fcopy_coding_system);
5641   DEFSUBR (Fcoding_system_canonical_name_p);
5642   DEFSUBR (Fcoding_system_alias_p);
5643   DEFSUBR (Fcoding_system_aliasee);
5644   DEFSUBR (Fdefine_coding_system_alias);
5645   DEFSUBR (Fsubsidiary_coding_system);
5646
5647   DEFSUBR (Fcoding_system_type);
5648   DEFSUBR (Fcoding_system_doc_string);
5649 #ifdef MULE
5650   DEFSUBR (Fcoding_system_charset);
5651 #endif
5652   DEFSUBR (Fcoding_system_property);
5653
5654   DEFSUBR (Fcoding_category_list);
5655   DEFSUBR (Fset_coding_priority_list);
5656   DEFSUBR (Fcoding_priority_list);
5657   DEFSUBR (Fset_coding_category_system);
5658   DEFSUBR (Fcoding_category_system);
5659
5660   DEFSUBR (Fdetect_coding_region);
5661   DEFSUBR (Fdecode_coding_region);
5662   DEFSUBR (Fencode_coding_region);
5663 #ifdef MULE
5664   DEFSUBR (Fdecode_shift_jis_char);
5665   DEFSUBR (Fencode_shift_jis_char);
5666   DEFSUBR (Fdecode_big5_char);
5667   DEFSUBR (Fencode_big5_char);
5668 #endif /* MULE */
5669   defsymbol (&Qcoding_systemp, "coding-system-p");
5670   defsymbol (&Qno_conversion, "no-conversion");
5671   defsymbol (&Qraw_text, "raw-text");
5672 #ifdef MULE
5673   defsymbol (&Qbig5, "big5");
5674   defsymbol (&Qshift_jis, "shift-jis");
5675   defsymbol (&Qucs4, "ucs-4");
5676   defsymbol (&Qutf8, "utf-8");
5677   defsymbol (&Qccl, "ccl");
5678   defsymbol (&Qiso2022, "iso2022");
5679 #endif /* MULE */
5680   defsymbol (&Qmnemonic, "mnemonic");
5681   defsymbol (&Qeol_type, "eol-type");
5682   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5683   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5684
5685   defsymbol (&Qcr, "cr");
5686   defsymbol (&Qlf, "lf");
5687   defsymbol (&Qcrlf, "crlf");
5688   defsymbol (&Qeol_cr, "eol-cr");
5689   defsymbol (&Qeol_lf, "eol-lf");
5690   defsymbol (&Qeol_crlf, "eol-crlf");
5691 #ifdef MULE
5692   defsymbol (&Qcharset_g0, "charset-g0");
5693   defsymbol (&Qcharset_g1, "charset-g1");
5694   defsymbol (&Qcharset_g2, "charset-g2");
5695   defsymbol (&Qcharset_g3, "charset-g3");
5696   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5697   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5698   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5699   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5700   defsymbol (&Qno_iso6429, "no-iso6429");
5701   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5702   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5703
5704   defsymbol (&Qshort, "short");
5705   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5706   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5707   defsymbol (&Qseven, "seven");
5708   defsymbol (&Qlock_shift, "lock-shift");
5709   defsymbol (&Qescape_quoted, "escape-quoted");
5710 #endif /* MULE */
5711 #ifdef UTF2000
5712   defsymbol (&Qdisable_composition, "disable-composition");
5713 #endif
5714   defsymbol (&Qencode, "encode");
5715   defsymbol (&Qdecode, "decode");
5716
5717 #ifdef MULE
5718   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5719              "shift-jis");
5720   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5721              "big5");
5722   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5723              "ucs-4");
5724   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5725              "utf-8");
5726   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5727              "iso-7");
5728   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5729              "iso-8-designate");
5730   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5731              "iso-8-1");
5732   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5733              "iso-8-2");
5734   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5735              "iso-lock-shift");
5736 #endif /* MULE */
5737   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5738              "no-conversion");
5739 }
5740
5741 void
5742 lstream_type_create_file_coding (void)
5743 {
5744   LSTREAM_HAS_METHOD (decoding, reader);
5745   LSTREAM_HAS_METHOD (decoding, writer);
5746   LSTREAM_HAS_METHOD (decoding, rewinder);
5747   LSTREAM_HAS_METHOD (decoding, seekable_p);
5748   LSTREAM_HAS_METHOD (decoding, flusher);
5749   LSTREAM_HAS_METHOD (decoding, closer);
5750   LSTREAM_HAS_METHOD (decoding, marker);
5751
5752   LSTREAM_HAS_METHOD (encoding, reader);
5753   LSTREAM_HAS_METHOD (encoding, writer);
5754   LSTREAM_HAS_METHOD (encoding, rewinder);
5755   LSTREAM_HAS_METHOD (encoding, seekable_p);
5756   LSTREAM_HAS_METHOD (encoding, flusher);
5757   LSTREAM_HAS_METHOD (encoding, closer);
5758   LSTREAM_HAS_METHOD (encoding, marker);
5759 }
5760
5761 void
5762 vars_of_file_coding (void)
5763 {
5764   int i;
5765
5766   fcd = xnew (struct file_coding_dump);
5767   dumpstruct (&fcd, &fcd_description);
5768
5769   /* Initialize to something reasonable ... */
5770   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5771     {
5772       fcd->coding_category_system[i] = Qnil;
5773       fcd->coding_category_by_priority[i] = i;
5774     }
5775
5776   Fprovide (intern ("file-coding"));
5777
5778   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5779 Coding system used for TTY keyboard input.
5780 Not used under a windowing system.
5781 */ );
5782   Vkeyboard_coding_system = Qnil;
5783
5784   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5785 Coding system used for TTY display output.
5786 Not used under a windowing system.
5787 */ );
5788   Vterminal_coding_system = Qnil;
5789
5790   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5791 Overriding coding system used when reading from a file or process.
5792 You should bind this variable with `let', but do not set it globally.
5793 If this is non-nil, it specifies the coding system that will be used
5794 to decode input on read operations, such as from a file or process.
5795 It overrides `buffer-file-coding-system-for-read',
5796 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5797 this one for permanent changes to the environment.  */ );
5798   Vcoding_system_for_read = Qnil;
5799
5800   DEFVAR_LISP ("coding-system-for-write",
5801                &Vcoding_system_for_write /*
5802 Overriding coding system used when writing to a file or process.
5803 You should bind this variable with `let', but do not set it globally.
5804 If this is non-nil, it specifies the coding system that will be used
5805 to encode output for write operations, such as to a file or process.
5806 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5807 Use those variables instead of this one for permanent changes to the
5808 environment.  */ );
5809   Vcoding_system_for_write = Qnil;
5810
5811   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5812 Coding system used to convert pathnames when accessing files.
5813 */ );
5814   Vfile_name_coding_system = Qnil;
5815
5816   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5817 Non-nil means the buffer contents are regarded as multi-byte form
5818 of characters, not a binary code.  This affects the display, file I/O,
5819 and behaviors of various editing commands.
5820
5821 Setting this to nil does not do anything.
5822 */ );
5823   enable_multibyte_characters = 1;
5824 }
5825
5826 void
5827 complex_vars_of_file_coding (void)
5828 {
5829   staticpro (&Vcoding_system_hash_table);
5830   Vcoding_system_hash_table =
5831     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5832
5833   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5834   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5835
5836 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5837 {                                               \
5838   struct codesys_prop csp;                      \
5839   csp.sym = (Sym);                              \
5840   csp.prop_type = (Prop_Type);                  \
5841   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5842 } while (0)
5843
5844   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5845   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5846   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5847   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5848   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5849   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5850   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5851 #ifdef MULE
5852   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5853   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5854   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5855   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5856   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5857   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5858   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5859   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5860   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5861   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5862   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5863   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5864   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5865   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5866   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5867   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5868   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5869
5870   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5871   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5872 #endif /* MULE */
5873   /* Need to create this here or we're really screwed. */
5874   Fmake_coding_system
5875     (Qraw_text, Qno_conversion,
5876      build_string ("Raw text, which means it converts only line-break-codes."),
5877      list2 (Qmnemonic, build_string ("Raw")));
5878
5879   Fmake_coding_system
5880     (Qbinary, Qno_conversion,
5881      build_string ("Binary, which means it does not convert anything."),
5882      list4 (Qeol_type, Qlf,
5883             Qmnemonic, build_string ("Binary")));
5884
5885 #ifdef UTF2000
5886   Fmake_coding_system
5887     (Qutf8, Qutf8,
5888      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5889      list2 (Qmnemonic, build_string ("UTF8")));
5890 #endif
5891
5892   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5893
5894   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5895
5896   Fdefine_coding_system_alias (Qterminal, Qbinary);
5897   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5898
5899   /* Need this for bootstrapping */
5900   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5901     Fget_coding_system (Qraw_text);
5902
5903 #ifdef UTF2000
5904   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5905    = Fget_coding_system (Qutf8);
5906 #endif
5907
5908 #if defined(MULE) && !defined(UTF2000)
5909   {
5910     unsigned int i;
5911
5912     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5913       fcd->ucs_to_mule_table[i] = Qnil;
5914   }
5915   staticpro (&mule_to_ucs_table);
5916   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5917 #endif /* defined(MULE) && !defined(UTF2000) */
5918 }