src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154 static int detect_coding_sjis (struct detection_state *st,
 155                                CONST unsigned char *src,
 156                                unsigned int n);
 157 static void decode_coding_sjis (Lstream *decoding,
 158                                 CONST unsigned char *src,
 159                                 unsigned_char_dynarr *dst,
 160                                 unsigned int n);
 161 static void encode_coding_sjis (Lstream *encoding,
 162                                 CONST unsigned char *src,
 163                                 unsigned_char_dynarr *dst,
 164                                 unsigned int n);
 165 static int detect_coding_big5 (struct detection_state *st,
 166                                CONST unsigned char *src,
 167                                unsigned int n);
 168 static void decode_coding_big5 (Lstream *decoding,
 169                                 CONST unsigned char *src,
 170                                 unsigned_char_dynarr *dst, unsigned int n);
 171 static void encode_coding_big5 (Lstream *encoding,
 172                                 CONST unsigned char *src,
 173                                 unsigned_char_dynarr *dst, unsigned int n);
 174 static int detect_coding_ucs4 (struct detection_state *st,
 175                                CONST unsigned char *src,
 176                                unsigned int n);
 177 static void decode_coding_ucs4 (Lstream *decoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static void encode_coding_ucs4 (Lstream *encoding,
 181                                 CONST unsigned char *src,
 182                                 unsigned_char_dynarr *dst, unsigned int n);
 183 static int detect_coding_utf8 (struct detection_state *st,
 184                                CONST unsigned char *src,
 185                                unsigned int n);
 186 static void decode_coding_utf8 (Lstream *decoding,
 187                                 CONST unsigned char *src,
 188                                 unsigned_char_dynarr *dst, unsigned int n);
 189 static void encode_coding_utf8 (Lstream *encoding,
 190                                 CONST unsigned char *src,
 191                                 unsigned_char_dynarr *dst, unsigned int n);
 192 static int postprocess_iso2022_mask (int mask);
 193 static void reset_iso2022 (Lisp_Object coding_system,
 194                            struct iso2022_decoder *iso);
 195 static int detect_coding_iso2022 (struct detection_state *st,
 196                                   CONST unsigned char *src,
 197                                   unsigned int n);
 198 static void decode_coding_iso2022 (Lstream *decoding,
 199                                    CONST unsigned char *src,
 200                                    unsigned_char_dynarr *dst, unsigned int n);
 201 static void encode_coding_iso2022 (Lstream *encoding,
 202                                    CONST unsigned char *src,
 203                                    unsigned_char_dynarr *dst, unsigned int n);
 204 #endif /* MULE */
 205 static void decode_coding_no_conversion (Lstream *decoding,
 206                                          CONST unsigned char *src,
 207                                          unsigned_char_dynarr *dst,
 208                                          unsigned int n);
 209 static void encode_coding_no_conversion (Lstream *encoding,
 210                                          CONST unsigned char *src,
 211                                          unsigned_char_dynarr *dst,
 212                                          unsigned int n);
 213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 214                          unsigned_char_dynarr *dst, unsigned int n);
 215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 216                          unsigned_char_dynarr *dst, unsigned int n);
 217
 218 typedef struct codesys_prop codesys_prop;
 219 struct codesys_prop
 220 {
 221   Lisp_Object sym;
 222   int prop_type;
 223 };
 224
 225 typedef struct
 226 {
 227   Dynarr_declare (codesys_prop);
 228 } codesys_prop_dynarr;
 229
 230 codesys_prop_dynarr *the_codesys_prop_dynarr;
 231
 232 enum codesys_prop_enum
 233 {
 234   CODESYS_PROP_ALL_OK,
 235   CODESYS_PROP_ISO2022,
 236   CODESYS_PROP_CCL
 237 };
 238
 239 \f
 240 /************************************************************************/
 241 /*                       Coding system functions                        */
 242 /************************************************************************/
 243
 244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 246 static void finalize_coding_system (void *header, int for_disksave);
 247
 248 #ifdef MULE
 249 static const struct lrecord_description ccs_description_1[] = {
 250   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 251   { XD_END }
 252 };
 253
 254 static const struct struct_description ccs_description = {
 255   sizeof(charset_conversion_spec),
 256   ccs_description_1
 257 };
 258
 259 static const struct lrecord_description ccsd_description_1[] = {
 260   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 261   { XD_END }
 262 };
 263
 264 static const struct struct_description ccsd_description = {
 265   sizeof(charset_conversion_spec_dynarr),
 266   ccsd_description_1
 267 };
 268 #endif
 269
 270 static const struct lrecord_description coding_system_description[] = {
 271   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 272   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 273   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 274 #ifdef MULE
 275   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 276   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 277   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 278   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 279 #endif
 280   { XD_END }
 281 };
 282
 283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 284                                mark_coding_system, print_coding_system,
 285                                finalize_coding_system,
 286                                0, 0, coding_system_description,
 287                                struct Lisp_Coding_System);
 288
 289 static Lisp_Object
 290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 291 {
 292   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 293
 294   markobj (CODING_SYSTEM_NAME (codesys));
 295   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 296   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 297   markobj (CODING_SYSTEM_EOL_LF (codesys));
 298   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 299   markobj (CODING_SYSTEM_EOL_CR (codesys));
 300
 301   switch (CODING_SYSTEM_TYPE (codesys))
 302     {
 303 #ifdef MULE
 304       int i;
 305     case CODESYS_ISO2022:
 306       for (i = 0; i < 4; i++)
 307         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 308       if (codesys->iso2022.input_conv)
 309         {
 310           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 311             {
 312               struct charset_conversion_spec *ccs =
 313                 Dynarr_atp (codesys->iso2022.input_conv, i);
 314               markobj (ccs->from_charset);
 315               markobj (ccs->to_charset);
 316             }
 317         }
 318       if (codesys->iso2022.output_conv)
 319         {
 320           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 321             {
 322               struct charset_conversion_spec *ccs =
 323                 Dynarr_atp (codesys->iso2022.output_conv, i);
 324               markobj (ccs->from_charset);
 325               markobj (ccs->to_charset);
 326             }
 327         }
 328       break;
 329
 330     case CODESYS_CCL:
 331       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 332       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 333       break;
 334 #endif /* MULE */
 335     default:
 336       break;
 337     }
 338
 339   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 340   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 341 }
 342
 343 static void
 344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 345                      int escapeflag)
 346 {
 347   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 348   if (print_readably)
 349     error ("printing unreadable object #<coding_system 0x%x>",
 350            c->header.uid);
 351
 352   write_c_string ("#<coding_system ", printcharfun);
 353   print_internal (c->name, printcharfun, 1);
 354   write_c_string (">", printcharfun);
 355 }
 356
 357 static void
 358 finalize_coding_system (void *header, int for_disksave)
 359 {
 360   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 361   /* Since coding systems never go away, this function is not
 362      necessary.  But it would be necessary if we changed things
 363      so that coding systems could go away. */
 364   if (!for_disksave) /* see comment in lstream.c */
 365     {
 366       switch (CODING_SYSTEM_TYPE (c))
 367         {
 368 #ifdef MULE
 369         case CODESYS_ISO2022:
 370           if (c->iso2022.input_conv)
 371             {
 372               Dynarr_free (c->iso2022.input_conv);
 373               c->iso2022.input_conv = 0;
 374             }
 375           if (c->iso2022.output_conv)
 376             {
 377               Dynarr_free (c->iso2022.output_conv);
 378               c->iso2022.output_conv = 0;
 379             }
 380           break;
 381 #endif /* MULE */
 382         default:
 383           break;
 384         }
 385     }
 386 }
 387
 388 static enum eol_type
 389 symbol_to_eol_type (Lisp_Object symbol)
 390 {
 391   CHECK_SYMBOL (symbol);
 392   if (NILP (symbol))      return EOL_AUTODETECT;
 393   if (EQ (symbol, Qlf))   return EOL_LF;
 394   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 395   if (EQ (symbol, Qcr))   return EOL_CR;
 396
 397   signal_simple_error ("Unrecognized eol type", symbol);
 398   return EOL_AUTODETECT; /* not reached */
 399 }
 400
 401 static Lisp_Object
 402 eol_type_to_symbol (enum eol_type type)
 403 {
 404   switch (type)
 405     {
 406     default: abort ();
 407     case EOL_LF:         return Qlf;
 408     case EOL_CRLF:       return Qcrlf;
 409     case EOL_CR:         return Qcr;
 410     case EOL_AUTODETECT: return Qnil;
 411     }
 412 }
 413
 414 static void
 415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 416 {
 417   Lisp_Object codesys_obj;
 418   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 419   char *codesys_name = (char *) alloca (len + 7);
 420   int mlen = -1;
 421   char *codesys_mnemonic=0;
 422
 423   Lisp_Object codesys_name_sym, sub_codesys_obj;
 424
 425   /* kludge */
 426
 427   XSETCODING_SYSTEM (codesys_obj, codesys);
 428
 429   memcpy (codesys_name,
 430           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 431
 432   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 433     {
 434       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 435       codesys_mnemonic = (char *) alloca (mlen + 7);
 436       memcpy (codesys_mnemonic,
 437               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 438     }
 439
 440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 441   strcpy (codesys_name + len, "-" op_sys);                                      \
 442   if (mlen != -1)                                                               \
 443     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 444   codesys_name_sym = intern (codesys_name);                                     \
 445   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 446   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 447   if (mlen != -1)                                                               \
 448     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 449       build_string (codesys_mnemonic);                                          \
 450   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 451 } while (0)
 452
 453   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 454   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 455   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 456 }
 457
 458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 459 Return t if OBJECT is a coding system.
 460 A coding system is an object that defines how text containing multiple
 461 character sets is encoded into a stream of (typically 8-bit) bytes.
 462 The coding system is used to decode the stream into a series of
 463 characters (which may be from multiple charsets) when the text is read
 464 from a file or process, and is used to encode the text back into the
 465 same format when it is written out to a file or process.
 466
 467 For example, many ISO2022-compliant coding systems (such as Compound
 468 Text, which is used for inter-client data under the X Window System)
 469 use escape sequences to switch between different charsets -- Japanese
 470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 472 `make-coding-system' for more information.
 473
 474 Coding systems are normally identified using a symbol, and the
 475 symbol is accepted in place of the actual coding system object whenever
 476 a coding system is called for. (This is similar to how faces work.)
 477 */
 478        (object))
 479 {
 480   return CODING_SYSTEMP (object) ? Qt : Qnil;
 481 }
 482
 483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 484 Retrieve the coding system of the given name.
 485
 486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 487 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 488 If there is no such coding system, nil is returned.  Otherwise the
 489 associated coding system object is returned.
 490 */
 491        (coding_system_or_name))
 492 {
 493   if (CODING_SYSTEMP (coding_system_or_name))
 494     return coding_system_or_name;
 495
 496   if (NILP (coding_system_or_name))
 497     coding_system_or_name = Qbinary;
 498   else
 499     CHECK_SYMBOL (coding_system_or_name);
 500
 501   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 502 }
 503
 504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 505 Retrieve the coding system of the given name.
 506 Same as `find-coding-system' except that if there is no such
 507 coding system, an error is signaled instead of returning nil.
 508 */
 509        (name))
 510 {
 511   Lisp_Object coding_system = Ffind_coding_system (name);
 512
 513   if (NILP (coding_system))
 514     signal_simple_error ("No such coding system", name);
 515   return coding_system;
 516 }
 517
 518 /* We store the coding systems in hash tables with the names as the key and the
 519    actual coding system object as the value.  Occasionally we need to use them
 520    in a list format.  These routines provide us with that. */
 521 struct coding_system_list_closure
 522 {
 523   Lisp_Object *coding_system_list;
 524 };
 525
 526 static int
 527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 528                                   void *coding_system_list_closure)
 529 {
 530   /* This function can GC */
 531   struct coding_system_list_closure *cscl =
 532     (struct coding_system_list_closure *) coding_system_list_closure;
 533   Lisp_Object *coding_system_list = cscl->coding_system_list;
 534
 535   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 536                                *coding_system_list);
 537   return 0;
 538 }
 539
 540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 541 Return a list of the names of all defined coding systems.
 542 */
 543        ())
 544 {
 545   Lisp_Object coding_system_list = Qnil;
 546   struct gcpro gcpro1;
 547   struct coding_system_list_closure coding_system_list_closure;
 548
 549   GCPRO1 (coding_system_list);
 550   coding_system_list_closure.coding_system_list = &coding_system_list;
 551   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 552                  &coding_system_list_closure);
 553   UNGCPRO;
 554
 555   return coding_system_list;
 556 }
 557
 558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 559 Return the name of the given coding system.
 560 */
 561        (coding_system))
 562 {
 563   coding_system = Fget_coding_system (coding_system);
 564   return XCODING_SYSTEM_NAME (coding_system);
 565 }
 566
 567 static Lisp_Coding_System *
 568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 569 {
 570   Lisp_Coding_System *codesys =
 571     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 572
 573   zero_lcrecord (codesys);
 574   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 575   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 576   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 577   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 578   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 579   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 580   CODING_SYSTEM_TYPE     (codesys) = type;
 581   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 582 #ifdef MULE
 583   if (type == CODESYS_ISO2022)
 584     {
 585       int i;
 586       for (i = 0; i < 4; i++)
 587         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 588     }
 589   else if (type == CODESYS_CCL)
 590     {
 591       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 592       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 593     }
 594 #endif /* MULE */
 595   CODING_SYSTEM_NAME (codesys) = name;
 596
 597   return codesys;
 598 }
 599
 600 #ifdef MULE
 601 /* Given a list of charset conversion specs as specified in a Lisp
 602    program, parse it into STORE_HERE. */
 603
 604 static void
 605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 606                                 Lisp_Object spec_list)
 607 {
 608   Lisp_Object rest;
 609
 610   EXTERNAL_LIST_LOOP (rest, spec_list)
 611     {
 612       Lisp_Object car = XCAR (rest);
 613       Lisp_Object from, to;
 614       struct charset_conversion_spec spec;
 615
 616       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 617         signal_simple_error ("Invalid charset conversion spec", car);
 618       from = Fget_charset (XCAR (car));
 619       to = Fget_charset (XCAR (XCDR (car)));
 620       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 621         signal_simple_error_2
 622           ("Attempted conversion between different charset types",
 623            from, to);
 624       spec.from_charset = from;
 625       spec.to_charset = to;
 626
 627       Dynarr_add (store_here, spec);
 628     }
 629 }
 630
 631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 632    specs, return the equivalent as the Lisp programmer would see it.
 633
 634    If LOAD_HERE is 0, return Qnil. */
 635
 636 static Lisp_Object
 637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 638 {
 639   int i;
 640   Lisp_Object result;
 641
 642   if (!load_here)
 643     return Qnil;
 644   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 645     {
 646       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 647       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 648     }
 649
 650   return Fnreverse (result);
 651 }
 652
 653 #endif /* MULE */
 654
 655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 656 Register symbol NAME as a coding system.
 657
 658 TYPE describes the conversion method used and should be one of
 659
 660 nil or 'undecided
 661      Automatic conversion.  XEmacs attempts to detect the coding system
 662      used in the file.
 663 'no-conversion
 664      No conversion.  Use this for binary files and such.  On output,
 665      graphic characters that are not in ASCII or Latin-1 will be
 666      replaced by a ?. (For a no-conversion-encoded buffer, these
 667      characters will only be present if you explicitly insert them.)
 668 'shift-jis
 669      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 670 'ucs-4
 671      ISO 10646 UCS-4 encoding.
 672 'utf-8
 673      ISO 10646 UTF-8 encoding.
 674 'iso2022
 675      Any ISO2022-compliant encoding.  Among other things, this includes
 676      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 677      standard Unix encoding for Japanese and other languages), and
 678      Compound Text (the encoding used in X11).  You can specify more
 679      specific information about the conversion with the FLAGS argument.
 680 'big5
 681      Big5 (the encoding commonly used for Taiwanese).
 682 'ccl
 683      The conversion is performed using a user-written pseudo-code
 684      program.  CCL (Code Conversion Language) is the name of this
 685      pseudo-code.
 686 'internal
 687      Write out or read in the raw contents of the memory representing
 688      the buffer's text.  This is primarily useful for debugging
 689      purposes, and is only enabled when XEmacs has been compiled with
 690      DEBUG_XEMACS defined (via the --debug configure option).
 691      WARNING: Reading in a file using 'internal conversion can result
 692      in an internal inconsistency in the memory representing a
 693      buffer's text, which will produce unpredictable results and may
 694      cause XEmacs to crash.  Under normal circumstances you should
 695      never use 'internal conversion.
 696
 697 DOC-STRING is a string describing the coding system.
 698
 699 PROPS is a property list, describing the specific nature of the
 700 character set.  Recognized properties are:
 701
 702 'mnemonic
 703      String to be displayed in the modeline when this coding system is
 704      active.
 705
 706 'eol-type
 707      End-of-line conversion to be used.  It should be one of
 708
 709         nil
 710                 Automatically detect the end-of-line type (LF, CRLF,
 711                 or CR).  Also generate subsidiary coding systems named
 712                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 713                 identical to this coding system but have an EOL-TYPE
 714                 value of 'lf, 'crlf, and 'cr, respectively.
 715         'lf
 716                 The end of a line is marked externally using ASCII LF.
 717                 Since this is also the way that XEmacs represents an
 718                 end-of-line internally, specifying this option results
 719                 in no end-of-line conversion.  This is the standard
 720                 format for Unix text files.
 721         'crlf
 722                 The end of a line is marked externally using ASCII
 723                 CRLF.  This is the standard format for MS-DOS text
 724                 files.
 725         'cr
 726                 The end of a line is marked externally using ASCII CR.
 727                 This is the standard format for Macintosh text files.
 728         t
 729                 Automatically detect the end-of-line type but do not
 730                 generate subsidiary coding systems.  (This value is
 731                 converted to nil when stored internally, and
 732                 `coding-system-property' will return nil.)
 733
 734 'post-read-conversion
 735      Function called after a file has been read in, to perform the
 736      decoding.  Called with two arguments, BEG and END, denoting
 737      a region of the current buffer to be decoded.
 738
 739 'pre-write-conversion
 740      Function called before a file is written out, to perform the
 741      encoding.  Called with two arguments, BEG and END, denoting
 742      a region of the current buffer to be encoded.
 743
 744
 745 The following additional properties are recognized if TYPE is 'iso2022:
 746
 747 'charset-g0
 748 'charset-g1
 749 'charset-g2
 750 'charset-g3
 751      The character set initially designated to the G0 - G3 registers.
 752      The value should be one of
 753
 754           -- A charset object (designate that character set)
 755           -- nil (do not ever use this register)
 756           -- t (no character set is initially designated to
 757                 the register, but may be later on; this automatically
 758                 sets the corresponding `force-g*-on-output' property)
 759
 760 'force-g0-on-output
 761 'force-g1-on-output
 762 'force-g2-on-output
 763 'force-g2-on-output
 764      If non-nil, send an explicit designation sequence on output before
 765      using the specified register.
 766
 767 'short
 768      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 769      "ESC $ B" on output in place of the full designation sequences
 770      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 771
 772 'no-ascii-eol
 773      If non-nil, don't designate ASCII to G0 at each end of line on output.
 774      Setting this to non-nil also suppresses other state-resetting that
 775      normally happens at the end of a line.
 776
 777 'no-ascii-cntl
 778      If non-nil, don't designate ASCII to G0 before control chars on output.
 779
 780 'seven
 781      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 782      environment.
 783
 784 'lock-shift
 785      If non-nil, use locking-shift (SO/SI) instead of single-shift
 786      or designation by escape sequence.
 787
 788 'no-iso6429
 789      If non-nil, don't use ISO6429's direction specification.
 790
 791 'escape-quoted
 792      If non-nil, literal control characters that are the same as
 793      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 794      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 795      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 796      so that they can be properly distinguished from an escape sequence.
 797      (Note that doing this results in a non-portable encoding.) This
 798      encoding flag is used for byte-compiled files.  Note that ESC
 799      is a good choice for a quoting character because there are no
 800      escape sequences whose second byte is a character from the Control-0
 801      or Control-1 character sets; this is explicitly disallowed by the
 802      ISO2022 standard.
 803
 804 'input-charset-conversion
 805      A list of conversion specifications, specifying conversion of
 806      characters in one charset to another when decoding is performed.
 807      Each specification is a list of two elements: the source charset,
 808      and the destination charset.
 809
 810 'output-charset-conversion
 811      A list of conversion specifications, specifying conversion of
 812      characters in one charset to another when encoding is performed.
 813      The form of each specification is the same as for
 814      'input-charset-conversion.
 815
 816
 817 The following additional properties are recognized (and required)
 818 if TYPE is 'ccl:
 819
 820 'decode
 821      CCL program used for decoding (converting to internal format).
 822
 823 'encode
 824      CCL program used for encoding (converting to external format).
 825 */
 826        (name, type, doc_string, props))
 827 {
 828   Lisp_Coding_System *codesys;
 829   Lisp_Object rest, key, value;
 830   enum coding_system_type ty;
 831   int need_to_setup_eol_systems = 1;
 832
 833   /* Convert type to constant */
 834   if (NILP (type) || EQ (type, Qundecided))
 835                                       { ty = CODESYS_AUTODETECT; }
 836 #ifdef MULE
 837   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 838   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 839   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 840   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 841   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 842   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 843 #endif
 844   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 845 #ifdef DEBUG_XEMACS
 846   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 847 #endif
 848   else
 849     signal_simple_error ("Invalid coding system type", type);
 850
 851   CHECK_SYMBOL (name);
 852
 853   codesys = allocate_coding_system (ty, name);
 854
 855   if (NILP (doc_string))
 856     doc_string = build_string ("");
 857   else
 858     CHECK_STRING (doc_string);
 859   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 860
 861 #ifdef UTF2000
 862   if (ty == CODESYS_NO_CONVERSION)
 863     codesys->fixed.size = 1;
 864 #endif
 865   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 866     {
 867       if (EQ (key, Qmnemonic))
 868         {
 869           if (!NILP (value))
 870             CHECK_STRING (value);
 871           CODING_SYSTEM_MNEMONIC (codesys) = value;
 872         }
 873
 874       else if (EQ (key, Qeol_type))
 875         {
 876           need_to_setup_eol_systems = NILP (value);
 877           if (EQ (value, Qt))
 878             value = Qnil;
 879           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 880         }
 881
 882       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 883       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 884 #ifdef MULE
 885       else if (ty == CODESYS_ISO2022)
 886         {
 887 #define FROB_INITIAL_CHARSET(charset_num) \
 888   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 889     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 890
 891           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 892           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 893           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 894           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 895
 896 #define FROB_FORCE_CHARSET(charset_num) \
 897   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 898
 899           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 900           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 901           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 902           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 903
 904 #define FROB_BOOLEAN_PROPERTY(prop) \
 905   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 906
 907           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 908           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 909           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 910           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 911           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 912           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 913           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 914
 915           else if (EQ (key, Qinput_charset_conversion))
 916             {
 917               codesys->iso2022.input_conv =
 918                 Dynarr_new (charset_conversion_spec);
 919               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 920                                               value);
 921             }
 922           else if (EQ (key, Qoutput_charset_conversion))
 923             {
 924               codesys->iso2022.output_conv =
 925                 Dynarr_new (charset_conversion_spec);
 926               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 927                                               value);
 928             }
 929           else
 930             signal_simple_error ("Unrecognized property", key);
 931         }
 932       else if (EQ (type, Qccl))
 933         {
 934           if (EQ (key, Qdecode))
 935             {
 936               CHECK_VECTOR (value);
 937               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 938             }
 939           else if (EQ (key, Qencode))
 940             {
 941               CHECK_VECTOR (value);
 942               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 943             }
 944           else
 945             signal_simple_error ("Unrecognized property", key);
 946         }
 947 #endif /* MULE */
 948       else
 949         signal_simple_error ("Unrecognized property", key);
 950     }
 951
 952   if (need_to_setup_eol_systems)
 953     setup_eol_coding_systems (codesys);
 954
 955   {
 956     Lisp_Object codesys_obj;
 957     XSETCODING_SYSTEM (codesys_obj, codesys);
 958     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 959     return codesys_obj;
 960   }
 961 }
 962
 963 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 964 Copy OLD-CODING-SYSTEM to NEW-NAME.
 965 If NEW-NAME does not name an existing coding system, a new one will
 966 be created.
 967 */
 968        (old_coding_system, new_name))
 969 {
 970   Lisp_Object new_coding_system;
 971   old_coding_system = Fget_coding_system (old_coding_system);
 972   new_coding_system = Ffind_coding_system (new_name);
 973   if (NILP (new_coding_system))
 974     {
 975       XSETCODING_SYSTEM (new_coding_system,
 976                          allocate_coding_system
 977                          (XCODING_SYSTEM_TYPE (old_coding_system),
 978                           new_name));
 979       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 980     }
 981
 982   {
 983     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 984     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 985     memcpy (((char *) to  ) + sizeof (to->header),
 986             ((char *) from) + sizeof (from->header),
 987             sizeof (*from) - sizeof (from->header));
 988     to->name = new_name;
 989   }
 990   return new_coding_system;
 991 }
 992
 993 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
 994 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
 995 */
 996        (alias, coding_system))
 997 {
 998   CHECK_SYMBOL (alias);
 999   if (!NILP (Ffind_coding_system (alias)))
1000     signal_simple_error ("Symbol already names a coding system", alias);
1001   coding_system = Fget_coding_system (coding_system);
1002   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1003
1004   /* Set up aliases for subsidiaries. */
1005   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1006     {
1007       Lisp_Object str;
1008       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1009 #define FROB(type, name)                                                        \
1010       do {                                                                      \
1011         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1012         if (!NILP (subsidiary))                                                 \
1013           Fdefine_coding_system_alias                                           \
1014             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1015       } while (0)
1016       FROB (LF,   "-unix");
1017       FROB (CRLF, "-dos");
1018       FROB (CR,   "-mac");
1019 #undef FROB
1020     }
1021   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1022      but it doesn't look intentional, so I'd rather return something
1023      meaningful or nothing at all. */
1024   return Qnil;
1025 }
1026
1027 static Lisp_Object
1028 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1029 {
1030   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1031   Lisp_Object new_coding_system;
1032
1033   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1034     return coding_system;
1035
1036   switch (type)
1037     {
1038     case EOL_AUTODETECT: return coding_system;
1039     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1040     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1041     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1042     default:       abort ();
1043     }
1044
1045   return NILP (new_coding_system) ? coding_system : new_coding_system;
1046 }
1047
1048 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1049 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1050 */
1051        (coding_system, eol_type))
1052 {
1053   coding_system = Fget_coding_system (coding_system);
1054
1055   return subsidiary_coding_system (coding_system,
1056                                    symbol_to_eol_type (eol_type));
1057 }
1058
1059 \f
1060 /************************************************************************/
1061 /*                         Coding system accessors                      */
1062 /************************************************************************/
1063
1064 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1065 Return the doc string for CODING-SYSTEM.
1066 */
1067        (coding_system))
1068 {
1069   coding_system = Fget_coding_system (coding_system);
1070   return XCODING_SYSTEM_DOC_STRING (coding_system);
1071 }
1072
1073 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1074 Return the type of CODING-SYSTEM.
1075 */
1076        (coding_system))
1077 {
1078   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1079     {
1080     default: abort ();
1081     case CODESYS_AUTODETECT:    return Qundecided;
1082 #ifdef MULE
1083     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1084     case CODESYS_ISO2022:       return Qiso2022;
1085     case CODESYS_BIG5:          return Qbig5;
1086     case CODESYS_UCS4:          return Qucs4;
1087     case CODESYS_UTF8:          return Qutf8;
1088     case CODESYS_CCL:           return Qccl;
1089 #endif
1090     case CODESYS_NO_CONVERSION: return Qno_conversion;
1091 #ifdef DEBUG_XEMACS
1092     case CODESYS_INTERNAL:      return Qinternal;
1093 #endif
1094     }
1095 }
1096
1097 #ifdef MULE
1098 static
1099 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1100 {
1101   Lisp_Object cs
1102     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1103
1104   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1105 }
1106
1107 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1108 Return initial charset of CODING-SYSTEM designated to GNUM.
1109 GNUM allows 0 .. 3.
1110 */
1111        (coding_system, gnum))
1112 {
1113   coding_system = Fget_coding_system (coding_system);
1114   CHECK_INT (gnum);
1115
1116   return coding_system_charset (coding_system, XINT (gnum));
1117 }
1118 #endif /* MULE */
1119
1120 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1121 Return the PROP property of CODING-SYSTEM.
1122 */
1123        (coding_system, prop))
1124 {
1125   int i, ok = 0;
1126   enum coding_system_type type;
1127
1128   coding_system = Fget_coding_system (coding_system);
1129   CHECK_SYMBOL (prop);
1130   type = XCODING_SYSTEM_TYPE (coding_system);
1131
1132   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1133     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1134       {
1135         ok = 1;
1136         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1137           {
1138           case CODESYS_PROP_ALL_OK:
1139             break;
1140 #ifdef MULE
1141           case CODESYS_PROP_ISO2022:
1142             if (type != CODESYS_ISO2022)
1143               signal_simple_error
1144                 ("Property only valid in ISO2022 coding systems",
1145                  prop);
1146             break;
1147
1148           case CODESYS_PROP_CCL:
1149             if (type != CODESYS_CCL)
1150               signal_simple_error
1151                 ("Property only valid in CCL coding systems",
1152                  prop);
1153             break;
1154 #endif /* MULE */
1155           default:
1156             abort ();
1157           }
1158       }
1159
1160   if (!ok)
1161     signal_simple_error ("Unrecognized property", prop);
1162
1163   if (EQ (prop, Qname))
1164     return XCODING_SYSTEM_NAME (coding_system);
1165   else if (EQ (prop, Qtype))
1166     return Fcoding_system_type (coding_system);
1167   else if (EQ (prop, Qdoc_string))
1168     return XCODING_SYSTEM_DOC_STRING (coding_system);
1169   else if (EQ (prop, Qmnemonic))
1170     return XCODING_SYSTEM_MNEMONIC (coding_system);
1171   else if (EQ (prop, Qeol_type))
1172     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1173   else if (EQ (prop, Qeol_lf))
1174     return XCODING_SYSTEM_EOL_LF (coding_system);
1175   else if (EQ (prop, Qeol_crlf))
1176     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1177   else if (EQ (prop, Qeol_cr))
1178     return XCODING_SYSTEM_EOL_CR (coding_system);
1179   else if (EQ (prop, Qpost_read_conversion))
1180     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1181   else if (EQ (prop, Qpre_write_conversion))
1182     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1183 #ifdef MULE
1184   else if (type == CODESYS_ISO2022)
1185     {
1186       if (EQ (prop, Qcharset_g0))
1187         return coding_system_charset (coding_system, 0);
1188       else if (EQ (prop, Qcharset_g1))
1189         return coding_system_charset (coding_system, 1);
1190       else if (EQ (prop, Qcharset_g2))
1191         return coding_system_charset (coding_system, 2);
1192       else if (EQ (prop, Qcharset_g3))
1193         return coding_system_charset (coding_system, 3);
1194
1195 #define FORCE_CHARSET(charset_num) \
1196   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1197    (coding_system, charset_num) ? Qt : Qnil)
1198
1199       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1200       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1201       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1202       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1203
1204 #define LISP_BOOLEAN(prop) \
1205   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1206
1207       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1208       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1209       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1210       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1211       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1212       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1213       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1214
1215       else if (EQ (prop, Qinput_charset_conversion))
1216         return
1217           unparse_charset_conversion_specs
1218             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1219       else if (EQ (prop, Qoutput_charset_conversion))
1220         return
1221           unparse_charset_conversion_specs
1222             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1223       else
1224         abort ();
1225     }
1226   else if (type == CODESYS_CCL)
1227     {
1228       if (EQ (prop, Qdecode))
1229         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1230       else if (EQ (prop, Qencode))
1231         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1232       else
1233         abort ();
1234     }
1235 #endif /* MULE */
1236   else
1237     abort ();
1238
1239   return Qnil; /* not reached */
1240 }
1241
1242 \f
1243 /************************************************************************/
1244 /*                       Coding category functions                      */
1245 /************************************************************************/
1246
1247 static int
1248 decode_coding_category (Lisp_Object symbol)
1249 {
1250   int i;
1251
1252   CHECK_SYMBOL (symbol);
1253   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1254     if (EQ (coding_category_symbol[i], symbol))
1255       return i;
1256
1257   signal_simple_error ("Unrecognized coding category", symbol);
1258   return 0; /* not reached */
1259 }
1260
1261 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1262 Return a list of all recognized coding categories.
1263 */
1264        ())
1265 {
1266   int i;
1267   Lisp_Object list = Qnil;
1268
1269   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1270     list = Fcons (coding_category_symbol[i], list);
1271   return list;
1272 }
1273
1274 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1275 Change the priority order of the coding categories.
1276 LIST should be list of coding categories, in descending order of
1277 priority.  Unspecified coding categories will be lower in priority
1278 than all specified ones, in the same relative order they were in
1279 previously.
1280 */
1281        (list))
1282 {
1283   int category_to_priority[CODING_CATEGORY_LAST + 1];
1284   int i, j;
1285   Lisp_Object rest;
1286
1287   /* First generate a list that maps coding categories to priorities. */
1288
1289   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1290     category_to_priority[i] = -1;
1291
1292   /* Highest priority comes from the specified list. */
1293   i = 0;
1294   EXTERNAL_LIST_LOOP (rest, list)
1295     {
1296       int cat = decode_coding_category (XCAR (rest));
1297
1298       if (category_to_priority[cat] >= 0)
1299         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1300       category_to_priority[cat] = i++;
1301     }
1302
1303   /* Now go through the existing categories by priority to retrieve
1304      the categories not yet specified and preserve their priority
1305      order. */
1306   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1307     {
1308       int cat = coding_category_by_priority[j];
1309       if (category_to_priority[cat] < 0)
1310         category_to_priority[cat] = i++;
1311     }
1312
1313   /* Now we need to construct the inverse of the mapping we just
1314      constructed. */
1315
1316   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1317     coding_category_by_priority[category_to_priority[i]] = i;
1318
1319   /* Phew!  That was confusing. */
1320   return Qnil;
1321 }
1322
1323 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1324 Return a list of coding categories in descending order of priority.
1325 */
1326        ())
1327 {
1328   int i;
1329   Lisp_Object list = Qnil;
1330
1331   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1332     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1333                   list);
1334   return list;
1335 }
1336
1337 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1338 Change the coding system associated with a coding category.
1339 */
1340        (coding_category, coding_system))
1341 {
1342   int cat = decode_coding_category (coding_category);
1343
1344   coding_system = Fget_coding_system (coding_system);
1345   coding_category_system[cat] = coding_system;
1346   return Qnil;
1347 }
1348
1349 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1350 Return the coding system associated with a coding category.
1351 */
1352        (coding_category))
1353 {
1354   int cat = decode_coding_category (coding_category);
1355   Lisp_Object sys = coding_category_system[cat];
1356
1357   if (!NILP (sys))
1358     return XCODING_SYSTEM_NAME (sys);
1359   return Qnil;
1360 }
1361
1362 \f
1363 /************************************************************************/
1364 /*                     Detecting the encoding of data                   */
1365 /************************************************************************/
1366
1367 struct detection_state
1368 {
1369   enum eol_type eol_type;
1370   int seen_non_ascii;
1371   int mask;
1372 #ifdef MULE
1373   struct
1374     {
1375       int mask;
1376       int in_second_byte;
1377     }
1378   big5;
1379
1380   struct
1381     {
1382       int mask;
1383       int in_second_byte;
1384     }
1385   shift_jis;
1386
1387   struct
1388     {
1389       int mask;
1390       int in_byte;
1391   }
1392   ucs4;
1393
1394   struct
1395     {
1396       int mask;
1397       int in_byte;
1398     }
1399   utf8;
1400
1401   struct
1402     {
1403       int mask;
1404       int initted;
1405       struct iso2022_decoder iso;
1406       unsigned int flags;
1407       int high_byte_count;
1408       unsigned int saw_single_shift:1;
1409     }
1410   iso2022;
1411 #endif
1412   struct
1413     {
1414       int seen_anything;
1415       int just_saw_cr;
1416     }
1417   eol;
1418 };
1419
1420 static int
1421 acceptable_control_char_p (int c)
1422 {
1423   switch (c)
1424     {
1425       /* Allow and ignore control characters that you might
1426          reasonably see in a text file */
1427     case '\r':
1428     case '\n':
1429     case '\t':
1430     case  7: /* bell */
1431     case  8: /* backspace */
1432     case 11: /* vertical tab */
1433     case 12: /* form feed */
1434     case 26: /* MS-DOS C-z junk */
1435     case 31: /* '^_' -- for info */
1436       return 1;
1437     default:
1438       return 0;
1439     }
1440 }
1441
1442 static int
1443 mask_has_at_most_one_bit_p (int mask)
1444 {
1445   /* Perhaps the only thing useful you learn from intensive Microsoft
1446      technical interviews */
1447   return (mask & (mask - 1)) == 0;
1448 }
1449
1450 static enum eol_type
1451 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1452                  unsigned int n)
1453 {
1454   int c;
1455
1456   while (n--)
1457     {
1458       c = *src++;
1459       if (c == '\r')
1460         st->eol.just_saw_cr = 1;
1461       else
1462         {
1463           if (c == '\n')
1464             {
1465               if (st->eol.just_saw_cr)
1466                 return EOL_CRLF;
1467               else if (st->eol.seen_anything)
1468                 return EOL_LF;
1469             }
1470           else if (st->eol.just_saw_cr)
1471             return EOL_CR;
1472           st->eol.just_saw_cr = 0;
1473         }
1474       st->eol.seen_anything = 1;
1475     }
1476
1477   return EOL_AUTODETECT;
1478 }
1479
1480 /* Attempt to determine the encoding and EOL type of the given text.
1481    Before calling this function for the first type, you must initialize
1482    st->eol_type as appropriate and initialize st->mask to ~0.
1483
1484    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1485    not yet known.
1486
1487    st->mask holds the determined coding category mask, or ~0 if only
1488    ASCII has been seen so far.
1489
1490    Returns:
1491
1492    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1493         is present in st->mask
1494    1 == definitive answers are here for both st->eol_type and st->mask
1495 */
1496
1497 static int
1498 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1499                     unsigned int n, int just_do_eol)
1500 {
1501   int c;
1502
1503   if (st->eol_type == EOL_AUTODETECT)
1504     st->eol_type = detect_eol_type (st, src, n);
1505
1506   if (just_do_eol)
1507     return st->eol_type != EOL_AUTODETECT;
1508
1509   if (!st->seen_non_ascii)
1510     {
1511       for (; n; n--, src++)
1512         {
1513           c = *src;
1514           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1515             {
1516               st->seen_non_ascii = 1;
1517 #ifdef MULE
1518               st->shift_jis.mask = ~0;
1519               st->big5.mask = ~0;
1520               st->ucs4.mask = ~0;
1521               st->utf8.mask = ~0;
1522               st->iso2022.mask = ~0;
1523 #endif
1524               break;
1525             }
1526         }
1527     }
1528
1529   if (!n)
1530     return 0;
1531 #ifdef MULE
1532   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1533     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1534   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1535     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1536   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1537     st->big5.mask = detect_coding_big5 (st, src, n);
1538   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1539     st->utf8.mask = detect_coding_utf8 (st, src, n);
1540   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1541     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1542
1543   st->mask
1544     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1545     | st->utf8.mask | st->ucs4.mask;
1546 #endif
1547   {
1548     int retval = mask_has_at_most_one_bit_p (st->mask);
1549     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1550     return retval && st->eol_type != EOL_AUTODETECT;
1551   }
1552 }
1553
1554 static Lisp_Object
1555 coding_system_from_mask (int mask)
1556 {
1557   if (mask == ~0)
1558     {
1559       /* If the file was entirely or basically ASCII, use the
1560          default value of `buffer-file-coding-system'. */
1561       Lisp_Object retval =
1562         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1563       if (!NILP (retval))
1564         {
1565           retval = Ffind_coding_system (retval);
1566           if (NILP (retval))
1567             {
1568               warn_when_safe
1569                 (Qbad_variable, Qwarning,
1570                  "Invalid `default-buffer-file-coding-system', set to nil");
1571               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1572             }
1573         }
1574       if (NILP (retval))
1575         retval = Fget_coding_system (Qraw_text);
1576       return retval;
1577     }
1578   else
1579     {
1580       int i;
1581       int cat = -1;
1582 #ifdef MULE
1583       mask = postprocess_iso2022_mask (mask);
1584 #endif
1585       /* Look through the coding categories by priority and find
1586          the first one that is allowed. */
1587       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1588         {
1589           cat = coding_category_by_priority[i];
1590           if ((mask & (1 << cat)) &&
1591               !NILP (coding_category_system[cat]))
1592             break;
1593         }
1594       if (cat >= 0)
1595         return coding_category_system[cat];
1596       else
1597         return Fget_coding_system (Qraw_text);
1598     }
1599 }
1600
1601 /* Given a seekable read stream and potential coding system and EOL type
1602    as specified, do any autodetection that is called for.  If the
1603    coding system and/or EOL type are not autodetect, they will be left
1604    alone; but this function will never return an autodetect coding system
1605    or EOL type.
1606
1607    This function does not automatically fetch subsidiary coding systems;
1608    that should be unnecessary with the explicit eol-type argument. */
1609
1610 void
1611 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1612                               enum eol_type *eol_type_in_out)
1613 {
1614   struct detection_state decst;
1615
1616   if (*eol_type_in_out == EOL_AUTODETECT)
1617     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1618
1619   xzero (decst);
1620   decst.eol_type = *eol_type_in_out;
1621   decst.mask = ~0;
1622
1623   /* If autodetection is called for, do it now. */
1624   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1625       *eol_type_in_out == EOL_AUTODETECT)
1626     {
1627       unsigned char random_buffer[4096];
1628       int nread;
1629       Lisp_Object coding_system = Qnil;
1630
1631       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1632       if (nread)
1633         {
1634           unsigned char *cp = random_buffer;
1635
1636           while (cp < random_buffer + nread)
1637             {
1638               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1639                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1640                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1641                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1642                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1643                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1644                   (*cp++ == ':') && (cp < random_buffer + nread))
1645                 {
1646                   unsigned char coding_system_name[4096 - 6];
1647                   unsigned char *np = coding_system_name;
1648
1649                   while ( (cp < random_buffer + nread)
1650                           && ((*cp == ' ') || (*cp == '\t')) )
1651                     {
1652                       cp++;
1653                     }
1654                   while ( (cp < random_buffer + nread) &&
1655                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1656                     {
1657                       *np++ = *cp++;
1658                     }
1659                   *np = 0;
1660                   coding_system
1661                     = Ffind_coding_system (intern (coding_system_name));
1662                   break;
1663                 }
1664             }
1665           if (EQ(coding_system, Qnil))
1666             do{
1667               if (detect_coding_type (&decst, random_buffer, nread,
1668                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1669                                       != CODESYS_AUTODETECT))
1670                 break;
1671               nread = Lstream_read (stream,
1672                                     random_buffer, sizeof (random_buffer));
1673               if (!nread)
1674                 break;
1675             } while(1);
1676         }
1677       *eol_type_in_out = decst.eol_type;
1678       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1679         {
1680           if (EQ(coding_system, Qnil))
1681             *codesys_in_out = coding_system_from_mask (decst.mask);
1682           else
1683             *codesys_in_out = coding_system;
1684         }
1685     }
1686   /* If we absolutely can't determine the EOL type, just assume LF. */
1687   if (*eol_type_in_out == EOL_AUTODETECT)
1688     *eol_type_in_out = EOL_LF;
1689
1690   Lstream_rewind (stream);
1691 }
1692
1693 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1694 Detect coding system of the text in the region between START and END.
1695 Returned a list of possible coding systems ordered by priority.
1696 If only ASCII characters are found, it returns 'undecided or one of
1697 its subsidiary coding systems according to a detected end-of-line
1698 type.  Optional arg BUFFER defaults to the current buffer.
1699 */
1700        (start, end, buffer))
1701 {
1702   Lisp_Object val = Qnil;
1703   struct buffer *buf = decode_buffer (buffer, 0);
1704   Bufpos b, e;
1705   Lisp_Object instream, lb_instream;
1706   Lstream *istr, *lb_istr;
1707   struct detection_state decst;
1708   struct gcpro gcpro1, gcpro2;
1709
1710   get_buffer_range_char (buf, start, end, &b, &e, 0);
1711   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1712   lb_istr = XLSTREAM (lb_instream);
1713   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1714   istr = XLSTREAM (instream);
1715   GCPRO2 (instream, lb_instream);
1716   xzero (decst);
1717   decst.eol_type = EOL_AUTODETECT;
1718   decst.mask = ~0;
1719   while (1)
1720     {
1721       unsigned char random_buffer[4096];
1722       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1723
1724       if (!nread)
1725         break;
1726       if (detect_coding_type (&decst, random_buffer, nread, 0))
1727         break;
1728     }
1729
1730   if (decst.mask == ~0)
1731     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1732                                     decst.eol_type);
1733   else
1734     {
1735       int i;
1736
1737       val = Qnil;
1738 #ifdef MULE
1739       decst.mask = postprocess_iso2022_mask (decst.mask);
1740 #endif
1741       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1742         {
1743           int sys = coding_category_by_priority[i];
1744           if (decst.mask & (1 << sys))
1745             {
1746               Lisp_Object codesys = coding_category_system[sys];
1747               if (!NILP (codesys))
1748                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1749               val = Fcons (codesys, val);
1750             }
1751         }
1752     }
1753   Lstream_close (istr);
1754   UNGCPRO;
1755   Lstream_delete (istr);
1756   Lstream_delete (lb_istr);
1757   return val;
1758 }
1759
1760 \f
1761 /************************************************************************/
1762 /*           Converting to internal Mule format ("decoding")            */
1763 /************************************************************************/
1764
1765 /* A decoding stream is a stream used for decoding text (i.e.
1766    converting from some external format to internal format).
1767    The decoding-stream object keeps track of the actual coding
1768    stream, the stream that is at the other end, and data that
1769    needs to be persistent across the lifetime of the stream. */
1770
1771 /* Handle the EOL stuff related to just-read-in character C.
1772    EOL_TYPE is the EOL type of the coding stream.
1773    FLAGS is the current value of FLAGS in the coding stream, and may
1774    be modified by this macro.  (The macro only looks at the
1775    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1776    bytes are to be written.  You need to also define a local goto
1777    label "label_continue_loop" that is at the end of the main
1778    character-reading loop.
1779
1780    If C is a CR character, then this macro handles it entirely and
1781    jumps to label_continue_loop.  Otherwise, this macro does not add
1782    anything to DST, and continues normally.  You should continue
1783    processing C normally after this macro. */
1784
1785 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1786 do {                                                            \
1787   if (c == '\r')                                                \
1788     {                                                           \
1789       if (eol_type == EOL_CR)                                   \
1790         Dynarr_add (dst, '\n');                                 \
1791       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1792         Dynarr_add (dst, c);                                    \
1793       else                                                      \
1794         flags |= CODING_STATE_CR;                               \
1795       goto label_continue_loop;                                 \
1796     }                                                           \
1797   else if (flags & CODING_STATE_CR)                             \
1798     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1799       if (c != '\n')                                            \
1800         Dynarr_add (dst, '\r');                                 \
1801       flags &= ~CODING_STATE_CR;                                \
1802     }                                                           \
1803 } while (0)
1804
1805 /* C should be a binary character in the range 0 - 255; convert
1806    to internal format and add to Dynarr DST. */
1807
1808 #ifdef UTF2000
1809 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1810 do {                                            \
1811   if (BYTE_ASCII_P (c))                         \
1812     Dynarr_add (dst, c);                        \
1813   else                                          \
1814     {                                           \
1815       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1816       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1817     }                                           \
1818 } while (0)
1819
1820 INLINE void
1821 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1822 {
1823   if ( c <= 0x7f )
1824     {
1825       Dynarr_add (dst, c);
1826     }
1827   else if ( c <= 0x7ff )
1828     {
1829       Dynarr_add (dst, (c >> 6) | 0xc0);
1830       Dynarr_add (dst, (c & 0x3f) | 0x80);
1831     }
1832   else if ( c <= 0xffff )
1833     {
1834       Dynarr_add (dst,  (c >> 12) | 0xe0);
1835       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1836       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1837     }
1838   else if ( c <= 0x1fffff )
1839     {
1840       Dynarr_add (dst,  (c >> 18) | 0xf0);
1841       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1842       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1843       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1844     }
1845   else if ( c <= 0x3ffffff )
1846     {
1847       Dynarr_add (dst,  (c >> 24) | 0xf8);
1848       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1849       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1850       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1851       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1852     }
1853   else
1854     {
1855       Dynarr_add (dst,  (c >> 30) | 0xfc);
1856       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1857       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1858       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1859       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1860       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1861     }
1862 }
1863 #else
1864 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1865 do {                                            \
1866   if (BYTE_ASCII_P (c))                         \
1867     Dynarr_add (dst, c);                        \
1868   else if (BYTE_C1_P (c))                       \
1869     {                                           \
1870       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1871       Dynarr_add (dst, c + 0x20);               \
1872     }                                           \
1873   else                                          \
1874     {                                           \
1875       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1876       Dynarr_add (dst, c);                      \
1877     }                                           \
1878 } while (0)
1879 #endif
1880
1881 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1882 do {                                    \
1883   if (ch)                               \
1884     {                                   \
1885       DECODE_ADD_BINARY_CHAR (ch, dst); \
1886       ch = 0;                           \
1887     }                                   \
1888 } while (0)
1889
1890 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1891 do {                                    \
1892   if (flags & CODING_STATE_END)         \
1893     {                                   \
1894       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1895       if (flags & CODING_STATE_CR)      \
1896         Dynarr_add (dst, '\r');         \
1897     }                                   \
1898 } while (0)
1899
1900 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1901
1902 struct decoding_stream
1903 {
1904   /* Coding system that governs the conversion. */
1905   Lisp_Coding_System *codesys;
1906
1907   /* Stream that we read the encoded data from or
1908      write the decoded data to. */
1909   Lstream *other_end;
1910
1911   /* If we are reading, then we can return only a fixed amount of
1912      data, so if the conversion resulted in too much data, we store it
1913      here for retrieval the next time around. */
1914   unsigned_char_dynarr *runoff;
1915
1916   /* FLAGS holds flags indicating the current state of the decoding.
1917      Some of these flags are dependent on the coding system. */
1918   unsigned int flags;
1919
1920   /* CH holds a partially built-up character.  Since we only deal
1921      with one- and two-byte characters at the moment, we only use
1922      this to store the first byte of a two-byte character. */
1923   unsigned int ch;
1924
1925   /* EOL_TYPE specifies the type of end-of-line conversion that
1926      currently applies.  We need to keep this separate from the
1927      EOL type stored in CODESYS because the latter might indicate
1928      automatic EOL-type detection while the former will always
1929      indicate a particular EOL type. */
1930   enum eol_type eol_type;
1931 #ifdef MULE
1932   /* Additional ISO2022 information.  We define the structure above
1933      because it's also needed by the detection routines. */
1934   struct iso2022_decoder iso2022;
1935
1936   /* Additional information (the state of the running CCL program)
1937      used by the CCL decoder. */
1938   struct ccl_program ccl;
1939
1940   /* counter for UTF-8 or UCS-4 */
1941   unsigned char counter;
1942 #endif
1943   struct detection_state decst;
1944 };
1945
1946 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1947 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1948 static int decoding_rewinder   (Lstream *stream);
1949 static int decoding_seekable_p (Lstream *stream);
1950 static int decoding_flusher    (Lstream *stream);
1951 static int decoding_closer     (Lstream *stream);
1952
1953 static Lisp_Object decoding_marker (Lisp_Object stream,
1954                                     void (*markobj) (Lisp_Object));
1955
1956 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1957                                sizeof (struct decoding_stream));
1958
1959 static Lisp_Object
1960 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1961 {
1962   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1963   Lisp_Object str_obj;
1964
1965   /* We do not need to mark the coding systems or charsets stored
1966      within the stream because they are stored in a global list
1967      and automatically marked. */
1968
1969   XSETLSTREAM (str_obj, str);
1970   markobj (str_obj);
1971   if (str->imp->marker)
1972     return (str->imp->marker) (str_obj, markobj);
1973   else
1974     return Qnil;
1975 }
1976
1977 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1978    so we read data from the other end, decode it, and store it into DATA. */
1979
1980 static int
1981 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1982 {
1983   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1984   unsigned char *orig_data = data;
1985   int read_size;
1986   int error_occurred = 0;
1987
1988   /* We need to interface to mule_decode(), which expects to take some
1989      amount of data and store the result into a Dynarr.  We have
1990      mule_decode() store into str->runoff, and take data from there
1991      as necessary. */
1992
1993   /* We loop until we have enough data, reading chunks from the other
1994      end and decoding it. */
1995   while (1)
1996     {
1997       /* Take data from the runoff if we can.  Make sure to take at
1998          most SIZE bytes, and delete the data from the runoff. */
1999       if (Dynarr_length (str->runoff) > 0)
2000         {
2001           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2002           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2003           Dynarr_delete_many (str->runoff, 0, chunk);
2004           data += chunk;
2005           size -= chunk;
2006         }
2007
2008       if (size == 0)
2009         break; /* No more room for data */
2010
2011       if (str->flags & CODING_STATE_END)
2012         /* This means that on the previous iteration, we hit the EOF on
2013            the other end.  We loop once more so that mule_decode() can
2014            output any final stuff it may be holding, or any "go back
2015            to a sane state" escape sequences. (This latter makes sense
2016            during encoding.) */
2017         break;
2018
2019       /* Exhausted the runoff, so get some more.  DATA has at least
2020          SIZE bytes left of storage in it, so it's OK to read directly
2021          into it.  (We'll be overwriting above, after we've decoded it
2022          into the runoff.) */
2023       read_size = Lstream_read (str->other_end, data, size);
2024       if (read_size < 0)
2025         {
2026           error_occurred = 1;
2027           break;
2028         }
2029       if (read_size == 0)
2030         /* There might be some more end data produced in the translation.
2031            See the comment above. */
2032         str->flags |= CODING_STATE_END;
2033       mule_decode (stream, data, str->runoff, read_size);
2034     }
2035
2036   if (data - orig_data == 0)
2037     return error_occurred ? -1 : 0;
2038   else
2039     return data - orig_data;
2040 }
2041
2042 static int
2043 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2044 {
2045   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2046   int retval;
2047
2048   /* Decode all our data into the runoff, and then attempt to write
2049      it all out to the other end.  Remove whatever chunk we succeeded
2050      in writing. */
2051   mule_decode (stream, data, str->runoff, size);
2052   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2053                           Dynarr_length (str->runoff));
2054   if (retval > 0)
2055     Dynarr_delete_many (str->runoff, 0, retval);
2056   /* Do NOT return retval.  The return value indicates how much
2057      of the incoming data was written, not how many bytes were
2058      written. */
2059   return size;
2060 }
2061
2062 static void
2063 reset_decoding_stream (struct decoding_stream *str)
2064 {
2065 #ifdef MULE
2066   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2067     {
2068       Lisp_Object coding_system;
2069       XSETCODING_SYSTEM (coding_system, str->codesys);
2070       reset_iso2022 (coding_system, &str->iso2022);
2071     }
2072   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2073     {
2074       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2075     }
2076   str->counter = 0;
2077 #endif /* MULE */
2078   str->flags = str->ch = 0;
2079 }
2080
2081 static int
2082 decoding_rewinder (Lstream *stream)
2083 {
2084   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2085   reset_decoding_stream (str);
2086   Dynarr_reset (str->runoff);
2087   return Lstream_rewind (str->other_end);
2088 }
2089
2090 static int
2091 decoding_seekable_p (Lstream *stream)
2092 {
2093   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2094   return Lstream_seekable_p (str->other_end);
2095 }
2096
2097 static int
2098 decoding_flusher (Lstream *stream)
2099 {
2100   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101   return Lstream_flush (str->other_end);
2102 }
2103
2104 static int
2105 decoding_closer (Lstream *stream)
2106 {
2107   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108   if (stream->flags & LSTREAM_FL_WRITE)
2109     {
2110       str->flags |= CODING_STATE_END;
2111       decoding_writer (stream, 0, 0);
2112     }
2113   Dynarr_free (str->runoff);
2114 #ifdef MULE
2115 #ifdef ENABLE_COMPOSITE_CHARS
2116   if (str->iso2022.composite_chars)
2117     Dynarr_free (str->iso2022.composite_chars);
2118 #endif
2119 #endif
2120   return Lstream_close (str->other_end);
2121 }
2122
2123 Lisp_Object
2124 decoding_stream_coding_system (Lstream *stream)
2125 {
2126   Lisp_Object coding_system;
2127   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2128
2129   XSETCODING_SYSTEM (coding_system, str->codesys);
2130   return subsidiary_coding_system (coding_system, str->eol_type);
2131 }
2132
2133 void
2134 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2135 {
2136   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2137   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2138   str->codesys = cs;
2139   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2140     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2141   reset_decoding_stream (str);
2142 }
2143
2144 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2145    stream for writing, no automatic code detection will be performed.
2146    The reason for this is that automatic code detection requires a
2147    seekable input.  Things will also fail if you open a decoding
2148    stream for reading using a non-fully-specified coding system and
2149    a non-seekable input stream. */
2150
2151 static Lisp_Object
2152 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2153                         CONST char *mode)
2154 {
2155   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2156   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2157   Lisp_Object obj;
2158
2159   xzero (*str);
2160   str->other_end = stream;
2161   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2162   str->eol_type = EOL_AUTODETECT;
2163   if (!strcmp (mode, "r")
2164       && Lstream_seekable_p (stream))
2165     /* We can determine the coding system now. */
2166     determine_real_coding_system (stream, &codesys, &str->eol_type);
2167   set_decoding_stream_coding_system (lstr, codesys);
2168   str->decst.eol_type = str->eol_type;
2169   str->decst.mask = ~0;
2170   XSETLSTREAM (obj, lstr);
2171   return obj;
2172 }
2173
2174 Lisp_Object
2175 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2176 {
2177   return make_decoding_stream_1 (stream, codesys, "r");
2178 }
2179
2180 Lisp_Object
2181 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2182 {
2183   return make_decoding_stream_1 (stream, codesys, "w");
2184 }
2185
2186 /* Note: the decode_coding_* functions all take the same
2187    arguments as mule_decode(), which is to say some SRC data of
2188    size N, which is to be stored into dynamic array DST.
2189    DECODING is the stream within which the decoding is
2190    taking place, but no data is actually read from or
2191    written to that stream; that is handled in decoding_reader()
2192    or decoding_writer().  This allows the same functions to
2193    be used for both reading and writing. */
2194
2195 static void
2196 mule_decode (Lstream *decoding, CONST unsigned char *src,
2197              unsigned_char_dynarr *dst, unsigned int n)
2198 {
2199   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2200
2201   /* If necessary, do encoding-detection now.  We do this when
2202      we're a writing stream or a non-seekable reading stream,
2203      meaning that we can't just process the whole input,
2204      rewind, and start over. */
2205
2206   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2207       str->eol_type == EOL_AUTODETECT)
2208     {
2209       Lisp_Object codesys;
2210
2211       XSETCODING_SYSTEM (codesys, str->codesys);
2212       detect_coding_type (&str->decst, src, n,
2213                           CODING_SYSTEM_TYPE (str->codesys) !=
2214                           CODESYS_AUTODETECT);
2215       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2216           str->decst.mask != ~0)
2217         /* #### This is cheesy.  What we really ought to do is
2218            buffer up a certain amount of data so as to get a
2219            less random result. */
2220         codesys = coding_system_from_mask (str->decst.mask);
2221       str->eol_type = str->decst.eol_type;
2222       if (XCODING_SYSTEM (codesys) != str->codesys)
2223         {
2224           /* Preserve the CODING_STATE_END flag in case it was set.
2225              If we erase it, bad things might happen. */
2226           int was_end = str->flags & CODING_STATE_END;
2227           set_decoding_stream_coding_system (decoding, codesys);
2228           if (was_end)
2229             str->flags |= CODING_STATE_END;
2230         }
2231     }
2232
2233   switch (CODING_SYSTEM_TYPE (str->codesys))
2234     {
2235 #ifdef DEBUG_XEMACS
2236     case CODESYS_INTERNAL:
2237       Dynarr_add_many (dst, src, n);
2238       break;
2239 #endif
2240     case CODESYS_AUTODETECT:
2241       /* If we got this far and still haven't decided on the coding
2242          system, then do no conversion. */
2243     case CODESYS_NO_CONVERSION:
2244       decode_coding_no_conversion (decoding, src, dst, n);
2245       break;
2246 #ifdef MULE
2247     case CODESYS_SHIFT_JIS:
2248       decode_coding_sjis (decoding, src, dst, n);
2249       break;
2250     case CODESYS_BIG5:
2251       decode_coding_big5 (decoding, src, dst, n);
2252       break;
2253     case CODESYS_UCS4:
2254       decode_coding_ucs4 (decoding, src, dst, n);
2255       break;
2256     case CODESYS_UTF8:
2257       decode_coding_utf8 (decoding, src, dst, n);
2258       break;
2259     case CODESYS_CCL:
2260       str->ccl.last_block = str->flags & CODING_STATE_END;
2261       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2262       break;
2263     case CODESYS_ISO2022:
2264       decode_coding_iso2022 (decoding, src, dst, n);
2265       break;
2266 #endif /* MULE */
2267     default:
2268       abort ();
2269     }
2270 }
2271
2272 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2273 Decode the text between START and END which is encoded in CODING-SYSTEM.
2274 This is useful if you've read in encoded text from a file without decoding
2275 it (e.g. you read in a JIS-formatted file but used the `binary' or
2276 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2277 Return length of decoded text.
2278 BUFFER defaults to the current buffer if unspecified.
2279 */
2280        (start, end, coding_system, buffer))
2281 {
2282   Bufpos b, e;
2283   struct buffer *buf = decode_buffer (buffer, 0);
2284   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2285   Lstream *istr, *ostr;
2286   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2287
2288   get_buffer_range_char (buf, start, end, &b, &e, 0);
2289
2290   barf_if_buffer_read_only (buf, b, e);
2291
2292   coding_system = Fget_coding_system (coding_system);
2293   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2294   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2295   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2296                                               coding_system);
2297   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2298                                            Fget_coding_system (Qbinary));
2299   istr = XLSTREAM (instream);
2300   ostr = XLSTREAM (outstream);
2301   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2302
2303   /* The chain of streams looks like this:
2304
2305      [BUFFER] <----- send through
2306                      ------> [ENCODE AS BINARY]
2307                              ------> [DECODE AS SPECIFIED]
2308                                      ------> [BUFFER]
2309    */
2310
2311   while (1)
2312     {
2313       char tempbuf[1024]; /* some random amount */
2314       Bufpos newpos, even_newer_pos;
2315       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2316       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2317
2318       if (!size_in_bytes)
2319         break;
2320       newpos = lisp_buffer_stream_startpos (istr);
2321       Lstream_write (ostr, tempbuf, size_in_bytes);
2322       even_newer_pos = lisp_buffer_stream_startpos (istr);
2323       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2324                            even_newer_pos, 0);
2325     }
2326   Lstream_close (istr);
2327   Lstream_close (ostr);
2328   UNGCPRO;
2329   Lstream_delete (istr);
2330   Lstream_delete (ostr);
2331   Lstream_delete (XLSTREAM (de_outstream));
2332   Lstream_delete (XLSTREAM (lb_outstream));
2333   return Qnil;
2334 }
2335
2336 \f
2337 /************************************************************************/
2338 /*           Converting to an external encoding ("encoding")            */
2339 /************************************************************************/
2340
2341 /* An encoding stream is an output stream.  When you create the
2342    stream, you specify the coding system that governs the encoding
2343    and another stream that the resulting encoded data is to be
2344    sent to, and then start sending data to it. */
2345
2346 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2347
2348 struct encoding_stream
2349 {
2350   /* Coding system that governs the conversion. */
2351   Lisp_Coding_System *codesys;
2352
2353   /* Stream that we read the encoded data from or
2354      write the decoded data to. */
2355   Lstream *other_end;
2356
2357   /* If we are reading, then we can return only a fixed amount of
2358      data, so if the conversion resulted in too much data, we store it
2359      here for retrieval the next time around. */
2360   unsigned_char_dynarr *runoff;
2361
2362   /* FLAGS holds flags indicating the current state of the encoding.
2363      Some of these flags are dependent on the coding system. */
2364   unsigned int flags;
2365
2366   /* CH holds a partially built-up character.  Since we only deal
2367      with one- and two-byte characters at the moment, we only use
2368      this to store the first byte of a two-byte character. */
2369   unsigned int ch;
2370 #ifdef MULE
2371   /* Additional information used by the ISO2022 encoder. */
2372   struct
2373     {
2374       /* CHARSET holds the character sets currently assigned to the G0
2375          through G3 registers.  It is initialized from the array
2376          INITIAL_CHARSET in CODESYS. */
2377       Lisp_Object charset[4];
2378
2379       /* Which registers are currently invoked into the left (GL) and
2380          right (GR) halves of the 8-bit encoding space? */
2381       int register_left, register_right;
2382
2383       /* Whether we need to explicitly designate the charset in the
2384          G? register before using it.  It is initialized from the
2385          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2386       unsigned char force_charset_on_output[4];
2387
2388       /* Other state variables that need to be preserved across
2389          invocations. */
2390       Lisp_Object current_charset;
2391       int current_half;
2392       int current_char_boundary;
2393     } iso2022;
2394
2395   /* Additional information (the state of the running CCL program)
2396      used by the CCL encoder. */
2397   struct ccl_program ccl;
2398 #endif /* MULE */
2399 };
2400
2401 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2402 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2403                             size_t size);
2404 static int encoding_rewinder   (Lstream *stream);
2405 static int encoding_seekable_p (Lstream *stream);
2406 static int encoding_flusher    (Lstream *stream);
2407 static int encoding_closer     (Lstream *stream);
2408
2409 static Lisp_Object encoding_marker (Lisp_Object stream,
2410                                     void (*markobj) (Lisp_Object));
2411
2412 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2413                                sizeof (struct encoding_stream));
2414
2415 static Lisp_Object
2416 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2417 {
2418   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2419   Lisp_Object str_obj;
2420
2421   /* We do not need to mark the coding systems or charsets stored
2422      within the stream because they are stored in a global list
2423      and automatically marked. */
2424
2425   XSETLSTREAM (str_obj, str);
2426   markobj (str_obj);
2427   if (str->imp->marker)
2428     return (str->imp->marker) (str_obj, markobj);
2429   else
2430     return Qnil;
2431 }
2432
2433 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2434    so we read data from the other end, encode it, and store it into DATA. */
2435
2436 static int
2437 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2438 {
2439   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2440   unsigned char *orig_data = data;
2441   int read_size;
2442   int error_occurred = 0;
2443
2444   /* We need to interface to mule_encode(), which expects to take some
2445      amount of data and store the result into a Dynarr.  We have
2446      mule_encode() store into str->runoff, and take data from there
2447      as necessary. */
2448
2449   /* We loop until we have enough data, reading chunks from the other
2450      end and encoding it. */
2451   while (1)
2452     {
2453       /* Take data from the runoff if we can.  Make sure to take at
2454          most SIZE bytes, and delete the data from the runoff. */
2455       if (Dynarr_length (str->runoff) > 0)
2456         {
2457           int chunk = min ((int) size, Dynarr_length (str->runoff));
2458           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2459           Dynarr_delete_many (str->runoff, 0, chunk);
2460           data += chunk;
2461           size -= chunk;
2462         }
2463
2464       if (size == 0)
2465         break; /* No more room for data */
2466
2467       if (str->flags & CODING_STATE_END)
2468         /* This means that on the previous iteration, we hit the EOF on
2469            the other end.  We loop once more so that mule_encode() can
2470            output any final stuff it may be holding, or any "go back
2471            to a sane state" escape sequences. (This latter makes sense
2472            during encoding.) */
2473         break;
2474
2475       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2476          left of storage in it, so it's OK to read directly into it.
2477          (We'll be overwriting above, after we've encoded it into the
2478          runoff.) */
2479       read_size = Lstream_read (str->other_end, data, size);
2480       if (read_size < 0)
2481         {
2482           error_occurred = 1;
2483           break;
2484         }
2485       if (read_size == 0)
2486         /* There might be some more end data produced in the translation.
2487            See the comment above. */
2488         str->flags |= CODING_STATE_END;
2489       mule_encode (stream, data, str->runoff, read_size);
2490     }
2491
2492   if (data == orig_data)
2493     return error_occurred ? -1 : 0;
2494   else
2495     return data - orig_data;
2496 }
2497
2498 static int
2499 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2500 {
2501   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2502   int retval;
2503
2504   /* Encode all our data into the runoff, and then attempt to write
2505      it all out to the other end.  Remove whatever chunk we succeeded
2506      in writing. */
2507   mule_encode (stream, data, str->runoff, size);
2508   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2509                           Dynarr_length (str->runoff));
2510   if (retval > 0)
2511     Dynarr_delete_many (str->runoff, 0, retval);
2512   /* Do NOT return retval.  The return value indicates how much
2513      of the incoming data was written, not how many bytes were
2514      written. */
2515   return size;
2516 }
2517
2518 static void
2519 reset_encoding_stream (struct encoding_stream *str)
2520 {
2521 #ifdef MULE
2522   switch (CODING_SYSTEM_TYPE (str->codesys))
2523     {
2524     case CODESYS_ISO2022:
2525       {
2526         int i;
2527
2528         for (i = 0; i < 4; i++)
2529           {
2530             str->iso2022.charset[i] =
2531               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2532             str->iso2022.force_charset_on_output[i] =
2533               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2534           }
2535         str->iso2022.register_left = 0;
2536         str->iso2022.register_right = 1;
2537         str->iso2022.current_charset = Qnil;
2538         str->iso2022.current_half = 0;
2539 #ifdef UTF2000
2540         str->iso2022.current_char_boundary = 0;
2541 #else
2542         str->iso2022.current_char_boundary = 1;
2543 #endif
2544         break;
2545       }
2546     case CODESYS_CCL:
2547       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2548       break;
2549     default:
2550       break;
2551     }
2552 #endif /* MULE */
2553
2554   str->flags = str->ch = 0;
2555 }
2556
2557 static int
2558 encoding_rewinder (Lstream *stream)
2559 {
2560   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2561   reset_encoding_stream (str);
2562   Dynarr_reset (str->runoff);
2563   return Lstream_rewind (str->other_end);
2564 }
2565
2566 static int
2567 encoding_seekable_p (Lstream *stream)
2568 {
2569   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2570   return Lstream_seekable_p (str->other_end);
2571 }
2572
2573 static int
2574 encoding_flusher (Lstream *stream)
2575 {
2576   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2577   return Lstream_flush (str->other_end);
2578 }
2579
2580 static int
2581 encoding_closer (Lstream *stream)
2582 {
2583   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2584   if (stream->flags & LSTREAM_FL_WRITE)
2585     {
2586       str->flags |= CODING_STATE_END;
2587       encoding_writer (stream, 0, 0);
2588     }
2589   Dynarr_free (str->runoff);
2590   return Lstream_close (str->other_end);
2591 }
2592
2593 Lisp_Object
2594 encoding_stream_coding_system (Lstream *stream)
2595 {
2596   Lisp_Object coding_system;
2597   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2598
2599   XSETCODING_SYSTEM (coding_system, str->codesys);
2600   return coding_system;
2601 }
2602
2603 void
2604 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2605 {
2606   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2607   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2608   str->codesys = cs;
2609   reset_encoding_stream (str);
2610 }
2611
2612 static Lisp_Object
2613 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2614                         CONST char *mode)
2615 {
2616   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2617   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2618   Lisp_Object obj;
2619
2620   xzero (*str);
2621   str->runoff = Dynarr_new (unsigned_char);
2622   str->other_end = stream;
2623   set_encoding_stream_coding_system (lstr, codesys);
2624   XSETLSTREAM (obj, lstr);
2625   return obj;
2626 }
2627
2628 Lisp_Object
2629 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2630 {
2631   return make_encoding_stream_1 (stream, codesys, "r");
2632 }
2633
2634 Lisp_Object
2635 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2636 {
2637   return make_encoding_stream_1 (stream, codesys, "w");
2638 }
2639
2640 /* Convert N bytes of internally-formatted data stored in SRC to an
2641    external format, according to the encoding stream ENCODING.
2642    Store the encoded data into DST. */
2643
2644 static void
2645 mule_encode (Lstream *encoding, CONST unsigned char *src,
2646              unsigned_char_dynarr *dst, unsigned int n)
2647 {
2648   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2649
2650   switch (CODING_SYSTEM_TYPE (str->codesys))
2651     {
2652 #ifdef DEBUG_XEMACS
2653     case CODESYS_INTERNAL:
2654       Dynarr_add_many (dst, src, n);
2655       break;
2656 #endif
2657     case CODESYS_AUTODETECT:
2658       /* If we got this far and still haven't decided on the coding
2659          system, then do no conversion. */
2660     case CODESYS_NO_CONVERSION:
2661       encode_coding_no_conversion (encoding, src, dst, n);
2662       break;
2663 #ifdef MULE
2664     case CODESYS_SHIFT_JIS:
2665       encode_coding_sjis (encoding, src, dst, n);
2666       break;
2667     case CODESYS_BIG5:
2668       encode_coding_big5 (encoding, src, dst, n);
2669       break;
2670     case CODESYS_UCS4:
2671       encode_coding_ucs4 (encoding, src, dst, n);
2672       break;
2673     case CODESYS_UTF8:
2674       encode_coding_utf8 (encoding, src, dst, n);
2675       break;
2676     case CODESYS_CCL:
2677       str->ccl.last_block = str->flags & CODING_STATE_END;
2678       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2679       break;
2680     case CODESYS_ISO2022:
2681       encode_coding_iso2022 (encoding, src, dst, n);
2682       break;
2683 #endif /* MULE */
2684     default:
2685       abort ();
2686     }
2687 }
2688
2689 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2690 Encode the text between START and END using CODING-SYSTEM.
2691 This will, for example, convert Japanese characters into stuff such as
2692 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2693 text.  BUFFER defaults to the current buffer if unspecified.
2694 */
2695        (start, end, coding_system, buffer))
2696 {
2697   Bufpos b, e;
2698   struct buffer *buf = decode_buffer (buffer, 0);
2699   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2700   Lstream *istr, *ostr;
2701   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2702
2703   get_buffer_range_char (buf, start, end, &b, &e, 0);
2704
2705   barf_if_buffer_read_only (buf, b, e);
2706
2707   coding_system = Fget_coding_system (coding_system);
2708   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2709   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2710   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2711                                               Fget_coding_system (Qbinary));
2712   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2713                                            coding_system);
2714   istr = XLSTREAM (instream);
2715   ostr = XLSTREAM (outstream);
2716   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2717   /* The chain of streams looks like this:
2718
2719      [BUFFER] <----- send through
2720                      ------> [ENCODE AS SPECIFIED]
2721                              ------> [DECODE AS BINARY]
2722                                      ------> [BUFFER]
2723    */
2724   while (1)
2725     {
2726       char tempbuf[1024]; /* some random amount */
2727       Bufpos newpos, even_newer_pos;
2728       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2729       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2730
2731       if (!size_in_bytes)
2732         break;
2733       newpos = lisp_buffer_stream_startpos (istr);
2734       Lstream_write (ostr, tempbuf, size_in_bytes);
2735       even_newer_pos = lisp_buffer_stream_startpos (istr);
2736       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2737                            even_newer_pos, 0);
2738     }
2739
2740   {
2741     Charcount retlen =
2742       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2743     Lstream_close (istr);
2744     Lstream_close (ostr);
2745     UNGCPRO;
2746     Lstream_delete (istr);
2747     Lstream_delete (ostr);
2748     Lstream_delete (XLSTREAM (de_outstream));
2749     Lstream_delete (XLSTREAM (lb_outstream));
2750     return make_int (retlen);
2751   }
2752 }
2753
2754 #ifdef MULE
2755 \f
2756 /************************************************************************/
2757 /*                          Shift-JIS methods                           */
2758 /************************************************************************/
2759
2760 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2761    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2762    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2763    encoded by "position-code + 0x80".  A character of JISX0208
2764    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2765    position-codes are divided and shifted so that it fit in the range
2766    below.
2767
2768    --- CODE RANGE of Shift-JIS ---
2769    (character set)      (range)
2770    ASCII                0x00 .. 0x7F
2771    JISX0201-Kana        0xA0 .. 0xDF
2772    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2773             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2774    -------------------------------
2775
2776 */
2777
2778 /* Is this the first byte of a Shift-JIS two-byte char? */
2779
2780 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2781   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2782
2783 /* Is this the second byte of a Shift-JIS two-byte char? */
2784
2785 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2786   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2787
2788 #define BYTE_SJIS_KATAKANA_P(c) \
2789   ((c) >= 0xA1 && (c) <= 0xDF)
2790
2791 static int
2792 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2793                     unsigned int n)
2794 {
2795   int c;
2796
2797   while (n--)
2798     {
2799       c = *src++;
2800       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2801         return 0;
2802       if (st->shift_jis.in_second_byte)
2803         {
2804           st->shift_jis.in_second_byte = 0;
2805           if (c < 0x40)
2806             return 0;
2807         }
2808       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2809         st->shift_jis.in_second_byte = 1;
2810     }
2811   return CODING_CATEGORY_SHIFT_JIS_MASK;
2812 }
2813
2814 /* Convert Shift-JIS data to internal format. */
2815
2816 static void
2817 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2818                     unsigned_char_dynarr *dst, unsigned int n)
2819 {
2820   unsigned char c;
2821   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2822   unsigned int flags  = str->flags;
2823   unsigned int ch     = str->ch;
2824   eol_type_t eol_type = str->eol_type;
2825
2826   while (n--)
2827     {
2828       c = *src++;
2829
2830       if (ch)
2831         {
2832           /* Previous character was first byte of Shift-JIS Kanji char. */
2833           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2834             {
2835               unsigned char e1, e2;
2836
2837               DECODE_SJIS (ch, c, e1, e2);
2838 #ifdef UTF2000
2839               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2840                                             e1 & 0x7F,
2841                                             e2 & 0x7F), dst);
2842 #else
2843               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2844               Dynarr_add (dst, e1);
2845               Dynarr_add (dst, e2);
2846 #endif
2847             }
2848           else
2849             {
2850               DECODE_ADD_BINARY_CHAR (ch, dst);
2851               DECODE_ADD_BINARY_CHAR (c, dst);
2852             }
2853           ch = 0;
2854         }
2855       else
2856         {
2857           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2858           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2859             ch = c;
2860           else if (BYTE_SJIS_KATAKANA_P (c))
2861             {
2862 #ifdef UTF2000
2863               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2864                                             c & 0x7F, 0), dst);
2865 #else
2866               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2867               Dynarr_add (dst, c);
2868 #endif
2869             }
2870           else
2871             DECODE_ADD_BINARY_CHAR (c, dst);
2872         }
2873     label_continue_loop:;
2874     }
2875
2876   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2877
2878   str->flags = flags;
2879   str->ch    = ch;
2880 }
2881
2882 /* Convert internally-formatted data to Shift-JIS. */
2883
2884 static void
2885 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2886                     unsigned_char_dynarr *dst, unsigned int n)
2887 {
2888   unsigned char c;
2889   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2890   unsigned int flags  = str->flags;
2891   unsigned int ch     = str->ch;
2892   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2893 #ifdef UTF2000
2894   unsigned char char_boundary = str->iso2022.current_char_boundary;
2895 #endif
2896
2897   while (n--)
2898     {
2899       c = *src++;
2900 #ifdef UTF2000
2901       switch (char_boundary)
2902         {
2903         case 0:
2904           if ( c >= 0xfc )
2905             {
2906               ch = c & 0x01;
2907               char_boundary = 5;
2908             }
2909           else if ( c >= 0xf8 )
2910             {
2911               ch = c & 0x03;
2912               char_boundary = 4;
2913             }
2914           else if ( c >= 0xf0 )
2915             {
2916               ch = c & 0x07;
2917               char_boundary = 3;
2918             }
2919           else if ( c >= 0xe0 )
2920             {
2921               ch = c & 0x0f;
2922               char_boundary = 2;
2923             }
2924           else if ( c >= 0xc0 )
2925             {
2926               ch = c & 0x1f;
2927               char_boundary = 1;
2928             }
2929           else
2930             {
2931               ch = 0;
2932               if (c == '\n')
2933                 {
2934                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2935                     Dynarr_add (dst, '\r');
2936                   if (eol_type != EOL_CR)
2937                     Dynarr_add (dst, c);
2938                 }
2939               else
2940                 Dynarr_add (dst, c);
2941               char_boundary = 0;
2942             }
2943           break;
2944         case 1:
2945           ch = ( ch << 6 ) | ( c & 0x3f );
2946           {
2947             Lisp_Object charset;
2948             unsigned int c1, c2, s1, s2;
2949
2950             BREAKUP_CHAR (ch, charset, c1, c2);
2951             if (EQ(charset, Vcharset_katakana_jisx0201))
2952               {
2953                 Dynarr_add (dst, c1 | 0x80);
2954               }
2955             else if (EQ(charset, Vcharset_japanese_jisx0208))
2956               {
2957                 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2958                 Dynarr_add (dst, s1);
2959                 Dynarr_add (dst, s2);
2960               }
2961           }
2962           char_boundary = 0;
2963           break;
2964         default:
2965           ch = ( ch << 6 ) | ( c & 0x3f );
2966           char_boundary--;
2967         }
2968 #else
2969       if (c == '\n')
2970         {
2971           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2972             Dynarr_add (dst, '\r');
2973           if (eol_type != EOL_CR)
2974             Dynarr_add (dst, '\n');
2975           ch = 0;
2976         }
2977       else if (BYTE_ASCII_P (c))
2978         {
2979           Dynarr_add (dst, c);
2980           ch = 0;
2981         }
2982       else if (BUFBYTE_LEADING_BYTE_P (c))
2983         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2984               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2985               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2986       else if (ch)
2987         {
2988           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2989             {
2990               Dynarr_add (dst, c);
2991               ch = 0;
2992             }
2993           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2994                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2995             ch = c;
2996           else
2997             {
2998               unsigned char j1, j2;
2999               ENCODE_SJIS (ch, c, j1, j2);
3000               Dynarr_add (dst, j1);
3001               Dynarr_add (dst, j2);
3002               ch = 0;
3003             }
3004         }
3005 #endif
3006     }
3007
3008   str->flags = flags;
3009   str->ch    = ch;
3010 #ifdef UTF2000
3011   str->iso2022.current_char_boundary = char_boundary;
3012 #endif
3013 }
3014
3015 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3016 Decode a JISX0208 character of Shift-JIS coding-system.
3017 CODE is the character code in Shift-JIS as a cons of type bytes.
3018 Return the corresponding character.
3019 */
3020        (code))
3021 {
3022   unsigned char c1, c2, s1, s2;
3023
3024   CHECK_CONS (code);
3025   CHECK_INT (XCAR (code));
3026   CHECK_INT (XCDR (code));
3027   s1 = XINT (XCAR (code));
3028   s2 = XINT (XCDR (code));
3029   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3030       BYTE_SJIS_TWO_BYTE_2_P (s2))
3031     {
3032       DECODE_SJIS (s1, s2, c1, c2);
3033       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3034                                    c1 & 0x7F, c2 & 0x7F));
3035     }
3036   else
3037     return Qnil;
3038 }
3039
3040 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3041 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3042 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3043 */
3044        (ch))
3045 {
3046   Lisp_Object charset;
3047   int c1, c2, s1, s2;
3048
3049   CHECK_CHAR_COERCE_INT (ch);
3050   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3051   if (EQ (charset, Vcharset_japanese_jisx0208))
3052     {
3053       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3054       return Fcons (make_int (s1), make_int (s2));
3055     }
3056   else
3057     return Qnil;
3058 }
3059
3060 \f
3061 /************************************************************************/
3062 /*                            Big5 methods                              */
3063 /************************************************************************/
3064
3065 /* BIG5 is a coding system encoding two character sets: ASCII and
3066    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3067    character set and is encoded in two-byte.
3068
3069    --- CODE RANGE of BIG5 ---
3070    (character set)      (range)
3071    ASCII                0x00 .. 0x7F
3072    Big5 (1st byte)      0xA1 .. 0xFE
3073         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3074    --------------------------
3075
3076    Since the number of characters in Big5 is larger than maximum
3077    characters in Emacs' charset (96x96), it can't be handled as one
3078    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3079    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3080    contains frequently used characters and the latter contains less
3081    frequently used characters.  */
3082
3083 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3084   ((c) >= 0xA1 && (c) <= 0xFE)
3085
3086 /* Is this the second byte of a Shift-JIS two-byte char? */
3087
3088 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3089   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3090
3091 /* Number of Big5 characters which have the same code in 1st byte.  */
3092
3093 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3094
3095 /* Code conversion macros.  These are macros because they are used in
3096    inner loops during code conversion.
3097
3098    Note that temporary variables in macros introduce the classic
3099    dynamic-scoping problems with variable names.  We use capital-
3100    lettered variables in the assumption that XEmacs does not use
3101    capital letters in variables except in a very formalized way
3102    (e.g. Qstring). */
3103
3104 /* Convert Big5 code (b1, b2) into its internal string representation
3105    (lb, c1, c2). */
3106
3107 /* There is a much simpler way to split the Big5 charset into two.
3108    For the moment I'm going to leave the algorithm as-is because it
3109    claims to separate out the most-used characters into a single
3110    charset, which perhaps will lead to optimizations in various
3111    places.
3112
3113    The way the algorithm works is something like this:
3114
3115    Big5 can be viewed as a 94x157 charset, where the row is
3116    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3117    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3118    the split between low and high column numbers is apparently
3119    meaningless; ascending rows produce less and less frequent chars.
3120    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3121    the first charset, and the upper half (0xC9 .. 0xFE) to the
3122    second.  To do the conversion, we convert the character into
3123    a single number where 0 .. 156 is the first row, 157 .. 313
3124    is the second, etc.  That way, the characters are ordered by
3125    decreasing frequency.  Then we just chop the space in two
3126    and coerce the result into a 94x94 space.
3127    */
3128
3129 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3130 {                                                                       \
3131   int B1 = b1, B2 = b2;                                                 \
3132   unsigned int I                                                        \
3133     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3134                                                                         \
3135   if (B1 < 0xC9)                                                        \
3136     {                                                                   \
3137       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3138     }                                                                   \
3139   else                                                                  \
3140     {                                                                   \
3141       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3142       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3143     }                                                                   \
3144   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3145   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3146 } while (0)
3147
3148 /* Convert the internal string representation of a Big5 character
3149    (lb, c1, c2) into Big5 code (b1, b2). */
3150
3151 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3152 {                                                                       \
3153   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3154                                                                         \
3155   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3156     {                                                                   \
3157       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3158     }                                                                   \
3159   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3160   b2 = I % BIG5_SAME_ROW;                                               \
3161   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3162 } while (0)
3163
3164 static int
3165 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3166                     unsigned int n)
3167 {
3168   int c;
3169
3170   while (n--)
3171     {
3172       c = *src++;
3173       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3174           (c >= 0x80 && c <= 0xA0))
3175         return 0;
3176       if (st->big5.in_second_byte)
3177         {
3178           st->big5.in_second_byte = 0;
3179           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3180             return 0;
3181         }
3182       else if (c >= 0xA1)
3183         st->big5.in_second_byte = 1;
3184     }
3185   return CODING_CATEGORY_BIG5_MASK;
3186 }
3187
3188 /* Convert Big5 data to internal format. */
3189
3190 static void
3191 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3192                     unsigned_char_dynarr *dst, unsigned int n)
3193 {
3194   unsigned char c;
3195   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3196   unsigned int flags  = str->flags;
3197   unsigned int ch     = str->ch;
3198   eol_type_t eol_type = str->eol_type;
3199
3200   while (n--)
3201     {
3202       c = *src++;
3203       if (ch)
3204         {
3205           /* Previous character was first byte of Big5 char. */
3206           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3207             {
3208               unsigned char b1, b2, b3;
3209               DECODE_BIG5 (ch, c, b1, b2, b3);
3210               Dynarr_add (dst, b1);
3211               Dynarr_add (dst, b2);
3212               Dynarr_add (dst, b3);
3213             }
3214           else
3215             {
3216               DECODE_ADD_BINARY_CHAR (ch, dst);
3217               DECODE_ADD_BINARY_CHAR (c, dst);
3218             }
3219           ch = 0;
3220         }
3221       else
3222         {
3223           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3224           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3225             ch = c;
3226           else
3227             DECODE_ADD_BINARY_CHAR (c, dst);
3228         }
3229     label_continue_loop:;
3230     }
3231
3232   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3233
3234   str->flags = flags;
3235   str->ch    = ch;
3236 }
3237
3238 /* Convert internally-formatted data to Big5. */
3239
3240 static void
3241 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3242                     unsigned_char_dynarr *dst, unsigned int n)
3243 {
3244 #ifndef UTF2000
3245   unsigned char c;
3246   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3247   unsigned int flags  = str->flags;
3248   unsigned int ch     = str->ch;
3249   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3250
3251   while (n--)
3252     {
3253       c = *src++;
3254       if (c == '\n')
3255         {
3256           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3257             Dynarr_add (dst, '\r');
3258           if (eol_type != EOL_CR)
3259             Dynarr_add (dst, '\n');
3260         }
3261       else if (BYTE_ASCII_P (c))
3262         {
3263           /* ASCII. */
3264           Dynarr_add (dst, c);
3265         }
3266       else if (BUFBYTE_LEADING_BYTE_P (c))
3267         {
3268           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3269               c == LEADING_BYTE_CHINESE_BIG5_2)
3270             {
3271               /* A recognized leading byte. */
3272               ch = c;
3273               continue; /* not done with this character. */
3274             }
3275           /* otherwise just ignore this character. */
3276         }
3277       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3278                ch == LEADING_BYTE_CHINESE_BIG5_2)
3279         {
3280           /* Previous char was a recognized leading byte. */
3281           ch = (ch << 8) | c;
3282           continue; /* not done with this character. */
3283         }
3284       else if (ch)
3285         {
3286           /* Encountering second byte of a Big5 character. */
3287           unsigned char b1, b2;
3288
3289           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3290           Dynarr_add (dst, b1);
3291           Dynarr_add (dst, b2);
3292         }
3293
3294       ch = 0;
3295     }
3296
3297   str->flags = flags;
3298   str->ch    = ch;
3299 #endif
3300 }
3301
3302
3303 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3304 Decode a Big5 character CODE of BIG5 coding-system.
3305 CODE is the character code in BIG5, a cons of two integers.
3306 Return the corresponding character.
3307 */
3308        (code))
3309 {
3310   unsigned char c1, c2, b1, b2;
3311
3312   CHECK_CONS (code);
3313   CHECK_INT (XCAR (code));
3314   CHECK_INT (XCDR (code));
3315   b1 = XINT (XCAR (code));
3316   b2 = XINT (XCDR (code));
3317   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3318       BYTE_BIG5_TWO_BYTE_2_P (b2))
3319     {
3320       Charset_ID leading_byte;
3321       Lisp_Object charset;
3322       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3323       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3324       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3325     }
3326   else
3327     return Qnil;
3328 }
3329
3330 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3331 Encode the Big5 character CH to BIG5 coding-system.
3332 Return the corresponding character code in Big5.
3333 */
3334        (ch))
3335 {
3336   Lisp_Object charset;
3337   int c1, c2, b1, b2;
3338
3339   CHECK_CHAR_COERCE_INT (ch);
3340   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3341   if (EQ (charset, Vcharset_chinese_big5_1) ||
3342       EQ (charset, Vcharset_chinese_big5_2))
3343     {
3344       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3345                    b1, b2);
3346       return Fcons (make_int (b1), make_int (b2));
3347     }
3348   else
3349     return Qnil;
3350 }
3351
3352 \f
3353 /************************************************************************/
3354 /*                           UCS-4 methods                              */
3355 /*                                                                      */
3356 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3357 /*                                                                      */
3358 /************************************************************************/
3359
3360 Lisp_Object ucs_to_mule_table[65536];
3361 Lisp_Object mule_to_ucs_table;
3362
3363 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3364 Map UCS-4 code CODE to Mule character CHARACTER.
3365
3366 Return T on success, NIL on failure.
3367 */
3368        (code, character))
3369 {
3370   unsigned int c;
3371
3372   CHECK_CHAR (character);
3373   CHECK_INT (code);
3374   c = XINT (code);
3375
3376   if (c < sizeof (ucs_to_mule_table))
3377     {
3378       ucs_to_mule_table[c] = character;
3379       return Qt;
3380     }
3381   else
3382     return Qnil;
3383 }
3384
3385 static Lisp_Object
3386 ucs_to_char (unsigned long code)
3387 {
3388   if (code < sizeof (ucs_to_mule_table))
3389     {
3390       return ucs_to_mule_table[code];
3391     }
3392   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3393     {
3394       unsigned int c;
3395
3396       code -= 0xe00000;
3397       c = code % (94 * 94);
3398       return make_char
3399         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3400                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3401                      CHARSET_LEFT_TO_RIGHT),
3402                     c / 94 + 33, c % 94 + 33));
3403     }
3404   else
3405     return Qnil;
3406 }
3407
3408 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3409 Return Mule character corresponding to UCS code CODE (a positive integer).
3410 */
3411        (code))
3412 {
3413   CHECK_NATNUM (code);
3414   return ucs_to_char (XINT (code));
3415 }
3416
3417 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3418 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3419 */
3420        (character, code))
3421 {
3422   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3423           Fset_char_ucs is more restrictive on index arg, but should
3424           check code arg in a char_table method. */
3425   CHECK_CHAR (character);
3426   CHECK_NATNUM (code);
3427   return Fput_char_table (character, code, mule_to_ucs_table);
3428 }
3429
3430 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3431 Return the UCS code (a positive integer) corresponding to CHARACTER.
3432 */
3433        (character))
3434 {
3435   return Fget_char_table (character, mule_to_ucs_table);
3436 }
3437
3438 #ifdef UTF2000
3439 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3440 #else
3441 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3442    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3443    is not found, instead.
3444    #### do something more appropriate (use blob?)
3445         Danger, Will Robinson!  Data loss.  Should we signal user? */
3446 static void
3447 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3448 {
3449   Lisp_Object chr = ucs_to_char (ch);
3450
3451   if (! NILP (chr))
3452     {
3453       Bufbyte work[MAX_EMCHAR_LEN];
3454       int len;
3455
3456       ch = XCHAR (chr);
3457       len = (ch < 128) ?
3458         simple_set_charptr_emchar (work, ch) :
3459         non_ascii_set_charptr_emchar (work, ch);
3460       Dynarr_add_many (dst, work, len);
3461     }
3462   else
3463     {
3464       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3465       Dynarr_add (dst, 34 + 128);
3466       Dynarr_add (dst, 46 + 128);
3467     }
3468 }
3469 #endif
3470
3471 static unsigned long
3472 mule_char_to_ucs4 (Lisp_Object charset,
3473                    unsigned char h, unsigned char l)
3474 {
3475   Lisp_Object code
3476     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3477                        mule_to_ucs_table);
3478
3479   if (INTP (code))
3480     {
3481       return XINT (code);
3482     }
3483   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3484             (XCHARSET_CHARS (charset) == 94) )
3485     {
3486       unsigned char final = XCHARSET_FINAL (charset);
3487
3488       if ( ('@' <= final) && (final < 0x7f) )
3489         {
3490           return 0xe00000 + (final - '@') * 94 * 94
3491             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3492         }
3493       else
3494         {
3495           return '?';
3496         }
3497     }
3498   else
3499     {
3500       return '?';
3501     }
3502 }
3503
3504 static void
3505 encode_ucs4 (Lisp_Object charset,
3506              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3507 {
3508   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3509   Dynarr_add (dst,  code >> 24);
3510   Dynarr_add (dst, (code >> 16) & 255);
3511   Dynarr_add (dst, (code >>  8) & 255);
3512   Dynarr_add (dst,  code        & 255);
3513 }
3514
3515 static int
3516 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3517                     unsigned int n)
3518 {
3519   while (n--)
3520     {
3521       int c = *src++;
3522       switch (st->ucs4.in_byte)
3523         {
3524         case 0:
3525           if (c >= 128)
3526             return 0;
3527           else
3528             st->ucs4.in_byte++;
3529           break;
3530         case 3:
3531           st->ucs4.in_byte = 0;
3532           break;
3533         default:
3534           st->ucs4.in_byte++;
3535         }
3536     }
3537   return CODING_CATEGORY_UCS4_MASK;
3538 }
3539
3540 static void
3541 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3542                     unsigned_char_dynarr *dst, unsigned int n)
3543 {
3544   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3545   unsigned int flags = str->flags;
3546   unsigned int ch    = str->ch;
3547   unsigned char counter = str->counter;
3548
3549   while (n--)
3550     {
3551       unsigned char c = *src++;
3552       switch (counter)
3553         {
3554         case 0:
3555           ch = c;
3556           counter = 3;
3557           break;
3558         case 1:
3559           decode_ucs4 ( ( ch << 8 ) | c, dst);
3560           ch = 0;
3561           counter = 0;
3562           break;
3563         default:
3564           ch = ( ch << 8 ) | c;
3565           counter--;
3566         }
3567     }
3568   if (counter & CODING_STATE_END)
3569     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3570
3571   str->flags = flags;
3572   str->ch    = ch;
3573   str->counter = counter;
3574 }
3575
3576 static void
3577 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3578                     unsigned_char_dynarr *dst, unsigned int n)
3579 {
3580 #ifndef UTF2000
3581   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3582   unsigned int flags = str->flags;
3583   unsigned int ch = str->ch;
3584   unsigned char char_boundary = str->iso2022.current_char_boundary;
3585   Lisp_Object charset = str->iso2022.current_charset;
3586
3587 #ifdef ENABLE_COMPOSITE_CHARS
3588   /* flags for handling composite chars.  We do a little switcharoo
3589      on the source while we're outputting the composite char. */
3590   unsigned int saved_n = 0;
3591   CONST unsigned char *saved_src = NULL;
3592   int in_composite = 0;
3593
3594  back_to_square_n:
3595 #endif
3596
3597   while (n--)
3598     {
3599       unsigned char c = *src++;
3600
3601       if (BYTE_ASCII_P (c))
3602         {               /* Processing ASCII character */
3603           ch = 0;
3604           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3605           char_boundary = 1;
3606         }
3607       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3608         { /* Processing Leading Byte */
3609           ch = 0;
3610           charset = CHARSET_BY_LEADING_BYTE (c);
3611           if (LEADING_BYTE_PREFIX_P(c))
3612             ch = c;
3613           char_boundary = 0;
3614         }
3615       else
3616         {                       /* Processing Non-ASCII character */
3617           char_boundary = 1;
3618           if (EQ (charset, Vcharset_control_1))
3619             {
3620               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3621             }
3622           else
3623             {
3624               switch (XCHARSET_REP_BYTES (charset))
3625                 {
3626                 case 2:
3627                   encode_ucs4 (charset, c, 0, dst);
3628                   break;
3629                 case 3:
3630                   if (XCHARSET_PRIVATE_P (charset))
3631                     {
3632                       encode_ucs4 (charset, c, 0, dst);
3633                       ch = 0;
3634                     }
3635                   else if (ch)
3636                     {
3637 #ifdef ENABLE_COMPOSITE_CHARS
3638                       if (EQ (charset, Vcharset_composite))
3639                         {
3640                           if (in_composite)
3641                             {
3642                               /* #### Bother! We don't know how to
3643                                  handle this yet. */
3644                               Dynarr_add (dst, 0);
3645                               Dynarr_add (dst, 0);
3646                               Dynarr_add (dst, 0);
3647                               Dynarr_add (dst, '~');
3648                             }
3649                           else
3650                             {
3651                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3652                                                        ch & 0x7F, c & 0x7F);
3653                               Lisp_Object lstr = composite_char_string (emch);
3654                               saved_n = n;
3655                               saved_src = src;
3656                               in_composite = 1;
3657                               src = XSTRING_DATA   (lstr);
3658                               n   = XSTRING_LENGTH (lstr);
3659                             }
3660                         }
3661                       else
3662 #endif /* ENABLE_COMPOSITE_CHARS */
3663                         {
3664                           encode_ucs4(charset, ch, c, dst);
3665                         }
3666                       ch = 0;
3667                     }
3668                   else
3669                     {
3670                       ch = c;
3671                       char_boundary = 0;
3672                     }
3673                   break;
3674                 case 4:
3675                   if (ch)
3676                     {
3677                       encode_ucs4 (charset, ch, c, dst);
3678                       ch = 0;
3679                     }
3680                   else
3681                     {
3682                       ch = c;
3683                       char_boundary = 0;
3684                     }
3685                   break;
3686                 default:
3687                   abort ();
3688                 }
3689             }
3690         }
3691     }
3692
3693 #ifdef ENABLE_COMPOSITE_CHARS
3694   if (in_composite)
3695     {
3696       n = saved_n;
3697       src = saved_src;
3698       in_composite = 0;
3699       goto back_to_square_n; /* Wheeeeeeeee ..... */
3700     }
3701 #endif /* ENABLE_COMPOSITE_CHARS */
3702
3703   str->flags = flags;
3704   str->ch = ch;
3705   str->iso2022.current_char_boundary = char_boundary;
3706   str->iso2022.current_charset = charset;
3707
3708   /* Verbum caro factum est! */
3709 #endif
3710 }
3711
3712 \f
3713 /************************************************************************/
3714 /*                           UTF-8 methods                              */
3715 /************************************************************************/
3716
3717 static int
3718 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3719                     unsigned int n)
3720 {
3721   while (n--)
3722     {
3723       unsigned char c = *src++;
3724       switch (st->utf8.in_byte)
3725         {
3726         case 0:
3727           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3728             return 0;
3729           else if (c >= 0xfc)
3730             st->utf8.in_byte = 5;
3731           else if (c >= 0xf8)
3732             st->utf8.in_byte = 4;
3733           else if (c >= 0xf0)
3734             st->utf8.in_byte = 3;
3735           else if (c >= 0xe0)
3736             st->utf8.in_byte = 2;
3737           else if (c >= 0xc0)
3738             st->utf8.in_byte = 1;
3739           else if (c >= 0x80)
3740             return 0;
3741           break;
3742         default:
3743           if ((c & 0xc0) != 0x80)
3744             return 0;
3745           else
3746             st->utf8.in_byte--;
3747         }
3748     }
3749   return CODING_CATEGORY_UTF8_MASK;
3750 }
3751
3752 static void
3753 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3754                     unsigned_char_dynarr *dst, unsigned int n)
3755 {
3756   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3757   unsigned int flags  = str->flags;
3758   unsigned int ch     = str->ch;
3759   eol_type_t eol_type = str->eol_type;
3760   unsigned char counter = str->counter;
3761
3762   while (n--)
3763     {
3764       unsigned char c = *src++;
3765       switch (counter)
3766         {
3767         case 0:
3768           if ( c >= 0xfc )
3769             {
3770               ch = c & 0x01;
3771               counter = 5;
3772             }
3773           else if ( c >= 0xf8 )
3774             {
3775               ch = c & 0x03;
3776               counter = 4;
3777             }
3778           else if ( c >= 0xf0 )
3779             {
3780               ch = c & 0x07;
3781               counter = 3;
3782             }
3783           else if ( c >= 0xe0 )
3784             {
3785               ch = c & 0x0f;
3786               counter = 2;
3787             }
3788           else if ( c >= 0xc0 )
3789             {
3790               ch = c & 0x1f;
3791               counter = 1;
3792             }
3793           else
3794             {
3795               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3796               decode_ucs4 (c, dst);
3797             }
3798           break;
3799         case 1:
3800           ch = ( ch << 6 ) | ( c & 0x3f );
3801           decode_ucs4 (ch, dst);
3802           ch = 0;
3803           counter = 0;
3804           break;
3805         default:
3806           ch = ( ch << 6 ) | ( c & 0x3f );
3807           counter--;
3808         }
3809     label_continue_loop:;
3810     }
3811
3812   if (flags & CODING_STATE_END)
3813     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3814
3815   str->flags = flags;
3816   str->ch    = ch;
3817   str->counter = counter;
3818 }
3819
3820 #ifndef UTF2000
3821 static void
3822 encode_utf8 (Lisp_Object charset,
3823              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3824 {
3825   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3826   if ( code <= 0x7f )
3827     {
3828       Dynarr_add (dst, code);
3829     }
3830   else if ( code <= 0x7ff )
3831     {
3832       Dynarr_add (dst, (code >> 6) | 0xc0);
3833       Dynarr_add (dst, (code & 0x3f) | 0x80);
3834     }
3835   else if ( code <= 0xffff )
3836     {
3837       Dynarr_add (dst,  (code >> 12) | 0xe0);
3838       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3839       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3840     }
3841   else if ( code <= 0x1fffff )
3842     {
3843       Dynarr_add (dst,  (code >> 18) | 0xf0);
3844       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3845       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3846       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3847     }
3848   else if ( code <= 0x3ffffff )
3849     {
3850       Dynarr_add (dst,  (code >> 24) | 0xf8);
3851       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3852       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3853       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3854       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3855     }
3856   else
3857     {
3858       Dynarr_add (dst,  (code >> 30) | 0xfc);
3859       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3860       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3861       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3862       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3863       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3864     }
3865 }
3866 #endif
3867
3868 static void
3869 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3870                     unsigned_char_dynarr *dst, unsigned int n)
3871 {
3872   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3873   unsigned int flags  = str->flags;
3874   unsigned int ch     = str->ch;
3875   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3876   unsigned char char_boundary = str->iso2022.current_char_boundary;
3877 #ifdef UTF2000
3878
3879   while (n--)
3880     {
3881       unsigned char c = *src++;
3882       switch (char_boundary)
3883         {
3884         case 0:
3885           if ( c >= 0xfc )
3886             {
3887               Dynarr_add (dst, c);
3888               char_boundary = 5;
3889             }
3890           else if ( c >= 0xf8 )
3891             {
3892               Dynarr_add (dst, c);
3893               char_boundary = 4;
3894             }
3895           else if ( c >= 0xf0 )
3896             {
3897               Dynarr_add (dst, c);
3898               char_boundary = 3;
3899             }
3900           else if ( c >= 0xe0 )
3901             {
3902               Dynarr_add (dst, c);
3903               char_boundary = 2;
3904             }
3905           else if ( c >= 0xc0 )
3906             {
3907               Dynarr_add (dst, c);
3908               char_boundary = 1;
3909             }
3910           else
3911             {
3912               if (c == '\n')
3913                 {
3914                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3915                     Dynarr_add (dst, '\r');
3916                   if (eol_type != EOL_CR)
3917                     Dynarr_add (dst, c);
3918                 }
3919               else
3920                 Dynarr_add (dst, c);
3921               char_boundary = 0;
3922             }
3923           break;
3924         case 1:
3925           Dynarr_add (dst, c);
3926           char_boundary = 0;
3927           break;
3928         default:
3929           Dynarr_add (dst, c);
3930           char_boundary--;
3931         }
3932     }
3933 #else /* not UTF2000 */
3934   Lisp_Object charset = str->iso2022.current_charset;
3935
3936 #ifdef ENABLE_COMPOSITE_CHARS
3937   /* flags for handling composite chars.  We do a little switcharoo
3938      on the source while we're outputting the composite char. */
3939   unsigned int saved_n = 0;
3940   CONST unsigned char *saved_src = NULL;
3941   int in_composite = 0;
3942
3943  back_to_square_n:
3944 #endif /* ENABLE_COMPOSITE_CHARS */
3945
3946   while (n--)
3947     {
3948       unsigned char c = *src++;
3949
3950       if (BYTE_ASCII_P (c))
3951         {               /* Processing ASCII character */
3952           ch = 0;
3953           if (c == '\n')
3954             {
3955               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3956                 Dynarr_add (dst, '\r');
3957               if (eol_type != EOL_CR)
3958                 Dynarr_add (dst, c);
3959             }
3960           else
3961             encode_utf8 (Vcharset_ascii, c, 0, dst);
3962           char_boundary = 1;
3963         }
3964       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3965         { /* Processing Leading Byte */
3966           ch = 0;
3967           charset = CHARSET_BY_LEADING_BYTE (c);
3968           if (LEADING_BYTE_PREFIX_P(c))
3969             ch = c;
3970           char_boundary = 0;
3971         }
3972       else
3973         {                       /* Processing Non-ASCII character */
3974           char_boundary = 1;
3975           if (EQ (charset, Vcharset_control_1))
3976             {
3977               encode_utf8 (Vcharset_control_1, c, 0, dst);
3978             }
3979           else
3980             {
3981               switch (XCHARSET_REP_BYTES (charset))
3982                 {
3983                 case 2:
3984                   encode_utf8 (charset, c, 0, dst);
3985                   break;
3986                 case 3:
3987                   if (XCHARSET_PRIVATE_P (charset))
3988                     {
3989                       encode_utf8 (charset, c, 0, dst);
3990                       ch = 0;
3991                     }
3992                   else if (ch)
3993                     {
3994 #ifdef ENABLE_COMPOSITE_CHARS
3995                       if (EQ (charset, Vcharset_composite))
3996                         {
3997                           if (in_composite)
3998                             {
3999                               /* #### Bother! We don't know how to
4000                                  handle this yet. */
4001                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
4002                             }
4003                           else
4004                             {
4005                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4006                                                        ch & 0x7F, c & 0x7F);
4007                               Lisp_Object lstr = composite_char_string (emch);
4008                               saved_n = n;
4009                               saved_src = src;
4010                               in_composite = 1;
4011                               src = XSTRING_DATA   (lstr);
4012                               n   = XSTRING_LENGTH (lstr);
4013                             }
4014                         }
4015                       else
4016 #endif /* ENABLE_COMPOSITE_CHARS */
4017                         {
4018                           encode_utf8 (charset, ch, c, dst);
4019                         }
4020                       ch = 0;
4021                     }
4022                   else
4023                     {
4024                       ch = c;
4025                       char_boundary = 0;
4026                     }
4027                   break;
4028                 case 4:
4029                   if (ch)
4030                     {
4031                       encode_utf8 (charset, ch, c, dst);
4032                       ch = 0;
4033                     }
4034                   else
4035                     {
4036                       ch = c;
4037                       char_boundary = 0;
4038                     }
4039                   break;
4040                 default:
4041                   abort ();
4042                 }
4043             }
4044         }
4045     }
4046
4047 #ifdef ENABLE_COMPOSITE_CHARS
4048   if (in_composite)
4049     {
4050       n = saved_n;
4051       src = saved_src;
4052       in_composite = 0;
4053       goto back_to_square_n; /* Wheeeeeeeee ..... */
4054     }
4055 #endif
4056
4057 #endif /* not UTF2000 */
4058   str->flags = flags;
4059   str->ch    = ch;
4060   str->iso2022.current_char_boundary = char_boundary;
4061 #ifndef UTF2000
4062   str->iso2022.current_charset = charset;
4063 #endif
4064
4065   /* Verbum caro factum est! */
4066 }
4067
4068 \f
4069 /************************************************************************/
4070 /*                           ISO2022 methods                            */
4071 /************************************************************************/
4072
4073 /* The following note describes the coding system ISO2022 briefly.
4074    Since the intention of this note is to help understand the
4075    functions in this file, some parts are NOT ACCURATE or OVERLY
4076    SIMPLIFIED.  For thorough understanding, please refer to the
4077    original document of ISO2022.
4078
4079    ISO2022 provides many mechanisms to encode several character sets
4080    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4081    is encoded using bytes less than 128.  This may make the encoded
4082    text a little bit longer, but the text passes more easily through
4083    several gateways, some of which strip off MSB (Most Signigant Bit).
4084
4085    There are two kinds of character sets: control character set and
4086    graphic character set.  The former contains control characters such
4087    as `newline' and `escape' to provide control functions (control
4088    functions are also provided by escape sequences).  The latter
4089    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4090    two control character sets and many graphic character sets.
4091
4092    Graphic character sets are classified into one of the following
4093    four classes, according to the number of bytes (DIMENSION) and
4094    number of characters in one dimension (CHARS) of the set:
4095    - DIMENSION1_CHARS94
4096    - DIMENSION1_CHARS96
4097    - DIMENSION2_CHARS94
4098    - DIMENSION2_CHARS96
4099
4100    In addition, each character set is assigned an identification tag,
4101    unique for each set, called "final character" (denoted as <F>
4102    hereafter).  The <F> of each character set is decided by ECMA(*)
4103    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4104    (0x30..0x3F are for private use only).
4105
4106    Note (*): ECMA = European Computer Manufacturers Association
4107
4108    Here are examples of graphic character set [NAME(<F>)]:
4109         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4110         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4111         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4112         o DIMENSION2_CHARS96 -- none for the moment
4113
4114    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4115         C0 [0x00..0x1F] -- control character plane 0
4116         GL [0x20..0x7F] -- graphic character plane 0
4117         C1 [0x80..0x9F] -- control character plane 1
4118         GR [0xA0..0xFF] -- graphic character plane 1
4119
4120    A control character set is directly designated and invoked to C0 or
4121    C1 by an escape sequence.  The most common case is that:
4122    - ISO646's  control character set is designated/invoked to C0, and
4123    - ISO6429's control character set is designated/invoked to C1,
4124    and usually these designations/invocations are omitted in encoded
4125    text.  In a 7-bit environment, only C0 can be used, and a control
4126    character for C1 is encoded by an appropriate escape sequence to
4127    fit into the environment.  All control characters for C1 are
4128    defined to have corresponding escape sequences.
4129
4130    A graphic character set is at first designated to one of four
4131    graphic registers (G0 through G3), then these graphic registers are
4132    invoked to GL or GR.  These designations and invocations can be
4133    done independently.  The most common case is that G0 is invoked to
4134    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4135    these invocations and designations are omitted in encoded text.
4136    In a 7-bit environment, only GL can be used.
4137
4138    When a graphic character set of CHARS94 is invoked to GL, codes
4139    0x20 and 0x7F of the GL area work as control characters SPACE and
4140    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4141    be used.
4142
4143    There are two ways of invocation: locking-shift and single-shift.
4144    With locking-shift, the invocation lasts until the next different
4145    invocation, whereas with single-shift, the invocation affects the
4146    following character only and doesn't affect the locking-shift
4147    state.  Invocations are done by the following control characters or
4148    escape sequences:
4149
4150    ----------------------------------------------------------------------
4151    abbrev  function                  cntrl escape seq   description
4152    ----------------------------------------------------------------------
4153    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4154    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4155    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4156    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4157    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4158    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4159    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4160    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4161    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4162    ----------------------------------------------------------------------
4163    (*) These are not used by any known coding system.
4164
4165    Control characters for these functions are defined by macros
4166    ISO_CODE_XXX in `coding.h'.
4167
4168    Designations are done by the following escape sequences:
4169    ----------------------------------------------------------------------
4170    escape sequence      description
4171    ----------------------------------------------------------------------
4172    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4173    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4174    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4175    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4176    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4177    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4178    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4179    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4180    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4181    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4182    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4183    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4184    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4185    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4186    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4187    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4188    ----------------------------------------------------------------------
4189
4190    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4191    of dimension 1, chars 94, and final character <F>, etc...
4192
4193    Note (*): Although these designations are not allowed in ISO2022,
4194    Emacs accepts them on decoding, and produces them on encoding
4195    CHARS96 character sets in a coding system which is characterized as
4196    7-bit environment, non-locking-shift, and non-single-shift.
4197
4198    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4199    '(' can be omitted.  We refer to this as "short-form" hereafter.
4200
4201    Now you may notice that there are a lot of ways for encoding the
4202    same multilingual text in ISO2022.  Actually, there exist many
4203    coding systems such as Compound Text (used in X11's inter client
4204    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4205    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4206    localized platforms), and all of these are variants of ISO2022.
4207
4208    In addition to the above, Emacs handles two more kinds of escape
4209    sequences: ISO6429's direction specification and Emacs' private
4210    sequence for specifying character composition.
4211
4212    ISO6429's direction specification takes the following form:
4213         o CSI ']'      -- end of the current direction
4214         o CSI '0' ']'  -- end of the current direction
4215         o CSI '1' ']'  -- start of left-to-right text
4216         o CSI '2' ']'  -- start of right-to-left text
4217    The control character CSI (0x9B: control sequence introducer) is
4218    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4219
4220    Character composition specification takes the following form:
4221         o ESC '0' -- start character composition
4222         o ESC '1' -- end character composition
4223    Since these are not standard escape sequences of any ISO standard,
4224    their use with these meanings is restricted to Emacs only.  */
4225
4226 static void
4227 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4228 {
4229   int i;
4230
4231   for (i = 0; i < 4; i++)
4232     {
4233       if (!NILP (coding_system))
4234         iso->charset[i] =
4235           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4236       else
4237         iso->charset[i] = Qt;
4238       iso->invalid_designated[i] = 0;
4239     }
4240   iso->esc = ISO_ESC_NOTHING;
4241   iso->esc_bytes_index = 0;
4242   iso->register_left = 0;
4243   iso->register_right = 1;
4244   iso->switched_dir_and_no_valid_charset_yet = 0;
4245   iso->invalid_switch_dir = 0;
4246   iso->output_direction_sequence = 0;
4247   iso->output_literally = 0;
4248 #ifdef ENABLE_COMPOSITE_CHARS
4249   if (iso->composite_chars)
4250     Dynarr_reset (iso->composite_chars);
4251 #endif
4252 }
4253
4254 static int
4255 fit_to_be_escape_quoted (unsigned char c)
4256 {
4257   switch (c)
4258     {
4259     case ISO_CODE_ESC:
4260     case ISO_CODE_CSI:
4261     case ISO_CODE_SS2:
4262     case ISO_CODE_SS3:
4263     case ISO_CODE_SO:
4264     case ISO_CODE_SI:
4265       return 1;
4266
4267     default:
4268       return 0;
4269     }
4270 }
4271
4272 /* Parse one byte of an ISO2022 escape sequence.
4273    If the result is an invalid escape sequence, return 0 and
4274    do not change anything in STR.  Otherwise, if the result is
4275    an incomplete escape sequence, update ISO2022.ESC and
4276    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4277    all the state variables (but not ISO2022.ESC_BYTES) and
4278    return 1.
4279
4280    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4281    or invocation of an invalid character set and treat that as
4282    an unrecognized escape sequence. */
4283
4284 static int
4285 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4286                    unsigned char c, unsigned int *flags,
4287                    int check_invalid_charsets)
4288 {
4289   /* (1) If we're at the end of a designation sequence, CS is the
4290      charset being designated and REG is the register to designate
4291      it to.
4292
4293      (2) If we're at the end of a locking-shift sequence, REG is
4294      the register to invoke and HALF (0 == left, 1 == right) is
4295      the half to invoke it into.
4296
4297      (3) If we're at the end of a single-shift sequence, REG is
4298      the register to invoke. */
4299   Lisp_Object cs = Qnil;
4300   int reg, half;
4301
4302   /* NOTE: This code does goto's all over the fucking place.
4303      The reason for this is that we're basically implementing
4304      a state machine here, and hierarchical languages like C
4305      don't really provide a clean way of doing this. */
4306
4307   if (! (*flags & CODING_STATE_ESCAPE))
4308     /* At beginning of escape sequence; we need to reset our
4309        escape-state variables. */
4310     iso->esc = ISO_ESC_NOTHING;
4311
4312   iso->output_literally = 0;
4313   iso->output_direction_sequence = 0;
4314
4315   switch (iso->esc)
4316     {
4317     case ISO_ESC_NOTHING:
4318       iso->esc_bytes_index = 0;
4319       switch (c)
4320         {
4321         case ISO_CODE_ESC:      /* Start escape sequence */
4322           *flags |= CODING_STATE_ESCAPE;
4323           iso->esc = ISO_ESC;
4324           goto not_done;
4325
4326         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4327           *flags |= CODING_STATE_ESCAPE;
4328           iso->esc = ISO_ESC_5_11;
4329           goto not_done;
4330
4331         case ISO_CODE_SO:       /* locking shift 1 */
4332           reg = 1; half = 0;
4333           goto locking_shift;
4334         case ISO_CODE_SI:       /* locking shift 0 */
4335           reg = 0; half = 0;
4336           goto locking_shift;
4337
4338         case ISO_CODE_SS2:      /* single shift */
4339           reg = 2;
4340           goto single_shift;
4341         case ISO_CODE_SS3:      /* single shift */
4342           reg = 3;
4343           goto single_shift;
4344
4345         default:                        /* Other control characters */
4346           return 0;
4347         }
4348
4349     case ISO_ESC:
4350       switch (c)
4351         {
4352           /**** single shift ****/
4353
4354         case 'N':       /* single shift 2 */
4355           reg = 2;
4356           goto single_shift;
4357         case 'O':       /* single shift 3 */
4358           reg = 3;
4359           goto single_shift;
4360
4361           /**** locking shift ****/
4362
4363         case '~':       /* locking shift 1 right */
4364           reg = 1; half = 1;
4365           goto locking_shift;
4366         case 'n':       /* locking shift 2 */
4367           reg = 2; half = 0;
4368           goto locking_shift;
4369         case '}':       /* locking shift 2 right */
4370           reg = 2; half = 1;
4371           goto locking_shift;
4372         case 'o':       /* locking shift 3 */
4373           reg = 3; half = 0;
4374           goto locking_shift;
4375         case '|':       /* locking shift 3 right */
4376           reg = 3; half = 1;
4377           goto locking_shift;
4378
4379 #ifdef ENABLE_COMPOSITE_CHARS
4380           /**** composite ****/
4381
4382         case '0':
4383           iso->esc = ISO_ESC_START_COMPOSITE;
4384           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4385             CODING_STATE_COMPOSITE;
4386           return 1;
4387
4388         case '1':
4389           iso->esc = ISO_ESC_END_COMPOSITE;
4390           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4391             ~CODING_STATE_COMPOSITE;
4392           return 1;
4393 #endif /* ENABLE_COMPOSITE_CHARS */
4394
4395           /**** directionality ****/
4396
4397         case '[':
4398           iso->esc = ISO_ESC_5_11;
4399           goto not_done;
4400
4401           /**** designation ****/
4402
4403         case '$':       /* multibyte charset prefix */
4404           iso->esc = ISO_ESC_2_4;
4405           goto not_done;
4406
4407         default:
4408           if (0x28 <= c && c <= 0x2F)
4409             {
4410               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4411               goto not_done;
4412             }
4413
4414           /* This function is called with CODESYS equal to nil when
4415              doing coding-system detection. */
4416           if (!NILP (codesys)
4417               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4418               && fit_to_be_escape_quoted (c))
4419             {
4420               iso->esc = ISO_ESC_LITERAL;
4421               *flags &= CODING_STATE_ISO2022_LOCK;
4422               return 1;
4423             }
4424
4425           /* bzzzt! */
4426           return 0;
4427         }
4428
4429
4430
4431       /**** directionality ****/
4432
4433     case ISO_ESC_5_11:          /* ISO6429 direction control */
4434       if (c == ']')
4435         {
4436           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4437           goto directionality;
4438         }
4439       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4440       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4441       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4442       else               return 0;
4443       goto not_done;
4444
4445     case ISO_ESC_5_11_0:
4446       if (c == ']')
4447         {
4448           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4449           goto directionality;
4450         }
4451       return 0;
4452
4453     case ISO_ESC_5_11_1:
4454       if (c == ']')
4455         {
4456           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4457           goto directionality;
4458         }
4459       return 0;
4460
4461     case ISO_ESC_5_11_2:
4462       if (c == ']')
4463         {
4464           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4465           goto directionality;
4466         }
4467       return 0;
4468
4469     directionality:
4470       iso->esc = ISO_ESC_DIRECTIONALITY;
4471       /* Various junk here to attempt to preserve the direction sequences
4472          literally in the text if they would otherwise be swallowed due
4473          to invalid designations that don't show up as actual charset
4474          changes in the text. */
4475       if (iso->invalid_switch_dir)
4476         {
4477           /* We already inserted a direction switch literally into the
4478              text.  We assume (#### this may not be right) that the
4479              next direction switch is the one going the other way,
4480              and we need to output that literally as well. */
4481           iso->output_literally = 1;
4482           iso->invalid_switch_dir = 0;
4483         }
4484       else
4485         {
4486           int jj;
4487
4488           /* If we are in the thrall of an invalid designation,
4489            then stick the directionality sequence literally into the
4490            output stream so it ends up in the original text again. */
4491           for (jj = 0; jj < 4; jj++)
4492             if (iso->invalid_designated[jj])
4493               break;
4494           if (jj < 4)
4495             {
4496               iso->output_literally = 1;
4497               iso->invalid_switch_dir = 1;
4498             }
4499           else
4500             /* Indicate that we haven't yet seen a valid designation,
4501                so that if a switch-dir is directly followed by an
4502                invalid designation, both get inserted literally. */
4503             iso->switched_dir_and_no_valid_charset_yet = 1;
4504         }
4505       return 1;
4506
4507
4508       /**** designation ****/
4509
4510     case ISO_ESC_2_4:
4511       if (0x28 <= c && c <= 0x2F)
4512         {
4513           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4514           goto not_done;
4515         }
4516       if (0x40 <= c && c <= 0x42)
4517         {
4518           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4519                                       *flags & CODING_STATE_R2L ?
4520                                       CHARSET_RIGHT_TO_LEFT :
4521                                       CHARSET_LEFT_TO_RIGHT);
4522           reg = 0;
4523           goto designated;
4524         }
4525       return 0;
4526
4527     default:
4528       {
4529         int type =-1;
4530
4531         if (c < '0' || c > '~')
4532           return 0; /* bad final byte */
4533
4534         if (iso->esc >= ISO_ESC_2_8 &&
4535             iso->esc <= ISO_ESC_2_15)
4536           {
4537             type = ((iso->esc >= ISO_ESC_2_12) ?
4538                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4539             reg = (iso->esc - ISO_ESC_2_8) & 3;
4540           }
4541         else if (iso->esc >= ISO_ESC_2_4_8 &&
4542                  iso->esc <= ISO_ESC_2_4_15)
4543           {
4544             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4545                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4546             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4547           }
4548         else
4549           {
4550             /* Can this ever be reached? -slb */
4551             abort();
4552           }
4553
4554         cs = CHARSET_BY_ATTRIBUTES (type, c,
4555                                     *flags & CODING_STATE_R2L ?
4556                                     CHARSET_RIGHT_TO_LEFT :
4557                                     CHARSET_LEFT_TO_RIGHT);
4558         goto designated;
4559       }
4560     }
4561
4562  not_done:
4563   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4564   return -1;
4565
4566  single_shift:
4567   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4568     /* can't invoke something that ain't there. */
4569     return 0;
4570   iso->esc = ISO_ESC_SINGLE_SHIFT;
4571   *flags &= CODING_STATE_ISO2022_LOCK;
4572   if (reg == 2)
4573     *flags |= CODING_STATE_SS2;
4574   else
4575     *flags |= CODING_STATE_SS3;
4576   return 1;
4577
4578  locking_shift:
4579   if (check_invalid_charsets &&
4580       !CHARSETP (iso->charset[reg]))
4581     /* can't invoke something that ain't there. */
4582     return 0;
4583   if (half)
4584     iso->register_right = reg;
4585   else
4586     iso->register_left = reg;
4587   *flags &= CODING_STATE_ISO2022_LOCK;
4588   iso->esc = ISO_ESC_LOCKING_SHIFT;
4589   return 1;
4590
4591  designated:
4592   if (NILP (cs) && check_invalid_charsets)
4593     {
4594       iso->invalid_designated[reg] = 1;
4595       iso->charset[reg] = Vcharset_ascii;
4596       iso->esc = ISO_ESC_DESIGNATE;
4597       *flags &= CODING_STATE_ISO2022_LOCK;
4598       iso->output_literally = 1;
4599       if (iso->switched_dir_and_no_valid_charset_yet)
4600         {
4601           /* We encountered a switch-direction followed by an
4602              invalid designation.  Ensure that the switch-direction
4603              gets outputted; otherwise it will probably get eaten
4604              when the text is written out again. */
4605           iso->switched_dir_and_no_valid_charset_yet = 0;
4606           iso->output_direction_sequence = 1;
4607           /* And make sure that the switch-dir going the other
4608              way gets outputted, as well. */
4609           iso->invalid_switch_dir = 1;
4610         }
4611       return 1;
4612     }
4613   /* This function is called with CODESYS equal to nil when
4614      doing coding-system detection. */
4615   if (!NILP (codesys))
4616     {
4617       charset_conversion_spec_dynarr *dyn =
4618         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4619
4620       if (dyn)
4621         {
4622           int i;
4623
4624           for (i = 0; i < Dynarr_length (dyn); i++)
4625             {
4626               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4627               if (EQ (cs, spec->from_charset))
4628                 cs = spec->to_charset;
4629             }
4630         }
4631     }
4632
4633   iso->charset[reg] = cs;
4634   iso->esc = ISO_ESC_DESIGNATE;
4635   *flags &= CODING_STATE_ISO2022_LOCK;
4636   if (iso->invalid_designated[reg])
4637     {
4638       iso->invalid_designated[reg] = 0;
4639       iso->output_literally = 1;
4640     }
4641   if (iso->switched_dir_and_no_valid_charset_yet)
4642     iso->switched_dir_and_no_valid_charset_yet = 0;
4643   return 1;
4644 }
4645
4646 static int
4647 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4648                        unsigned int n)
4649 {
4650   int mask;
4651
4652   /* #### There are serious deficiencies in the recognition mechanism
4653      here.  This needs to be much smarter if it's going to cut it.
4654      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4655      it should be detected as Latin-1.
4656      All the ISO2022 stuff in this file should be synced up with the
4657      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4658      Perhaps we should wait till R2L works in FSF Emacs? */
4659
4660   if (!st->iso2022.initted)
4661     {
4662       reset_iso2022 (Qnil, &st->iso2022.iso);
4663       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4664                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4665                           CODING_CATEGORY_ISO_8_1_MASK |
4666                           CODING_CATEGORY_ISO_8_2_MASK |
4667                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4668       st->iso2022.flags = 0;
4669       st->iso2022.high_byte_count = 0;
4670       st->iso2022.saw_single_shift = 0;
4671       st->iso2022.initted = 1;
4672     }
4673
4674   mask = st->iso2022.mask;
4675
4676   while (n--)
4677     {
4678       int c = *src++;
4679       if (c >= 0xA0)
4680         {
4681           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4682           st->iso2022.high_byte_count++;
4683         }
4684       else
4685         {
4686           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4687             {
4688               if (st->iso2022.high_byte_count & 1)
4689                 /* odd number of high bytes; assume not iso-8-2 */
4690                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4691             }
4692           st->iso2022.high_byte_count = 0;
4693           st->iso2022.saw_single_shift = 0;
4694           if (c > 0x80)
4695             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4696         }
4697       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4698           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4699         { /* control chars */
4700           switch (c)
4701             {
4702               /* Allow and ignore control characters that you might
4703                  reasonably see in a text file */
4704             case '\r':
4705             case '\n':
4706             case '\t':
4707             case  7: /* bell */
4708             case  8: /* backspace */
4709             case 11: /* vertical tab */
4710             case 12: /* form feed */
4711             case 26: /* MS-DOS C-z junk */
4712             case 31: /* '^_' -- for info */
4713               goto label_continue_loop;
4714
4715             default:
4716               break;
4717             }
4718         }
4719
4720       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4721           || BYTE_C1_P (c))
4722         {
4723           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4724                                  &st->iso2022.flags, 0))
4725             {
4726               switch (st->iso2022.iso.esc)
4727                 {
4728                 case ISO_ESC_DESIGNATE:
4729                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4730                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4731                   break;
4732                 case ISO_ESC_LOCKING_SHIFT:
4733                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4734                   goto ran_out_of_chars;
4735                 case ISO_ESC_SINGLE_SHIFT:
4736                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4737                   st->iso2022.saw_single_shift = 1;
4738                   break;
4739                 default:
4740                   break;
4741                 }
4742             }
4743           else
4744             {
4745               mask = 0;
4746               goto ran_out_of_chars;
4747             }
4748         }
4749     label_continue_loop:;
4750     }
4751
4752  ran_out_of_chars:
4753
4754   return mask;
4755 }
4756
4757 static int
4758 postprocess_iso2022_mask (int mask)
4759 {
4760   /* #### kind of cheesy */
4761   /* If seven-bit ISO is allowed, then assume that the encoding is
4762      entirely seven-bit and turn off the eight-bit ones. */
4763   if (mask & CODING_CATEGORY_ISO_7_MASK)
4764     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4765                CODING_CATEGORY_ISO_8_1_MASK |
4766                CODING_CATEGORY_ISO_8_2_MASK);
4767   return mask;
4768 }
4769
4770 /* If FLAGS is a null pointer or specifies right-to-left motion,
4771    output a switch-dir-to-left-to-right sequence to DST.
4772    Also update FLAGS if it is not a null pointer.
4773    If INTERNAL_P is set, we are outputting in internal format and
4774    need to handle the CSI differently. */
4775
4776 static void
4777 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4778                                  unsigned_char_dynarr *dst,
4779                                  unsigned int *flags,
4780                                  int internal_p)
4781 {
4782   if (!flags || (*flags & CODING_STATE_R2L))
4783     {
4784       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4785         {
4786           Dynarr_add (dst, ISO_CODE_ESC);
4787           Dynarr_add (dst, '[');
4788         }
4789       else if (internal_p)
4790         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4791       else
4792         Dynarr_add (dst, ISO_CODE_CSI);
4793       Dynarr_add (dst, '0');
4794       Dynarr_add (dst, ']');
4795       if (flags)
4796         *flags &= ~CODING_STATE_R2L;
4797     }
4798 }
4799
4800 /* If FLAGS is a null pointer or specifies a direction different from
4801    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4802    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4803    sequence to DST.  Also update FLAGS if it is not a null pointer.
4804    If INTERNAL_P is set, we are outputting in internal format and
4805    need to handle the CSI differently. */
4806
4807 static void
4808 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4809                           unsigned_char_dynarr *dst, unsigned int *flags,
4810                           int internal_p)
4811 {
4812   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4813       direction == CHARSET_LEFT_TO_RIGHT)
4814     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4815   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4816            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4817            direction == CHARSET_RIGHT_TO_LEFT)
4818     {
4819       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4820         {
4821           Dynarr_add (dst, ISO_CODE_ESC);
4822           Dynarr_add (dst, '[');
4823         }
4824       else if (internal_p)
4825         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4826       else
4827         Dynarr_add (dst, ISO_CODE_CSI);
4828       Dynarr_add (dst, '2');
4829       Dynarr_add (dst, ']');
4830       if (flags)
4831         *flags |= CODING_STATE_R2L;
4832     }
4833 }
4834
4835 /* Convert ISO2022-format data to internal format. */
4836
4837 static void
4838 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4839                        unsigned_char_dynarr *dst, unsigned int n)
4840 {
4841   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4842   unsigned int flags  = str->flags;
4843   unsigned int ch     = str->ch;
4844   eol_type_t eol_type = str->eol_type;
4845 #ifdef ENABLE_COMPOSITE_CHARS
4846   unsigned_char_dynarr *real_dst = dst;
4847 #endif
4848   Lisp_Object coding_system;
4849
4850   XSETCODING_SYSTEM (coding_system, str->codesys);
4851
4852 #ifdef ENABLE_COMPOSITE_CHARS
4853   if (flags & CODING_STATE_COMPOSITE)
4854     dst = str->iso2022.composite_chars;
4855 #endif /* ENABLE_COMPOSITE_CHARS */
4856
4857   while (n--)
4858     {
4859       unsigned char c = *src++;
4860       if (flags & CODING_STATE_ESCAPE)
4861         {       /* Within ESC sequence */
4862           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4863                                           c, &flags, 1);
4864
4865           if (retval)
4866             {
4867               switch (str->iso2022.esc)
4868                 {
4869 #ifdef ENABLE_COMPOSITE_CHARS
4870                 case ISO_ESC_START_COMPOSITE:
4871                   if (str->iso2022.composite_chars)
4872                     Dynarr_reset (str->iso2022.composite_chars);
4873                   else
4874                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4875                   dst = str->iso2022.composite_chars;
4876                   break;
4877                 case ISO_ESC_END_COMPOSITE:
4878                   {
4879                     Bufbyte comstr[MAX_EMCHAR_LEN];
4880                     Bytecount len;
4881                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4882                                                          Dynarr_length (dst));
4883                     dst = real_dst;
4884                     len = set_charptr_emchar (comstr, emch);
4885                     Dynarr_add_many (dst, comstr, len);
4886                     break;
4887                   }
4888 #endif /* ENABLE_COMPOSITE_CHARS */
4889
4890                 case ISO_ESC_LITERAL:
4891                   DECODE_ADD_BINARY_CHAR (c, dst);
4892                   break;
4893
4894                 default:
4895                   /* Everything else handled already */
4896                   break;
4897                 }
4898             }
4899
4900           /* Attempted error recovery. */
4901           if (str->iso2022.output_direction_sequence)
4902             ensure_correct_direction (flags & CODING_STATE_R2L ?
4903                                       CHARSET_RIGHT_TO_LEFT :
4904                                       CHARSET_LEFT_TO_RIGHT,
4905                                       str->codesys, dst, 0, 1);
4906           /* More error recovery. */
4907           if (!retval || str->iso2022.output_literally)
4908             {
4909               /* Output the (possibly invalid) sequence */
4910               int i;
4911               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4912                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4913               flags &= CODING_STATE_ISO2022_LOCK;
4914               if (!retval)
4915                 n++, src--;/* Repeat the loop with the same character. */
4916               else
4917                 {
4918                   /* No sense in reprocessing the final byte of the
4919                      escape sequence; it could mess things up anyway.
4920                      Just add it now. */
4921                   DECODE_ADD_BINARY_CHAR (c, dst);
4922                 }
4923             }
4924           ch = 0;
4925         }
4926       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4927         { /* Control characters */
4928
4929           /***** Error-handling *****/
4930
4931           /* If we were in the middle of a character, dump out the
4932              partial character. */
4933           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4934
4935           /* If we just saw a single-shift character, dump it out.
4936              This may dump out the wrong sort of single-shift character,
4937              but least it will give an indication that something went
4938              wrong. */
4939           if (flags & CODING_STATE_SS2)
4940             {
4941               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4942               flags &= ~CODING_STATE_SS2;
4943             }
4944           if (flags & CODING_STATE_SS3)
4945             {
4946               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4947               flags &= ~CODING_STATE_SS3;
4948             }
4949
4950           /***** Now handle the control characters. *****/
4951
4952           /* Handle CR/LF */
4953           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4954
4955           flags &= CODING_STATE_ISO2022_LOCK;
4956
4957           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4958             DECODE_ADD_BINARY_CHAR (c, dst);
4959         }
4960       else
4961         {                       /* Graphic characters */
4962           Lisp_Object charset;
4963 #ifndef UTF2000
4964           Charset_ID lb;
4965 #endif
4966           int reg;
4967
4968           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4969
4970           /* Now determine the charset. */
4971           reg = ((flags & CODING_STATE_SS2) ? 2
4972                  : (flags & CODING_STATE_SS3) ? 3
4973                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4974                  : str->iso2022.register_left);
4975           charset = str->iso2022.charset[reg];
4976
4977           /* Error checking: */
4978           if (! CHARSETP (charset)
4979               || str->iso2022.invalid_designated[reg]
4980               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4981                   && XCHARSET_CHARS (charset) == 94))
4982             /* Mrmph.  We are trying to invoke a register that has no
4983                or an invalid charset in it, or trying to add a character
4984                outside the range of the charset.  Insert that char literally
4985                to preserve it for the output. */
4986             {
4987               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4988               DECODE_ADD_BINARY_CHAR (c, dst);
4989             }
4990
4991           else
4992             {
4993               /* Things are probably hunky-dorey. */
4994
4995               /* Fetch reverse charset, maybe. */
4996               if (((flags & CODING_STATE_R2L) &&
4997                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4998                   ||
4999                   (!(flags & CODING_STATE_R2L) &&
5000                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5001                 {
5002                   Lisp_Object new_charset =
5003                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5004                   if (!NILP (new_charset))
5005                     charset = new_charset;
5006                 }
5007
5008 #ifdef UTF2000
5009               if (XCHARSET_DIMENSION (charset) == 1)
5010                 {
5011                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5012                   DECODE_ADD_UCS_CHAR
5013                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5014                 }
5015               else if (ch)
5016                 {
5017                   DECODE_ADD_UCS_CHAR
5018                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5019                   ch = 0;
5020                 }
5021               else
5022                 ch = c;
5023 #else
5024               lb = XCHARSET_LEADING_BYTE (charset);
5025               switch (XCHARSET_REP_BYTES (charset))
5026                 {
5027                 case 1: /* ASCII */
5028                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5029                   Dynarr_add (dst, c & 0x7F);
5030                   break;
5031
5032                 case 2: /* one-byte official */
5033                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5034                   Dynarr_add (dst, lb);
5035                   Dynarr_add (dst, c | 0x80);
5036                   break;
5037
5038                 case 3: /* one-byte private or two-byte official */
5039                   if (XCHARSET_PRIVATE_P (charset))
5040                     {
5041                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5042                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5043                       Dynarr_add (dst, lb);
5044                       Dynarr_add (dst, c | 0x80);
5045                     }
5046                   else
5047                     {
5048                       if (ch)
5049                         {
5050                           Dynarr_add (dst, lb);
5051                           Dynarr_add (dst, ch | 0x80);
5052                           Dynarr_add (dst, c | 0x80);
5053                           ch = 0;
5054                         }
5055                       else
5056                         ch = c;
5057                     }
5058                   break;
5059
5060                 default:        /* two-byte private */
5061                   if (ch)
5062                     {
5063                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5064                       Dynarr_add (dst, lb);
5065                       Dynarr_add (dst, ch | 0x80);
5066                       Dynarr_add (dst, c | 0x80);
5067                       ch = 0;
5068                     }
5069                   else
5070                     ch = c;
5071                 }
5072 #endif
5073             }
5074
5075           if (!ch)
5076             flags &= CODING_STATE_ISO2022_LOCK;
5077         }
5078
5079     label_continue_loop:;
5080     }
5081
5082   if (flags & CODING_STATE_END)
5083     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5084
5085   str->flags = flags;
5086   str->ch    = ch;
5087 }
5088
5089
5090 /***** ISO2022 encoder *****/
5091
5092 /* Designate CHARSET into register REG. */
5093
5094 static void
5095 iso2022_designate (Lisp_Object charset, unsigned char reg,
5096                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5097 {
5098   static CONST char inter94[] = "()*+";
5099   static CONST char inter96[] = ",-./";
5100   unsigned int type;
5101   unsigned char final;
5102   Lisp_Object old_charset = str->iso2022.charset[reg];
5103
5104   str->iso2022.charset[reg] = charset;
5105   if (!CHARSETP (charset))
5106     /* charset might be an initial nil or t. */
5107     return;
5108   type = XCHARSET_TYPE (charset);
5109   final = XCHARSET_FINAL (charset);
5110   if (!str->iso2022.force_charset_on_output[reg] &&
5111       CHARSETP (old_charset) &&
5112       XCHARSET_TYPE (old_charset) == type &&
5113       XCHARSET_FINAL (old_charset) == final)
5114     return;
5115
5116   str->iso2022.force_charset_on_output[reg] = 0;
5117
5118   {
5119     charset_conversion_spec_dynarr *dyn =
5120       str->codesys->iso2022.output_conv;
5121
5122     if (dyn)
5123       {
5124         int i;
5125
5126         for (i = 0; i < Dynarr_length (dyn); i++)
5127           {
5128             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5129             if (EQ (charset, spec->from_charset))
5130                 charset = spec->to_charset;
5131           }
5132       }
5133   }
5134
5135   Dynarr_add (dst, ISO_CODE_ESC);
5136   switch (type)
5137     {
5138     case CHARSET_TYPE_94:
5139       Dynarr_add (dst, inter94[reg]);
5140       break;
5141     case CHARSET_TYPE_96:
5142       Dynarr_add (dst, inter96[reg]);
5143       break;
5144     case CHARSET_TYPE_94X94:
5145       Dynarr_add (dst, '$');
5146       if (reg != 0
5147           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5148           || final < '@'
5149           || final > 'B')
5150         Dynarr_add (dst, inter94[reg]);
5151       break;
5152     case CHARSET_TYPE_96X96:
5153       Dynarr_add (dst, '$');
5154       Dynarr_add (dst, inter96[reg]);
5155       break;
5156     }
5157   Dynarr_add (dst, final);
5158 }
5159
5160 static void
5161 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5162 {
5163   if (str->iso2022.register_left != 0)
5164     {
5165       Dynarr_add (dst, ISO_CODE_SI);
5166       str->iso2022.register_left = 0;
5167     }
5168 }
5169
5170 static void
5171 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5172 {
5173   if (str->iso2022.register_left != 1)
5174     {
5175       Dynarr_add (dst, ISO_CODE_SO);
5176       str->iso2022.register_left = 1;
5177     }
5178 }
5179
5180 /* Convert internally-formatted data to ISO2022 format. */
5181
5182 static void
5183 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5184                        unsigned_char_dynarr *dst, unsigned int n)
5185 {
5186   unsigned char charmask, c;
5187   unsigned char char_boundary;
5188   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5189   unsigned int flags          = str->flags;
5190   Emchar ch                   = str->ch;
5191   Lisp_Coding_System *codesys = str->codesys;
5192   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5193   int i;
5194   Lisp_Object charset;
5195   int half;
5196 #ifdef UTF2000
5197   unsigned int byte1, byte2;
5198 #endif
5199
5200 #ifdef ENABLE_COMPOSITE_CHARS
5201   /* flags for handling composite chars.  We do a little switcharoo
5202      on the source while we're outputting the composite char. */
5203   unsigned int saved_n = 0;
5204   CONST unsigned char *saved_src = NULL;
5205   int in_composite = 0;
5206 #endif /* ENABLE_COMPOSITE_CHARS */
5207
5208   char_boundary = str->iso2022.current_char_boundary;
5209   charset = str->iso2022.current_charset;
5210   half = str->iso2022.current_half;
5211
5212 #ifdef ENABLE_COMPOSITE_CHARS
5213  back_to_square_n:
5214 #endif
5215 #ifdef UTF2000
5216   while (n--)
5217     {
5218       c = *src++;
5219
5220       switch (char_boundary)
5221         {
5222         case 0:
5223           if ( c >= 0xfc )
5224             {
5225               ch = c & 0x01;
5226               char_boundary = 5;
5227             }
5228           else if ( c >= 0xf8 )
5229             {
5230               ch = c & 0x03;
5231               char_boundary = 4;
5232             }
5233           else if ( c >= 0xf0 )
5234             {
5235               ch = c & 0x07;
5236               char_boundary = 3;
5237             }
5238           else if ( c >= 0xe0 )
5239             {
5240               ch = c & 0x0f;
5241               char_boundary = 2;
5242             }
5243           else if ( c >= 0xc0 )
5244             {
5245               ch = c & 0x1f;
5246               char_boundary = 1;
5247             }
5248           else
5249             {
5250               ch = 0;
5251
5252               restore_left_to_right_direction (codesys, dst, &flags, 0);
5253
5254               /* Make sure G0 contains ASCII */
5255               if ((c > ' ' && c < ISO_CODE_DEL) ||
5256                   !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5257                 {
5258                   ensure_normal_shift (str, dst);
5259                   iso2022_designate (Vcharset_ascii, 0, str, dst);
5260                 }
5261
5262               /* If necessary, restore everything to the default state
5263                  at end-of-line */
5264               if (c == '\n' &&
5265                   !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5266                 {
5267                   restore_left_to_right_direction (codesys, dst, &flags, 0);
5268
5269                   ensure_normal_shift (str, dst);
5270
5271                   for (i = 0; i < 4; i++)
5272                     {
5273                       Lisp_Object initial_charset =
5274                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5275                       iso2022_designate (initial_charset, i, str, dst);
5276                     }
5277                 }
5278               if (c == '\n')
5279                 {
5280                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5281                     Dynarr_add (dst, '\r');
5282                   if (eol_type != EOL_CR)
5283                     Dynarr_add (dst, c);
5284                 }
5285               else
5286                 {
5287                   if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5288                       && fit_to_be_escape_quoted (c))
5289                     Dynarr_add (dst, ISO_CODE_ESC);
5290                   Dynarr_add (dst, c);
5291                 }
5292               char_boundary = 0;
5293             }
5294           break;
5295         case 1:
5296           ch = ( ch << 6 ) | ( c & 0x3f );
5297
5298           char_boundary = 0;
5299           if ( (0x80 <= ch) && (ch <= 0x9f) )
5300             {
5301               charmask = (half == 0 ? 0x00 : 0x80);
5302
5303               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5304                   && fit_to_be_escape_quoted (ch))
5305                 Dynarr_add (dst, ISO_CODE_ESC);
5306               /* you asked for it ... */
5307               Dynarr_add (dst, ch);
5308             }
5309           else
5310             {
5311               int reg;
5312
5313               BREAKUP_CHAR (ch, charset, byte1, byte2);
5314               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5315                                         codesys, dst, &flags, 0);
5316
5317               /* Now determine which register to use. */
5318               reg = -1;
5319               for (i = 0; i < 4; i++)
5320                 {
5321                   if (EQ (charset, str->iso2022.charset[i]) ||
5322                       EQ (charset,
5323                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5324                     {
5325                       reg = i;
5326                       break;
5327                     }
5328                 }
5329
5330               if (reg == -1)
5331                 {
5332                   if (XCHARSET_GRAPHIC (charset) != 0)
5333                     {
5334                       if (!NILP (str->iso2022.charset[1]) &&
5335                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5336                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5337                         reg = 1;
5338                       else if (!NILP (str->iso2022.charset[2]))
5339                         reg = 2;
5340                       else if (!NILP (str->iso2022.charset[3]))
5341                         reg = 3;
5342                       else
5343                         reg = 0;
5344                     }
5345                   else
5346                     reg = 0;
5347                 }
5348
5349               iso2022_designate (charset, reg, str, dst);
5350
5351               /* Now invoke that register. */
5352               switch (reg)
5353                 {
5354                 case 0:
5355                   ensure_normal_shift (str, dst);
5356                   half = 0;
5357                   break;
5358
5359                 case 1:
5360                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5361                     {
5362                       ensure_shift_out (str, dst);
5363                       half = 0;
5364                     }
5365                   else
5366                     half = 1;
5367                   break;
5368
5369                 case 2:
5370                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5371                     {
5372                       Dynarr_add (dst, ISO_CODE_ESC);
5373                       Dynarr_add (dst, 'N');
5374                       half = 0;
5375                     }
5376                   else
5377                     {
5378                       Dynarr_add (dst, ISO_CODE_SS2);
5379                       half = 1;
5380                     }
5381                   break;
5382
5383                 case 3:
5384                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5385                     {
5386                       Dynarr_add (dst, ISO_CODE_ESC);
5387                       Dynarr_add (dst, 'O');
5388                       half = 0;
5389                     }
5390                   else
5391                     {
5392                       Dynarr_add (dst, ISO_CODE_SS3);
5393                       half = 1;
5394                     }
5395                   break;
5396
5397                 default:
5398                   abort ();
5399                 }
5400
5401               charmask = (half == 0 ? 0x00 : 0x80);
5402
5403               switch (XCHARSET_DIMENSION (charset))
5404                 {
5405                 case 1:
5406                   Dynarr_add (dst, byte1 | charmask);
5407                   break;
5408                 case 2:
5409                   Dynarr_add (dst, byte1 | charmask);
5410                   Dynarr_add (dst, byte2 | charmask);
5411                   break;
5412                 default:
5413                   abort ();
5414                 }
5415             }
5416           ch =0;
5417           break;
5418         default:
5419           ch = ( ch << 6 ) | ( c & 0x3f );
5420           char_boundary--;
5421         }
5422     }
5423 #else /* not UTF2000 */
5424
5425   while (n--)
5426     {
5427       c = *src++;
5428
5429       if (BYTE_ASCII_P (c))
5430         {               /* Processing ASCII character */
5431           ch = 0;
5432
5433           restore_left_to_right_direction (codesys, dst, &flags, 0);
5434
5435           /* Make sure G0 contains ASCII */
5436           if ((c > ' ' && c < ISO_CODE_DEL) ||
5437               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5438             {
5439               ensure_normal_shift (str, dst);
5440               iso2022_designate (Vcharset_ascii, 0, str, dst);
5441             }
5442
5443           /* If necessary, restore everything to the default state
5444              at end-of-line */
5445           if (c == '\n' &&
5446               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5447             {
5448               restore_left_to_right_direction (codesys, dst, &flags, 0);
5449
5450               ensure_normal_shift (str, dst);
5451
5452               for (i = 0; i < 4; i++)
5453                 {
5454                   Lisp_Object initial_charset =
5455                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5456                   iso2022_designate (initial_charset, i, str, dst);
5457                 }
5458             }
5459           if (c == '\n')
5460             {
5461               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5462                 Dynarr_add (dst, '\r');
5463               if (eol_type != EOL_CR)
5464                 Dynarr_add (dst, c);
5465             }
5466           else
5467             {
5468               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5469                   && fit_to_be_escape_quoted (c))
5470                 Dynarr_add (dst, ISO_CODE_ESC);
5471               Dynarr_add (dst, c);
5472             }
5473           char_boundary = 1;
5474         }
5475
5476       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5477         { /* Processing Leading Byte */
5478           ch = 0;
5479           charset = CHARSET_BY_LEADING_BYTE (c);
5480           if (LEADING_BYTE_PREFIX_P(c))
5481             ch = c;
5482           else if (!EQ (charset, Vcharset_control_1)
5483 #ifdef ENABLE_COMPOSITE_CHARS
5484                    && !EQ (charset, Vcharset_composite)
5485 #endif
5486                    )
5487             {
5488               int reg;
5489
5490               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5491                                         codesys, dst, &flags, 0);
5492
5493               /* Now determine which register to use. */
5494               reg = -1;
5495               for (i = 0; i < 4; i++)
5496                 {
5497                   if (EQ (charset, str->iso2022.charset[i]) ||
5498                       EQ (charset,
5499                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5500                     {
5501                       reg = i;
5502                       break;
5503                     }
5504                 }
5505
5506               if (reg == -1)
5507                 {
5508                   if (XCHARSET_GRAPHIC (charset) != 0)
5509                     {
5510                       if (!NILP (str->iso2022.charset[1]) &&
5511                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5512                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5513                         reg = 1;
5514                       else if (!NILP (str->iso2022.charset[2]))
5515                         reg = 2;
5516                       else if (!NILP (str->iso2022.charset[3]))
5517                         reg = 3;
5518                       else
5519                         reg = 0;
5520                     }
5521                   else
5522                     reg = 0;
5523                 }
5524
5525               iso2022_designate (charset, reg, str, dst);
5526
5527               /* Now invoke that register. */
5528               switch (reg)
5529                 {
5530                 case 0:
5531                   ensure_normal_shift (str, dst);
5532                   half = 0;
5533                   break;
5534
5535                 case 1:
5536                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5537                     {
5538                       ensure_shift_out (str, dst);
5539                       half = 0;
5540                     }
5541                   else
5542                     half = 1;
5543                   break;
5544
5545                 case 2:
5546                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5547                     {
5548                       Dynarr_add (dst, ISO_CODE_ESC);
5549                       Dynarr_add (dst, 'N');
5550                       half = 0;
5551                     }
5552                   else
5553                     {
5554                       Dynarr_add (dst, ISO_CODE_SS2);
5555                       half = 1;
5556                     }
5557                   break;
5558
5559                 case 3:
5560                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5561                     {
5562                       Dynarr_add (dst, ISO_CODE_ESC);
5563                       Dynarr_add (dst, 'O');
5564                       half = 0;
5565                     }
5566                   else
5567                     {
5568                       Dynarr_add (dst, ISO_CODE_SS3);
5569                       half = 1;
5570                     }
5571                   break;
5572
5573                 default:
5574                   abort ();
5575                 }
5576             }
5577           char_boundary = 0;
5578         }
5579       else
5580         {                       /* Processing Non-ASCII character */
5581           charmask = (half == 0 ? 0x7F : 0xFF);
5582           char_boundary = 1;
5583           if (EQ (charset, Vcharset_control_1))
5584             {
5585               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5586                   && fit_to_be_escape_quoted (c))
5587                 Dynarr_add (dst, ISO_CODE_ESC);
5588               /* you asked for it ... */
5589               Dynarr_add (dst, c - 0x20);
5590             }
5591           else
5592             {
5593               switch (XCHARSET_REP_BYTES (charset))
5594                 {
5595                 case 2:
5596                   Dynarr_add (dst, c & charmask);
5597                   break;
5598                 case 3:
5599                   if (XCHARSET_PRIVATE_P (charset))
5600                     {
5601                       Dynarr_add (dst, c & charmask);
5602                       ch = 0;
5603                     }
5604                   else if (ch)
5605                     {
5606 #ifdef ENABLE_COMPOSITE_CHARS
5607                       if (EQ (charset, Vcharset_composite))
5608                         {
5609                           if (in_composite)
5610                             {
5611                               /* #### Bother! We don't know how to
5612                                  handle this yet. */
5613                               Dynarr_add (dst, '~');
5614                             }
5615                           else
5616                             {
5617                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5618                                                        ch & 0x7F, c & 0x7F);
5619                               Lisp_Object lstr = composite_char_string (emch);
5620                               saved_n = n;
5621                               saved_src = src;
5622                               in_composite = 1;
5623                               src = XSTRING_DATA   (lstr);
5624                               n   = XSTRING_LENGTH (lstr);
5625                               Dynarr_add (dst, ISO_CODE_ESC);
5626                               Dynarr_add (dst, '0'); /* start composing */
5627                             }
5628                         }
5629                       else
5630 #endif /* ENABLE_COMPOSITE_CHARS */
5631                         {
5632                           Dynarr_add (dst, ch & charmask);
5633                           Dynarr_add (dst, c & charmask);
5634                         }
5635                       ch = 0;
5636                     }
5637                   else
5638                     {
5639                       ch = c;
5640                       char_boundary = 0;
5641                     }
5642                   break;
5643                 case 4:
5644                   if (ch)
5645                     {
5646                       Dynarr_add (dst, ch & charmask);
5647                       Dynarr_add (dst, c & charmask);
5648                       ch = 0;
5649                     }
5650                   else
5651                     {
5652                       ch = c;
5653                       char_boundary = 0;
5654                     }
5655                   break;
5656                 default:
5657                   abort ();
5658                 }
5659             }
5660         }
5661     }
5662 #endif /* not UTF2000 */
5663
5664 #ifdef ENABLE_COMPOSITE_CHARS
5665   if (in_composite)
5666     {
5667       n = saved_n;
5668       src = saved_src;
5669       in_composite = 0;
5670       Dynarr_add (dst, ISO_CODE_ESC);
5671       Dynarr_add (dst, '1'); /* end composing */
5672       goto back_to_square_n; /* Wheeeeeeeee ..... */
5673     }
5674 #endif /* ENABLE_COMPOSITE_CHARS */
5675
5676 #ifdef UTF2000
5677   if ( (char_boundary == 0) && flags & CODING_STATE_END)
5678 #else
5679   if (char_boundary && flags & CODING_STATE_END)
5680 #endif
5681     {
5682       restore_left_to_right_direction (codesys, dst, &flags, 0);
5683       ensure_normal_shift (str, dst);
5684       for (i = 0; i < 4; i++)
5685         {
5686           Lisp_Object initial_charset =
5687             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5688           iso2022_designate (initial_charset, i, str, dst);
5689         }
5690     }
5691
5692   str->flags = flags;
5693   str->ch    = ch;
5694   str->iso2022.current_char_boundary = char_boundary;
5695   str->iso2022.current_charset = charset;
5696   str->iso2022.current_half = half;
5697
5698   /* Verbum caro factum est! */
5699 }
5700 #endif /* MULE */
5701 \f
5702 /************************************************************************/
5703 /*                     No-conversion methods                            */
5704 /************************************************************************/
5705
5706 /* This is used when reading in "binary" files -- i.e. files that may
5707    contain all 256 possible byte values and that are not to be
5708    interpreted as being in any particular decoding. */
5709 static void
5710 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5711                              unsigned_char_dynarr *dst, unsigned int n)
5712 {
5713   unsigned char c;
5714   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5715   unsigned int flags  = str->flags;
5716   unsigned int ch     = str->ch;
5717   eol_type_t eol_type = str->eol_type;
5718
5719   while (n--)
5720     {
5721       c = *src++;
5722
5723       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5724       DECODE_ADD_BINARY_CHAR (c, dst);
5725     label_continue_loop:;
5726     }
5727
5728   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5729
5730   str->flags = flags;
5731   str->ch    = ch;
5732 }
5733
5734 static void
5735 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5736                              unsigned_char_dynarr *dst, unsigned int n)
5737 {
5738   unsigned char c;
5739   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5740   unsigned int flags  = str->flags;
5741   unsigned int ch     = str->ch;
5742   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5743 #ifdef UTF2000
5744   unsigned char char_boundary = str->iso2022.current_char_boundary;
5745 #endif
5746
5747   while (n--)
5748     {
5749       c = *src++;
5750 #ifdef UTF2000
5751       switch (char_boundary)
5752         {
5753         case 0:
5754           if ( c >= 0xfc )
5755             {
5756               ch = c & 0x01;
5757               char_boundary = 5;
5758             }
5759           else if ( c >= 0xf8 )
5760             {
5761               ch = c & 0x03;
5762               char_boundary = 4;
5763             }
5764           else if ( c >= 0xf0 )
5765             {
5766               ch = c & 0x07;
5767               char_boundary = 3;
5768             }
5769           else if ( c >= 0xe0 )
5770             {
5771               ch = c & 0x0f;
5772               char_boundary = 2;
5773             }
5774           else if ( c >= 0xc0 )
5775             {
5776               ch = c & 0x1f;
5777               char_boundary = 1;
5778             }
5779           else
5780             {
5781               ch = 0;
5782
5783               if (c == '\n')
5784                 {
5785                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5786                     Dynarr_add (dst, '\r');
5787                   if (eol_type != EOL_CR)
5788                     Dynarr_add (dst, c);
5789                 }
5790               else
5791                 Dynarr_add (dst, c);
5792               char_boundary = 0;
5793             }
5794           break;
5795         case 1:
5796           ch = ( ch << 6 ) | ( c & 0x3f );
5797           switch ( str->codesys->fixed.size )
5798             {
5799             case 1:
5800               Dynarr_add (dst, ch & 0xff);
5801               break;
5802             case 2:
5803               Dynarr_add (dst, (ch >> 8) & 0xff);
5804               Dynarr_add (dst,  ch       & 0xff);
5805               break;
5806             case 3:
5807               Dynarr_add (dst, (ch >> 16) & 0xff);
5808               Dynarr_add (dst, (ch >>  8) & 0xff);
5809               Dynarr_add (dst,  ch        & 0xff);
5810               break;
5811             case 4:
5812               Dynarr_add (dst, (ch >> 24) & 0xff);
5813               Dynarr_add (dst, (ch >> 16) & 0xff);
5814               Dynarr_add (dst, (ch >>  8) & 0xff);
5815               Dynarr_add (dst,  ch        & 0xff);
5816               break;
5817             default:
5818               fprintf(stderr, "It seems %d bytes stream.\n",
5819                       str->codesys->fixed.size);
5820               abort ();
5821             }
5822           char_boundary = 0;
5823           break;
5824         default:
5825           ch = ( ch << 6 ) | ( c & 0x3f );
5826           char_boundary--;
5827         }
5828 #else /* not UTF2000 */
5829       if (c == '\n')
5830         {
5831           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5832             Dynarr_add (dst, '\r');
5833           if (eol_type != EOL_CR)
5834             Dynarr_add (dst, '\n');
5835           ch = 0;
5836         }
5837       else if (BYTE_ASCII_P (c))
5838         {
5839           assert (ch == 0);
5840           Dynarr_add (dst, c);
5841         }
5842       else if (BUFBYTE_LEADING_BYTE_P (c))
5843         {
5844           assert (ch == 0);
5845           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5846               c == LEADING_BYTE_CONTROL_1)
5847             ch = c;
5848           else
5849             Dynarr_add (dst, '~'); /* untranslatable character */
5850         }
5851       else
5852         {
5853           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5854             Dynarr_add (dst, c);
5855           else if (ch == LEADING_BYTE_CONTROL_1)
5856             {
5857               assert (c < 0xC0);
5858               Dynarr_add (dst, c - 0x20);
5859             }
5860           /* else it should be the second or third byte of an
5861              untranslatable character, so ignore it */
5862           ch = 0;
5863         }
5864 #endif /* not UTF2000 */
5865     }
5866
5867   str->flags = flags;
5868   str->ch    = ch;
5869 #ifdef UTF2000
5870   str->iso2022.current_char_boundary = char_boundary;
5871 #endif
5872 }
5873
5874 \f
5875 /************************************************************************/
5876 /*                   Simple internal/external functions                 */
5877 /************************************************************************/
5878
5879 static Extbyte_dynarr *conversion_out_dynarr;
5880 static Bufbyte_dynarr *conversion_in_dynarr;
5881
5882 /* Determine coding system from coding format */
5883
5884 /* #### not correct for all values of `fmt'! */
5885 static Lisp_Object
5886 external_data_format_to_coding_system (enum external_data_format fmt)
5887 {
5888   switch (fmt)
5889     {
5890     case FORMAT_FILENAME:
5891     case FORMAT_TERMINAL:
5892       if (EQ (Vfile_name_coding_system, Qnil) ||
5893           EQ (Vfile_name_coding_system, Qbinary))
5894         return Qnil;
5895       else
5896         return Fget_coding_system (Vfile_name_coding_system);
5897 #ifdef MULE
5898     case FORMAT_CTEXT:
5899       return Fget_coding_system (Qctext);
5900 #endif
5901     default:
5902       return Qnil;
5903     }
5904 }
5905
5906 Extbyte *
5907 convert_to_external_format (CONST Bufbyte *ptr,
5908                             Bytecount len,
5909                             Extcount *len_out,
5910                             enum external_data_format fmt)
5911 {
5912   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5913
5914   if (!conversion_out_dynarr)
5915     conversion_out_dynarr = Dynarr_new (Extbyte);
5916   else
5917     Dynarr_reset (conversion_out_dynarr);
5918
5919   if (NILP (coding_system))
5920     {
5921       CONST Bufbyte *end = ptr + len;
5922
5923       for (; ptr < end;)
5924         {
5925 #ifdef UTF2000
5926           Bufbyte c =
5927             (*ptr < 0xc0) ? *ptr :
5928             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5929 #else
5930           Bufbyte c =
5931             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5932             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5933             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5934             '~';
5935 #endif
5936           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5937           INC_CHARPTR (ptr);
5938         }
5939
5940 #ifdef ERROR_CHECK_BUFPOS
5941       assert (ptr == end);
5942 #endif
5943     }
5944   else
5945     {
5946       Lisp_Object instream, outstream, da_outstream;
5947       Lstream *istr, *ostr;
5948       struct gcpro gcpro1, gcpro2, gcpro3;
5949       char tempbuf[1024]; /* some random amount */
5950
5951       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5952       da_outstream = make_dynarr_output_stream
5953         ((unsigned_char_dynarr *) conversion_out_dynarr);
5954       outstream =
5955         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5956       istr = XLSTREAM (instream);
5957       ostr = XLSTREAM (outstream);
5958       GCPRO3 (instream, outstream, da_outstream);
5959       while (1)
5960         {
5961           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5962           if (!size_in_bytes)
5963             break;
5964           Lstream_write (ostr, tempbuf, size_in_bytes);
5965         }
5966       Lstream_close (istr);
5967       Lstream_close (ostr);
5968       UNGCPRO;
5969       Lstream_delete (istr);
5970       Lstream_delete (ostr);
5971       Lstream_delete (XLSTREAM (da_outstream));
5972     }
5973
5974   *len_out = Dynarr_length (conversion_out_dynarr);
5975   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5976   return Dynarr_atp (conversion_out_dynarr, 0);
5977 }
5978
5979 Bufbyte *
5980 convert_from_external_format (CONST Extbyte *ptr,
5981                               Extcount len,
5982                               Bytecount *len_out,
5983                               enum external_data_format fmt)
5984 {
5985   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5986
5987   if (!conversion_in_dynarr)
5988     conversion_in_dynarr = Dynarr_new (Bufbyte);
5989   else
5990     Dynarr_reset (conversion_in_dynarr);
5991
5992   if (NILP (coding_system))
5993     {
5994       CONST Extbyte *end = ptr + len;
5995       for (; ptr < end; ptr++)
5996         {
5997           Extbyte c = *ptr;
5998           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5999         }
6000     }
6001   else
6002     {
6003       Lisp_Object instream, outstream, da_outstream;
6004       Lstream *istr, *ostr;
6005       struct gcpro gcpro1, gcpro2, gcpro3;
6006       char tempbuf[1024]; /* some random amount */
6007
6008       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
6009       da_outstream = make_dynarr_output_stream
6010         ((unsigned_char_dynarr *) conversion_in_dynarr);
6011       outstream =
6012         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
6013       istr = XLSTREAM (instream);
6014       ostr = XLSTREAM (outstream);
6015       GCPRO3 (instream, outstream, da_outstream);
6016       while (1)
6017         {
6018           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
6019           if (!size_in_bytes)
6020             break;
6021           Lstream_write (ostr, tempbuf, size_in_bytes);
6022         }
6023       Lstream_close (istr);
6024       Lstream_close (ostr);
6025       UNGCPRO;
6026       Lstream_delete (istr);
6027       Lstream_delete (ostr);
6028       Lstream_delete (XLSTREAM (da_outstream));
6029     }
6030
6031   *len_out = Dynarr_length (conversion_in_dynarr);
6032   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
6033   return Dynarr_atp (conversion_in_dynarr, 0);
6034 }
6035
6036 \f
6037 /************************************************************************/
6038 /*                             Initialization                           */
6039 /************************************************************************/
6040
6041 void
6042 syms_of_file_coding (void)
6043 {
6044   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
6045   deferror (&Qcoding_system_error, "coding-system-error",
6046             "Coding-system error", Qio_error);
6047
6048   DEFSUBR (Fcoding_system_p);
6049   DEFSUBR (Ffind_coding_system);
6050   DEFSUBR (Fget_coding_system);
6051   DEFSUBR (Fcoding_system_list);
6052   DEFSUBR (Fcoding_system_name);
6053   DEFSUBR (Fmake_coding_system);
6054   DEFSUBR (Fcopy_coding_system);
6055   DEFSUBR (Fdefine_coding_system_alias);
6056   DEFSUBR (Fsubsidiary_coding_system);
6057
6058   DEFSUBR (Fcoding_system_type);
6059   DEFSUBR (Fcoding_system_doc_string);
6060 #ifdef MULE
6061   DEFSUBR (Fcoding_system_charset);
6062 #endif
6063   DEFSUBR (Fcoding_system_property);
6064
6065   DEFSUBR (Fcoding_category_list);
6066   DEFSUBR (Fset_coding_priority_list);
6067   DEFSUBR (Fcoding_priority_list);
6068   DEFSUBR (Fset_coding_category_system);
6069   DEFSUBR (Fcoding_category_system);
6070
6071   DEFSUBR (Fdetect_coding_region);
6072   DEFSUBR (Fdecode_coding_region);
6073   DEFSUBR (Fencode_coding_region);
6074 #ifdef MULE
6075   DEFSUBR (Fdecode_shift_jis_char);
6076   DEFSUBR (Fencode_shift_jis_char);
6077   DEFSUBR (Fdecode_big5_char);
6078   DEFSUBR (Fencode_big5_char);
6079   DEFSUBR (Fset_ucs_char);
6080   DEFSUBR (Fucs_char);
6081   DEFSUBR (Fset_char_ucs);
6082   DEFSUBR (Fchar_ucs);
6083 #endif /* MULE */
6084   defsymbol (&Qcoding_system_p, "coding-system-p");
6085   defsymbol (&Qno_conversion, "no-conversion");
6086   defsymbol (&Qraw_text, "raw-text");
6087 #ifdef MULE
6088   defsymbol (&Qbig5, "big5");
6089   defsymbol (&Qshift_jis, "shift-jis");
6090   defsymbol (&Qucs4, "ucs-4");
6091   defsymbol (&Qutf8, "utf-8");
6092   defsymbol (&Qccl, "ccl");
6093   defsymbol (&Qiso2022, "iso2022");
6094 #endif /* MULE */
6095   defsymbol (&Qmnemonic, "mnemonic");
6096   defsymbol (&Qeol_type, "eol-type");
6097   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6098   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6099
6100   defsymbol (&Qcr, "cr");
6101   defsymbol (&Qlf, "lf");
6102   defsymbol (&Qcrlf, "crlf");
6103   defsymbol (&Qeol_cr, "eol-cr");
6104   defsymbol (&Qeol_lf, "eol-lf");
6105   defsymbol (&Qeol_crlf, "eol-crlf");
6106 #ifdef MULE
6107   defsymbol (&Qcharset_g0, "charset-g0");
6108   defsymbol (&Qcharset_g1, "charset-g1");
6109   defsymbol (&Qcharset_g2, "charset-g2");
6110   defsymbol (&Qcharset_g3, "charset-g3");
6111   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6112   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6113   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6114   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6115   defsymbol (&Qno_iso6429, "no-iso6429");
6116   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6117   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6118
6119   defsymbol (&Qshort, "short");
6120   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6121   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6122   defsymbol (&Qseven, "seven");
6123   defsymbol (&Qlock_shift, "lock-shift");
6124   defsymbol (&Qescape_quoted, "escape-quoted");
6125 #endif /* MULE */
6126   defsymbol (&Qencode, "encode");
6127   defsymbol (&Qdecode, "decode");
6128
6129 #ifdef MULE
6130   defsymbol (&Qctext, "ctext");
6131   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6132              "shift-jis");
6133   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6134              "big5");
6135   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6136              "ucs-4");
6137   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6138              "utf-8");
6139   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6140              "iso-7");
6141   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6142              "iso-8-designate");
6143   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6144              "iso-8-1");
6145   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6146              "iso-8-2");
6147   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6148              "iso-lock-shift");
6149 #endif /* MULE */
6150   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6151              "no-conversion");
6152 }
6153
6154 void
6155 lstream_type_create_file_coding (void)
6156 {
6157   LSTREAM_HAS_METHOD (decoding, reader);
6158   LSTREAM_HAS_METHOD (decoding, writer);
6159   LSTREAM_HAS_METHOD (decoding, rewinder);
6160   LSTREAM_HAS_METHOD (decoding, seekable_p);
6161   LSTREAM_HAS_METHOD (decoding, flusher);
6162   LSTREAM_HAS_METHOD (decoding, closer);
6163   LSTREAM_HAS_METHOD (decoding, marker);
6164
6165   LSTREAM_HAS_METHOD (encoding, reader);
6166   LSTREAM_HAS_METHOD (encoding, writer);
6167   LSTREAM_HAS_METHOD (encoding, rewinder);
6168   LSTREAM_HAS_METHOD (encoding, seekable_p);
6169   LSTREAM_HAS_METHOD (encoding, flusher);
6170   LSTREAM_HAS_METHOD (encoding, closer);
6171   LSTREAM_HAS_METHOD (encoding, marker);
6172 }
6173
6174 void
6175 vars_of_file_coding (void)
6176 {
6177   int i;
6178
6179   /* Initialize to something reasonable ... */
6180   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6181     {
6182       coding_category_system[i] = Qnil;
6183       coding_category_by_priority[i] = i;
6184     }
6185
6186   Fprovide (intern ("file-coding"));
6187
6188   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6189 Coding system used for TTY keyboard input.
6190 Not used under a windowing system.
6191 */ );
6192   Vkeyboard_coding_system = Qnil;
6193
6194   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6195 Coding system used for TTY display output.
6196 Not used under a windowing system.
6197 */ );
6198   Vterminal_coding_system = Qnil;
6199
6200   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6201 Overriding coding system used when writing a file or process.
6202 You should *bind* this, not set it.  If this is non-nil, it specifies
6203 the coding system that will be used when a file or process is read
6204 in, and overrides `buffer-file-coding-system-for-read',
6205 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6206 this one for permanent changes to the environment.
6207 */ );
6208   Vcoding_system_for_read = Qnil;
6209
6210   DEFVAR_LISP ("coding-system-for-write",
6211                &Vcoding_system_for_write /*
6212 Overriding coding system used when writing a file or process.
6213 You should *bind* this, not set it.  If this is non-nil, it specifies
6214 the coding system that will be used when a file or process is wrote
6215 in, and overrides `buffer-file-coding-system',
6216 `write-region-pre-hook', etc.  Use those variables instead of this one
6217 for permanent changes to the environment.
6218 */ );
6219   Vcoding_system_for_write = Qnil;
6220
6221   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6222 Coding system used to convert pathnames when accessing files.
6223 */ );
6224   Vfile_name_coding_system = Qnil;
6225
6226   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6227 Non-nil means the buffer contents are regarded as multi-byte form
6228 of characters, not a binary code.  This affects the display, file I/O,
6229 and behaviors of various editing commands.
6230
6231 Setting this to nil does not do anything.
6232 */ );
6233   enable_multibyte_characters = 1;
6234 }
6235
6236 void
6237 complex_vars_of_file_coding (void)
6238 {
6239   staticpro (&Vcoding_system_hash_table);
6240   Vcoding_system_hash_table =
6241     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6242
6243   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6244
6245 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6246 {                                               \
6247   struct codesys_prop csp;                      \
6248   csp.sym = (Sym);                              \
6249   csp.prop_type = (Prop_Type);                  \
6250   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6251 } while (0)
6252
6253   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6254   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6255   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6256   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6257   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6258   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6259   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6260 #ifdef MULE
6261   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6262   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6263   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6264   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6265   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6266   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6267   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6268   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6269   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6270   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6271   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6272   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6273   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6274   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6275   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6276   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6277   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6278
6279   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6280   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6281 #endif /* MULE */
6282   /* Need to create this here or we're really screwed. */
6283   Fmake_coding_system
6284     (Qraw_text, Qno_conversion,
6285      build_string ("Raw text, which means it converts only line-break-codes."),
6286      list2 (Qmnemonic, build_string ("Raw")));
6287
6288   Fmake_coding_system
6289     (Qbinary, Qno_conversion,
6290      build_string ("Binary, which means it does not convert anything."),
6291      list4 (Qeol_type, Qlf,
6292             Qmnemonic, build_string ("Binary")));
6293
6294 #ifdef UTF2000
6295   Fmake_coding_system
6296     (Qutf8, Qutf8,
6297      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6298      list2 (Qmnemonic, build_string ("UTF8")));
6299 #endif
6300
6301   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6302
6303   /* Need this for bootstrapping */
6304   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6305     Fget_coding_system (Qraw_text);
6306
6307 #ifdef UTF2000
6308   coding_category_system[CODING_CATEGORY_UTF8]
6309    = Fget_coding_system (Qutf8);
6310 #endif
6311
6312 #ifdef MULE
6313   {
6314     unsigned int i;
6315
6316     for (i = 0; i < 65536; i++)
6317       ucs_to_mule_table[i] = Qnil;
6318   }
6319   staticpro (&mule_to_ucs_table);
6320   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6321 #endif /* MULE */
6322 }