src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28 #include <stddef.h>
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #ifdef MULE
  35 #include "mule-ccl.h"
  36 #include "chartab.h"
  37 #endif
  38 #include "file-coding.h"
  39
  40 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  41
  42 Lisp_Object Vkeyboard_coding_system;
  43 Lisp_Object Vterminal_coding_system;
  44 Lisp_Object Vcoding_system_for_read;
  45 Lisp_Object Vcoding_system_for_write;
  46 Lisp_Object Vfile_name_coding_system;
  47
  48 /* Table of symbols identifying each coding category. */
  49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  50
  51 /* Coding system currently associated with each coding category. */
  52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  53
  54 /* Table of all coding categories in decreasing order of priority.
  55    This describes a permutation of the possible coding categories. */
  56 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  57
  58 Lisp_Object Qcoding_system_p;
  59
  60 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  61 /* Qinternal in general.c */
  62
  63 Lisp_Object Qmnemonic, Qeol_type;
  64 Lisp_Object Qcr, Qcrlf, Qlf;
  65 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  66 Lisp_Object Qpost_read_conversion;
  67 Lisp_Object Qpre_write_conversion;
  68
  69 #ifdef MULE
  70 Lisp_Object Qucs4, Qutf8;
  71 Lisp_Object Qbig5, Qshift_jis;
  72 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  73 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  74 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  75 Lisp_Object Qno_iso6429;
  76 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  77 Lisp_Object Qctext, Qescape_quoted;
  78 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  79 #endif
  80 Lisp_Object Qencode, Qdecode;
  81
  82 Lisp_Object Vcoding_system_hash_table;
  83
  84 int enable_multibyte_characters;
  85
  86 #ifdef MULE
  87 /* Additional information used by the ISO2022 decoder and detector. */
  88 struct iso2022_decoder
  89 {
  90   /* CHARSET holds the character sets currently assigned to the G0
  91      through G3 variables.  It is initialized from the array
  92      INITIAL_CHARSET in CODESYS. */
  93   Lisp_Object charset[4];
  94
  95   /* Which registers are currently invoked into the left (GL) and
  96      right (GR) halves of the 8-bit encoding space? */
  97   int register_left, register_right;
  98
  99   /* ISO_ESC holds a value indicating part of an escape sequence
 100      that has already been seen. */
 101   enum iso_esc_flag esc;
 102
 103   /* This records the bytes we've seen so far in an escape sequence,
 104      in case the sequence is invalid (we spit out the bytes unchanged). */
 105   unsigned char esc_bytes[8];
 106
 107   /* Index for next byte to store in ISO escape sequence. */
 108   int esc_bytes_index;
 109
 110 #ifdef ENABLE_COMPOSITE_CHARS
 111   /* Stuff seen so far when composing a string. */
 112   unsigned_char_dynarr *composite_chars;
 113 #endif
 114
 115   /* If we saw an invalid designation sequence for a particular
 116      register, we flag it here and switch to ASCII.  The next time we
 117      see a valid designation for this register, we turn off the flag
 118      and do the designation normally, but pretend the sequence was
 119      invalid.  The effect of all this is that (most of the time) the
 120      escape sequences for both the switch to the unknown charset, and
 121      the switch back to the known charset, get inserted literally into
 122      the buffer and saved out as such.  The hope is that we can
 123      preserve the escape sequences so that the resulting written out
 124      file makes sense.  If we don't do any of this, the designation
 125      to the invalid charset will be preserved but that switch back
 126      to the known charset will probably get eaten because it was
 127      the same charset that was already present in the register. */
 128   unsigned char invalid_designated[4];
 129
 130   /* We try to do similar things as above for direction-switching
 131      sequences.  If we encountered a direction switch while an
 132      invalid designation was present, or an invalid designation
 133      just after a direction switch (i.e. no valid designation
 134      encountered yet), we insert the direction-switch escape
 135      sequence literally into the output stream, and later on
 136      insert the corresponding direction-restoring escape sequence
 137      literally also. */
 138   unsigned int switched_dir_and_no_valid_charset_yet :1;
 139   unsigned int invalid_switch_dir :1;
 140
 141   /* Tells the decoder to output the escape sequence literally
 142      even though it was valid.  Used in the games we play to
 143      avoid lossage when we encounter invalid designations. */
 144   unsigned int output_literally :1;
 145   /* We encountered a direction switch followed by an invalid
 146      designation.  We didn't output the direction switch
 147      literally because we didn't know about the invalid designation;
 148      but we have to do so now. */
 149   unsigned int output_direction_sequence :1;
 150 };
 151 #endif /* MULE */
 152 EXFUN (Fcopy_coding_system, 2);
 153 #ifdef MULE
 154 struct detection_state;
 155 static int detect_coding_sjis (struct detection_state *st,
 156                                CONST unsigned char *src,
 157                                unsigned int n);
 158 static void decode_coding_sjis (Lstream *decoding,
 159                                 CONST unsigned char *src,
 160                                 unsigned_char_dynarr *dst,
 161                                 unsigned int n);
 162 static void encode_coding_sjis (Lstream *encoding,
 163                                 CONST unsigned char *src,
 164                                 unsigned_char_dynarr *dst,
 165                                 unsigned int n);
 166 static int detect_coding_big5 (struct detection_state *st,
 167                                CONST unsigned char *src,
 168                                unsigned int n);
 169 static void decode_coding_big5 (Lstream *decoding,
 170                                 CONST unsigned char *src,
 171                                 unsigned_char_dynarr *dst, unsigned int n);
 172 static void encode_coding_big5 (Lstream *encoding,
 173                                 CONST unsigned char *src,
 174                                 unsigned_char_dynarr *dst, unsigned int n);
 175 static int detect_coding_ucs4 (struct detection_state *st,
 176                                CONST unsigned char *src,
 177                                unsigned int n);
 178 static void decode_coding_ucs4 (Lstream *decoding,
 179                                 CONST unsigned char *src,
 180                                 unsigned_char_dynarr *dst, unsigned int n);
 181 static void encode_coding_ucs4 (Lstream *encoding,
 182                                 CONST unsigned char *src,
 183                                 unsigned_char_dynarr *dst, unsigned int n);
 184 static int detect_coding_utf8 (struct detection_state *st,
 185                                CONST unsigned char *src,
 186                                unsigned int n);
 187 static void decode_coding_utf8 (Lstream *decoding,
 188                                 CONST unsigned char *src,
 189                                 unsigned_char_dynarr *dst, unsigned int n);
 190 static void encode_coding_utf8 (Lstream *encoding,
 191                                 CONST unsigned char *src,
 192                                 unsigned_char_dynarr *dst, unsigned int n);
 193 static int postprocess_iso2022_mask (int mask);
 194 static void reset_iso2022 (Lisp_Object coding_system,
 195                            struct iso2022_decoder *iso);
 196 static int detect_coding_iso2022 (struct detection_state *st,
 197                                   CONST unsigned char *src,
 198                                   unsigned int n);
 199 static void decode_coding_iso2022 (Lstream *decoding,
 200                                    CONST unsigned char *src,
 201                                    unsigned_char_dynarr *dst, unsigned int n);
 202 static void encode_coding_iso2022 (Lstream *encoding,
 203                                    CONST unsigned char *src,
 204                                    unsigned_char_dynarr *dst, unsigned int n);
 205 #endif /* MULE */
 206 static void decode_coding_no_conversion (Lstream *decoding,
 207                                          CONST unsigned char *src,
 208                                          unsigned_char_dynarr *dst,
 209                                          unsigned int n);
 210 static void encode_coding_no_conversion (Lstream *encoding,
 211                                          CONST unsigned char *src,
 212                                          unsigned_char_dynarr *dst,
 213                                          unsigned int n);
 214 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 215                          unsigned_char_dynarr *dst, unsigned int n);
 216 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 217                          unsigned_char_dynarr *dst, unsigned int n);
 218
 219 typedef struct codesys_prop codesys_prop;
 220 struct codesys_prop
 221 {
 222   Lisp_Object sym;
 223   int prop_type;
 224 };
 225
 226 typedef struct
 227 {
 228   Dynarr_declare (codesys_prop);
 229 } codesys_prop_dynarr;
 230
 231 codesys_prop_dynarr *the_codesys_prop_dynarr;
 232
 233 enum codesys_prop_enum
 234 {
 235   CODESYS_PROP_ALL_OK,
 236   CODESYS_PROP_ISO2022,
 237   CODESYS_PROP_CCL
 238 };
 239
 240 \f
 241 /************************************************************************/
 242 /*                       Coding system functions                        */
 243 /************************************************************************/
 244
 245 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 246 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 247 static void finalize_coding_system (void *header, int for_disksave);
 248
 249 #ifdef MULE
 250 static const struct lrecord_description ccs_description_1[] = {
 251   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 252   { XD_END }
 253 };
 254
 255 static const struct struct_description ccs_description = {
 256   sizeof(charset_conversion_spec),
 257   ccs_description_1
 258 };
 259
 260 static const struct lrecord_description ccsd_description_1[] = {
 261   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 262   { XD_END }
 263 };
 264
 265 static const struct struct_description ccsd_description = {
 266   sizeof(charset_conversion_spec_dynarr),
 267   ccsd_description_1
 268 };
 269 #endif
 270
 271 static const struct lrecord_description coding_system_description[] = {
 272   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 273   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 274   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 275 #ifdef MULE
 276   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 277   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 278   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 279   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 280 #endif
 281   { XD_END }
 282 };
 283
 284 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 285                                mark_coding_system, print_coding_system,
 286                                finalize_coding_system,
 287                                0, 0, coding_system_description,
 288                                struct Lisp_Coding_System);
 289
 290 static Lisp_Object
 291 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 292 {
 293   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 294
 295   markobj (CODING_SYSTEM_NAME (codesys));
 296   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 297   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 298   markobj (CODING_SYSTEM_EOL_LF (codesys));
 299   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 300   markobj (CODING_SYSTEM_EOL_CR (codesys));
 301
 302   switch (CODING_SYSTEM_TYPE (codesys))
 303     {
 304 #ifdef MULE
 305       int i;
 306     case CODESYS_ISO2022:
 307       for (i = 0; i < 4; i++)
 308         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 309       if (codesys->iso2022.input_conv)
 310         {
 311           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 312             {
 313               struct charset_conversion_spec *ccs =
 314                 Dynarr_atp (codesys->iso2022.input_conv, i);
 315               markobj (ccs->from_charset);
 316               markobj (ccs->to_charset);
 317             }
 318         }
 319       if (codesys->iso2022.output_conv)
 320         {
 321           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 322             {
 323               struct charset_conversion_spec *ccs =
 324                 Dynarr_atp (codesys->iso2022.output_conv, i);
 325               markobj (ccs->from_charset);
 326               markobj (ccs->to_charset);
 327             }
 328         }
 329       break;
 330
 331     case CODESYS_CCL:
 332       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 333       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 334       break;
 335 #endif /* MULE */
 336     default:
 337       break;
 338     }
 339
 340   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 341   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 342 }
 343
 344 static void
 345 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 346                      int escapeflag)
 347 {
 348   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 349   if (print_readably)
 350     error ("printing unreadable object #<coding_system 0x%x>",
 351            c->header.uid);
 352
 353   write_c_string ("#<coding_system ", printcharfun);
 354   print_internal (c->name, printcharfun, 1);
 355   write_c_string (">", printcharfun);
 356 }
 357
 358 static void
 359 finalize_coding_system (void *header, int for_disksave)
 360 {
 361   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 362   /* Since coding systems never go away, this function is not
 363      necessary.  But it would be necessary if we changed things
 364      so that coding systems could go away. */
 365   if (!for_disksave) /* see comment in lstream.c */
 366     {
 367       switch (CODING_SYSTEM_TYPE (c))
 368         {
 369 #ifdef MULE
 370         case CODESYS_ISO2022:
 371           if (c->iso2022.input_conv)
 372             {
 373               Dynarr_free (c->iso2022.input_conv);
 374               c->iso2022.input_conv = 0;
 375             }
 376           if (c->iso2022.output_conv)
 377             {
 378               Dynarr_free (c->iso2022.output_conv);
 379               c->iso2022.output_conv = 0;
 380             }
 381           break;
 382 #endif /* MULE */
 383         default:
 384           break;
 385         }
 386     }
 387 }
 388
 389 static enum eol_type
 390 symbol_to_eol_type (Lisp_Object symbol)
 391 {
 392   CHECK_SYMBOL (symbol);
 393   if (NILP (symbol))      return EOL_AUTODETECT;
 394   if (EQ (symbol, Qlf))   return EOL_LF;
 395   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 396   if (EQ (symbol, Qcr))   return EOL_CR;
 397
 398   signal_simple_error ("Unrecognized eol type", symbol);
 399   return EOL_AUTODETECT; /* not reached */
 400 }
 401
 402 static Lisp_Object
 403 eol_type_to_symbol (enum eol_type type)
 404 {
 405   switch (type)
 406     {
 407     default: abort ();
 408     case EOL_LF:         return Qlf;
 409     case EOL_CRLF:       return Qcrlf;
 410     case EOL_CR:         return Qcr;
 411     case EOL_AUTODETECT: return Qnil;
 412     }
 413 }
 414
 415 static void
 416 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 417 {
 418   Lisp_Object codesys_obj;
 419   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 420   char *codesys_name = (char *) alloca (len + 7);
 421   int mlen = -1;
 422   char *codesys_mnemonic=0;
 423
 424   Lisp_Object codesys_name_sym, sub_codesys_obj;
 425
 426   /* kludge */
 427
 428   XSETCODING_SYSTEM (codesys_obj, codesys);
 429
 430   memcpy (codesys_name,
 431           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 432
 433   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 434     {
 435       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 436       codesys_mnemonic = (char *) alloca (mlen + 7);
 437       memcpy (codesys_mnemonic,
 438               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 439     }
 440
 441 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 442   strcpy (codesys_name + len, "-" op_sys);                                      \
 443   if (mlen != -1)                                                               \
 444     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 445   codesys_name_sym = intern (codesys_name);                                     \
 446   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 447   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 448   if (mlen != -1)                                                               \
 449     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 450       build_string (codesys_mnemonic);                                          \
 451   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 452 } while (0)
 453
 454   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 455   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 456   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 457 }
 458
 459 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 460 Return t if OBJECT is a coding system.
 461 A coding system is an object that defines how text containing multiple
 462 character sets is encoded into a stream of (typically 8-bit) bytes.
 463 The coding system is used to decode the stream into a series of
 464 characters (which may be from multiple charsets) when the text is read
 465 from a file or process, and is used to encode the text back into the
 466 same format when it is written out to a file or process.
 467
 468 For example, many ISO2022-compliant coding systems (such as Compound
 469 Text, which is used for inter-client data under the X Window System)
 470 use escape sequences to switch between different charsets -- Japanese
 471 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 472 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 473 `make-coding-system' for more information.
 474
 475 Coding systems are normally identified using a symbol, and the
 476 symbol is accepted in place of the actual coding system object whenever
 477 a coding system is called for. (This is similar to how faces work.)
 478 */
 479        (object))
 480 {
 481   return CODING_SYSTEMP (object) ? Qt : Qnil;
 482 }
 483
 484 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 485 Retrieve the coding system of the given name.
 486
 487 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 488 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 489 If there is no such coding system, nil is returned.  Otherwise the
 490 associated coding system object is returned.
 491 */
 492        (coding_system_or_name))
 493 {
 494   if (CODING_SYSTEMP (coding_system_or_name))
 495     return coding_system_or_name;
 496
 497   if (NILP (coding_system_or_name))
 498     coding_system_or_name = Qbinary;
 499   else
 500     CHECK_SYMBOL (coding_system_or_name);
 501
 502   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 503 }
 504
 505 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 506 Retrieve the coding system of the given name.
 507 Same as `find-coding-system' except that if there is no such
 508 coding system, an error is signaled instead of returning nil.
 509 */
 510        (name))
 511 {
 512   Lisp_Object coding_system = Ffind_coding_system (name);
 513
 514   if (NILP (coding_system))
 515     signal_simple_error ("No such coding system", name);
 516   return coding_system;
 517 }
 518
 519 /* We store the coding systems in hash tables with the names as the key and the
 520    actual coding system object as the value.  Occasionally we need to use them
 521    in a list format.  These routines provide us with that. */
 522 struct coding_system_list_closure
 523 {
 524   Lisp_Object *coding_system_list;
 525 };
 526
 527 static int
 528 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 529                                   void *coding_system_list_closure)
 530 {
 531   /* This function can GC */
 532   struct coding_system_list_closure *cscl =
 533     (struct coding_system_list_closure *) coding_system_list_closure;
 534   Lisp_Object *coding_system_list = cscl->coding_system_list;
 535
 536   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 537                                *coding_system_list);
 538   return 0;
 539 }
 540
 541 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 542 Return a list of the names of all defined coding systems.
 543 */
 544        ())
 545 {
 546   Lisp_Object coding_system_list = Qnil;
 547   struct gcpro gcpro1;
 548   struct coding_system_list_closure coding_system_list_closure;
 549
 550   GCPRO1 (coding_system_list);
 551   coding_system_list_closure.coding_system_list = &coding_system_list;
 552   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 553                  &coding_system_list_closure);
 554   UNGCPRO;
 555
 556   return coding_system_list;
 557 }
 558
 559 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 560 Return the name of the given coding system.
 561 */
 562        (coding_system))
 563 {
 564   coding_system = Fget_coding_system (coding_system);
 565   return XCODING_SYSTEM_NAME (coding_system);
 566 }
 567
 568 static Lisp_Coding_System *
 569 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 570 {
 571   Lisp_Coding_System *codesys =
 572     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 573
 574   zero_lcrecord (codesys);
 575   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 576   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 577   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 578   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 579   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 580   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 581   CODING_SYSTEM_TYPE     (codesys) = type;
 582   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 583 #ifdef MULE
 584   if (type == CODESYS_ISO2022)
 585     {
 586       int i;
 587       for (i = 0; i < 4; i++)
 588         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 589     }
 590   else if (type == CODESYS_CCL)
 591     {
 592       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 593       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 594     }
 595 #endif /* MULE */
 596   CODING_SYSTEM_NAME (codesys) = name;
 597
 598   return codesys;
 599 }
 600
 601 #ifdef MULE
 602 /* Given a list of charset conversion specs as specified in a Lisp
 603    program, parse it into STORE_HERE. */
 604
 605 static void
 606 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 607                                 Lisp_Object spec_list)
 608 {
 609   Lisp_Object rest;
 610
 611   EXTERNAL_LIST_LOOP (rest, spec_list)
 612     {
 613       Lisp_Object car = XCAR (rest);
 614       Lisp_Object from, to;
 615       struct charset_conversion_spec spec;
 616
 617       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 618         signal_simple_error ("Invalid charset conversion spec", car);
 619       from = Fget_charset (XCAR (car));
 620       to = Fget_charset (XCAR (XCDR (car)));
 621       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 622         signal_simple_error_2
 623           ("Attempted conversion between different charset types",
 624            from, to);
 625       spec.from_charset = from;
 626       spec.to_charset = to;
 627
 628       Dynarr_add (store_here, spec);
 629     }
 630 }
 631
 632 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 633    specs, return the equivalent as the Lisp programmer would see it.
 634
 635    If LOAD_HERE is 0, return Qnil. */
 636
 637 static Lisp_Object
 638 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 639 {
 640   int i;
 641   Lisp_Object result;
 642
 643   if (!load_here)
 644     return Qnil;
 645   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 646     {
 647       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 648       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 649     }
 650
 651   return Fnreverse (result);
 652 }
 653
 654 #endif /* MULE */
 655
 656 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 657 Register symbol NAME as a coding system.
 658
 659 TYPE describes the conversion method used and should be one of
 660
 661 nil or 'undecided
 662      Automatic conversion.  XEmacs attempts to detect the coding system
 663      used in the file.
 664 'no-conversion
 665      No conversion.  Use this for binary files and such.  On output,
 666      graphic characters that are not in ASCII or Latin-1 will be
 667      replaced by a ?. (For a no-conversion-encoded buffer, these
 668      characters will only be present if you explicitly insert them.)
 669 'shift-jis
 670      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 671 'ucs-4
 672      ISO 10646 UCS-4 encoding.
 673 'utf-8
 674      ISO 10646 UTF-8 encoding.
 675 'iso2022
 676      Any ISO2022-compliant encoding.  Among other things, this includes
 677      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 678      standard Unix encoding for Japanese and other languages), and
 679      Compound Text (the encoding used in X11).  You can specify more
 680      specific information about the conversion with the FLAGS argument.
 681 'big5
 682      Big5 (the encoding commonly used for Taiwanese).
 683 'ccl
 684      The conversion is performed using a user-written pseudo-code
 685      program.  CCL (Code Conversion Language) is the name of this
 686      pseudo-code.
 687 'internal
 688      Write out or read in the raw contents of the memory representing
 689      the buffer's text.  This is primarily useful for debugging
 690      purposes, and is only enabled when XEmacs has been compiled with
 691      DEBUG_XEMACS defined (via the --debug configure option).
 692      WARNING: Reading in a file using 'internal conversion can result
 693      in an internal inconsistency in the memory representing a
 694      buffer's text, which will produce unpredictable results and may
 695      cause XEmacs to crash.  Under normal circumstances you should
 696      never use 'internal conversion.
 697
 698 DOC-STRING is a string describing the coding system.
 699
 700 PROPS is a property list, describing the specific nature of the
 701 character set.  Recognized properties are:
 702
 703 'mnemonic
 704      String to be displayed in the modeline when this coding system is
 705      active.
 706
 707 'eol-type
 708      End-of-line conversion to be used.  It should be one of
 709
 710         nil
 711                 Automatically detect the end-of-line type (LF, CRLF,
 712                 or CR).  Also generate subsidiary coding systems named
 713                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 714                 identical to this coding system but have an EOL-TYPE
 715                 value of 'lf, 'crlf, and 'cr, respectively.
 716         'lf
 717                 The end of a line is marked externally using ASCII LF.
 718                 Since this is also the way that XEmacs represents an
 719                 end-of-line internally, specifying this option results
 720                 in no end-of-line conversion.  This is the standard
 721                 format for Unix text files.
 722         'crlf
 723                 The end of a line is marked externally using ASCII
 724                 CRLF.  This is the standard format for MS-DOS text
 725                 files.
 726         'cr
 727                 The end of a line is marked externally using ASCII CR.
 728                 This is the standard format for Macintosh text files.
 729         t
 730                 Automatically detect the end-of-line type but do not
 731                 generate subsidiary coding systems.  (This value is
 732                 converted to nil when stored internally, and
 733                 `coding-system-property' will return nil.)
 734
 735 'post-read-conversion
 736      Function called after a file has been read in, to perform the
 737      decoding.  Called with two arguments, BEG and END, denoting
 738      a region of the current buffer to be decoded.
 739
 740 'pre-write-conversion
 741      Function called before a file is written out, to perform the
 742      encoding.  Called with two arguments, BEG and END, denoting
 743      a region of the current buffer to be encoded.
 744
 745
 746 The following additional properties are recognized if TYPE is 'iso2022:
 747
 748 'charset-g0
 749 'charset-g1
 750 'charset-g2
 751 'charset-g3
 752      The character set initially designated to the G0 - G3 registers.
 753      The value should be one of
 754
 755           -- A charset object (designate that character set)
 756           -- nil (do not ever use this register)
 757           -- t (no character set is initially designated to
 758                 the register, but may be later on; this automatically
 759                 sets the corresponding `force-g*-on-output' property)
 760
 761 'force-g0-on-output
 762 'force-g1-on-output
 763 'force-g2-on-output
 764 'force-g2-on-output
 765      If non-nil, send an explicit designation sequence on output before
 766      using the specified register.
 767
 768 'short
 769      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 770      "ESC $ B" on output in place of the full designation sequences
 771      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 772
 773 'no-ascii-eol
 774      If non-nil, don't designate ASCII to G0 at each end of line on output.
 775      Setting this to non-nil also suppresses other state-resetting that
 776      normally happens at the end of a line.
 777
 778 'no-ascii-cntl
 779      If non-nil, don't designate ASCII to G0 before control chars on output.
 780
 781 'seven
 782      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 783      environment.
 784
 785 'lock-shift
 786      If non-nil, use locking-shift (SO/SI) instead of single-shift
 787      or designation by escape sequence.
 788
 789 'no-iso6429
 790      If non-nil, don't use ISO6429's direction specification.
 791
 792 'escape-quoted
 793      If non-nil, literal control characters that are the same as
 794      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 795      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 796      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 797      so that they can be properly distinguished from an escape sequence.
 798      (Note that doing this results in a non-portable encoding.) This
 799      encoding flag is used for byte-compiled files.  Note that ESC
 800      is a good choice for a quoting character because there are no
 801      escape sequences whose second byte is a character from the Control-0
 802      or Control-1 character sets; this is explicitly disallowed by the
 803      ISO2022 standard.
 804
 805 'input-charset-conversion
 806      A list of conversion specifications, specifying conversion of
 807      characters in one charset to another when decoding is performed.
 808      Each specification is a list of two elements: the source charset,
 809      and the destination charset.
 810
 811 'output-charset-conversion
 812      A list of conversion specifications, specifying conversion of
 813      characters in one charset to another when encoding is performed.
 814      The form of each specification is the same as for
 815      'input-charset-conversion.
 816
 817
 818 The following additional properties are recognized (and required)
 819 if TYPE is 'ccl:
 820
 821 'decode
 822      CCL program used for decoding (converting to internal format).
 823
 824 'encode
 825      CCL program used for encoding (converting to external format).
 826 */
 827        (name, type, doc_string, props))
 828 {
 829   Lisp_Coding_System *codesys;
 830   Lisp_Object rest, key, value;
 831   enum coding_system_type ty;
 832   int need_to_setup_eol_systems = 1;
 833
 834   /* Convert type to constant */
 835   if (NILP (type) || EQ (type, Qundecided))
 836                                       { ty = CODESYS_AUTODETECT; }
 837 #ifdef MULE
 838   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 839   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 840   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 841   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 842   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 843   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 844 #endif
 845   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 846 #ifdef DEBUG_XEMACS
 847   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 848 #endif
 849   else
 850     signal_simple_error ("Invalid coding system type", type);
 851
 852   CHECK_SYMBOL (name);
 853
 854   codesys = allocate_coding_system (ty, name);
 855
 856   if (NILP (doc_string))
 857     doc_string = build_string ("");
 858   else
 859     CHECK_STRING (doc_string);
 860   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 861
 862   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 863     {
 864       if (EQ (key, Qmnemonic))
 865         {
 866           if (!NILP (value))
 867             CHECK_STRING (value);
 868           CODING_SYSTEM_MNEMONIC (codesys) = value;
 869         }
 870
 871       else if (EQ (key, Qeol_type))
 872         {
 873           need_to_setup_eol_systems = NILP (value);
 874           if (EQ (value, Qt))
 875             value = Qnil;
 876           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 877         }
 878
 879       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 880       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 881 #ifdef MULE
 882       else if (ty == CODESYS_ISO2022)
 883         {
 884 #define FROB_INITIAL_CHARSET(charset_num) \
 885   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 886     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 887
 888           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 889           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 890           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 891           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 892
 893 #define FROB_FORCE_CHARSET(charset_num) \
 894   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 895
 896           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 897           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 898           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 899           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 900
 901 #define FROB_BOOLEAN_PROPERTY(prop) \
 902   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 903
 904           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 905           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 906           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 907           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 908           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 909           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 910           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 911
 912           else if (EQ (key, Qinput_charset_conversion))
 913             {
 914               codesys->iso2022.input_conv =
 915                 Dynarr_new (charset_conversion_spec);
 916               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 917                                               value);
 918             }
 919           else if (EQ (key, Qoutput_charset_conversion))
 920             {
 921               codesys->iso2022.output_conv =
 922                 Dynarr_new (charset_conversion_spec);
 923               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 924                                               value);
 925             }
 926           else
 927             signal_simple_error ("Unrecognized property", key);
 928         }
 929       else if (EQ (type, Qccl))
 930         {
 931           if (EQ (key, Qdecode))
 932             {
 933               CHECK_VECTOR (value);
 934               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 935             }
 936           else if (EQ (key, Qencode))
 937             {
 938               CHECK_VECTOR (value);
 939               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 940             }
 941           else
 942             signal_simple_error ("Unrecognized property", key);
 943         }
 944 #endif /* MULE */
 945       else
 946         signal_simple_error ("Unrecognized property", key);
 947     }
 948
 949   if (need_to_setup_eol_systems)
 950     setup_eol_coding_systems (codesys);
 951
 952   {
 953     Lisp_Object codesys_obj;
 954     XSETCODING_SYSTEM (codesys_obj, codesys);
 955     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 956     return codesys_obj;
 957   }
 958 }
 959
 960 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 961 Copy OLD-CODING-SYSTEM to NEW-NAME.
 962 If NEW-NAME does not name an existing coding system, a new one will
 963 be created.
 964 */
 965        (old_coding_system, new_name))
 966 {
 967   Lisp_Object new_coding_system;
 968   old_coding_system = Fget_coding_system (old_coding_system);
 969   new_coding_system = Ffind_coding_system (new_name);
 970   if (NILP (new_coding_system))
 971     {
 972       XSETCODING_SYSTEM (new_coding_system,
 973                          allocate_coding_system
 974                          (XCODING_SYSTEM_TYPE (old_coding_system),
 975                           new_name));
 976       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 977     }
 978
 979   {
 980     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 981     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 982     memcpy (((char *) to  ) + sizeof (to->header),
 983             ((char *) from) + sizeof (from->header),
 984             sizeof (*from) - sizeof (from->header));
 985     to->name = new_name;
 986   }
 987   return new_coding_system;
 988 }
 989
 990 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
 991 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
 992 */
 993        (alias, coding_system))
 994 {
 995   CHECK_SYMBOL (alias);
 996   if (!NILP (Ffind_coding_system (alias)))
 997     signal_simple_error ("Symbol already names a coding system", alias);
 998   coding_system = Fget_coding_system (coding_system);
 999   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1000
1001   /* Set up aliases for subsidiaries. */
1002   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1003     {
1004       Lisp_Object str;
1005       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1006 #define FROB(type, name)                                                        \
1007       do {                                                                      \
1008         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1009         if (!NILP (subsidiary))                                                 \
1010           Fdefine_coding_system_alias                                           \
1011             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1012       } while (0)
1013       FROB (LF,   "-unix");
1014       FROB (CRLF, "-dos");
1015       FROB (CR,   "-mac");
1016 #undef FROB
1017     }
1018   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1019      but it doesn't look intentional, so I'd rather return something
1020      meaningful or nothing at all. */
1021   return Qnil;
1022 }
1023
1024 static Lisp_Object
1025 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1026 {
1027   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1028   Lisp_Object new_coding_system;
1029
1030   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1031     return coding_system;
1032
1033   switch (type)
1034     {
1035     case EOL_AUTODETECT: return coding_system;
1036     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1037     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1038     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1039     default:       abort ();
1040     }
1041
1042   return NILP (new_coding_system) ? coding_system : new_coding_system;
1043 }
1044
1045 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1046 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1047 */
1048        (coding_system, eol_type))
1049 {
1050   coding_system = Fget_coding_system (coding_system);
1051
1052   return subsidiary_coding_system (coding_system,
1053                                    symbol_to_eol_type (eol_type));
1054 }
1055
1056 \f
1057 /************************************************************************/
1058 /*                         Coding system accessors                      */
1059 /************************************************************************/
1060
1061 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1062 Return the doc string for CODING-SYSTEM.
1063 */
1064        (coding_system))
1065 {
1066   coding_system = Fget_coding_system (coding_system);
1067   return XCODING_SYSTEM_DOC_STRING (coding_system);
1068 }
1069
1070 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1071 Return the type of CODING-SYSTEM.
1072 */
1073        (coding_system))
1074 {
1075   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1076     {
1077     default: abort ();
1078     case CODESYS_AUTODETECT:    return Qundecided;
1079 #ifdef MULE
1080     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1081     case CODESYS_ISO2022:       return Qiso2022;
1082     case CODESYS_BIG5:          return Qbig5;
1083     case CODESYS_UCS4:          return Qucs4;
1084     case CODESYS_UTF8:          return Qutf8;
1085     case CODESYS_CCL:           return Qccl;
1086 #endif
1087     case CODESYS_NO_CONVERSION: return Qno_conversion;
1088 #ifdef DEBUG_XEMACS
1089     case CODESYS_INTERNAL:      return Qinternal;
1090 #endif
1091     }
1092 }
1093
1094 #ifdef MULE
1095 static
1096 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1097 {
1098   Lisp_Object cs
1099     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1100
1101   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1102 }
1103
1104 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1105 Return initial charset of CODING-SYSTEM designated to GNUM.
1106 GNUM allows 0 .. 3.
1107 */
1108        (coding_system, gnum))
1109 {
1110   coding_system = Fget_coding_system (coding_system);
1111   CHECK_INT (gnum);
1112
1113   return coding_system_charset (coding_system, XINT (gnum));
1114 }
1115 #endif /* MULE */
1116
1117 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1118 Return the PROP property of CODING-SYSTEM.
1119 */
1120        (coding_system, prop))
1121 {
1122   int i, ok = 0;
1123   enum coding_system_type type;
1124
1125   coding_system = Fget_coding_system (coding_system);
1126   CHECK_SYMBOL (prop);
1127   type = XCODING_SYSTEM_TYPE (coding_system);
1128
1129   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1130     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1131       {
1132         ok = 1;
1133         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1134           {
1135           case CODESYS_PROP_ALL_OK:
1136             break;
1137 #ifdef MULE
1138           case CODESYS_PROP_ISO2022:
1139             if (type != CODESYS_ISO2022)
1140               signal_simple_error
1141                 ("Property only valid in ISO2022 coding systems",
1142                  prop);
1143             break;
1144
1145           case CODESYS_PROP_CCL:
1146             if (type != CODESYS_CCL)
1147               signal_simple_error
1148                 ("Property only valid in CCL coding systems",
1149                  prop);
1150             break;
1151 #endif /* MULE */
1152           default:
1153             abort ();
1154           }
1155       }
1156
1157   if (!ok)
1158     signal_simple_error ("Unrecognized property", prop);
1159
1160   if (EQ (prop, Qname))
1161     return XCODING_SYSTEM_NAME (coding_system);
1162   else if (EQ (prop, Qtype))
1163     return Fcoding_system_type (coding_system);
1164   else if (EQ (prop, Qdoc_string))
1165     return XCODING_SYSTEM_DOC_STRING (coding_system);
1166   else if (EQ (prop, Qmnemonic))
1167     return XCODING_SYSTEM_MNEMONIC (coding_system);
1168   else if (EQ (prop, Qeol_type))
1169     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1170   else if (EQ (prop, Qeol_lf))
1171     return XCODING_SYSTEM_EOL_LF (coding_system);
1172   else if (EQ (prop, Qeol_crlf))
1173     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1174   else if (EQ (prop, Qeol_cr))
1175     return XCODING_SYSTEM_EOL_CR (coding_system);
1176   else if (EQ (prop, Qpost_read_conversion))
1177     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1178   else if (EQ (prop, Qpre_write_conversion))
1179     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1180 #ifdef MULE
1181   else if (type == CODESYS_ISO2022)
1182     {
1183       if (EQ (prop, Qcharset_g0))
1184         return coding_system_charset (coding_system, 0);
1185       else if (EQ (prop, Qcharset_g1))
1186         return coding_system_charset (coding_system, 1);
1187       else if (EQ (prop, Qcharset_g2))
1188         return coding_system_charset (coding_system, 2);
1189       else if (EQ (prop, Qcharset_g3))
1190         return coding_system_charset (coding_system, 3);
1191
1192 #define FORCE_CHARSET(charset_num) \
1193   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1194    (coding_system, charset_num) ? Qt : Qnil)
1195
1196       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1197       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1198       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1199       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1200
1201 #define LISP_BOOLEAN(prop) \
1202   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1203
1204       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1205       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1206       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1207       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1208       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1209       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1210       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1211
1212       else if (EQ (prop, Qinput_charset_conversion))
1213         return
1214           unparse_charset_conversion_specs
1215             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1216       else if (EQ (prop, Qoutput_charset_conversion))
1217         return
1218           unparse_charset_conversion_specs
1219             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1220       else
1221         abort ();
1222     }
1223   else if (type == CODESYS_CCL)
1224     {
1225       if (EQ (prop, Qdecode))
1226         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1227       else if (EQ (prop, Qencode))
1228         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1229       else
1230         abort ();
1231     }
1232 #endif /* MULE */
1233   else
1234     abort ();
1235
1236   return Qnil; /* not reached */
1237 }
1238
1239 \f
1240 /************************************************************************/
1241 /*                       Coding category functions                      */
1242 /************************************************************************/
1243
1244 static int
1245 decode_coding_category (Lisp_Object symbol)
1246 {
1247   int i;
1248
1249   CHECK_SYMBOL (symbol);
1250   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1251     if (EQ (coding_category_symbol[i], symbol))
1252       return i;
1253
1254   signal_simple_error ("Unrecognized coding category", symbol);
1255   return 0; /* not reached */
1256 }
1257
1258 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1259 Return a list of all recognized coding categories.
1260 */
1261        ())
1262 {
1263   int i;
1264   Lisp_Object list = Qnil;
1265
1266   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1267     list = Fcons (coding_category_symbol[i], list);
1268   return list;
1269 }
1270
1271 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1272 Change the priority order of the coding categories.
1273 LIST should be list of coding categories, in descending order of
1274 priority.  Unspecified coding categories will be lower in priority
1275 than all specified ones, in the same relative order they were in
1276 previously.
1277 */
1278        (list))
1279 {
1280   int category_to_priority[CODING_CATEGORY_LAST + 1];
1281   int i, j;
1282   Lisp_Object rest;
1283
1284   /* First generate a list that maps coding categories to priorities. */
1285
1286   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1287     category_to_priority[i] = -1;
1288
1289   /* Highest priority comes from the specified list. */
1290   i = 0;
1291   EXTERNAL_LIST_LOOP (rest, list)
1292     {
1293       int cat = decode_coding_category (XCAR (rest));
1294
1295       if (category_to_priority[cat] >= 0)
1296         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1297       category_to_priority[cat] = i++;
1298     }
1299
1300   /* Now go through the existing categories by priority to retrieve
1301      the categories not yet specified and preserve their priority
1302      order. */
1303   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1304     {
1305       int cat = coding_category_by_priority[j];
1306       if (category_to_priority[cat] < 0)
1307         category_to_priority[cat] = i++;
1308     }
1309
1310   /* Now we need to construct the inverse of the mapping we just
1311      constructed. */
1312
1313   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1314     coding_category_by_priority[category_to_priority[i]] = i;
1315
1316   /* Phew!  That was confusing. */
1317   return Qnil;
1318 }
1319
1320 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1321 Return a list of coding categories in descending order of priority.
1322 */
1323        ())
1324 {
1325   int i;
1326   Lisp_Object list = Qnil;
1327
1328   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1329     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1330                   list);
1331   return list;
1332 }
1333
1334 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1335 Change the coding system associated with a coding category.
1336 */
1337        (coding_category, coding_system))
1338 {
1339   int cat = decode_coding_category (coding_category);
1340
1341   coding_system = Fget_coding_system (coding_system);
1342   coding_category_system[cat] = coding_system;
1343   return Qnil;
1344 }
1345
1346 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1347 Return the coding system associated with a coding category.
1348 */
1349        (coding_category))
1350 {
1351   int cat = decode_coding_category (coding_category);
1352   Lisp_Object sys = coding_category_system[cat];
1353
1354   if (!NILP (sys))
1355     return XCODING_SYSTEM_NAME (sys);
1356   return Qnil;
1357 }
1358
1359 \f
1360 /************************************************************************/
1361 /*                     Detecting the encoding of data                   */
1362 /************************************************************************/
1363
1364 struct detection_state
1365 {
1366   enum eol_type eol_type;
1367   int seen_non_ascii;
1368   int mask;
1369 #ifdef MULE
1370   struct
1371     {
1372       int mask;
1373       int in_second_byte;
1374     }
1375   big5;
1376
1377   struct
1378     {
1379       int mask;
1380       int in_second_byte;
1381     }
1382   shift_jis;
1383
1384   struct
1385     {
1386       int mask;
1387       int in_byte;
1388   }
1389   ucs4;
1390
1391   struct
1392     {
1393       int mask;
1394       int in_byte;
1395     }
1396   utf8;
1397
1398   struct
1399     {
1400       int mask;
1401       int initted;
1402       struct iso2022_decoder iso;
1403       unsigned int flags;
1404       int high_byte_count;
1405       unsigned int saw_single_shift:1;
1406     }
1407   iso2022;
1408 #endif
1409   struct
1410     {
1411       int seen_anything;
1412       int just_saw_cr;
1413     }
1414   eol;
1415 };
1416
1417 static int
1418 acceptable_control_char_p (int c)
1419 {
1420   switch (c)
1421     {
1422       /* Allow and ignore control characters that you might
1423          reasonably see in a text file */
1424     case '\r':
1425     case '\n':
1426     case '\t':
1427     case  7: /* bell */
1428     case  8: /* backspace */
1429     case 11: /* vertical tab */
1430     case 12: /* form feed */
1431     case 26: /* MS-DOS C-z junk */
1432     case 31: /* '^_' -- for info */
1433       return 1;
1434     default:
1435       return 0;
1436     }
1437 }
1438
1439 static int
1440 mask_has_at_most_one_bit_p (int mask)
1441 {
1442   /* Perhaps the only thing useful you learn from intensive Microsoft
1443      technical interviews */
1444   return (mask & (mask - 1)) == 0;
1445 }
1446
1447 static enum eol_type
1448 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1449                  unsigned int n)
1450 {
1451   int c;
1452
1453   while (n--)
1454     {
1455       c = *src++;
1456       if (c == '\r')
1457         st->eol.just_saw_cr = 1;
1458       else
1459         {
1460           if (c == '\n')
1461             {
1462               if (st->eol.just_saw_cr)
1463                 return EOL_CRLF;
1464               else if (st->eol.seen_anything)
1465                 return EOL_LF;
1466             }
1467           else if (st->eol.just_saw_cr)
1468             return EOL_CR;
1469           st->eol.just_saw_cr = 0;
1470         }
1471       st->eol.seen_anything = 1;
1472     }
1473
1474   return EOL_AUTODETECT;
1475 }
1476
1477 /* Attempt to determine the encoding and EOL type of the given text.
1478    Before calling this function for the first type, you must initialize
1479    st->eol_type as appropriate and initialize st->mask to ~0.
1480
1481    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1482    not yet known.
1483
1484    st->mask holds the determined coding category mask, or ~0 if only
1485    ASCII has been seen so far.
1486
1487    Returns:
1488
1489    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1490         is present in st->mask
1491    1 == definitive answers are here for both st->eol_type and st->mask
1492 */
1493
1494 static int
1495 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1496                     unsigned int n, int just_do_eol)
1497 {
1498   int c;
1499
1500   if (st->eol_type == EOL_AUTODETECT)
1501     st->eol_type = detect_eol_type (st, src, n);
1502
1503   if (just_do_eol)
1504     return st->eol_type != EOL_AUTODETECT;
1505
1506   if (!st->seen_non_ascii)
1507     {
1508       for (; n; n--, src++)
1509         {
1510           c = *src;
1511           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1512             {
1513               st->seen_non_ascii = 1;
1514 #ifdef MULE
1515               st->shift_jis.mask = ~0;
1516               st->big5.mask = ~0;
1517               st->ucs4.mask = ~0;
1518               st->utf8.mask = ~0;
1519               st->iso2022.mask = ~0;
1520 #endif
1521               break;
1522             }
1523         }
1524     }
1525
1526   if (!n)
1527     return 0;
1528 #ifdef MULE
1529   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1530     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1531   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1532     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1533   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1534     st->big5.mask = detect_coding_big5 (st, src, n);
1535   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1536     st->utf8.mask = detect_coding_utf8 (st, src, n);
1537   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1538     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1539
1540   st->mask
1541     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1542     | st->utf8.mask | st->ucs4.mask;
1543 #endif
1544   {
1545     int retval = mask_has_at_most_one_bit_p (st->mask);
1546     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1547     return retval && st->eol_type != EOL_AUTODETECT;
1548   }
1549 }
1550
1551 static Lisp_Object
1552 coding_system_from_mask (int mask)
1553 {
1554   if (mask == ~0)
1555     {
1556       /* If the file was entirely or basically ASCII, use the
1557          default value of `buffer-file-coding-system'. */
1558       Lisp_Object retval =
1559         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1560       if (!NILP (retval))
1561         {
1562           retval = Ffind_coding_system (retval);
1563           if (NILP (retval))
1564             {
1565               warn_when_safe
1566                 (Qbad_variable, Qwarning,
1567                  "Invalid `default-buffer-file-coding-system', set to nil");
1568               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1569             }
1570         }
1571       if (NILP (retval))
1572         retval = Fget_coding_system (Qraw_text);
1573       return retval;
1574     }
1575   else
1576     {
1577       int i;
1578       int cat = -1;
1579 #ifdef MULE
1580       mask = postprocess_iso2022_mask (mask);
1581 #endif
1582       /* Look through the coding categories by priority and find
1583          the first one that is allowed. */
1584       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1585         {
1586           cat = coding_category_by_priority[i];
1587           if ((mask & (1 << cat)) &&
1588               !NILP (coding_category_system[cat]))
1589             break;
1590         }
1591       if (cat >= 0)
1592         return coding_category_system[cat];
1593       else
1594         return Fget_coding_system (Qraw_text);
1595     }
1596 }
1597
1598 /* Given a seekable read stream and potential coding system and EOL type
1599    as specified, do any autodetection that is called for.  If the
1600    coding system and/or EOL type are not autodetect, they will be left
1601    alone; but this function will never return an autodetect coding system
1602    or EOL type.
1603
1604    This function does not automatically fetch subsidiary coding systems;
1605    that should be unnecessary with the explicit eol-type argument. */
1606
1607 void
1608 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1609                               enum eol_type *eol_type_in_out)
1610 {
1611   struct detection_state decst;
1612
1613   if (*eol_type_in_out == EOL_AUTODETECT)
1614     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1615
1616   xzero (decst);
1617   decst.eol_type = *eol_type_in_out;
1618   decst.mask = ~0;
1619
1620   /* If autodetection is called for, do it now. */
1621   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1622       *eol_type_in_out == EOL_AUTODETECT)
1623     {
1624
1625       while (1)
1626         {
1627           unsigned char random_buffer[4096];
1628           int nread;
1629
1630           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1631           if (!nread)
1632             break;
1633           if (detect_coding_type (&decst, random_buffer, nread,
1634                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1635                                   CODESYS_AUTODETECT))
1636             break;
1637         }
1638
1639       *eol_type_in_out = decst.eol_type;
1640       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1641         *codesys_in_out = coding_system_from_mask (decst.mask);
1642     }
1643
1644   /* If we absolutely can't determine the EOL type, just assume LF. */
1645   if (*eol_type_in_out == EOL_AUTODETECT)
1646     *eol_type_in_out = EOL_LF;
1647
1648   Lstream_rewind (stream);
1649 }
1650
1651 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1652 Detect coding system of the text in the region between START and END.
1653 Returned a list of possible coding systems ordered by priority.
1654 If only ASCII characters are found, it returns 'undecided or one of
1655 its subsidiary coding systems according to a detected end-of-line
1656 type.  Optional arg BUFFER defaults to the current buffer.
1657 */
1658        (start, end, buffer))
1659 {
1660   Lisp_Object val = Qnil;
1661   struct buffer *buf = decode_buffer (buffer, 0);
1662   Bufpos b, e;
1663   Lisp_Object instream, lb_instream;
1664   Lstream *istr, *lb_istr;
1665   struct detection_state decst;
1666   struct gcpro gcpro1, gcpro2;
1667
1668   get_buffer_range_char (buf, start, end, &b, &e, 0);
1669   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1670   lb_istr = XLSTREAM (lb_instream);
1671   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1672   istr = XLSTREAM (instream);
1673   GCPRO2 (instream, lb_instream);
1674   xzero (decst);
1675   decst.eol_type = EOL_AUTODETECT;
1676   decst.mask = ~0;
1677   while (1)
1678     {
1679       unsigned char random_buffer[4096];
1680       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1681
1682       if (!nread)
1683         break;
1684       if (detect_coding_type (&decst, random_buffer, nread, 0))
1685         break;
1686     }
1687
1688   if (decst.mask == ~0)
1689     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1690                                     decst.eol_type);
1691   else
1692     {
1693       int i;
1694
1695       val = Qnil;
1696 #ifdef MULE
1697       decst.mask = postprocess_iso2022_mask (decst.mask);
1698 #endif
1699       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1700         {
1701           int sys = coding_category_by_priority[i];
1702           if (decst.mask & (1 << sys))
1703             {
1704               Lisp_Object codesys = coding_category_system[sys];
1705               if (!NILP (codesys))
1706                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1707               val = Fcons (codesys, val);
1708             }
1709         }
1710     }
1711   Lstream_close (istr);
1712   UNGCPRO;
1713   Lstream_delete (istr);
1714   Lstream_delete (lb_istr);
1715   return val;
1716 }
1717
1718 \f
1719 /************************************************************************/
1720 /*           Converting to internal Mule format ("decoding")            */
1721 /************************************************************************/
1722
1723 /* A decoding stream is a stream used for decoding text (i.e.
1724    converting from some external format to internal format).
1725    The decoding-stream object keeps track of the actual coding
1726    stream, the stream that is at the other end, and data that
1727    needs to be persistent across the lifetime of the stream. */
1728
1729 /* Handle the EOL stuff related to just-read-in character C.
1730    EOL_TYPE is the EOL type of the coding stream.
1731    FLAGS is the current value of FLAGS in the coding stream, and may
1732    be modified by this macro.  (The macro only looks at the
1733    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1734    bytes are to be written.  You need to also define a local goto
1735    label "label_continue_loop" that is at the end of the main
1736    character-reading loop.
1737
1738    If C is a CR character, then this macro handles it entirely and
1739    jumps to label_continue_loop.  Otherwise, this macro does not add
1740    anything to DST, and continues normally.  You should continue
1741    processing C normally after this macro. */
1742
1743 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1744 do {                                                            \
1745   if (c == '\r')                                                \
1746     {                                                           \
1747       if (eol_type == EOL_CR)                                   \
1748         Dynarr_add (dst, '\n');                                 \
1749       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1750         Dynarr_add (dst, c);                                    \
1751       else                                                      \
1752         flags |= CODING_STATE_CR;                               \
1753       goto label_continue_loop;                                 \
1754     }                                                           \
1755   else if (flags & CODING_STATE_CR)                             \
1756     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1757       if (c != '\n')                                            \
1758         Dynarr_add (dst, '\r');                                 \
1759       flags &= ~CODING_STATE_CR;                                \
1760     }                                                           \
1761 } while (0)
1762
1763 /* C should be a binary character in the range 0 - 255; convert
1764    to internal format and add to Dynarr DST. */
1765
1766 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1767 do {                                            \
1768   if (BYTE_ASCII_P (c))                         \
1769     Dynarr_add (dst, c);                        \
1770   else if (BYTE_C1_P (c))                       \
1771     {                                           \
1772       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1773       Dynarr_add (dst, c + 0x20);               \
1774     }                                           \
1775   else                                          \
1776     {                                           \
1777       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1778       Dynarr_add (dst, c);                      \
1779     }                                           \
1780 } while (0)
1781
1782 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1783 do {                                    \
1784   if (ch)                               \
1785     {                                   \
1786       DECODE_ADD_BINARY_CHAR (ch, dst); \
1787       ch = 0;                           \
1788     }                                   \
1789 } while (0)
1790
1791 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1792 do {                                    \
1793   if (flags & CODING_STATE_END)         \
1794     {                                   \
1795       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1796       if (flags & CODING_STATE_CR)      \
1797         Dynarr_add (dst, '\r');         \
1798     }                                   \
1799 } while (0)
1800
1801 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1802
1803 struct decoding_stream
1804 {
1805   /* Coding system that governs the conversion. */
1806   Lisp_Coding_System *codesys;
1807
1808   /* Stream that we read the encoded data from or
1809      write the decoded data to. */
1810   Lstream *other_end;
1811
1812   /* If we are reading, then we can return only a fixed amount of
1813      data, so if the conversion resulted in too much data, we store it
1814      here for retrieval the next time around. */
1815   unsigned_char_dynarr *runoff;
1816
1817   /* FLAGS holds flags indicating the current state of the decoding.
1818      Some of these flags are dependent on the coding system. */
1819   unsigned int flags;
1820
1821   /* CH holds a partially built-up character.  Since we only deal
1822      with one- and two-byte characters at the moment, we only use
1823      this to store the first byte of a two-byte character. */
1824   unsigned int ch;
1825
1826   /* EOL_TYPE specifies the type of end-of-line conversion that
1827      currently applies.  We need to keep this separate from the
1828      EOL type stored in CODESYS because the latter might indicate
1829      automatic EOL-type detection while the former will always
1830      indicate a particular EOL type. */
1831   enum eol_type eol_type;
1832 #ifdef MULE
1833   /* Additional ISO2022 information.  We define the structure above
1834      because it's also needed by the detection routines. */
1835   struct iso2022_decoder iso2022;
1836
1837   /* Additional information (the state of the running CCL program)
1838      used by the CCL decoder. */
1839   struct ccl_program ccl;
1840 #endif
1841   struct detection_state decst;
1842 };
1843
1844 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1845 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1846 static int decoding_rewinder   (Lstream *stream);
1847 static int decoding_seekable_p (Lstream *stream);
1848 static int decoding_flusher    (Lstream *stream);
1849 static int decoding_closer     (Lstream *stream);
1850
1851 static Lisp_Object decoding_marker (Lisp_Object stream,
1852                                     void (*markobj) (Lisp_Object));
1853
1854 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1855                                sizeof (struct decoding_stream));
1856
1857 static Lisp_Object
1858 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1859 {
1860   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1861   Lisp_Object str_obj;
1862
1863   /* We do not need to mark the coding systems or charsets stored
1864      within the stream because they are stored in a global list
1865      and automatically marked. */
1866
1867   XSETLSTREAM (str_obj, str);
1868   markobj (str_obj);
1869   if (str->imp->marker)
1870     return (str->imp->marker) (str_obj, markobj);
1871   else
1872     return Qnil;
1873 }
1874
1875 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1876    so we read data from the other end, decode it, and store it into DATA. */
1877
1878 static int
1879 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1880 {
1881   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1882   unsigned char *orig_data = data;
1883   int read_size;
1884   int error_occurred = 0;
1885
1886   /* We need to interface to mule_decode(), which expects to take some
1887      amount of data and store the result into a Dynarr.  We have
1888      mule_decode() store into str->runoff, and take data from there
1889      as necessary. */
1890
1891   /* We loop until we have enough data, reading chunks from the other
1892      end and decoding it. */
1893   while (1)
1894     {
1895       /* Take data from the runoff if we can.  Make sure to take at
1896          most SIZE bytes, and delete the data from the runoff. */
1897       if (Dynarr_length (str->runoff) > 0)
1898         {
1899           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1900           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1901           Dynarr_delete_many (str->runoff, 0, chunk);
1902           data += chunk;
1903           size -= chunk;
1904         }
1905
1906       if (size == 0)
1907         break; /* No more room for data */
1908
1909       if (str->flags & CODING_STATE_END)
1910         /* This means that on the previous iteration, we hit the EOF on
1911            the other end.  We loop once more so that mule_decode() can
1912            output any final stuff it may be holding, or any "go back
1913            to a sane state" escape sequences. (This latter makes sense
1914            during encoding.) */
1915         break;
1916
1917       /* Exhausted the runoff, so get some more.  DATA has at least
1918          SIZE bytes left of storage in it, so it's OK to read directly
1919          into it.  (We'll be overwriting above, after we've decoded it
1920          into the runoff.) */
1921       read_size = Lstream_read (str->other_end, data, size);
1922       if (read_size < 0)
1923         {
1924           error_occurred = 1;
1925           break;
1926         }
1927       if (read_size == 0)
1928         /* There might be some more end data produced in the translation.
1929            See the comment above. */
1930         str->flags |= CODING_STATE_END;
1931       mule_decode (stream, data, str->runoff, read_size);
1932     }
1933
1934   if (data - orig_data == 0)
1935     return error_occurred ? -1 : 0;
1936   else
1937     return data - orig_data;
1938 }
1939
1940 static int
1941 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1942 {
1943   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1944   int retval;
1945
1946   /* Decode all our data into the runoff, and then attempt to write
1947      it all out to the other end.  Remove whatever chunk we succeeded
1948      in writing. */
1949   mule_decode (stream, data, str->runoff, size);
1950   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1951                           Dynarr_length (str->runoff));
1952   if (retval > 0)
1953     Dynarr_delete_many (str->runoff, 0, retval);
1954   /* Do NOT return retval.  The return value indicates how much
1955      of the incoming data was written, not how many bytes were
1956      written. */
1957   return size;
1958 }
1959
1960 static void
1961 reset_decoding_stream (struct decoding_stream *str)
1962 {
1963 #ifdef MULE
1964   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1965     {
1966       Lisp_Object coding_system;
1967       XSETCODING_SYSTEM (coding_system, str->codesys);
1968       reset_iso2022 (coding_system, &str->iso2022);
1969     }
1970   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1971     {
1972       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1973     }
1974 #endif /* MULE */
1975   str->flags = str->ch = 0;
1976 }
1977
1978 static int
1979 decoding_rewinder (Lstream *stream)
1980 {
1981   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1982   reset_decoding_stream (str);
1983   Dynarr_reset (str->runoff);
1984   return Lstream_rewind (str->other_end);
1985 }
1986
1987 static int
1988 decoding_seekable_p (Lstream *stream)
1989 {
1990   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1991   return Lstream_seekable_p (str->other_end);
1992 }
1993
1994 static int
1995 decoding_flusher (Lstream *stream)
1996 {
1997   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1998   return Lstream_flush (str->other_end);
1999 }
2000
2001 static int
2002 decoding_closer (Lstream *stream)
2003 {
2004   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2005   if (stream->flags & LSTREAM_FL_WRITE)
2006     {
2007       str->flags |= CODING_STATE_END;
2008       decoding_writer (stream, 0, 0);
2009     }
2010   Dynarr_free (str->runoff);
2011 #ifdef MULE
2012 #ifdef ENABLE_COMPOSITE_CHARS
2013   if (str->iso2022.composite_chars)
2014     Dynarr_free (str->iso2022.composite_chars);
2015 #endif
2016 #endif
2017   return Lstream_close (str->other_end);
2018 }
2019
2020 Lisp_Object
2021 decoding_stream_coding_system (Lstream *stream)
2022 {
2023   Lisp_Object coding_system;
2024   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2025
2026   XSETCODING_SYSTEM (coding_system, str->codesys);
2027   return subsidiary_coding_system (coding_system, str->eol_type);
2028 }
2029
2030 void
2031 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2032 {
2033   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2034   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2035   str->codesys = cs;
2036   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2037     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2038   reset_decoding_stream (str);
2039 }
2040
2041 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2042    stream for writing, no automatic code detection will be performed.
2043    The reason for this is that automatic code detection requires a
2044    seekable input.  Things will also fail if you open a decoding
2045    stream for reading using a non-fully-specified coding system and
2046    a non-seekable input stream. */
2047
2048 static Lisp_Object
2049 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2050                         CONST char *mode)
2051 {
2052   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2053   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2054   Lisp_Object obj;
2055
2056   xzero (*str);
2057   str->other_end = stream;
2058   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2059   str->eol_type = EOL_AUTODETECT;
2060   if (!strcmp (mode, "r")
2061       && Lstream_seekable_p (stream))
2062     /* We can determine the coding system now. */
2063     determine_real_coding_system (stream, &codesys, &str->eol_type);
2064   set_decoding_stream_coding_system (lstr, codesys);
2065   str->decst.eol_type = str->eol_type;
2066   str->decst.mask = ~0;
2067   XSETLSTREAM (obj, lstr);
2068   return obj;
2069 }
2070
2071 Lisp_Object
2072 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2073 {
2074   return make_decoding_stream_1 (stream, codesys, "r");
2075 }
2076
2077 Lisp_Object
2078 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2079 {
2080   return make_decoding_stream_1 (stream, codesys, "w");
2081 }
2082
2083 /* Note: the decode_coding_* functions all take the same
2084    arguments as mule_decode(), which is to say some SRC data of
2085    size N, which is to be stored into dynamic array DST.
2086    DECODING is the stream within which the decoding is
2087    taking place, but no data is actually read from or
2088    written to that stream; that is handled in decoding_reader()
2089    or decoding_writer().  This allows the same functions to
2090    be used for both reading and writing. */
2091
2092 static void
2093 mule_decode (Lstream *decoding, CONST unsigned char *src,
2094              unsigned_char_dynarr *dst, unsigned int n)
2095 {
2096   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2097
2098   /* If necessary, do encoding-detection now.  We do this when
2099      we're a writing stream or a non-seekable reading stream,
2100      meaning that we can't just process the whole input,
2101      rewind, and start over. */
2102
2103   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2104       str->eol_type == EOL_AUTODETECT)
2105     {
2106       Lisp_Object codesys;
2107
2108       XSETCODING_SYSTEM (codesys, str->codesys);
2109       detect_coding_type (&str->decst, src, n,
2110                           CODING_SYSTEM_TYPE (str->codesys) !=
2111                           CODESYS_AUTODETECT);
2112       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2113           str->decst.mask != ~0)
2114         /* #### This is cheesy.  What we really ought to do is
2115            buffer up a certain amount of data so as to get a
2116            less random result. */
2117         codesys = coding_system_from_mask (str->decst.mask);
2118       str->eol_type = str->decst.eol_type;
2119       if (XCODING_SYSTEM (codesys) != str->codesys)
2120         {
2121           /* Preserve the CODING_STATE_END flag in case it was set.
2122              If we erase it, bad things might happen. */
2123           int was_end = str->flags & CODING_STATE_END;
2124           set_decoding_stream_coding_system (decoding, codesys);
2125           if (was_end)
2126             str->flags |= CODING_STATE_END;
2127         }
2128     }
2129
2130   switch (CODING_SYSTEM_TYPE (str->codesys))
2131     {
2132 #ifdef DEBUG_XEMACS
2133     case CODESYS_INTERNAL:
2134       Dynarr_add_many (dst, src, n);
2135       break;
2136 #endif
2137     case CODESYS_AUTODETECT:
2138       /* If we got this far and still haven't decided on the coding
2139          system, then do no conversion. */
2140     case CODESYS_NO_CONVERSION:
2141       decode_coding_no_conversion (decoding, src, dst, n);
2142       break;
2143 #ifdef MULE
2144     case CODESYS_SHIFT_JIS:
2145       decode_coding_sjis (decoding, src, dst, n);
2146       break;
2147     case CODESYS_BIG5:
2148       decode_coding_big5 (decoding, src, dst, n);
2149       break;
2150     case CODESYS_UCS4:
2151       decode_coding_ucs4 (decoding, src, dst, n);
2152       break;
2153     case CODESYS_UTF8:
2154       decode_coding_utf8 (decoding, src, dst, n);
2155       break;
2156     case CODESYS_CCL:
2157       str->ccl.last_block = str->flags & CODING_STATE_END;
2158       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2159       break;
2160     case CODESYS_ISO2022:
2161       decode_coding_iso2022 (decoding, src, dst, n);
2162       break;
2163 #endif /* MULE */
2164     default:
2165       abort ();
2166     }
2167 }
2168
2169 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2170 Decode the text between START and END which is encoded in CODING-SYSTEM.
2171 This is useful if you've read in encoded text from a file without decoding
2172 it (e.g. you read in a JIS-formatted file but used the `binary' or
2173 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2174 Return length of decoded text.
2175 BUFFER defaults to the current buffer if unspecified.
2176 */
2177        (start, end, coding_system, buffer))
2178 {
2179   Bufpos b, e;
2180   struct buffer *buf = decode_buffer (buffer, 0);
2181   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2182   Lstream *istr, *ostr;
2183   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2184
2185   get_buffer_range_char (buf, start, end, &b, &e, 0);
2186
2187   barf_if_buffer_read_only (buf, b, e);
2188
2189   coding_system = Fget_coding_system (coding_system);
2190   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2191   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2192   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2193                                               coding_system);
2194   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2195                                            Fget_coding_system (Qbinary));
2196   istr = XLSTREAM (instream);
2197   ostr = XLSTREAM (outstream);
2198   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2199
2200   /* The chain of streams looks like this:
2201
2202      [BUFFER] <----- send through
2203                      ------> [ENCODE AS BINARY]
2204                              ------> [DECODE AS SPECIFIED]
2205                                      ------> [BUFFER]
2206    */
2207
2208   while (1)
2209     {
2210       char tempbuf[1024]; /* some random amount */
2211       Bufpos newpos, even_newer_pos;
2212       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2213       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2214
2215       if (!size_in_bytes)
2216         break;
2217       newpos = lisp_buffer_stream_startpos (istr);
2218       Lstream_write (ostr, tempbuf, size_in_bytes);
2219       even_newer_pos = lisp_buffer_stream_startpos (istr);
2220       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2221                            even_newer_pos, 0);
2222     }
2223   Lstream_close (istr);
2224   Lstream_close (ostr);
2225   UNGCPRO;
2226   Lstream_delete (istr);
2227   Lstream_delete (ostr);
2228   Lstream_delete (XLSTREAM (de_outstream));
2229   Lstream_delete (XLSTREAM (lb_outstream));
2230   return Qnil;
2231 }
2232
2233 \f
2234 /************************************************************************/
2235 /*           Converting to an external encoding ("encoding")            */
2236 /************************************************************************/
2237
2238 /* An encoding stream is an output stream.  When you create the
2239    stream, you specify the coding system that governs the encoding
2240    and another stream that the resulting encoded data is to be
2241    sent to, and then start sending data to it. */
2242
2243 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2244
2245 struct encoding_stream
2246 {
2247   /* Coding system that governs the conversion. */
2248   Lisp_Coding_System *codesys;
2249
2250   /* Stream that we read the encoded data from or
2251      write the decoded data to. */
2252   Lstream *other_end;
2253
2254   /* If we are reading, then we can return only a fixed amount of
2255      data, so if the conversion resulted in too much data, we store it
2256      here for retrieval the next time around. */
2257   unsigned_char_dynarr *runoff;
2258
2259   /* FLAGS holds flags indicating the current state of the encoding.
2260      Some of these flags are dependent on the coding system. */
2261   unsigned int flags;
2262
2263   /* CH holds a partially built-up character.  Since we only deal
2264      with one- and two-byte characters at the moment, we only use
2265      this to store the first byte of a two-byte character. */
2266   unsigned int ch;
2267 #ifdef MULE
2268   /* Additional information used by the ISO2022 encoder. */
2269   struct
2270     {
2271       /* CHARSET holds the character sets currently assigned to the G0
2272          through G3 registers.  It is initialized from the array
2273          INITIAL_CHARSET in CODESYS. */
2274       Lisp_Object charset[4];
2275
2276       /* Which registers are currently invoked into the left (GL) and
2277          right (GR) halves of the 8-bit encoding space? */
2278       int register_left, register_right;
2279
2280       /* Whether we need to explicitly designate the charset in the
2281          G? register before using it.  It is initialized from the
2282          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2283       unsigned char force_charset_on_output[4];
2284
2285       /* Other state variables that need to be preserved across
2286          invocations. */
2287       Lisp_Object current_charset;
2288       int current_half;
2289       int current_char_boundary;
2290     } iso2022;
2291
2292   /* Additional information (the state of the running CCL program)
2293      used by the CCL encoder. */
2294   struct ccl_program ccl;
2295 #endif /* MULE */
2296 };
2297
2298 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2299 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2300                             size_t size);
2301 static int encoding_rewinder   (Lstream *stream);
2302 static int encoding_seekable_p (Lstream *stream);
2303 static int encoding_flusher    (Lstream *stream);
2304 static int encoding_closer     (Lstream *stream);
2305
2306 static Lisp_Object encoding_marker (Lisp_Object stream,
2307                                     void (*markobj) (Lisp_Object));
2308
2309 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2310                                sizeof (struct encoding_stream));
2311
2312 static Lisp_Object
2313 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2314 {
2315   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2316   Lisp_Object str_obj;
2317
2318   /* We do not need to mark the coding systems or charsets stored
2319      within the stream because they are stored in a global list
2320      and automatically marked. */
2321
2322   XSETLSTREAM (str_obj, str);
2323   markobj (str_obj);
2324   if (str->imp->marker)
2325     return (str->imp->marker) (str_obj, markobj);
2326   else
2327     return Qnil;
2328 }
2329
2330 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2331    so we read data from the other end, encode it, and store it into DATA. */
2332
2333 static int
2334 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2335 {
2336   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2337   unsigned char *orig_data = data;
2338   int read_size;
2339   int error_occurred = 0;
2340
2341   /* We need to interface to mule_encode(), which expects to take some
2342      amount of data and store the result into a Dynarr.  We have
2343      mule_encode() store into str->runoff, and take data from there
2344      as necessary. */
2345
2346   /* We loop until we have enough data, reading chunks from the other
2347      end and encoding it. */
2348   while (1)
2349     {
2350       /* Take data from the runoff if we can.  Make sure to take at
2351          most SIZE bytes, and delete the data from the runoff. */
2352       if (Dynarr_length (str->runoff) > 0)
2353         {
2354           int chunk = min ((int) size, Dynarr_length (str->runoff));
2355           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2356           Dynarr_delete_many (str->runoff, 0, chunk);
2357           data += chunk;
2358           size -= chunk;
2359         }
2360
2361       if (size == 0)
2362         break; /* No more room for data */
2363
2364       if (str->flags & CODING_STATE_END)
2365         /* This means that on the previous iteration, we hit the EOF on
2366            the other end.  We loop once more so that mule_encode() can
2367            output any final stuff it may be holding, or any "go back
2368            to a sane state" escape sequences. (This latter makes sense
2369            during encoding.) */
2370         break;
2371
2372       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2373          left of storage in it, so it's OK to read directly into it.
2374          (We'll be overwriting above, after we've encoded it into the
2375          runoff.) */
2376       read_size = Lstream_read (str->other_end, data, size);
2377       if (read_size < 0)
2378         {
2379           error_occurred = 1;
2380           break;
2381         }
2382       if (read_size == 0)
2383         /* There might be some more end data produced in the translation.
2384            See the comment above. */
2385         str->flags |= CODING_STATE_END;
2386       mule_encode (stream, data, str->runoff, read_size);
2387     }
2388
2389   if (data == orig_data)
2390     return error_occurred ? -1 : 0;
2391   else
2392     return data - orig_data;
2393 }
2394
2395 static int
2396 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2397 {
2398   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2399   int retval;
2400
2401   /* Encode all our data into the runoff, and then attempt to write
2402      it all out to the other end.  Remove whatever chunk we succeeded
2403      in writing. */
2404   mule_encode (stream, data, str->runoff, size);
2405   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2406                           Dynarr_length (str->runoff));
2407   if (retval > 0)
2408     Dynarr_delete_many (str->runoff, 0, retval);
2409   /* Do NOT return retval.  The return value indicates how much
2410      of the incoming data was written, not how many bytes were
2411      written. */
2412   return size;
2413 }
2414
2415 static void
2416 reset_encoding_stream (struct encoding_stream *str)
2417 {
2418 #ifdef MULE
2419   switch (CODING_SYSTEM_TYPE (str->codesys))
2420     {
2421     case CODESYS_ISO2022:
2422       {
2423         int i;
2424
2425         for (i = 0; i < 4; i++)
2426           {
2427             str->iso2022.charset[i] =
2428               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2429             str->iso2022.force_charset_on_output[i] =
2430               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2431           }
2432         str->iso2022.register_left = 0;
2433         str->iso2022.register_right = 1;
2434         str->iso2022.current_charset = Qnil;
2435         str->iso2022.current_half = 0;
2436         str->iso2022.current_char_boundary = 1;
2437         break;
2438       }
2439     case CODESYS_CCL:
2440       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2441       break;
2442     default:
2443       break;
2444     }
2445 #endif /* MULE */
2446
2447   str->flags = str->ch = 0;
2448 }
2449
2450 static int
2451 encoding_rewinder (Lstream *stream)
2452 {
2453   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2454   reset_encoding_stream (str);
2455   Dynarr_reset (str->runoff);
2456   return Lstream_rewind (str->other_end);
2457 }
2458
2459 static int
2460 encoding_seekable_p (Lstream *stream)
2461 {
2462   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2463   return Lstream_seekable_p (str->other_end);
2464 }
2465
2466 static int
2467 encoding_flusher (Lstream *stream)
2468 {
2469   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2470   return Lstream_flush (str->other_end);
2471 }
2472
2473 static int
2474 encoding_closer (Lstream *stream)
2475 {
2476   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2477   if (stream->flags & LSTREAM_FL_WRITE)
2478     {
2479       str->flags |= CODING_STATE_END;
2480       encoding_writer (stream, 0, 0);
2481     }
2482   Dynarr_free (str->runoff);
2483   return Lstream_close (str->other_end);
2484 }
2485
2486 Lisp_Object
2487 encoding_stream_coding_system (Lstream *stream)
2488 {
2489   Lisp_Object coding_system;
2490   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2491
2492   XSETCODING_SYSTEM (coding_system, str->codesys);
2493   return coding_system;
2494 }
2495
2496 void
2497 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2498 {
2499   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2500   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2501   str->codesys = cs;
2502   reset_encoding_stream (str);
2503 }
2504
2505 static Lisp_Object
2506 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2507                         CONST char *mode)
2508 {
2509   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2510   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2511   Lisp_Object obj;
2512
2513   xzero (*str);
2514   str->runoff = Dynarr_new (unsigned_char);
2515   str->other_end = stream;
2516   set_encoding_stream_coding_system (lstr, codesys);
2517   XSETLSTREAM (obj, lstr);
2518   return obj;
2519 }
2520
2521 Lisp_Object
2522 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2523 {
2524   return make_encoding_stream_1 (stream, codesys, "r");
2525 }
2526
2527 Lisp_Object
2528 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2529 {
2530   return make_encoding_stream_1 (stream, codesys, "w");
2531 }
2532
2533 /* Convert N bytes of internally-formatted data stored in SRC to an
2534    external format, according to the encoding stream ENCODING.
2535    Store the encoded data into DST. */
2536
2537 static void
2538 mule_encode (Lstream *encoding, CONST unsigned char *src,
2539              unsigned_char_dynarr *dst, unsigned int n)
2540 {
2541   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2542
2543   switch (CODING_SYSTEM_TYPE (str->codesys))
2544     {
2545 #ifdef DEBUG_XEMACS
2546     case CODESYS_INTERNAL:
2547       Dynarr_add_many (dst, src, n);
2548       break;
2549 #endif
2550     case CODESYS_AUTODETECT:
2551       /* If we got this far and still haven't decided on the coding
2552          system, then do no conversion. */
2553     case CODESYS_NO_CONVERSION:
2554       encode_coding_no_conversion (encoding, src, dst, n);
2555       break;
2556 #ifdef MULE
2557     case CODESYS_SHIFT_JIS:
2558       encode_coding_sjis (encoding, src, dst, n);
2559       break;
2560     case CODESYS_BIG5:
2561       encode_coding_big5 (encoding, src, dst, n);
2562       break;
2563     case CODESYS_UCS4:
2564       encode_coding_ucs4 (encoding, src, dst, n);
2565       break;
2566     case CODESYS_UTF8:
2567       encode_coding_utf8 (encoding, src, dst, n);
2568       break;
2569     case CODESYS_CCL:
2570       str->ccl.last_block = str->flags & CODING_STATE_END;
2571       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2572       break;
2573     case CODESYS_ISO2022:
2574       encode_coding_iso2022 (encoding, src, dst, n);
2575       break;
2576 #endif /* MULE */
2577     default:
2578       abort ();
2579     }
2580 }
2581
2582 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2583 Encode the text between START and END using CODING-SYSTEM.
2584 This will, for example, convert Japanese characters into stuff such as
2585 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2586 text.  BUFFER defaults to the current buffer if unspecified.
2587 */
2588        (start, end, coding_system, buffer))
2589 {
2590   Bufpos b, e;
2591   struct buffer *buf = decode_buffer (buffer, 0);
2592   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2593   Lstream *istr, *ostr;
2594   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2595
2596   get_buffer_range_char (buf, start, end, &b, &e, 0);
2597
2598   barf_if_buffer_read_only (buf, b, e);
2599
2600   coding_system = Fget_coding_system (coding_system);
2601   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2602   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2603   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2604                                               Fget_coding_system (Qbinary));
2605   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2606                                            coding_system);
2607   istr = XLSTREAM (instream);
2608   ostr = XLSTREAM (outstream);
2609   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2610   /* The chain of streams looks like this:
2611
2612      [BUFFER] <----- send through
2613                      ------> [ENCODE AS SPECIFIED]
2614                              ------> [DECODE AS BINARY]
2615                                      ------> [BUFFER]
2616    */
2617   while (1)
2618     {
2619       char tempbuf[1024]; /* some random amount */
2620       Bufpos newpos, even_newer_pos;
2621       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2622       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2623
2624       if (!size_in_bytes)
2625         break;
2626       newpos = lisp_buffer_stream_startpos (istr);
2627       Lstream_write (ostr, tempbuf, size_in_bytes);
2628       even_newer_pos = lisp_buffer_stream_startpos (istr);
2629       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2630                            even_newer_pos, 0);
2631     }
2632
2633   {
2634     Charcount retlen =
2635       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2636     Lstream_close (istr);
2637     Lstream_close (ostr);
2638     UNGCPRO;
2639     Lstream_delete (istr);
2640     Lstream_delete (ostr);
2641     Lstream_delete (XLSTREAM (de_outstream));
2642     Lstream_delete (XLSTREAM (lb_outstream));
2643     return make_int (retlen);
2644   }
2645 }
2646
2647 #ifdef MULE
2648 \f
2649 /************************************************************************/
2650 /*                          Shift-JIS methods                           */
2651 /************************************************************************/
2652
2653 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2654    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2655    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2656    encoded by "position-code + 0x80".  A character of JISX0208
2657    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2658    position-codes are divided and shifted so that it fit in the range
2659    below.
2660
2661    --- CODE RANGE of Shift-JIS ---
2662    (character set)      (range)
2663    ASCII                0x00 .. 0x7F
2664    JISX0201-Kana        0xA0 .. 0xDF
2665    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2666             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2667    -------------------------------
2668
2669 */
2670
2671 /* Is this the first byte of a Shift-JIS two-byte char? */
2672
2673 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2674   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2675
2676 /* Is this the second byte of a Shift-JIS two-byte char? */
2677
2678 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2679   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2680
2681 #define BYTE_SJIS_KATAKANA_P(c) \
2682   ((c) >= 0xA1 && (c) <= 0xDF)
2683
2684 static int
2685 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2686                     unsigned int n)
2687 {
2688   int c;
2689
2690   while (n--)
2691     {
2692       c = *src++;
2693       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2694         return 0;
2695       if (st->shift_jis.in_second_byte)
2696         {
2697           st->shift_jis.in_second_byte = 0;
2698           if (c < 0x40)
2699             return 0;
2700         }
2701       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2702         st->shift_jis.in_second_byte = 1;
2703     }
2704   return CODING_CATEGORY_SHIFT_JIS_MASK;
2705 }
2706
2707 /* Convert Shift-JIS data to internal format. */
2708
2709 static void
2710 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2711                     unsigned_char_dynarr *dst, unsigned int n)
2712 {
2713   unsigned char c;
2714   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2715   unsigned int flags  = str->flags;
2716   unsigned int ch     = str->ch;
2717   eol_type_t eol_type = str->eol_type;
2718
2719   while (n--)
2720     {
2721       c = *src++;
2722
2723       if (ch)
2724         {
2725           /* Previous character was first byte of Shift-JIS Kanji char. */
2726           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2727             {
2728               unsigned char e1, e2;
2729
2730               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2731               DECODE_SJIS (ch, c, e1, e2);
2732               Dynarr_add (dst, e1);
2733               Dynarr_add (dst, e2);
2734             }
2735           else
2736             {
2737               DECODE_ADD_BINARY_CHAR (ch, dst);
2738               DECODE_ADD_BINARY_CHAR (c, dst);
2739             }
2740           ch = 0;
2741         }
2742       else
2743         {
2744           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2745           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2746             ch = c;
2747           else if (BYTE_SJIS_KATAKANA_P (c))
2748             {
2749               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2750               Dynarr_add (dst, c);
2751             }
2752           else
2753             DECODE_ADD_BINARY_CHAR (c, dst);
2754         }
2755     label_continue_loop:;
2756     }
2757
2758   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2759
2760   str->flags = flags;
2761   str->ch    = ch;
2762 }
2763
2764 /* Convert internally-formatted data to Shift-JIS. */
2765
2766 static void
2767 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2768                     unsigned_char_dynarr *dst, unsigned int n)
2769 {
2770   unsigned char c;
2771   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2772   unsigned int flags  = str->flags;
2773   unsigned int ch     = str->ch;
2774   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2775
2776   while (n--)
2777     {
2778       c = *src++;
2779       if (c == '\n')
2780         {
2781           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2782             Dynarr_add (dst, '\r');
2783           if (eol_type != EOL_CR)
2784             Dynarr_add (dst, '\n');
2785           ch = 0;
2786         }
2787       else if (BYTE_ASCII_P (c))
2788         {
2789           Dynarr_add (dst, c);
2790           ch = 0;
2791         }
2792       else if (BUFBYTE_LEADING_BYTE_P (c))
2793         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2794               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2795               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2796       else if (ch)
2797         {
2798           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2799             {
2800               Dynarr_add (dst, c);
2801               ch = 0;
2802             }
2803           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2804                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2805             ch = c;
2806           else
2807             {
2808               unsigned char j1, j2;
2809               ENCODE_SJIS (ch, c, j1, j2);
2810               Dynarr_add (dst, j1);
2811               Dynarr_add (dst, j2);
2812               ch = 0;
2813             }
2814         }
2815     }
2816
2817   str->flags = flags;
2818   str->ch    = ch;
2819 }
2820
2821 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2822 Decode a JISX0208 character of Shift-JIS coding-system.
2823 CODE is the character code in Shift-JIS as a cons of type bytes.
2824 Return the corresponding character.
2825 */
2826        (code))
2827 {
2828   unsigned char c1, c2, s1, s2;
2829
2830   CHECK_CONS (code);
2831   CHECK_INT (XCAR (code));
2832   CHECK_INT (XCDR (code));
2833   s1 = XINT (XCAR (code));
2834   s2 = XINT (XCDR (code));
2835   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2836       BYTE_SJIS_TWO_BYTE_2_P (s2))
2837     {
2838       DECODE_SJIS (s1, s2, c1, c2);
2839       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2840                                    c1 & 0x7F, c2 & 0x7F));
2841     }
2842   else
2843     return Qnil;
2844 }
2845
2846 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2847 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2848 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2849 */
2850        (ch))
2851 {
2852   Lisp_Object charset;
2853   int c1, c2, s1, s2;
2854
2855   CHECK_CHAR_COERCE_INT (ch);
2856   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2857   if (EQ (charset, Vcharset_japanese_jisx0208))
2858     {
2859       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2860       return Fcons (make_int (s1), make_int (s2));
2861     }
2862   else
2863     return Qnil;
2864 }
2865
2866 \f
2867 /************************************************************************/
2868 /*                            Big5 methods                              */
2869 /************************************************************************/
2870
2871 /* BIG5 is a coding system encoding two character sets: ASCII and
2872    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2873    character set and is encoded in two-byte.
2874
2875    --- CODE RANGE of BIG5 ---
2876    (character set)      (range)
2877    ASCII                0x00 .. 0x7F
2878    Big5 (1st byte)      0xA1 .. 0xFE
2879         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2880    --------------------------
2881
2882    Since the number of characters in Big5 is larger than maximum
2883    characters in Emacs' charset (96x96), it can't be handled as one
2884    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2885    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
2886    contains frequently used characters and the latter contains less
2887    frequently used characters.  */
2888
2889 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2890   ((c) >= 0xA1 && (c) <= 0xFE)
2891
2892 /* Is this the second byte of a Shift-JIS two-byte char? */
2893
2894 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2895   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2896
2897 /* Number of Big5 characters which have the same code in 1st byte.  */
2898
2899 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2900
2901 /* Code conversion macros.  These are macros because they are used in
2902    inner loops during code conversion.
2903
2904    Note that temporary variables in macros introduce the classic
2905    dynamic-scoping problems with variable names.  We use capital-
2906    lettered variables in the assumption that XEmacs does not use
2907    capital letters in variables except in a very formalized way
2908    (e.g. Qstring). */
2909
2910 /* Convert Big5 code (b1, b2) into its internal string representation
2911    (lb, c1, c2). */
2912
2913 /* There is a much simpler way to split the Big5 charset into two.
2914    For the moment I'm going to leave the algorithm as-is because it
2915    claims to separate out the most-used characters into a single
2916    charset, which perhaps will lead to optimizations in various
2917    places.
2918
2919    The way the algorithm works is something like this:
2920
2921    Big5 can be viewed as a 94x157 charset, where the row is
2922    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2923    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2924    the split between low and high column numbers is apparently
2925    meaningless; ascending rows produce less and less frequent chars.
2926    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2927    the first charset, and the upper half (0xC9 .. 0xFE) to the
2928    second.  To do the conversion, we convert the character into
2929    a single number where 0 .. 156 is the first row, 157 .. 313
2930    is the second, etc.  That way, the characters are ordered by
2931    decreasing frequency.  Then we just chop the space in two
2932    and coerce the result into a 94x94 space.
2933    */
2934
2935 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2936 {                                                                       \
2937   int B1 = b1, B2 = b2;                                                 \
2938   unsigned int I                                                        \
2939     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2940                                                                         \
2941   if (B1 < 0xC9)                                                        \
2942     {                                                                   \
2943       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2944     }                                                                   \
2945   else                                                                  \
2946     {                                                                   \
2947       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2948       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2949     }                                                                   \
2950   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2951   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2952 } while (0)
2953
2954 /* Convert the internal string representation of a Big5 character
2955    (lb, c1, c2) into Big5 code (b1, b2). */
2956
2957 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2958 {                                                                       \
2959   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2960                                                                         \
2961   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2962     {                                                                   \
2963       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2964     }                                                                   \
2965   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2966   b2 = I % BIG5_SAME_ROW;                                               \
2967   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2968 } while (0)
2969
2970 static int
2971 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2972                     unsigned int n)
2973 {
2974   int c;
2975
2976   while (n--)
2977     {
2978       c = *src++;
2979       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2980           (c >= 0x80 && c <= 0xA0))
2981         return 0;
2982       if (st->big5.in_second_byte)
2983         {
2984           st->big5.in_second_byte = 0;
2985           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2986             return 0;
2987         }
2988       else if (c >= 0xA1)
2989         st->big5.in_second_byte = 1;
2990     }
2991   return CODING_CATEGORY_BIG5_MASK;
2992 }
2993
2994 /* Convert Big5 data to internal format. */
2995
2996 static void
2997 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2998                     unsigned_char_dynarr *dst, unsigned int n)
2999 {
3000   unsigned char c;
3001   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3002   unsigned int flags  = str->flags;
3003   unsigned int ch     = str->ch;
3004   eol_type_t eol_type = str->eol_type;
3005
3006   while (n--)
3007     {
3008       c = *src++;
3009       if (ch)
3010         {
3011           /* Previous character was first byte of Big5 char. */
3012           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3013             {
3014               unsigned char b1, b2, b3;
3015               DECODE_BIG5 (ch, c, b1, b2, b3);
3016               Dynarr_add (dst, b1);
3017               Dynarr_add (dst, b2);
3018               Dynarr_add (dst, b3);
3019             }
3020           else
3021             {
3022               DECODE_ADD_BINARY_CHAR (ch, dst);
3023               DECODE_ADD_BINARY_CHAR (c, dst);
3024             }
3025           ch = 0;
3026         }
3027       else
3028         {
3029           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3030           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3031             ch = c;
3032           else
3033             DECODE_ADD_BINARY_CHAR (c, dst);
3034         }
3035     label_continue_loop:;
3036     }
3037
3038   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3039
3040   str->flags = flags;
3041   str->ch    = ch;
3042 }
3043
3044 /* Convert internally-formatted data to Big5. */
3045
3046 static void
3047 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3048                     unsigned_char_dynarr *dst, unsigned int n)
3049 {
3050   unsigned char c;
3051   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3052   unsigned int flags  = str->flags;
3053   unsigned int ch     = str->ch;
3054   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3055
3056   while (n--)
3057     {
3058       c = *src++;
3059       if (c == '\n')
3060         {
3061           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3062             Dynarr_add (dst, '\r');
3063           if (eol_type != EOL_CR)
3064             Dynarr_add (dst, '\n');
3065         }
3066       else if (BYTE_ASCII_P (c))
3067         {
3068           /* ASCII. */
3069           Dynarr_add (dst, c);
3070         }
3071       else if (BUFBYTE_LEADING_BYTE_P (c))
3072         {
3073           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3074               c == LEADING_BYTE_CHINESE_BIG5_2)
3075             {
3076               /* A recognized leading byte. */
3077               ch = c;
3078               continue; /* not done with this character. */
3079             }
3080           /* otherwise just ignore this character. */
3081         }
3082       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3083                ch == LEADING_BYTE_CHINESE_BIG5_2)
3084         {
3085           /* Previous char was a recognized leading byte. */
3086           ch = (ch << 8) | c;
3087           continue; /* not done with this character. */
3088         }
3089       else if (ch)
3090         {
3091           /* Encountering second byte of a Big5 character. */
3092           unsigned char b1, b2;
3093
3094           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3095           Dynarr_add (dst, b1);
3096           Dynarr_add (dst, b2);
3097         }
3098
3099       ch = 0;
3100     }
3101
3102   str->flags = flags;
3103   str->ch    = ch;
3104 }
3105
3106
3107 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3108 Decode a Big5 character CODE of BIG5 coding-system.
3109 CODE is the character code in BIG5, a cons of two integers.
3110 Return the corresponding character.
3111 */
3112        (code))
3113 {
3114   unsigned char c1, c2, b1, b2;
3115
3116   CHECK_CONS (code);
3117   CHECK_INT (XCAR (code));
3118   CHECK_INT (XCDR (code));
3119   b1 = XINT (XCAR (code));
3120   b2 = XINT (XCDR (code));
3121   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3122       BYTE_BIG5_TWO_BYTE_2_P (b2))
3123     {
3124       int leading_byte;
3125       Lisp_Object charset;
3126       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3127       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3128       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3129     }
3130   else
3131     return Qnil;
3132 }
3133
3134 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3135 Encode the Big5 character CH to BIG5 coding-system.
3136 Return the corresponding character code in Big5.
3137 */
3138        (ch))
3139 {
3140   Lisp_Object charset;
3141   int c1, c2, b1, b2;
3142
3143   CHECK_CHAR_COERCE_INT (ch);
3144   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3145   if (EQ (charset, Vcharset_chinese_big5_1) ||
3146       EQ (charset, Vcharset_chinese_big5_2))
3147     {
3148       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3149                    b1, b2);
3150       return Fcons (make_int (b1), make_int (b2));
3151     }
3152   else
3153     return Qnil;
3154 }
3155
3156 \f
3157 /************************************************************************/
3158 /*                           UCS-4 methods                              */
3159 /*                                                                      */
3160 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3161 /*                                                                      */
3162 /************************************************************************/
3163
3164 Lisp_Object ucs_to_mule_table[65536];
3165 Lisp_Object mule_to_ucs_table;
3166
3167 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3168 Map UCS-4 code CODE to Mule character CHARACTER.
3169
3170 Return T on success, NIL on failure.
3171 */
3172        (code, character))
3173 {
3174   unsigned int c;
3175
3176   CHECK_CHAR (character);
3177   CHECK_INT (code);
3178   c = XINT (code);
3179
3180   if (c < sizeof (ucs_to_mule_table))
3181     {
3182       ucs_to_mule_table[c] = character;
3183       return Qt;
3184     }
3185   else
3186     return Qnil;
3187 }
3188
3189 static Lisp_Object
3190 ucs_to_char (unsigned long code)
3191 {
3192   if (code < sizeof (ucs_to_mule_table))
3193     {
3194       return ucs_to_mule_table[code];
3195     }
3196   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3197     {
3198       unsigned int c;
3199
3200       code -= 0xe00000;
3201       c = code % (94 * 94);
3202       return make_char
3203         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3204                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3205                      CHARSET_LEFT_TO_RIGHT),
3206                     c / 94 + 33, c % 94 + 33));
3207     }
3208   else
3209     return Qnil;
3210 }
3211
3212 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3213 Return Mule character corresponding to UCS code CODE (a positive integer).
3214 */
3215        (code))
3216 {
3217   CHECK_NATNUM (code);
3218   return ucs_to_char (XINT (code));
3219 }
3220
3221 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3222 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3223 */
3224        (character, code))
3225 {
3226   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3227           Fset_char_ucs is more restrictive on index arg, but should
3228           check code arg in a char_table method. */
3229   CHECK_CHAR (character);
3230   CHECK_NATNUM (code);
3231   return Fput_char_table (character, code, mule_to_ucs_table);
3232 }
3233
3234 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3235 Return the UCS code (a positive integer) corresponding to CHARACTER.
3236 */
3237        (character))
3238 {
3239   return Fget_char_table (character, mule_to_ucs_table);
3240 }
3241
3242 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3243    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3244    is not found, instead.
3245    #### do something more appropriate (use blob?)
3246         Danger, Will Robinson!  Data loss.  Should we signal user? */
3247 static void
3248 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3249 {
3250   Lisp_Object chr = ucs_to_char (ch);
3251
3252   if (! NILP (chr))
3253     {
3254       Bufbyte work[MAX_EMCHAR_LEN];
3255       int len;
3256
3257       ch = XCHAR (chr);
3258       len = (ch < 128) ?
3259         simple_set_charptr_emchar (work, ch) :
3260         non_ascii_set_charptr_emchar (work, ch);
3261       Dynarr_add_many (dst, work, len);
3262     }
3263   else
3264     {
3265       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3266       Dynarr_add (dst, 34 + 128);
3267       Dynarr_add (dst, 46 + 128);
3268     }
3269 }
3270
3271 static unsigned long
3272 mule_char_to_ucs4 (Lisp_Object charset,
3273                    unsigned char h, unsigned char l)
3274 {
3275   Lisp_Object code
3276     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3277                        mule_to_ucs_table);
3278
3279   if (INTP (code))
3280     {
3281       return XINT (code);
3282     }
3283   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3284             (XCHARSET_CHARS (charset) == 94) )
3285     {
3286       unsigned char final = XCHARSET_FINAL (charset);
3287
3288       if ( ('@' <= final) && (final < 0x7f) )
3289         {
3290           return 0xe00000 + (final - '@') * 94 * 94
3291             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3292         }
3293       else
3294         {
3295           return '?';
3296         }
3297     }
3298   else
3299     {
3300       return '?';
3301     }
3302 }
3303
3304 static void
3305 encode_ucs4 (Lisp_Object charset,
3306              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3307 {
3308   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3309   Dynarr_add (dst,  code >> 24);
3310   Dynarr_add (dst, (code >> 16) & 255);
3311   Dynarr_add (dst, (code >>  8) & 255);
3312   Dynarr_add (dst,  code        & 255);
3313 }
3314
3315 static int
3316 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3317                     unsigned int n)
3318 {
3319   while (n--)
3320     {
3321       int c = *src++;
3322       switch (st->ucs4.in_byte)
3323         {
3324         case 0:
3325           if (c >= 128)
3326             return 0;
3327           else
3328             st->ucs4.in_byte++;
3329           break;
3330         case 3:
3331           st->ucs4.in_byte = 0;
3332           break;
3333         default:
3334           st->ucs4.in_byte++;
3335         }
3336     }
3337   return CODING_CATEGORY_UCS4_MASK;
3338 }
3339
3340 static void
3341 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3342                     unsigned_char_dynarr *dst, unsigned int n)
3343 {
3344   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3345   unsigned int flags = str->flags;
3346   unsigned int ch    = str->ch;
3347
3348   while (n--)
3349     {
3350       unsigned char c = *src++;
3351       switch (flags)
3352         {
3353         case 0:
3354           ch = c;
3355           flags = 3;
3356           break;
3357         case 1:
3358           decode_ucs4 ( ( ch << 8 ) | c, dst);
3359           ch = 0;
3360           flags = 0;
3361           break;
3362         default:
3363           ch = ( ch << 8 ) | c;
3364           flags--;
3365         }
3366     }
3367   if (flags & CODING_STATE_END)
3368     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3369
3370   str->flags = flags;
3371   str->ch    = ch;
3372 }
3373
3374 static void
3375 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3376                     unsigned_char_dynarr *dst, unsigned int n)
3377 {
3378   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3379   unsigned int flags = str->flags;
3380   unsigned int ch = str->ch;
3381   unsigned char char_boundary = str->iso2022.current_char_boundary;
3382   Lisp_Object charset = str->iso2022.current_charset;
3383
3384 #ifdef ENABLE_COMPOSITE_CHARS
3385   /* flags for handling composite chars.  We do a little switcharoo
3386      on the source while we're outputting the composite char. */
3387   unsigned int saved_n = 0;
3388   CONST unsigned char *saved_src = NULL;
3389   int in_composite = 0;
3390
3391  back_to_square_n:
3392 #endif
3393
3394   while (n--)
3395     {
3396       unsigned char c = *src++;
3397
3398       if (BYTE_ASCII_P (c))
3399         {               /* Processing ASCII character */
3400           ch = 0;
3401           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3402           char_boundary = 1;
3403         }
3404       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3405         { /* Processing Leading Byte */
3406           ch = 0;
3407           charset = CHARSET_BY_LEADING_BYTE (c);
3408           if (LEADING_BYTE_PREFIX_P(c))
3409             ch = c;
3410           char_boundary = 0;
3411         }
3412       else
3413         {                       /* Processing Non-ASCII character */
3414           char_boundary = 1;
3415           if (EQ (charset, Vcharset_control_1))
3416             {
3417               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3418             }
3419           else
3420             {
3421               switch (XCHARSET_REP_BYTES (charset))
3422                 {
3423                 case 2:
3424                   encode_ucs4 (charset, c, 0, dst);
3425                   break;
3426                 case 3:
3427                   if (XCHARSET_PRIVATE_P (charset))
3428                     {
3429                       encode_ucs4 (charset, c, 0, dst);
3430                       ch = 0;
3431                     }
3432                   else if (ch)
3433                     {
3434 #ifdef ENABLE_COMPOSITE_CHARS
3435                       if (EQ (charset, Vcharset_composite))
3436                         {
3437                           if (in_composite)
3438                             {
3439                               /* #### Bother! We don't know how to
3440                                  handle this yet. */
3441                               Dynarr_add (dst, 0);
3442                               Dynarr_add (dst, 0);
3443                               Dynarr_add (dst, 0);
3444                               Dynarr_add (dst, '~');
3445                             }
3446                           else
3447                             {
3448                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3449                                                        ch & 0x7F, c & 0x7F);
3450                               Lisp_Object lstr = composite_char_string (emch);
3451                               saved_n = n;
3452                               saved_src = src;
3453                               in_composite = 1;
3454                               src = XSTRING_DATA   (lstr);
3455                               n   = XSTRING_LENGTH (lstr);
3456                             }
3457                         }
3458                       else
3459 #endif /* ENABLE_COMPOSITE_CHARS */
3460                         {
3461                           encode_ucs4(charset, ch, c, dst);
3462                         }
3463                       ch = 0;
3464                     }
3465                   else
3466                     {
3467                       ch = c;
3468                       char_boundary = 0;
3469                     }
3470                   break;
3471                 case 4:
3472                   if (ch)
3473                     {
3474                       encode_ucs4 (charset, ch, c, dst);
3475                       ch = 0;
3476                     }
3477                   else
3478                     {
3479                       ch = c;
3480                       char_boundary = 0;
3481                     }
3482                   break;
3483                 default:
3484                   abort ();
3485                 }
3486             }
3487         }
3488     }
3489
3490 #ifdef ENABLE_COMPOSITE_CHARS
3491   if (in_composite)
3492     {
3493       n = saved_n;
3494       src = saved_src;
3495       in_composite = 0;
3496       goto back_to_square_n; /* Wheeeeeeeee ..... */
3497     }
3498 #endif /* ENABLE_COMPOSITE_CHARS */
3499
3500   str->flags = flags;
3501   str->ch = ch;
3502   str->iso2022.current_char_boundary = char_boundary;
3503   str->iso2022.current_charset = charset;
3504
3505   /* Verbum caro factum est! */
3506 }
3507
3508 \f
3509 /************************************************************************/
3510 /*                           UTF-8 methods                              */
3511 /************************************************************************/
3512
3513 static int
3514 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3515                     unsigned int n)
3516 {
3517   while (n--)
3518     {
3519       unsigned char c = *src++;
3520       switch (st->utf8.in_byte)
3521         {
3522         case 0:
3523           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3524             return 0;
3525           else if (c >= 0xfc)
3526             st->utf8.in_byte = 5;
3527           else if (c >= 0xf8)
3528             st->utf8.in_byte = 4;
3529           else if (c >= 0xf0)
3530             st->utf8.in_byte = 3;
3531           else if (c >= 0xe0)
3532             st->utf8.in_byte = 2;
3533           else if (c >= 0xc0)
3534             st->utf8.in_byte = 1;
3535           else if (c >= 0x80)
3536             return 0;
3537           break;
3538         default:
3539           if ((c & 0xc0) != 0x80)
3540             return 0;
3541           else
3542             st->utf8.in_byte--;
3543         }
3544     }
3545   return CODING_CATEGORY_UTF8_MASK;
3546 }
3547
3548 static void
3549 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3550                     unsigned_char_dynarr *dst, unsigned int n)
3551 {
3552   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3553   unsigned int flags  = str->flags;
3554   unsigned int ch     = str->ch;
3555   eol_type_t eol_type = str->eol_type;
3556
3557   while (n--)
3558     {
3559       unsigned char c = *src++;
3560       switch (flags)
3561         {
3562         case 0:
3563           if ( c >= 0xfc )
3564             {
3565               ch = c & 0x01;
3566               flags = 5;
3567             }
3568           else if ( c >= 0xf8 )
3569             {
3570               ch = c & 0x03;
3571               flags = 4;
3572             }
3573           else if ( c >= 0xf0 )
3574             {
3575               ch = c & 0x07;
3576               flags = 3;
3577             }
3578           else if ( c >= 0xe0 )
3579             {
3580               ch = c & 0x0f;
3581               flags = 2;
3582             }
3583           else if ( c >= 0xc0 )
3584             {
3585               ch = c & 0x1f;
3586               flags = 1;
3587             }
3588           else
3589             {
3590               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3591               decode_ucs4 (c, dst);
3592             }
3593           break;
3594         case 1:
3595           ch = ( ch << 6 ) | ( c & 0x3f );
3596           decode_ucs4 (ch, dst);
3597           ch = 0;
3598           flags = 0;
3599           break;
3600         default:
3601           ch = ( ch << 6 ) | ( c & 0x3f );
3602           flags--;
3603         }
3604     label_continue_loop:;
3605     }
3606
3607   if (flags & CODING_STATE_END)
3608     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3609
3610   str->flags = flags;
3611   str->ch    = ch;
3612 }
3613
3614 static void
3615 encode_utf8 (Lisp_Object charset,
3616              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3617 {
3618   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3619   if ( code <= 0x7f )
3620     {
3621       Dynarr_add (dst, code);
3622     }
3623   else if ( code <= 0x7ff )
3624     {
3625       Dynarr_add (dst, (code >> 6) | 0xc0);
3626       Dynarr_add (dst, (code & 0x3f) | 0x80);
3627     }
3628   else if ( code <= 0xffff )
3629     {
3630       Dynarr_add (dst,  (code >> 12) | 0xe0);
3631       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3632       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3633     }
3634   else if ( code <= 0x1fffff )
3635     {
3636       Dynarr_add (dst,  (code >> 18) | 0xf0);
3637       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3638       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3639       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3640     }
3641   else if ( code <= 0x3ffffff )
3642     {
3643       Dynarr_add (dst,  (code >> 24) | 0xf8);
3644       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3645       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3646       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3647       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3648     }
3649   else
3650     {
3651       Dynarr_add (dst,  (code >> 30) | 0xfc);
3652       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3653       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3654       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3655       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3656       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3657     }
3658 }
3659
3660 static void
3661 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3662                     unsigned_char_dynarr *dst, unsigned int n)
3663 {
3664   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3665   unsigned int flags  = str->flags;
3666   unsigned int ch     = str->ch;
3667   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3668   unsigned char char_boundary = str->iso2022.current_char_boundary;
3669   Lisp_Object charset = str->iso2022.current_charset;
3670
3671 #ifdef ENABLE_COMPOSITE_CHARS
3672   /* flags for handling composite chars.  We do a little switcharoo
3673      on the source while we're outputting the composite char. */
3674   unsigned int saved_n = 0;
3675   CONST unsigned char *saved_src = NULL;
3676   int in_composite = 0;
3677
3678  back_to_square_n:
3679 #endif /* ENABLE_COMPOSITE_CHARS */
3680
3681   while (n--)
3682     {
3683       unsigned char c = *src++;
3684
3685       if (BYTE_ASCII_P (c))
3686         {               /* Processing ASCII character */
3687           ch = 0;
3688           if (c == '\n')
3689             {
3690               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3691                 Dynarr_add (dst, '\r');
3692               if (eol_type != EOL_CR)
3693                 Dynarr_add (dst, c);
3694             }
3695           else
3696             encode_utf8 (Vcharset_ascii, c, 0, dst);
3697           char_boundary = 1;
3698         }
3699       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3700         { /* Processing Leading Byte */
3701           ch = 0;
3702           charset = CHARSET_BY_LEADING_BYTE (c);
3703           if (LEADING_BYTE_PREFIX_P(c))
3704             ch = c;
3705           char_boundary = 0;
3706         }
3707       else
3708         {                       /* Processing Non-ASCII character */
3709           char_boundary = 1;
3710           if (EQ (charset, Vcharset_control_1))
3711             {
3712               encode_utf8 (Vcharset_control_1, c, 0, dst);
3713             }
3714           else
3715             {
3716               switch (XCHARSET_REP_BYTES (charset))
3717                 {
3718                 case 2:
3719                   encode_utf8 (charset, c, 0, dst);
3720                   break;
3721                 case 3:
3722                   if (XCHARSET_PRIVATE_P (charset))
3723                     {
3724                       encode_utf8 (charset, c, 0, dst);
3725                       ch = 0;
3726                     }
3727                   else if (ch)
3728                     {
3729 #ifdef ENABLE_COMPOSITE_CHARS
3730                       if (EQ (charset, Vcharset_composite))
3731                         {
3732                           if (in_composite)
3733                             {
3734                               /* #### Bother! We don't know how to
3735                                  handle this yet. */
3736                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3737                             }
3738                           else
3739                             {
3740                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3741                                                        ch & 0x7F, c & 0x7F);
3742                               Lisp_Object lstr = composite_char_string (emch);
3743                               saved_n = n;
3744                               saved_src = src;
3745                               in_composite = 1;
3746                               src = XSTRING_DATA   (lstr);
3747                               n   = XSTRING_LENGTH (lstr);
3748                             }
3749                         }
3750                       else
3751 #endif /* ENABLE_COMPOSITE_CHARS */
3752                         {
3753                           encode_utf8 (charset, ch, c, dst);
3754                         }
3755                       ch = 0;
3756                     }
3757                   else
3758                     {
3759                       ch = c;
3760                       char_boundary = 0;
3761                     }
3762                   break;
3763                 case 4:
3764                   if (ch)
3765                     {
3766                       encode_utf8 (charset, ch, c, dst);
3767                       ch = 0;
3768                     }
3769                   else
3770                     {
3771                       ch = c;
3772                       char_boundary = 0;
3773                     }
3774                   break;
3775                 default:
3776                   abort ();
3777                 }
3778             }
3779         }
3780     }
3781
3782 #ifdef ENABLE_COMPOSITE_CHARS
3783   if (in_composite)
3784     {
3785       n = saved_n;
3786       src = saved_src;
3787       in_composite = 0;
3788       goto back_to_square_n; /* Wheeeeeeeee ..... */
3789     }
3790 #endif
3791
3792   str->flags = flags;
3793   str->ch    = ch;
3794   str->iso2022.current_char_boundary = char_boundary;
3795   str->iso2022.current_charset = charset;
3796
3797   /* Verbum caro factum est! */
3798 }
3799
3800 \f
3801 /************************************************************************/
3802 /*                           ISO2022 methods                            */
3803 /************************************************************************/
3804
3805 /* The following note describes the coding system ISO2022 briefly.
3806    Since the intention of this note is to help understand the
3807    functions in this file, some parts are NOT ACCURATE or OVERLY
3808    SIMPLIFIED.  For thorough understanding, please refer to the
3809    original document of ISO2022.
3810
3811    ISO2022 provides many mechanisms to encode several character sets
3812    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3813    is encoded using bytes less than 128.  This may make the encoded
3814    text a little bit longer, but the text passes more easily through
3815    several gateways, some of which strip off MSB (Most Signigant Bit).
3816
3817    There are two kinds of character sets: control character set and
3818    graphic character set.  The former contains control characters such
3819    as `newline' and `escape' to provide control functions (control
3820    functions are also provided by escape sequences).  The latter
3821    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3822    two control character sets and many graphic character sets.
3823
3824    Graphic character sets are classified into one of the following
3825    four classes, according to the number of bytes (DIMENSION) and
3826    number of characters in one dimension (CHARS) of the set:
3827    - DIMENSION1_CHARS94
3828    - DIMENSION1_CHARS96
3829    - DIMENSION2_CHARS94
3830    - DIMENSION2_CHARS96
3831
3832    In addition, each character set is assigned an identification tag,
3833    unique for each set, called "final character" (denoted as <F>
3834    hereafter).  The <F> of each character set is decided by ECMA(*)
3835    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3836    (0x30..0x3F are for private use only).
3837
3838    Note (*): ECMA = European Computer Manufacturers Association
3839
3840    Here are examples of graphic character set [NAME(<F>)]:
3841         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3842         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3843         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3844         o DIMENSION2_CHARS96 -- none for the moment
3845
3846    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3847         C0 [0x00..0x1F] -- control character plane 0
3848         GL [0x20..0x7F] -- graphic character plane 0
3849         C1 [0x80..0x9F] -- control character plane 1
3850         GR [0xA0..0xFF] -- graphic character plane 1
3851
3852    A control character set is directly designated and invoked to C0 or
3853    C1 by an escape sequence.  The most common case is that:
3854    - ISO646's  control character set is designated/invoked to C0, and
3855    - ISO6429's control character set is designated/invoked to C1,
3856    and usually these designations/invocations are omitted in encoded
3857    text.  In a 7-bit environment, only C0 can be used, and a control
3858    character for C1 is encoded by an appropriate escape sequence to
3859    fit into the environment.  All control characters for C1 are
3860    defined to have corresponding escape sequences.
3861
3862    A graphic character set is at first designated to one of four
3863    graphic registers (G0 through G3), then these graphic registers are
3864    invoked to GL or GR.  These designations and invocations can be
3865    done independently.  The most common case is that G0 is invoked to
3866    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3867    these invocations and designations are omitted in encoded text.
3868    In a 7-bit environment, only GL can be used.
3869
3870    When a graphic character set of CHARS94 is invoked to GL, codes
3871    0x20 and 0x7F of the GL area work as control characters SPACE and
3872    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3873    be used.
3874
3875    There are two ways of invocation: locking-shift and single-shift.
3876    With locking-shift, the invocation lasts until the next different
3877    invocation, whereas with single-shift, the invocation affects the
3878    following character only and doesn't affect the locking-shift
3879    state.  Invocations are done by the following control characters or
3880    escape sequences:
3881
3882    ----------------------------------------------------------------------
3883    abbrev  function                  cntrl escape seq   description
3884    ----------------------------------------------------------------------
3885    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3886    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3887    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3888    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3889    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3890    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3891    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3892    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3893    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3894    ----------------------------------------------------------------------
3895    (*) These are not used by any known coding system.
3896
3897    Control characters for these functions are defined by macros
3898    ISO_CODE_XXX in `coding.h'.
3899
3900    Designations are done by the following escape sequences:
3901    ----------------------------------------------------------------------
3902    escape sequence      description
3903    ----------------------------------------------------------------------
3904    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3905    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3906    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3907    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3908    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3909    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3910    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3911    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3912    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3913    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3914    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3915    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3916    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3917    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3918    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3919    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3920    ----------------------------------------------------------------------
3921
3922    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3923    of dimension 1, chars 94, and final character <F>, etc...
3924
3925    Note (*): Although these designations are not allowed in ISO2022,
3926    Emacs accepts them on decoding, and produces them on encoding
3927    CHARS96 character sets in a coding system which is characterized as
3928    7-bit environment, non-locking-shift, and non-single-shift.
3929
3930    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3931    '(' can be omitted.  We refer to this as "short-form" hereafter.
3932
3933    Now you may notice that there are a lot of ways for encoding the
3934    same multilingual text in ISO2022.  Actually, there exist many
3935    coding systems such as Compound Text (used in X11's inter client
3936    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3937    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3938    localized platforms), and all of these are variants of ISO2022.
3939
3940    In addition to the above, Emacs handles two more kinds of escape
3941    sequences: ISO6429's direction specification and Emacs' private
3942    sequence for specifying character composition.
3943
3944    ISO6429's direction specification takes the following form:
3945         o CSI ']'      -- end of the current direction
3946         o CSI '0' ']'  -- end of the current direction
3947         o CSI '1' ']'  -- start of left-to-right text
3948         o CSI '2' ']'  -- start of right-to-left text
3949    The control character CSI (0x9B: control sequence introducer) is
3950    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3951
3952    Character composition specification takes the following form:
3953         o ESC '0' -- start character composition
3954         o ESC '1' -- end character composition
3955    Since these are not standard escape sequences of any ISO standard,
3956    their use with these meanings is restricted to Emacs only.  */
3957
3958 static void
3959 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3960 {
3961   int i;
3962
3963   for (i = 0; i < 4; i++)
3964     {
3965       if (!NILP (coding_system))
3966         iso->charset[i] =
3967           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3968       else
3969         iso->charset[i] = Qt;
3970       iso->invalid_designated[i] = 0;
3971     }
3972   iso->esc = ISO_ESC_NOTHING;
3973   iso->esc_bytes_index = 0;
3974   iso->register_left = 0;
3975   iso->register_right = 1;
3976   iso->switched_dir_and_no_valid_charset_yet = 0;
3977   iso->invalid_switch_dir = 0;
3978   iso->output_direction_sequence = 0;
3979   iso->output_literally = 0;
3980 #ifdef ENABLE_COMPOSITE_CHARS
3981   if (iso->composite_chars)
3982     Dynarr_reset (iso->composite_chars);
3983 #endif
3984 }
3985
3986 static int
3987 fit_to_be_escape_quoted (unsigned char c)
3988 {
3989   switch (c)
3990     {
3991     case ISO_CODE_ESC:
3992     case ISO_CODE_CSI:
3993     case ISO_CODE_SS2:
3994     case ISO_CODE_SS3:
3995     case ISO_CODE_SO:
3996     case ISO_CODE_SI:
3997       return 1;
3998
3999     default:
4000       return 0;
4001     }
4002 }
4003
4004 /* Parse one byte of an ISO2022 escape sequence.
4005    If the result is an invalid escape sequence, return 0 and
4006    do not change anything in STR.  Otherwise, if the result is
4007    an incomplete escape sequence, update ISO2022.ESC and
4008    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4009    all the state variables (but not ISO2022.ESC_BYTES) and
4010    return 1.
4011
4012    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4013    or invocation of an invalid character set and treat that as
4014    an unrecognized escape sequence. */
4015
4016 static int
4017 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4018                    unsigned char c, unsigned int *flags,
4019                    int check_invalid_charsets)
4020 {
4021   /* (1) If we're at the end of a designation sequence, CS is the
4022      charset being designated and REG is the register to designate
4023      it to.
4024
4025      (2) If we're at the end of a locking-shift sequence, REG is
4026      the register to invoke and HALF (0 == left, 1 == right) is
4027      the half to invoke it into.
4028
4029      (3) If we're at the end of a single-shift sequence, REG is
4030      the register to invoke. */
4031   Lisp_Object cs = Qnil;
4032   int reg, half;
4033
4034   /* NOTE: This code does goto's all over the fucking place.
4035      The reason for this is that we're basically implementing
4036      a state machine here, and hierarchical languages like C
4037      don't really provide a clean way of doing this. */
4038
4039   if (! (*flags & CODING_STATE_ESCAPE))
4040     /* At beginning of escape sequence; we need to reset our
4041        escape-state variables. */
4042     iso->esc = ISO_ESC_NOTHING;
4043
4044   iso->output_literally = 0;
4045   iso->output_direction_sequence = 0;
4046
4047   switch (iso->esc)
4048     {
4049     case ISO_ESC_NOTHING:
4050       iso->esc_bytes_index = 0;
4051       switch (c)
4052         {
4053         case ISO_CODE_ESC:      /* Start escape sequence */
4054           *flags |= CODING_STATE_ESCAPE;
4055           iso->esc = ISO_ESC;
4056           goto not_done;
4057
4058         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4059           *flags |= CODING_STATE_ESCAPE;
4060           iso->esc = ISO_ESC_5_11;
4061           goto not_done;
4062
4063         case ISO_CODE_SO:       /* locking shift 1 */
4064           reg = 1; half = 0;
4065           goto locking_shift;
4066         case ISO_CODE_SI:       /* locking shift 0 */
4067           reg = 0; half = 0;
4068           goto locking_shift;
4069
4070         case ISO_CODE_SS2:      /* single shift */
4071           reg = 2;
4072           goto single_shift;
4073         case ISO_CODE_SS3:      /* single shift */
4074           reg = 3;
4075           goto single_shift;
4076
4077         default:                        /* Other control characters */
4078           return 0;
4079         }
4080
4081     case ISO_ESC:
4082       switch (c)
4083         {
4084           /**** single shift ****/
4085
4086         case 'N':       /* single shift 2 */
4087           reg = 2;
4088           goto single_shift;
4089         case 'O':       /* single shift 3 */
4090           reg = 3;
4091           goto single_shift;
4092
4093           /**** locking shift ****/
4094
4095         case '~':       /* locking shift 1 right */
4096           reg = 1; half = 1;
4097           goto locking_shift;
4098         case 'n':       /* locking shift 2 */
4099           reg = 2; half = 0;
4100           goto locking_shift;
4101         case '}':       /* locking shift 2 right */
4102           reg = 2; half = 1;
4103           goto locking_shift;
4104         case 'o':       /* locking shift 3 */
4105           reg = 3; half = 0;
4106           goto locking_shift;
4107         case '|':       /* locking shift 3 right */
4108           reg = 3; half = 1;
4109           goto locking_shift;
4110
4111 #ifdef ENABLE_COMPOSITE_CHARS
4112           /**** composite ****/
4113
4114         case '0':
4115           iso->esc = ISO_ESC_START_COMPOSITE;
4116           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4117             CODING_STATE_COMPOSITE;
4118           return 1;
4119
4120         case '1':
4121           iso->esc = ISO_ESC_END_COMPOSITE;
4122           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4123             ~CODING_STATE_COMPOSITE;
4124           return 1;
4125 #endif /* ENABLE_COMPOSITE_CHARS */
4126
4127           /**** directionality ****/
4128
4129         case '[':
4130           iso->esc = ISO_ESC_5_11;
4131           goto not_done;
4132
4133           /**** designation ****/
4134
4135         case '$':       /* multibyte charset prefix */
4136           iso->esc = ISO_ESC_2_4;
4137           goto not_done;
4138
4139         default:
4140           if (0x28 <= c && c <= 0x2F)
4141             {
4142               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4143               goto not_done;
4144             }
4145
4146           /* This function is called with CODESYS equal to nil when
4147              doing coding-system detection. */
4148           if (!NILP (codesys)
4149               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4150               && fit_to_be_escape_quoted (c))
4151             {
4152               iso->esc = ISO_ESC_LITERAL;
4153               *flags &= CODING_STATE_ISO2022_LOCK;
4154               return 1;
4155             }
4156
4157           /* bzzzt! */
4158           return 0;
4159         }
4160
4161
4162
4163       /**** directionality ****/
4164
4165     case ISO_ESC_5_11:          /* ISO6429 direction control */
4166       if (c == ']')
4167         {
4168           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4169           goto directionality;
4170         }
4171       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4172       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4173       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4174       else               return 0;
4175       goto not_done;
4176
4177     case ISO_ESC_5_11_0:
4178       if (c == ']')
4179         {
4180           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4181           goto directionality;
4182         }
4183       return 0;
4184
4185     case ISO_ESC_5_11_1:
4186       if (c == ']')
4187         {
4188           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4189           goto directionality;
4190         }
4191       return 0;
4192
4193     case ISO_ESC_5_11_2:
4194       if (c == ']')
4195         {
4196           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4197           goto directionality;
4198         }
4199       return 0;
4200
4201     directionality:
4202       iso->esc = ISO_ESC_DIRECTIONALITY;
4203       /* Various junk here to attempt to preserve the direction sequences
4204          literally in the text if they would otherwise be swallowed due
4205          to invalid designations that don't show up as actual charset
4206          changes in the text. */
4207       if (iso->invalid_switch_dir)
4208         {
4209           /* We already inserted a direction switch literally into the
4210              text.  We assume (#### this may not be right) that the
4211              next direction switch is the one going the other way,
4212              and we need to output that literally as well. */
4213           iso->output_literally = 1;
4214           iso->invalid_switch_dir = 0;
4215         }
4216       else
4217         {
4218           int jj;
4219
4220           /* If we are in the thrall of an invalid designation,
4221            then stick the directionality sequence literally into the
4222            output stream so it ends up in the original text again. */
4223           for (jj = 0; jj < 4; jj++)
4224             if (iso->invalid_designated[jj])
4225               break;
4226           if (jj < 4)
4227             {
4228               iso->output_literally = 1;
4229               iso->invalid_switch_dir = 1;
4230             }
4231           else
4232             /* Indicate that we haven't yet seen a valid designation,
4233                so that if a switch-dir is directly followed by an
4234                invalid designation, both get inserted literally. */
4235             iso->switched_dir_and_no_valid_charset_yet = 1;
4236         }
4237       return 1;
4238
4239
4240       /**** designation ****/
4241
4242     case ISO_ESC_2_4:
4243       if (0x28 <= c && c <= 0x2F)
4244         {
4245           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4246           goto not_done;
4247         }
4248       if (0x40 <= c && c <= 0x42)
4249         {
4250           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4251                                       *flags & CODING_STATE_R2L ?
4252                                       CHARSET_RIGHT_TO_LEFT :
4253                                       CHARSET_LEFT_TO_RIGHT);
4254           reg = 0;
4255           goto designated;
4256         }
4257       return 0;
4258
4259     default:
4260       {
4261         int type =-1;
4262
4263         if (c < '0' || c > '~')
4264           return 0; /* bad final byte */
4265
4266         if (iso->esc >= ISO_ESC_2_8 &&
4267             iso->esc <= ISO_ESC_2_15)
4268           {
4269             type = ((iso->esc >= ISO_ESC_2_12) ?
4270                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4271             reg = (iso->esc - ISO_ESC_2_8) & 3;
4272           }
4273         else if (iso->esc >= ISO_ESC_2_4_8 &&
4274                  iso->esc <= ISO_ESC_2_4_15)
4275           {
4276             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4277                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4278             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4279           }
4280         else
4281           {
4282             /* Can this ever be reached? -slb */
4283             abort();
4284           }
4285
4286         cs = CHARSET_BY_ATTRIBUTES (type, c,
4287                                     *flags & CODING_STATE_R2L ?
4288                                     CHARSET_RIGHT_TO_LEFT :
4289                                     CHARSET_LEFT_TO_RIGHT);
4290         goto designated;
4291       }
4292     }
4293
4294  not_done:
4295   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4296   return -1;
4297
4298  single_shift:
4299   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4300     /* can't invoke something that ain't there. */
4301     return 0;
4302   iso->esc = ISO_ESC_SINGLE_SHIFT;
4303   *flags &= CODING_STATE_ISO2022_LOCK;
4304   if (reg == 2)
4305     *flags |= CODING_STATE_SS2;
4306   else
4307     *flags |= CODING_STATE_SS3;
4308   return 1;
4309
4310  locking_shift:
4311   if (check_invalid_charsets &&
4312       !CHARSETP (iso->charset[reg]))
4313     /* can't invoke something that ain't there. */
4314     return 0;
4315   if (half)
4316     iso->register_right = reg;
4317   else
4318     iso->register_left = reg;
4319   *flags &= CODING_STATE_ISO2022_LOCK;
4320   iso->esc = ISO_ESC_LOCKING_SHIFT;
4321   return 1;
4322
4323  designated:
4324   if (NILP (cs) && check_invalid_charsets)
4325     {
4326       iso->invalid_designated[reg] = 1;
4327       iso->charset[reg] = Vcharset_ascii;
4328       iso->esc = ISO_ESC_DESIGNATE;
4329       *flags &= CODING_STATE_ISO2022_LOCK;
4330       iso->output_literally = 1;
4331       if (iso->switched_dir_and_no_valid_charset_yet)
4332         {
4333           /* We encountered a switch-direction followed by an
4334              invalid designation.  Ensure that the switch-direction
4335              gets outputted; otherwise it will probably get eaten
4336              when the text is written out again. */
4337           iso->switched_dir_and_no_valid_charset_yet = 0;
4338           iso->output_direction_sequence = 1;
4339           /* And make sure that the switch-dir going the other
4340              way gets outputted, as well. */
4341           iso->invalid_switch_dir = 1;
4342         }
4343       return 1;
4344     }
4345   /* This function is called with CODESYS equal to nil when
4346      doing coding-system detection. */
4347   if (!NILP (codesys))
4348     {
4349       charset_conversion_spec_dynarr *dyn =
4350         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4351
4352       if (dyn)
4353         {
4354           int i;
4355
4356           for (i = 0; i < Dynarr_length (dyn); i++)
4357             {
4358               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4359               if (EQ (cs, spec->from_charset))
4360                 cs = spec->to_charset;
4361             }
4362         }
4363     }
4364
4365   iso->charset[reg] = cs;
4366   iso->esc = ISO_ESC_DESIGNATE;
4367   *flags &= CODING_STATE_ISO2022_LOCK;
4368   if (iso->invalid_designated[reg])
4369     {
4370       iso->invalid_designated[reg] = 0;
4371       iso->output_literally = 1;
4372     }
4373   if (iso->switched_dir_and_no_valid_charset_yet)
4374     iso->switched_dir_and_no_valid_charset_yet = 0;
4375   return 1;
4376 }
4377
4378 static int
4379 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4380                        unsigned int n)
4381 {
4382   int mask;
4383
4384   /* #### There are serious deficiencies in the recognition mechanism
4385      here.  This needs to be much smarter if it's going to cut it.
4386      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4387      it should be detected as Latin-1.
4388      All the ISO2022 stuff in this file should be synced up with the
4389      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4390      Perhaps we should wait till R2L works in FSF Emacs? */
4391
4392   if (!st->iso2022.initted)
4393     {
4394       reset_iso2022 (Qnil, &st->iso2022.iso);
4395       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4396                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4397                           CODING_CATEGORY_ISO_8_1_MASK |
4398                           CODING_CATEGORY_ISO_8_2_MASK |
4399                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4400       st->iso2022.flags = 0;
4401       st->iso2022.high_byte_count = 0;
4402       st->iso2022.saw_single_shift = 0;
4403       st->iso2022.initted = 1;
4404     }
4405
4406   mask = st->iso2022.mask;
4407
4408   while (n--)
4409     {
4410       int c = *src++;
4411       if (c >= 0xA0)
4412         {
4413           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4414           st->iso2022.high_byte_count++;
4415         }
4416       else
4417         {
4418           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4419             {
4420               if (st->iso2022.high_byte_count & 1)
4421                 /* odd number of high bytes; assume not iso-8-2 */
4422                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4423             }
4424           st->iso2022.high_byte_count = 0;
4425           st->iso2022.saw_single_shift = 0;
4426           if (c > 0x80)
4427             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4428         }
4429       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4430           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4431         { /* control chars */
4432           switch (c)
4433             {
4434               /* Allow and ignore control characters that you might
4435                  reasonably see in a text file */
4436             case '\r':
4437             case '\n':
4438             case '\t':
4439             case  7: /* bell */
4440             case  8: /* backspace */
4441             case 11: /* vertical tab */
4442             case 12: /* form feed */
4443             case 26: /* MS-DOS C-z junk */
4444             case 31: /* '^_' -- for info */
4445               goto label_continue_loop;
4446
4447             default:
4448               break;
4449             }
4450         }
4451
4452       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4453           || BYTE_C1_P (c))
4454         {
4455           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4456                                  &st->iso2022.flags, 0))
4457             {
4458               switch (st->iso2022.iso.esc)
4459                 {
4460                 case ISO_ESC_DESIGNATE:
4461                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4462                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4463                   break;
4464                 case ISO_ESC_LOCKING_SHIFT:
4465                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4466                   goto ran_out_of_chars;
4467                 case ISO_ESC_SINGLE_SHIFT:
4468                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4469                   st->iso2022.saw_single_shift = 1;
4470                   break;
4471                 default:
4472                   break;
4473                 }
4474             }
4475           else
4476             {
4477               mask = 0;
4478               goto ran_out_of_chars;
4479             }
4480         }
4481     label_continue_loop:;
4482     }
4483
4484  ran_out_of_chars:
4485
4486   return mask;
4487 }
4488
4489 static int
4490 postprocess_iso2022_mask (int mask)
4491 {
4492   /* #### kind of cheesy */
4493   /* If seven-bit ISO is allowed, then assume that the encoding is
4494      entirely seven-bit and turn off the eight-bit ones. */
4495   if (mask & CODING_CATEGORY_ISO_7_MASK)
4496     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4497                CODING_CATEGORY_ISO_8_1_MASK |
4498                CODING_CATEGORY_ISO_8_2_MASK);
4499   return mask;
4500 }
4501
4502 /* If FLAGS is a null pointer or specifies right-to-left motion,
4503    output a switch-dir-to-left-to-right sequence to DST.
4504    Also update FLAGS if it is not a null pointer.
4505    If INTERNAL_P is set, we are outputting in internal format and
4506    need to handle the CSI differently. */
4507
4508 static void
4509 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4510                                  unsigned_char_dynarr *dst,
4511                                  unsigned int *flags,
4512                                  int internal_p)
4513 {
4514   if (!flags || (*flags & CODING_STATE_R2L))
4515     {
4516       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4517         {
4518           Dynarr_add (dst, ISO_CODE_ESC);
4519           Dynarr_add (dst, '[');
4520         }
4521       else if (internal_p)
4522         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4523       else
4524         Dynarr_add (dst, ISO_CODE_CSI);
4525       Dynarr_add (dst, '0');
4526       Dynarr_add (dst, ']');
4527       if (flags)
4528         *flags &= ~CODING_STATE_R2L;
4529     }
4530 }
4531
4532 /* If FLAGS is a null pointer or specifies a direction different from
4533    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4534    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4535    sequence to DST.  Also update FLAGS if it is not a null pointer.
4536    If INTERNAL_P is set, we are outputting in internal format and
4537    need to handle the CSI differently. */
4538
4539 static void
4540 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4541                           unsigned_char_dynarr *dst, unsigned int *flags,
4542                           int internal_p)
4543 {
4544   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4545       direction == CHARSET_LEFT_TO_RIGHT)
4546     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4547   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4548            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4549            direction == CHARSET_RIGHT_TO_LEFT)
4550     {
4551       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4552         {
4553           Dynarr_add (dst, ISO_CODE_ESC);
4554           Dynarr_add (dst, '[');
4555         }
4556       else if (internal_p)
4557         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4558       else
4559         Dynarr_add (dst, ISO_CODE_CSI);
4560       Dynarr_add (dst, '2');
4561       Dynarr_add (dst, ']');
4562       if (flags)
4563         *flags |= CODING_STATE_R2L;
4564     }
4565 }
4566
4567 /* Convert ISO2022-format data to internal format. */
4568
4569 static void
4570 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4571                        unsigned_char_dynarr *dst, unsigned int n)
4572 {
4573   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4574   unsigned int flags  = str->flags;
4575   unsigned int ch     = str->ch;
4576   eol_type_t eol_type = str->eol_type;
4577 #ifdef ENABLE_COMPOSITE_CHARS
4578   unsigned_char_dynarr *real_dst = dst;
4579 #endif
4580   Lisp_Object coding_system;
4581
4582   XSETCODING_SYSTEM (coding_system, str->codesys);
4583
4584 #ifdef ENABLE_COMPOSITE_CHARS
4585   if (flags & CODING_STATE_COMPOSITE)
4586     dst = str->iso2022.composite_chars;
4587 #endif /* ENABLE_COMPOSITE_CHARS */
4588
4589   while (n--)
4590     {
4591       unsigned char c = *src++;
4592       if (flags & CODING_STATE_ESCAPE)
4593         {       /* Within ESC sequence */
4594           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4595                                           c, &flags, 1);
4596
4597           if (retval)
4598             {
4599               switch (str->iso2022.esc)
4600                 {
4601 #ifdef ENABLE_COMPOSITE_CHARS
4602                 case ISO_ESC_START_COMPOSITE:
4603                   if (str->iso2022.composite_chars)
4604                     Dynarr_reset (str->iso2022.composite_chars);
4605                   else
4606                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4607                   dst = str->iso2022.composite_chars;
4608                   break;
4609                 case ISO_ESC_END_COMPOSITE:
4610                   {
4611                     Bufbyte comstr[MAX_EMCHAR_LEN];
4612                     Bytecount len;
4613                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4614                                                          Dynarr_length (dst));
4615                     dst = real_dst;
4616                     len = set_charptr_emchar (comstr, emch);
4617                     Dynarr_add_many (dst, comstr, len);
4618                     break;
4619                   }
4620 #endif /* ENABLE_COMPOSITE_CHARS */
4621
4622                 case ISO_ESC_LITERAL:
4623                   DECODE_ADD_BINARY_CHAR (c, dst);
4624                   break;
4625
4626                 default:
4627                   /* Everything else handled already */
4628                   break;
4629                 }
4630             }
4631
4632           /* Attempted error recovery. */
4633           if (str->iso2022.output_direction_sequence)
4634             ensure_correct_direction (flags & CODING_STATE_R2L ?
4635                                       CHARSET_RIGHT_TO_LEFT :
4636                                       CHARSET_LEFT_TO_RIGHT,
4637                                       str->codesys, dst, 0, 1);
4638           /* More error recovery. */
4639           if (!retval || str->iso2022.output_literally)
4640             {
4641               /* Output the (possibly invalid) sequence */
4642               int i;
4643               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4644                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4645               flags &= CODING_STATE_ISO2022_LOCK;
4646               if (!retval)
4647                 n++, src--;/* Repeat the loop with the same character. */
4648               else
4649                 {
4650                   /* No sense in reprocessing the final byte of the
4651                      escape sequence; it could mess things up anyway.
4652                      Just add it now. */
4653                   DECODE_ADD_BINARY_CHAR (c, dst);
4654                 }
4655             }
4656           ch = 0;
4657         }
4658       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4659         { /* Control characters */
4660
4661           /***** Error-handling *****/
4662
4663           /* If we were in the middle of a character, dump out the
4664              partial character. */
4665           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4666
4667           /* If we just saw a single-shift character, dump it out.
4668              This may dump out the wrong sort of single-shift character,
4669              but least it will give an indication that something went
4670              wrong. */
4671           if (flags & CODING_STATE_SS2)
4672             {
4673               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4674               flags &= ~CODING_STATE_SS2;
4675             }
4676           if (flags & CODING_STATE_SS3)
4677             {
4678               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4679               flags &= ~CODING_STATE_SS3;
4680             }
4681
4682           /***** Now handle the control characters. *****/
4683
4684           /* Handle CR/LF */
4685           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4686
4687           flags &= CODING_STATE_ISO2022_LOCK;
4688
4689           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4690             DECODE_ADD_BINARY_CHAR (c, dst);
4691         }
4692       else
4693         {                       /* Graphic characters */
4694           Lisp_Object charset;
4695           int lb;
4696           int reg;
4697
4698           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4699
4700           /* Now determine the charset. */
4701           reg = ((flags & CODING_STATE_SS2) ? 2
4702                  : (flags & CODING_STATE_SS3) ? 3
4703                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4704                  : str->iso2022.register_left);
4705           charset = str->iso2022.charset[reg];
4706
4707           /* Error checking: */
4708           if (! CHARSETP (charset)
4709               || str->iso2022.invalid_designated[reg]
4710               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4711                   && XCHARSET_CHARS (charset) == 94))
4712             /* Mrmph.  We are trying to invoke a register that has no
4713                or an invalid charset in it, or trying to add a character
4714                outside the range of the charset.  Insert that char literally
4715                to preserve it for the output. */
4716             {
4717               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4718               DECODE_ADD_BINARY_CHAR (c, dst);
4719             }
4720
4721           else
4722             {
4723               /* Things are probably hunky-dorey. */
4724
4725               /* Fetch reverse charset, maybe. */
4726               if (((flags & CODING_STATE_R2L) &&
4727                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4728                   ||
4729                   (!(flags & CODING_STATE_R2L) &&
4730                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4731                 {
4732                   Lisp_Object new_charset =
4733                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4734                   if (!NILP (new_charset))
4735                     charset = new_charset;
4736                 }
4737
4738               lb = XCHARSET_LEADING_BYTE (charset);
4739               switch (XCHARSET_REP_BYTES (charset))
4740                 {
4741                 case 1: /* ASCII */
4742                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4743                   Dynarr_add (dst, c & 0x7F);
4744                   break;
4745
4746                 case 2: /* one-byte official */
4747                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4748                   Dynarr_add (dst, lb);
4749                   Dynarr_add (dst, c | 0x80);
4750                   break;
4751
4752                 case 3: /* one-byte private or two-byte official */
4753                   if (XCHARSET_PRIVATE_P (charset))
4754                     {
4755                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4756                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4757                       Dynarr_add (dst, lb);
4758                       Dynarr_add (dst, c | 0x80);
4759                     }
4760                   else
4761                     {
4762                       if (ch)
4763                         {
4764                           Dynarr_add (dst, lb);
4765                           Dynarr_add (dst, ch | 0x80);
4766                           Dynarr_add (dst, c | 0x80);
4767                           ch = 0;
4768                         }
4769                       else
4770                         ch = c;
4771                     }
4772                   break;
4773
4774                 default:        /* two-byte private */
4775                   if (ch)
4776                     {
4777                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4778                       Dynarr_add (dst, lb);
4779                       Dynarr_add (dst, ch | 0x80);
4780                       Dynarr_add (dst, c | 0x80);
4781                       ch = 0;
4782                     }
4783                   else
4784                     ch = c;
4785                 }
4786             }
4787
4788           if (!ch)
4789             flags &= CODING_STATE_ISO2022_LOCK;
4790         }
4791
4792     label_continue_loop:;
4793     }
4794
4795   if (flags & CODING_STATE_END)
4796     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4797
4798   str->flags = flags;
4799   str->ch    = ch;
4800 }
4801
4802
4803 /***** ISO2022 encoder *****/
4804
4805 /* Designate CHARSET into register REG. */
4806
4807 static void
4808 iso2022_designate (Lisp_Object charset, unsigned char reg,
4809                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4810 {
4811   static CONST char inter94[] = "()*+";
4812   static CONST char inter96[] = ",-./";
4813   unsigned int type;
4814   unsigned char final;
4815   Lisp_Object old_charset = str->iso2022.charset[reg];
4816
4817   str->iso2022.charset[reg] = charset;
4818   if (!CHARSETP (charset))
4819     /* charset might be an initial nil or t. */
4820     return;
4821   type = XCHARSET_TYPE (charset);
4822   final = XCHARSET_FINAL (charset);
4823   if (!str->iso2022.force_charset_on_output[reg] &&
4824       CHARSETP (old_charset) &&
4825       XCHARSET_TYPE (old_charset) == type &&
4826       XCHARSET_FINAL (old_charset) == final)
4827     return;
4828
4829   str->iso2022.force_charset_on_output[reg] = 0;
4830
4831   {
4832     charset_conversion_spec_dynarr *dyn =
4833       str->codesys->iso2022.output_conv;
4834
4835     if (dyn)
4836       {
4837         int i;
4838
4839         for (i = 0; i < Dynarr_length (dyn); i++)
4840           {
4841             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4842             if (EQ (charset, spec->from_charset))
4843                 charset = spec->to_charset;
4844           }
4845       }
4846   }
4847
4848   Dynarr_add (dst, ISO_CODE_ESC);
4849   switch (type)
4850     {
4851     case CHARSET_TYPE_94:
4852       Dynarr_add (dst, inter94[reg]);
4853       break;
4854     case CHARSET_TYPE_96:
4855       Dynarr_add (dst, inter96[reg]);
4856       break;
4857     case CHARSET_TYPE_94X94:
4858       Dynarr_add (dst, '$');
4859       if (reg != 0
4860           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4861           || final < '@'
4862           || final > 'B')
4863         Dynarr_add (dst, inter94[reg]);
4864       break;
4865     case CHARSET_TYPE_96X96:
4866       Dynarr_add (dst, '$');
4867       Dynarr_add (dst, inter96[reg]);
4868       break;
4869     }
4870   Dynarr_add (dst, final);
4871 }
4872
4873 static void
4874 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4875 {
4876   if (str->iso2022.register_left != 0)
4877     {
4878       Dynarr_add (dst, ISO_CODE_SI);
4879       str->iso2022.register_left = 0;
4880     }
4881 }
4882
4883 static void
4884 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4885 {
4886   if (str->iso2022.register_left != 1)
4887     {
4888       Dynarr_add (dst, ISO_CODE_SO);
4889       str->iso2022.register_left = 1;
4890     }
4891 }
4892
4893 /* Convert internally-formatted data to ISO2022 format. */
4894
4895 static void
4896 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4897                        unsigned_char_dynarr *dst, unsigned int n)
4898 {
4899   unsigned char charmask, c;
4900   unsigned char char_boundary;
4901   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4902   unsigned int flags          = str->flags;
4903   unsigned int ch             = str->ch;
4904   Lisp_Coding_System *codesys = str->codesys;
4905   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4906   int i;
4907   Lisp_Object charset;
4908   int half;
4909
4910 #ifdef ENABLE_COMPOSITE_CHARS
4911   /* flags for handling composite chars.  We do a little switcharoo
4912      on the source while we're outputting the composite char. */
4913   unsigned int saved_n = 0;
4914   CONST unsigned char *saved_src = NULL;
4915   int in_composite = 0;
4916 #endif /* ENABLE_COMPOSITE_CHARS */
4917
4918   char_boundary = str->iso2022.current_char_boundary;
4919   charset = str->iso2022.current_charset;
4920   half = str->iso2022.current_half;
4921
4922 #ifdef ENABLE_COMPOSITE_CHARS
4923  back_to_square_n:
4924 #endif
4925   while (n--)
4926     {
4927       c = *src++;
4928
4929       if (BYTE_ASCII_P (c))
4930         {               /* Processing ASCII character */
4931           ch = 0;
4932
4933           restore_left_to_right_direction (codesys, dst, &flags, 0);
4934
4935           /* Make sure G0 contains ASCII */
4936           if ((c > ' ' && c < ISO_CODE_DEL) ||
4937               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4938             {
4939               ensure_normal_shift (str, dst);
4940               iso2022_designate (Vcharset_ascii, 0, str, dst);
4941             }
4942
4943           /* If necessary, restore everything to the default state
4944              at end-of-line */
4945           if (c == '\n' &&
4946               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4947             {
4948               restore_left_to_right_direction (codesys, dst, &flags, 0);
4949
4950               ensure_normal_shift (str, dst);
4951
4952               for (i = 0; i < 4; i++)
4953                 {
4954                   Lisp_Object initial_charset =
4955                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4956                   iso2022_designate (initial_charset, i, str, dst);
4957                 }
4958             }
4959           if (c == '\n')
4960             {
4961               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4962                 Dynarr_add (dst, '\r');
4963               if (eol_type != EOL_CR)
4964                 Dynarr_add (dst, c);
4965             }
4966           else
4967             {
4968               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4969                   && fit_to_be_escape_quoted (c))
4970                 Dynarr_add (dst, ISO_CODE_ESC);
4971               Dynarr_add (dst, c);
4972             }
4973           char_boundary = 1;
4974         }
4975
4976       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4977         { /* Processing Leading Byte */
4978           ch = 0;
4979           charset = CHARSET_BY_LEADING_BYTE (c);
4980           if (LEADING_BYTE_PREFIX_P(c))
4981             ch = c;
4982           else if (!EQ (charset, Vcharset_control_1)
4983 #ifdef ENABLE_COMPOSITE_CHARS
4984                    && !EQ (charset, Vcharset_composite)
4985 #endif
4986                    )
4987             {
4988               int reg;
4989
4990               ensure_correct_direction (XCHARSET_DIRECTION (charset),
4991                                         codesys, dst, &flags, 0);
4992
4993               /* Now determine which register to use. */
4994               reg = -1;
4995               for (i = 0; i < 4; i++)
4996                 {
4997                   if (EQ (charset, str->iso2022.charset[i]) ||
4998                       EQ (charset,
4999                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5000                     {
5001                       reg = i;
5002                       break;
5003                     }
5004                 }
5005
5006               if (reg == -1)
5007                 {
5008                   if (XCHARSET_GRAPHIC (charset) != 0)
5009                     {
5010                       if (!NILP (str->iso2022.charset[1]) &&
5011                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5012                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5013                         reg = 1;
5014                       else if (!NILP (str->iso2022.charset[2]))
5015                         reg = 2;
5016                       else if (!NILP (str->iso2022.charset[3]))
5017                         reg = 3;
5018                       else
5019                         reg = 0;
5020                     }
5021                   else
5022                     reg = 0;
5023                 }
5024
5025               iso2022_designate (charset, reg, str, dst);
5026
5027               /* Now invoke that register. */
5028               switch (reg)
5029                 {
5030                 case 0:
5031                   ensure_normal_shift (str, dst);
5032                   half = 0;
5033                   break;
5034
5035                 case 1:
5036                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5037                     {
5038                       ensure_shift_out (str, dst);
5039                       half = 0;
5040                     }
5041                   else
5042                     half = 1;
5043                   break;
5044
5045                 case 2:
5046                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5047                     {
5048                       Dynarr_add (dst, ISO_CODE_ESC);
5049                       Dynarr_add (dst, 'N');
5050                       half = 0;
5051                     }
5052                   else
5053                     {
5054                       Dynarr_add (dst, ISO_CODE_SS2);
5055                       half = 1;
5056                     }
5057                   break;
5058
5059                 case 3:
5060                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5061                     {
5062                       Dynarr_add (dst, ISO_CODE_ESC);
5063                       Dynarr_add (dst, 'O');
5064                       half = 0;
5065                     }
5066                   else
5067                     {
5068                       Dynarr_add (dst, ISO_CODE_SS3);
5069                       half = 1;
5070                     }
5071                   break;
5072
5073                 default:
5074                   abort ();
5075                 }
5076             }
5077           char_boundary = 0;
5078         }
5079       else
5080         {                       /* Processing Non-ASCII character */
5081           charmask = (half == 0 ? 0x7F : 0xFF);
5082           char_boundary = 1;
5083           if (EQ (charset, Vcharset_control_1))
5084             {
5085               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5086                   && fit_to_be_escape_quoted (c))
5087                 Dynarr_add (dst, ISO_CODE_ESC);
5088               /* you asked for it ... */
5089               Dynarr_add (dst, c - 0x20);
5090             }
5091           else
5092             {
5093               switch (XCHARSET_REP_BYTES (charset))
5094                 {
5095                 case 2:
5096                   Dynarr_add (dst, c & charmask);
5097                   break;
5098                 case 3:
5099                   if (XCHARSET_PRIVATE_P (charset))
5100                     {
5101                       Dynarr_add (dst, c & charmask);
5102                       ch = 0;
5103                     }
5104                   else if (ch)
5105                     {
5106 #ifdef ENABLE_COMPOSITE_CHARS
5107                       if (EQ (charset, Vcharset_composite))
5108                         {
5109                           if (in_composite)
5110                             {
5111                               /* #### Bother! We don't know how to
5112                                  handle this yet. */
5113                               Dynarr_add (dst, '~');
5114                             }
5115                           else
5116                             {
5117                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5118                                                        ch & 0x7F, c & 0x7F);
5119                               Lisp_Object lstr = composite_char_string (emch);
5120                               saved_n = n;
5121                               saved_src = src;
5122                               in_composite = 1;
5123                               src = XSTRING_DATA   (lstr);
5124                               n   = XSTRING_LENGTH (lstr);
5125                               Dynarr_add (dst, ISO_CODE_ESC);
5126                               Dynarr_add (dst, '0'); /* start composing */
5127                             }
5128                         }
5129                       else
5130 #endif /* ENABLE_COMPOSITE_CHARS */
5131                         {
5132                           Dynarr_add (dst, ch & charmask);
5133                           Dynarr_add (dst, c & charmask);
5134                         }
5135                       ch = 0;
5136                     }
5137                   else
5138                     {
5139                       ch = c;
5140                       char_boundary = 0;
5141                     }
5142                   break;
5143                 case 4:
5144                   if (ch)
5145                     {
5146                       Dynarr_add (dst, ch & charmask);
5147                       Dynarr_add (dst, c & charmask);
5148                       ch = 0;
5149                     }
5150                   else
5151                     {
5152                       ch = c;
5153                       char_boundary = 0;
5154                     }
5155                   break;
5156                 default:
5157                   abort ();
5158                 }
5159             }
5160         }
5161     }
5162
5163 #ifdef ENABLE_COMPOSITE_CHARS
5164   if (in_composite)
5165     {
5166       n = saved_n;
5167       src = saved_src;
5168       in_composite = 0;
5169       Dynarr_add (dst, ISO_CODE_ESC);
5170       Dynarr_add (dst, '1'); /* end composing */
5171       goto back_to_square_n; /* Wheeeeeeeee ..... */
5172     }
5173 #endif /* ENABLE_COMPOSITE_CHARS */
5174
5175   if (char_boundary && flags & CODING_STATE_END)
5176     {
5177       restore_left_to_right_direction (codesys, dst, &flags, 0);
5178       ensure_normal_shift (str, dst);
5179       for (i = 0; i < 4; i++)
5180         {
5181           Lisp_Object initial_charset =
5182             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5183           iso2022_designate (initial_charset, i, str, dst);
5184         }
5185     }
5186
5187   str->flags = flags;
5188   str->ch    = ch;
5189   str->iso2022.current_char_boundary = char_boundary;
5190   str->iso2022.current_charset = charset;
5191   str->iso2022.current_half = half;
5192
5193   /* Verbum caro factum est! */
5194 }
5195 #endif /* MULE */
5196 \f
5197 /************************************************************************/
5198 /*                     No-conversion methods                            */
5199 /************************************************************************/
5200
5201 /* This is used when reading in "binary" files -- i.e. files that may
5202    contain all 256 possible byte values and that are not to be
5203    interpreted as being in any particular decoding. */
5204 static void
5205 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5206                              unsigned_char_dynarr *dst, unsigned int n)
5207 {
5208   unsigned char c;
5209   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5210   unsigned int flags  = str->flags;
5211   unsigned int ch     = str->ch;
5212   eol_type_t eol_type = str->eol_type;
5213
5214   while (n--)
5215     {
5216       c = *src++;
5217
5218       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5219       DECODE_ADD_BINARY_CHAR (c, dst);
5220     label_continue_loop:;
5221     }
5222
5223   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5224
5225   str->flags = flags;
5226   str->ch    = ch;
5227 }
5228
5229 static void
5230 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5231                              unsigned_char_dynarr *dst, unsigned int n)
5232 {
5233   unsigned char c;
5234   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5235   unsigned int flags  = str->flags;
5236   unsigned int ch     = str->ch;
5237   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5238
5239   while (n--)
5240     {
5241       c = *src++;
5242       if (c == '\n')
5243         {
5244           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5245             Dynarr_add (dst, '\r');
5246           if (eol_type != EOL_CR)
5247             Dynarr_add (dst, '\n');
5248           ch = 0;
5249         }
5250       else if (BYTE_ASCII_P (c))
5251         {
5252           assert (ch == 0);
5253           Dynarr_add (dst, c);
5254         }
5255       else if (BUFBYTE_LEADING_BYTE_P (c))
5256         {
5257           assert (ch == 0);
5258           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5259               c == LEADING_BYTE_CONTROL_1)
5260             ch = c;
5261           else
5262             Dynarr_add (dst, '~'); /* untranslatable character */
5263         }
5264       else
5265         {
5266           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5267             Dynarr_add (dst, c);
5268           else if (ch == LEADING_BYTE_CONTROL_1)
5269             {
5270               assert (c < 0xC0);
5271               Dynarr_add (dst, c - 0x20);
5272             }
5273           /* else it should be the second or third byte of an
5274              untranslatable character, so ignore it */
5275           ch = 0;
5276         }
5277     }
5278
5279   str->flags = flags;
5280   str->ch    = ch;
5281 }
5282
5283 \f
5284 /************************************************************************/
5285 /*                   Simple internal/external functions                 */
5286 /************************************************************************/
5287
5288 static Extbyte_dynarr *conversion_out_dynarr;
5289 static Bufbyte_dynarr *conversion_in_dynarr;
5290
5291 /* Determine coding system from coding format */
5292
5293 /* #### not correct for all values of `fmt'! */
5294 static Lisp_Object
5295 external_data_format_to_coding_system (enum external_data_format fmt)
5296 {
5297   switch (fmt)
5298     {
5299     case FORMAT_FILENAME:
5300     case FORMAT_TERMINAL:
5301       if (EQ (Vfile_name_coding_system, Qnil) ||
5302           EQ (Vfile_name_coding_system, Qbinary))
5303         return Qnil;
5304       else
5305         return Fget_coding_system (Vfile_name_coding_system);
5306 #ifdef MULE
5307     case FORMAT_CTEXT:
5308       return Fget_coding_system (Qctext);
5309 #endif
5310     default:
5311       return Qnil;
5312     }
5313 }
5314
5315 Extbyte *
5316 convert_to_external_format (CONST Bufbyte *ptr,
5317                             Bytecount len,
5318                             Extcount *len_out,
5319                             enum external_data_format fmt)
5320 {
5321   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5322
5323   if (!conversion_out_dynarr)
5324     conversion_out_dynarr = Dynarr_new (Extbyte);
5325   else
5326     Dynarr_reset (conversion_out_dynarr);
5327
5328   if (NILP (coding_system))
5329     {
5330       CONST Bufbyte *end = ptr + len;
5331
5332       for (; ptr < end;)
5333         {
5334           Bufbyte c =
5335             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5336             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5337             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5338             '~';
5339
5340           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5341           INC_CHARPTR (ptr);
5342         }
5343
5344 #ifdef ERROR_CHECK_BUFPOS
5345       assert (ptr == end);
5346 #endif
5347     }
5348   else
5349     {
5350       Lisp_Object instream, outstream, da_outstream;
5351       Lstream *istr, *ostr;
5352       struct gcpro gcpro1, gcpro2, gcpro3;
5353       char tempbuf[1024]; /* some random amount */
5354
5355       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5356       da_outstream = make_dynarr_output_stream
5357         ((unsigned_char_dynarr *) conversion_out_dynarr);
5358       outstream =
5359         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5360       istr = XLSTREAM (instream);
5361       ostr = XLSTREAM (outstream);
5362       GCPRO3 (instream, outstream, da_outstream);
5363       while (1)
5364         {
5365           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5366           if (!size_in_bytes)
5367             break;
5368           Lstream_write (ostr, tempbuf, size_in_bytes);
5369         }
5370       Lstream_close (istr);
5371       Lstream_close (ostr);
5372       UNGCPRO;
5373       Lstream_delete (istr);
5374       Lstream_delete (ostr);
5375       Lstream_delete (XLSTREAM (da_outstream));
5376     }
5377
5378   *len_out = Dynarr_length (conversion_out_dynarr);
5379   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5380   return Dynarr_atp (conversion_out_dynarr, 0);
5381 }
5382
5383 Bufbyte *
5384 convert_from_external_format (CONST Extbyte *ptr,
5385                               Extcount len,
5386                               Bytecount *len_out,
5387                               enum external_data_format fmt)
5388 {
5389   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5390
5391   if (!conversion_in_dynarr)
5392     conversion_in_dynarr = Dynarr_new (Bufbyte);
5393   else
5394     Dynarr_reset (conversion_in_dynarr);
5395
5396   if (NILP (coding_system))
5397     {
5398       CONST Extbyte *end = ptr + len;
5399       for (; ptr < end; ptr++)
5400         {
5401           Extbyte c = *ptr;
5402           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5403         }
5404     }
5405   else
5406     {
5407       Lisp_Object instream, outstream, da_outstream;
5408       Lstream *istr, *ostr;
5409       struct gcpro gcpro1, gcpro2, gcpro3;
5410       char tempbuf[1024]; /* some random amount */
5411
5412       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5413       da_outstream = make_dynarr_output_stream
5414         ((unsigned_char_dynarr *) conversion_in_dynarr);
5415       outstream =
5416         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5417       istr = XLSTREAM (instream);
5418       ostr = XLSTREAM (outstream);
5419       GCPRO3 (instream, outstream, da_outstream);
5420       while (1)
5421         {
5422           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5423           if (!size_in_bytes)
5424             break;
5425           Lstream_write (ostr, tempbuf, size_in_bytes);
5426         }
5427       Lstream_close (istr);
5428       Lstream_close (ostr);
5429       UNGCPRO;
5430       Lstream_delete (istr);
5431       Lstream_delete (ostr);
5432       Lstream_delete (XLSTREAM (da_outstream));
5433     }
5434
5435   *len_out = Dynarr_length (conversion_in_dynarr);
5436   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5437   return Dynarr_atp (conversion_in_dynarr, 0);
5438 }
5439
5440 \f
5441 /************************************************************************/
5442 /*                             Initialization                           */
5443 /************************************************************************/
5444
5445 void
5446 syms_of_file_coding (void)
5447 {
5448   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5449   deferror (&Qcoding_system_error, "coding-system-error",
5450             "Coding-system error", Qio_error);
5451
5452   DEFSUBR (Fcoding_system_p);
5453   DEFSUBR (Ffind_coding_system);
5454   DEFSUBR (Fget_coding_system);
5455   DEFSUBR (Fcoding_system_list);
5456   DEFSUBR (Fcoding_system_name);
5457   DEFSUBR (Fmake_coding_system);
5458   DEFSUBR (Fcopy_coding_system);
5459   DEFSUBR (Fdefine_coding_system_alias);
5460   DEFSUBR (Fsubsidiary_coding_system);
5461
5462   DEFSUBR (Fcoding_system_type);
5463   DEFSUBR (Fcoding_system_doc_string);
5464 #ifdef MULE
5465   DEFSUBR (Fcoding_system_charset);
5466 #endif
5467   DEFSUBR (Fcoding_system_property);
5468
5469   DEFSUBR (Fcoding_category_list);
5470   DEFSUBR (Fset_coding_priority_list);
5471   DEFSUBR (Fcoding_priority_list);
5472   DEFSUBR (Fset_coding_category_system);
5473   DEFSUBR (Fcoding_category_system);
5474
5475   DEFSUBR (Fdetect_coding_region);
5476   DEFSUBR (Fdecode_coding_region);
5477   DEFSUBR (Fencode_coding_region);
5478 #ifdef MULE
5479   DEFSUBR (Fdecode_shift_jis_char);
5480   DEFSUBR (Fencode_shift_jis_char);
5481   DEFSUBR (Fdecode_big5_char);
5482   DEFSUBR (Fencode_big5_char);
5483   DEFSUBR (Fset_ucs_char);
5484   DEFSUBR (Fucs_char);
5485   DEFSUBR (Fset_char_ucs);
5486   DEFSUBR (Fchar_ucs);
5487 #endif /* MULE */
5488   defsymbol (&Qcoding_system_p, "coding-system-p");
5489   defsymbol (&Qno_conversion, "no-conversion");
5490   defsymbol (&Qraw_text, "raw-text");
5491 #ifdef MULE
5492   defsymbol (&Qbig5, "big5");
5493   defsymbol (&Qshift_jis, "shift-jis");
5494   defsymbol (&Qucs4, "ucs-4");
5495   defsymbol (&Qutf8, "utf-8");
5496   defsymbol (&Qccl, "ccl");
5497   defsymbol (&Qiso2022, "iso2022");
5498 #endif /* MULE */
5499   defsymbol (&Qmnemonic, "mnemonic");
5500   defsymbol (&Qeol_type, "eol-type");
5501   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5502   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5503
5504   defsymbol (&Qcr, "cr");
5505   defsymbol (&Qlf, "lf");
5506   defsymbol (&Qcrlf, "crlf");
5507   defsymbol (&Qeol_cr, "eol-cr");
5508   defsymbol (&Qeol_lf, "eol-lf");
5509   defsymbol (&Qeol_crlf, "eol-crlf");
5510 #ifdef MULE
5511   defsymbol (&Qcharset_g0, "charset-g0");
5512   defsymbol (&Qcharset_g1, "charset-g1");
5513   defsymbol (&Qcharset_g2, "charset-g2");
5514   defsymbol (&Qcharset_g3, "charset-g3");
5515   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5516   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5517   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5518   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5519   defsymbol (&Qno_iso6429, "no-iso6429");
5520   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5521   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5522
5523   defsymbol (&Qshort, "short");
5524   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5525   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5526   defsymbol (&Qseven, "seven");
5527   defsymbol (&Qlock_shift, "lock-shift");
5528   defsymbol (&Qescape_quoted, "escape-quoted");
5529 #endif /* MULE */
5530   defsymbol (&Qencode, "encode");
5531   defsymbol (&Qdecode, "decode");
5532
5533 #ifdef MULE
5534   defsymbol (&Qctext, "ctext");
5535   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5536              "shift-jis");
5537   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5538              "big5");
5539   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5540              "ucs-4");
5541   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5542              "utf-8");
5543   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5544              "iso-7");
5545   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5546              "iso-8-designate");
5547   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5548              "iso-8-1");
5549   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5550              "iso-8-2");
5551   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5552              "iso-lock-shift");
5553 #endif /* MULE */
5554   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5555              "no-conversion");
5556 }
5557
5558 void
5559 lstream_type_create_file_coding (void)
5560 {
5561   LSTREAM_HAS_METHOD (decoding, reader);
5562   LSTREAM_HAS_METHOD (decoding, writer);
5563   LSTREAM_HAS_METHOD (decoding, rewinder);
5564   LSTREAM_HAS_METHOD (decoding, seekable_p);
5565   LSTREAM_HAS_METHOD (decoding, flusher);
5566   LSTREAM_HAS_METHOD (decoding, closer);
5567   LSTREAM_HAS_METHOD (decoding, marker);
5568
5569   LSTREAM_HAS_METHOD (encoding, reader);
5570   LSTREAM_HAS_METHOD (encoding, writer);
5571   LSTREAM_HAS_METHOD (encoding, rewinder);
5572   LSTREAM_HAS_METHOD (encoding, seekable_p);
5573   LSTREAM_HAS_METHOD (encoding, flusher);
5574   LSTREAM_HAS_METHOD (encoding, closer);
5575   LSTREAM_HAS_METHOD (encoding, marker);
5576 }
5577
5578 void
5579 vars_of_file_coding (void)
5580 {
5581   int i;
5582
5583   /* Initialize to something reasonable ... */
5584   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5585     {
5586       coding_category_system[i] = Qnil;
5587       coding_category_by_priority[i] = i;
5588     }
5589
5590   Fprovide (intern ("file-coding"));
5591
5592   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5593 Coding system used for TTY keyboard input.
5594 Not used under a windowing system.
5595 */ );
5596   Vkeyboard_coding_system = Qnil;
5597
5598   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5599 Coding system used for TTY display output.
5600 Not used under a windowing system.
5601 */ );
5602   Vterminal_coding_system = Qnil;
5603
5604   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5605 Overriding coding system used when writing a file or process.
5606 You should *bind* this, not set it.  If this is non-nil, it specifies
5607 the coding system that will be used when a file or process is read
5608 in, and overrides `buffer-file-coding-system-for-read',
5609 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5610 this one for permanent changes to the environment.
5611 */ );
5612   Vcoding_system_for_read = Qnil;
5613
5614   DEFVAR_LISP ("coding-system-for-write",
5615                &Vcoding_system_for_write /*
5616 Overriding coding system used when writing a file or process.
5617 You should *bind* this, not set it.  If this is non-nil, it specifies
5618 the coding system that will be used when a file or process is wrote
5619 in, and overrides `buffer-file-coding-system',
5620 `write-region-pre-hook', etc.  Use those variables instead of this one
5621 for permanent changes to the environment.
5622 */ );
5623   Vcoding_system_for_write = Qnil;
5624
5625   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5626 Coding system used to convert pathnames when accessing files.
5627 */ );
5628   Vfile_name_coding_system = Qnil;
5629
5630   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5631 Non-nil means the buffer contents are regarded as multi-byte form
5632 of characters, not a binary code.  This affects the display, file I/O,
5633 and behaviors of various editing commands.
5634
5635 Setting this to nil does not do anything.
5636 */ );
5637   enable_multibyte_characters = 1;
5638 }
5639
5640 void
5641 complex_vars_of_file_coding (void)
5642 {
5643   staticpro (&Vcoding_system_hash_table);
5644   Vcoding_system_hash_table =
5645     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5646
5647   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5648
5649 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5650 {                                               \
5651   struct codesys_prop csp;                      \
5652   csp.sym = (Sym);                              \
5653   csp.prop_type = (Prop_Type);                  \
5654   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5655 } while (0)
5656
5657   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5658   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5659   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5660   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5661   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5662   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5663   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5664 #ifdef MULE
5665   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5666   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5667   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5668   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5669   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5670   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5671   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5672   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5673   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5674   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5675   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5676   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5677   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5678   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5679   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5680   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5681   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5682
5683   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5684   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5685 #endif /* MULE */
5686   /* Need to create this here or we're really screwed. */
5687   Fmake_coding_system
5688     (Qraw_text, Qno_conversion,
5689      build_string ("Raw text, which means it converts only line-break-codes."),
5690      list2 (Qmnemonic, build_string ("Raw")));
5691
5692   Fmake_coding_system
5693     (Qbinary, Qno_conversion,
5694      build_string ("Binary, which means it does not convert anything."),
5695      list4 (Qeol_type, Qlf,
5696             Qmnemonic, build_string ("Binary")));
5697
5698   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5699
5700   /* Need this for bootstrapping */
5701   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5702     Fget_coding_system (Qraw_text);
5703
5704 #ifdef MULE
5705   {
5706     unsigned int i;
5707
5708     for (i = 0; i < 65536; i++)
5709       ucs_to_mule_table[i] = Qnil;
5710   }
5711   staticpro (&mule_to_ucs_table);
5712   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5713 #endif /* MULE */
5714 }