src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154 static int detect_coding_sjis (struct detection_state *st,
 155                                CONST unsigned char *src,
 156                                unsigned int n);
 157 static void decode_coding_sjis (Lstream *decoding,
 158                                 CONST unsigned char *src,
 159                                 unsigned_char_dynarr *dst,
 160                                 unsigned int n);
 161 static void encode_coding_sjis (Lstream *encoding,
 162                                 CONST unsigned char *src,
 163                                 unsigned_char_dynarr *dst,
 164                                 unsigned int n);
 165 static int detect_coding_big5 (struct detection_state *st,
 166                                CONST unsigned char *src,
 167                                unsigned int n);
 168 static void decode_coding_big5 (Lstream *decoding,
 169                                 CONST unsigned char *src,
 170                                 unsigned_char_dynarr *dst, unsigned int n);
 171 static void encode_coding_big5 (Lstream *encoding,
 172                                 CONST unsigned char *src,
 173                                 unsigned_char_dynarr *dst, unsigned int n);
 174 static int detect_coding_ucs4 (struct detection_state *st,
 175                                CONST unsigned char *src,
 176                                unsigned int n);
 177 static void decode_coding_ucs4 (Lstream *decoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static void encode_coding_ucs4 (Lstream *encoding,
 181                                 CONST unsigned char *src,
 182                                 unsigned_char_dynarr *dst, unsigned int n);
 183 static int detect_coding_utf8 (struct detection_state *st,
 184                                CONST unsigned char *src,
 185                                unsigned int n);
 186 static void decode_coding_utf8 (Lstream *decoding,
 187                                 CONST unsigned char *src,
 188                                 unsigned_char_dynarr *dst, unsigned int n);
 189 static void encode_coding_utf8 (Lstream *encoding,
 190                                 CONST unsigned char *src,
 191                                 unsigned_char_dynarr *dst, unsigned int n);
 192 static int postprocess_iso2022_mask (int mask);
 193 static void reset_iso2022 (Lisp_Object coding_system,
 194                            struct iso2022_decoder *iso);
 195 static int detect_coding_iso2022 (struct detection_state *st,
 196                                   CONST unsigned char *src,
 197                                   unsigned int n);
 198 static void decode_coding_iso2022 (Lstream *decoding,
 199                                    CONST unsigned char *src,
 200                                    unsigned_char_dynarr *dst, unsigned int n);
 201 static void encode_coding_iso2022 (Lstream *encoding,
 202                                    CONST unsigned char *src,
 203                                    unsigned_char_dynarr *dst, unsigned int n);
 204 #endif /* MULE */
 205 static void decode_coding_no_conversion (Lstream *decoding,
 206                                          CONST unsigned char *src,
 207                                          unsigned_char_dynarr *dst,
 208                                          unsigned int n);
 209 static void encode_coding_no_conversion (Lstream *encoding,
 210                                          CONST unsigned char *src,
 211                                          unsigned_char_dynarr *dst,
 212                                          unsigned int n);
 213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 214                          unsigned_char_dynarr *dst, unsigned int n);
 215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 216                          unsigned_char_dynarr *dst, unsigned int n);
 217
 218 typedef struct codesys_prop codesys_prop;
 219 struct codesys_prop
 220 {
 221   Lisp_Object sym;
 222   int prop_type;
 223 };
 224
 225 typedef struct
 226 {
 227   Dynarr_declare (codesys_prop);
 228 } codesys_prop_dynarr;
 229
 230 codesys_prop_dynarr *the_codesys_prop_dynarr;
 231
 232 enum codesys_prop_enum
 233 {
 234   CODESYS_PROP_ALL_OK,
 235   CODESYS_PROP_ISO2022,
 236   CODESYS_PROP_CCL
 237 };
 238
 239 \f
 240 /************************************************************************/
 241 /*                       Coding system functions                        */
 242 /************************************************************************/
 243
 244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 246 static void finalize_coding_system (void *header, int for_disksave);
 247
 248 #ifdef MULE
 249 static const struct lrecord_description ccs_description_1[] = {
 250   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 251   { XD_END }
 252 };
 253
 254 static const struct struct_description ccs_description = {
 255   sizeof(charset_conversion_spec),
 256   ccs_description_1
 257 };
 258
 259 static const struct lrecord_description ccsd_description_1[] = {
 260   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 261   { XD_END }
 262 };
 263
 264 static const struct struct_description ccsd_description = {
 265   sizeof(charset_conversion_spec_dynarr),
 266   ccsd_description_1
 267 };
 268 #endif
 269
 270 static const struct lrecord_description coding_system_description[] = {
 271   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 272   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 273   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 274 #ifdef MULE
 275   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 276   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 277   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 278   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 279 #endif
 280   { XD_END }
 281 };
 282
 283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 284                                mark_coding_system, print_coding_system,
 285                                finalize_coding_system,
 286                                0, 0, coding_system_description,
 287                                struct Lisp_Coding_System);
 288
 289 static Lisp_Object
 290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 291 {
 292   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 293
 294   markobj (CODING_SYSTEM_NAME (codesys));
 295   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 296   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 297   markobj (CODING_SYSTEM_EOL_LF (codesys));
 298   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 299   markobj (CODING_SYSTEM_EOL_CR (codesys));
 300
 301   switch (CODING_SYSTEM_TYPE (codesys))
 302     {
 303 #ifdef MULE
 304       int i;
 305     case CODESYS_ISO2022:
 306       for (i = 0; i < 4; i++)
 307         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 308       if (codesys->iso2022.input_conv)
 309         {
 310           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 311             {
 312               struct charset_conversion_spec *ccs =
 313                 Dynarr_atp (codesys->iso2022.input_conv, i);
 314               markobj (ccs->from_charset);
 315               markobj (ccs->to_charset);
 316             }
 317         }
 318       if (codesys->iso2022.output_conv)
 319         {
 320           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 321             {
 322               struct charset_conversion_spec *ccs =
 323                 Dynarr_atp (codesys->iso2022.output_conv, i);
 324               markobj (ccs->from_charset);
 325               markobj (ccs->to_charset);
 326             }
 327         }
 328       break;
 329
 330     case CODESYS_CCL:
 331       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 332       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 333       break;
 334 #endif /* MULE */
 335     default:
 336       break;
 337     }
 338
 339   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 340   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 341 }
 342
 343 static void
 344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 345                      int escapeflag)
 346 {
 347   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 348   if (print_readably)
 349     error ("printing unreadable object #<coding_system 0x%x>",
 350            c->header.uid);
 351
 352   write_c_string ("#<coding_system ", printcharfun);
 353   print_internal (c->name, printcharfun, 1);
 354   write_c_string (">", printcharfun);
 355 }
 356
 357 static void
 358 finalize_coding_system (void *header, int for_disksave)
 359 {
 360   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 361   /* Since coding systems never go away, this function is not
 362      necessary.  But it would be necessary if we changed things
 363      so that coding systems could go away. */
 364   if (!for_disksave) /* see comment in lstream.c */
 365     {
 366       switch (CODING_SYSTEM_TYPE (c))
 367         {
 368 #ifdef MULE
 369         case CODESYS_ISO2022:
 370           if (c->iso2022.input_conv)
 371             {
 372               Dynarr_free (c->iso2022.input_conv);
 373               c->iso2022.input_conv = 0;
 374             }
 375           if (c->iso2022.output_conv)
 376             {
 377               Dynarr_free (c->iso2022.output_conv);
 378               c->iso2022.output_conv = 0;
 379             }
 380           break;
 381 #endif /* MULE */
 382         default:
 383           break;
 384         }
 385     }
 386 }
 387
 388 static enum eol_type
 389 symbol_to_eol_type (Lisp_Object symbol)
 390 {
 391   CHECK_SYMBOL (symbol);
 392   if (NILP (symbol))      return EOL_AUTODETECT;
 393   if (EQ (symbol, Qlf))   return EOL_LF;
 394   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 395   if (EQ (symbol, Qcr))   return EOL_CR;
 396
 397   signal_simple_error ("Unrecognized eol type", symbol);
 398   return EOL_AUTODETECT; /* not reached */
 399 }
 400
 401 static Lisp_Object
 402 eol_type_to_symbol (enum eol_type type)
 403 {
 404   switch (type)
 405     {
 406     default: abort ();
 407     case EOL_LF:         return Qlf;
 408     case EOL_CRLF:       return Qcrlf;
 409     case EOL_CR:         return Qcr;
 410     case EOL_AUTODETECT: return Qnil;
 411     }
 412 }
 413
 414 static void
 415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 416 {
 417   Lisp_Object codesys_obj;
 418   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 419   char *codesys_name = (char *) alloca (len + 7);
 420   int mlen = -1;
 421   char *codesys_mnemonic=0;
 422
 423   Lisp_Object codesys_name_sym, sub_codesys_obj;
 424
 425   /* kludge */
 426
 427   XSETCODING_SYSTEM (codesys_obj, codesys);
 428
 429   memcpy (codesys_name,
 430           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 431
 432   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 433     {
 434       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 435       codesys_mnemonic = (char *) alloca (mlen + 7);
 436       memcpy (codesys_mnemonic,
 437               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 438     }
 439
 440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 441   strcpy (codesys_name + len, "-" op_sys);                                      \
 442   if (mlen != -1)                                                               \
 443     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 444   codesys_name_sym = intern (codesys_name);                                     \
 445   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 446   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 447   if (mlen != -1)                                                               \
 448     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 449       build_string (codesys_mnemonic);                                          \
 450   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 451 } while (0)
 452
 453   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 454   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 455   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 456 }
 457
 458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 459 Return t if OBJECT is a coding system.
 460 A coding system is an object that defines how text containing multiple
 461 character sets is encoded into a stream of (typically 8-bit) bytes.
 462 The coding system is used to decode the stream into a series of
 463 characters (which may be from multiple charsets) when the text is read
 464 from a file or process, and is used to encode the text back into the
 465 same format when it is written out to a file or process.
 466
 467 For example, many ISO2022-compliant coding systems (such as Compound
 468 Text, which is used for inter-client data under the X Window System)
 469 use escape sequences to switch between different charsets -- Japanese
 470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 472 `make-coding-system' for more information.
 473
 474 Coding systems are normally identified using a symbol, and the
 475 symbol is accepted in place of the actual coding system object whenever
 476 a coding system is called for. (This is similar to how faces work.)
 477 */
 478        (object))
 479 {
 480   return CODING_SYSTEMP (object) ? Qt : Qnil;
 481 }
 482
 483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 484 Retrieve the coding system of the given name.
 485
 486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 487 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 488 If there is no such coding system, nil is returned.  Otherwise the
 489 associated coding system object is returned.
 490 */
 491        (coding_system_or_name))
 492 {
 493   if (CODING_SYSTEMP (coding_system_or_name))
 494     return coding_system_or_name;
 495
 496   if (NILP (coding_system_or_name))
 497     coding_system_or_name = Qbinary;
 498   else
 499     CHECK_SYMBOL (coding_system_or_name);
 500
 501   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 502 }
 503
 504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 505 Retrieve the coding system of the given name.
 506 Same as `find-coding-system' except that if there is no such
 507 coding system, an error is signaled instead of returning nil.
 508 */
 509        (name))
 510 {
 511   Lisp_Object coding_system = Ffind_coding_system (name);
 512
 513   if (NILP (coding_system))
 514     signal_simple_error ("No such coding system", name);
 515   return coding_system;
 516 }
 517
 518 /* We store the coding systems in hash tables with the names as the key and the
 519    actual coding system object as the value.  Occasionally we need to use them
 520    in a list format.  These routines provide us with that. */
 521 struct coding_system_list_closure
 522 {
 523   Lisp_Object *coding_system_list;
 524 };
 525
 526 static int
 527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 528                                   void *coding_system_list_closure)
 529 {
 530   /* This function can GC */
 531   struct coding_system_list_closure *cscl =
 532     (struct coding_system_list_closure *) coding_system_list_closure;
 533   Lisp_Object *coding_system_list = cscl->coding_system_list;
 534
 535   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 536                                *coding_system_list);
 537   return 0;
 538 }
 539
 540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 541 Return a list of the names of all defined coding systems.
 542 */
 543        ())
 544 {
 545   Lisp_Object coding_system_list = Qnil;
 546   struct gcpro gcpro1;
 547   struct coding_system_list_closure coding_system_list_closure;
 548
 549   GCPRO1 (coding_system_list);
 550   coding_system_list_closure.coding_system_list = &coding_system_list;
 551   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 552                  &coding_system_list_closure);
 553   UNGCPRO;
 554
 555   return coding_system_list;
 556 }
 557
 558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 559 Return the name of the given coding system.
 560 */
 561        (coding_system))
 562 {
 563   coding_system = Fget_coding_system (coding_system);
 564   return XCODING_SYSTEM_NAME (coding_system);
 565 }
 566
 567 static Lisp_Coding_System *
 568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 569 {
 570   Lisp_Coding_System *codesys =
 571     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 572
 573   zero_lcrecord (codesys);
 574   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 575   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 576   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 577   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 578   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 579   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 580   CODING_SYSTEM_TYPE     (codesys) = type;
 581   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 582 #ifdef MULE
 583   if (type == CODESYS_ISO2022)
 584     {
 585       int i;
 586       for (i = 0; i < 4; i++)
 587         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 588     }
 589   else if (type == CODESYS_CCL)
 590     {
 591       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 592       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 593     }
 594 #endif /* MULE */
 595   CODING_SYSTEM_NAME (codesys) = name;
 596
 597   return codesys;
 598 }
 599
 600 #ifdef MULE
 601 /* Given a list of charset conversion specs as specified in a Lisp
 602    program, parse it into STORE_HERE. */
 603
 604 static void
 605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 606                                 Lisp_Object spec_list)
 607 {
 608   Lisp_Object rest;
 609
 610   EXTERNAL_LIST_LOOP (rest, spec_list)
 611     {
 612       Lisp_Object car = XCAR (rest);
 613       Lisp_Object from, to;
 614       struct charset_conversion_spec spec;
 615
 616       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 617         signal_simple_error ("Invalid charset conversion spec", car);
 618       from = Fget_charset (XCAR (car));
 619       to = Fget_charset (XCAR (XCDR (car)));
 620       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 621         signal_simple_error_2
 622           ("Attempted conversion between different charset types",
 623            from, to);
 624       spec.from_charset = from;
 625       spec.to_charset = to;
 626
 627       Dynarr_add (store_here, spec);
 628     }
 629 }
 630
 631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 632    specs, return the equivalent as the Lisp programmer would see it.
 633
 634    If LOAD_HERE is 0, return Qnil. */
 635
 636 static Lisp_Object
 637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 638 {
 639   int i;
 640   Lisp_Object result;
 641
 642   if (!load_here)
 643     return Qnil;
 644   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 645     {
 646       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 647       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 648     }
 649
 650   return Fnreverse (result);
 651 }
 652
 653 #endif /* MULE */
 654
 655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 656 Register symbol NAME as a coding system.
 657
 658 TYPE describes the conversion method used and should be one of
 659
 660 nil or 'undecided
 661      Automatic conversion.  XEmacs attempts to detect the coding system
 662      used in the file.
 663 'no-conversion
 664      No conversion.  Use this for binary files and such.  On output,
 665      graphic characters that are not in ASCII or Latin-1 will be
 666      replaced by a ?. (For a no-conversion-encoded buffer, these
 667      characters will only be present if you explicitly insert them.)
 668 'shift-jis
 669      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 670 'ucs-4
 671      ISO 10646 UCS-4 encoding.
 672 'utf-8
 673      ISO 10646 UTF-8 encoding.
 674 'iso2022
 675      Any ISO2022-compliant encoding.  Among other things, this includes
 676      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 677      standard Unix encoding for Japanese and other languages), and
 678      Compound Text (the encoding used in X11).  You can specify more
 679      specific information about the conversion with the FLAGS argument.
 680 'big5
 681      Big5 (the encoding commonly used for Taiwanese).
 682 'ccl
 683      The conversion is performed using a user-written pseudo-code
 684      program.  CCL (Code Conversion Language) is the name of this
 685      pseudo-code.
 686 'internal
 687      Write out or read in the raw contents of the memory representing
 688      the buffer's text.  This is primarily useful for debugging
 689      purposes, and is only enabled when XEmacs has been compiled with
 690      DEBUG_XEMACS defined (via the --debug configure option).
 691      WARNING: Reading in a file using 'internal conversion can result
 692      in an internal inconsistency in the memory representing a
 693      buffer's text, which will produce unpredictable results and may
 694      cause XEmacs to crash.  Under normal circumstances you should
 695      never use 'internal conversion.
 696
 697 DOC-STRING is a string describing the coding system.
 698
 699 PROPS is a property list, describing the specific nature of the
 700 character set.  Recognized properties are:
 701
 702 'mnemonic
 703      String to be displayed in the modeline when this coding system is
 704      active.
 705
 706 'eol-type
 707      End-of-line conversion to be used.  It should be one of
 708
 709         nil
 710                 Automatically detect the end-of-line type (LF, CRLF,
 711                 or CR).  Also generate subsidiary coding systems named
 712                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 713                 identical to this coding system but have an EOL-TYPE
 714                 value of 'lf, 'crlf, and 'cr, respectively.
 715         'lf
 716                 The end of a line is marked externally using ASCII LF.
 717                 Since this is also the way that XEmacs represents an
 718                 end-of-line internally, specifying this option results
 719                 in no end-of-line conversion.  This is the standard
 720                 format for Unix text files.
 721         'crlf
 722                 The end of a line is marked externally using ASCII
 723                 CRLF.  This is the standard format for MS-DOS text
 724                 files.
 725         'cr
 726                 The end of a line is marked externally using ASCII CR.
 727                 This is the standard format for Macintosh text files.
 728         t
 729                 Automatically detect the end-of-line type but do not
 730                 generate subsidiary coding systems.  (This value is
 731                 converted to nil when stored internally, and
 732                 `coding-system-property' will return nil.)
 733
 734 'post-read-conversion
 735      Function called after a file has been read in, to perform the
 736      decoding.  Called with two arguments, BEG and END, denoting
 737      a region of the current buffer to be decoded.
 738
 739 'pre-write-conversion
 740      Function called before a file is written out, to perform the
 741      encoding.  Called with two arguments, BEG and END, denoting
 742      a region of the current buffer to be encoded.
 743
 744
 745 The following additional properties are recognized if TYPE is 'iso2022:
 746
 747 'charset-g0
 748 'charset-g1
 749 'charset-g2
 750 'charset-g3
 751      The character set initially designated to the G0 - G3 registers.
 752      The value should be one of
 753
 754           -- A charset object (designate that character set)
 755           -- nil (do not ever use this register)
 756           -- t (no character set is initially designated to
 757                 the register, but may be later on; this automatically
 758                 sets the corresponding `force-g*-on-output' property)
 759
 760 'force-g0-on-output
 761 'force-g1-on-output
 762 'force-g2-on-output
 763 'force-g2-on-output
 764      If non-nil, send an explicit designation sequence on output before
 765      using the specified register.
 766
 767 'short
 768      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 769      "ESC $ B" on output in place of the full designation sequences
 770      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 771
 772 'no-ascii-eol
 773      If non-nil, don't designate ASCII to G0 at each end of line on output.
 774      Setting this to non-nil also suppresses other state-resetting that
 775      normally happens at the end of a line.
 776
 777 'no-ascii-cntl
 778      If non-nil, don't designate ASCII to G0 before control chars on output.
 779
 780 'seven
 781      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 782      environment.
 783
 784 'lock-shift
 785      If non-nil, use locking-shift (SO/SI) instead of single-shift
 786      or designation by escape sequence.
 787
 788 'no-iso6429
 789      If non-nil, don't use ISO6429's direction specification.
 790
 791 'escape-quoted
 792      If non-nil, literal control characters that are the same as
 793      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 794      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 795      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 796      so that they can be properly distinguished from an escape sequence.
 797      (Note that doing this results in a non-portable encoding.) This
 798      encoding flag is used for byte-compiled files.  Note that ESC
 799      is a good choice for a quoting character because there are no
 800      escape sequences whose second byte is a character from the Control-0
 801      or Control-1 character sets; this is explicitly disallowed by the
 802      ISO2022 standard.
 803
 804 'input-charset-conversion
 805      A list of conversion specifications, specifying conversion of
 806      characters in one charset to another when decoding is performed.
 807      Each specification is a list of two elements: the source charset,
 808      and the destination charset.
 809
 810 'output-charset-conversion
 811      A list of conversion specifications, specifying conversion of
 812      characters in one charset to another when encoding is performed.
 813      The form of each specification is the same as for
 814      'input-charset-conversion.
 815
 816
 817 The following additional properties are recognized (and required)
 818 if TYPE is 'ccl:
 819
 820 'decode
 821      CCL program used for decoding (converting to internal format).
 822
 823 'encode
 824      CCL program used for encoding (converting to external format).
 825 */
 826        (name, type, doc_string, props))
 827 {
 828   Lisp_Coding_System *codesys;
 829   Lisp_Object rest, key, value;
 830   enum coding_system_type ty;
 831   int need_to_setup_eol_systems = 1;
 832
 833   /* Convert type to constant */
 834   if (NILP (type) || EQ (type, Qundecided))
 835                                       { ty = CODESYS_AUTODETECT; }
 836 #ifdef MULE
 837   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 838   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 839   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 840   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 841   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 842   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 843 #endif
 844   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 845 #ifdef DEBUG_XEMACS
 846   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 847 #endif
 848   else
 849     signal_simple_error ("Invalid coding system type", type);
 850
 851   CHECK_SYMBOL (name);
 852
 853   codesys = allocate_coding_system (ty, name);
 854
 855   if (NILP (doc_string))
 856     doc_string = build_string ("");
 857   else
 858     CHECK_STRING (doc_string);
 859   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 860
 861 #ifdef UTF2000
 862   if (ty == CODESYS_NO_CONVERSION)
 863     codesys->fixed.size = 1;
 864 #endif
 865   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 866     {
 867       if (EQ (key, Qmnemonic))
 868         {
 869           if (!NILP (value))
 870             CHECK_STRING (value);
 871           CODING_SYSTEM_MNEMONIC (codesys) = value;
 872         }
 873
 874       else if (EQ (key, Qeol_type))
 875         {
 876           need_to_setup_eol_systems = NILP (value);
 877           if (EQ (value, Qt))
 878             value = Qnil;
 879           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 880         }
 881
 882       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 883       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 884 #ifdef MULE
 885       else if (ty == CODESYS_ISO2022)
 886         {
 887 #define FROB_INITIAL_CHARSET(charset_num) \
 888   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 889     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 890
 891           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 892           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 893           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 894           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 895
 896 #define FROB_FORCE_CHARSET(charset_num) \
 897   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 898
 899           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 900           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 901           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 902           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 903
 904 #define FROB_BOOLEAN_PROPERTY(prop) \
 905   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 906
 907           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 908           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 909           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 910           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 911           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 912           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 913           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 914
 915           else if (EQ (key, Qinput_charset_conversion))
 916             {
 917               codesys->iso2022.input_conv =
 918                 Dynarr_new (charset_conversion_spec);
 919               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 920                                               value);
 921             }
 922           else if (EQ (key, Qoutput_charset_conversion))
 923             {
 924               codesys->iso2022.output_conv =
 925                 Dynarr_new (charset_conversion_spec);
 926               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 927                                               value);
 928             }
 929           else
 930             signal_simple_error ("Unrecognized property", key);
 931         }
 932       else if (EQ (type, Qccl))
 933         {
 934           if (EQ (key, Qdecode))
 935             {
 936               CHECK_VECTOR (value);
 937               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 938             }
 939           else if (EQ (key, Qencode))
 940             {
 941               CHECK_VECTOR (value);
 942               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 943             }
 944           else
 945             signal_simple_error ("Unrecognized property", key);
 946         }
 947 #endif /* MULE */
 948       else
 949         signal_simple_error ("Unrecognized property", key);
 950     }
 951
 952   if (need_to_setup_eol_systems)
 953     setup_eol_coding_systems (codesys);
 954
 955   {
 956     Lisp_Object codesys_obj;
 957     XSETCODING_SYSTEM (codesys_obj, codesys);
 958     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 959     return codesys_obj;
 960   }
 961 }
 962
 963 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 964 Copy OLD-CODING-SYSTEM to NEW-NAME.
 965 If NEW-NAME does not name an existing coding system, a new one will
 966 be created.
 967 */
 968        (old_coding_system, new_name))
 969 {
 970   Lisp_Object new_coding_system;
 971   old_coding_system = Fget_coding_system (old_coding_system);
 972   new_coding_system = Ffind_coding_system (new_name);
 973   if (NILP (new_coding_system))
 974     {
 975       XSETCODING_SYSTEM (new_coding_system,
 976                          allocate_coding_system
 977                          (XCODING_SYSTEM_TYPE (old_coding_system),
 978                           new_name));
 979       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 980     }
 981
 982   {
 983     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 984     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 985     memcpy (((char *) to  ) + sizeof (to->header),
 986             ((char *) from) + sizeof (from->header),
 987             sizeof (*from) - sizeof (from->header));
 988     to->name = new_name;
 989   }
 990   return new_coding_system;
 991 }
 992
 993 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
 994 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
 995 */
 996        (alias, coding_system))
 997 {
 998   CHECK_SYMBOL (alias);
 999   if (!NILP (Ffind_coding_system (alias)))
1000     signal_simple_error ("Symbol already names a coding system", alias);
1001   coding_system = Fget_coding_system (coding_system);
1002   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1003
1004   /* Set up aliases for subsidiaries. */
1005   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1006     {
1007       Lisp_Object str;
1008       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1009 #define FROB(type, name)                                                        \
1010       do {                                                                      \
1011         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1012         if (!NILP (subsidiary))                                                 \
1013           Fdefine_coding_system_alias                                           \
1014             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1015       } while (0)
1016       FROB (LF,   "-unix");
1017       FROB (CRLF, "-dos");
1018       FROB (CR,   "-mac");
1019 #undef FROB
1020     }
1021   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1022      but it doesn't look intentional, so I'd rather return something
1023      meaningful or nothing at all. */
1024   return Qnil;
1025 }
1026
1027 static Lisp_Object
1028 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1029 {
1030   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1031   Lisp_Object new_coding_system;
1032
1033   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1034     return coding_system;
1035
1036   switch (type)
1037     {
1038     case EOL_AUTODETECT: return coding_system;
1039     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1040     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1041     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1042     default:       abort ();
1043     }
1044
1045   return NILP (new_coding_system) ? coding_system : new_coding_system;
1046 }
1047
1048 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1049 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1050 */
1051        (coding_system, eol_type))
1052 {
1053   coding_system = Fget_coding_system (coding_system);
1054
1055   return subsidiary_coding_system (coding_system,
1056                                    symbol_to_eol_type (eol_type));
1057 }
1058
1059 \f
1060 /************************************************************************/
1061 /*                         Coding system accessors                      */
1062 /************************************************************************/
1063
1064 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1065 Return the doc string for CODING-SYSTEM.
1066 */
1067        (coding_system))
1068 {
1069   coding_system = Fget_coding_system (coding_system);
1070   return XCODING_SYSTEM_DOC_STRING (coding_system);
1071 }
1072
1073 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1074 Return the type of CODING-SYSTEM.
1075 */
1076        (coding_system))
1077 {
1078   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1079     {
1080     default: abort ();
1081     case CODESYS_AUTODETECT:    return Qundecided;
1082 #ifdef MULE
1083     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1084     case CODESYS_ISO2022:       return Qiso2022;
1085     case CODESYS_BIG5:          return Qbig5;
1086     case CODESYS_UCS4:          return Qucs4;
1087     case CODESYS_UTF8:          return Qutf8;
1088     case CODESYS_CCL:           return Qccl;
1089 #endif
1090     case CODESYS_NO_CONVERSION: return Qno_conversion;
1091 #ifdef DEBUG_XEMACS
1092     case CODESYS_INTERNAL:      return Qinternal;
1093 #endif
1094     }
1095 }
1096
1097 #ifdef MULE
1098 static
1099 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1100 {
1101   Lisp_Object cs
1102     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1103
1104   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1105 }
1106
1107 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1108 Return initial charset of CODING-SYSTEM designated to GNUM.
1109 GNUM allows 0 .. 3.
1110 */
1111        (coding_system, gnum))
1112 {
1113   coding_system = Fget_coding_system (coding_system);
1114   CHECK_INT (gnum);
1115
1116   return coding_system_charset (coding_system, XINT (gnum));
1117 }
1118 #endif /* MULE */
1119
1120 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1121 Return the PROP property of CODING-SYSTEM.
1122 */
1123        (coding_system, prop))
1124 {
1125   int i, ok = 0;
1126   enum coding_system_type type;
1127
1128   coding_system = Fget_coding_system (coding_system);
1129   CHECK_SYMBOL (prop);
1130   type = XCODING_SYSTEM_TYPE (coding_system);
1131
1132   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1133     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1134       {
1135         ok = 1;
1136         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1137           {
1138           case CODESYS_PROP_ALL_OK:
1139             break;
1140 #ifdef MULE
1141           case CODESYS_PROP_ISO2022:
1142             if (type != CODESYS_ISO2022)
1143               signal_simple_error
1144                 ("Property only valid in ISO2022 coding systems",
1145                  prop);
1146             break;
1147
1148           case CODESYS_PROP_CCL:
1149             if (type != CODESYS_CCL)
1150               signal_simple_error
1151                 ("Property only valid in CCL coding systems",
1152                  prop);
1153             break;
1154 #endif /* MULE */
1155           default:
1156             abort ();
1157           }
1158       }
1159
1160   if (!ok)
1161     signal_simple_error ("Unrecognized property", prop);
1162
1163   if (EQ (prop, Qname))
1164     return XCODING_SYSTEM_NAME (coding_system);
1165   else if (EQ (prop, Qtype))
1166     return Fcoding_system_type (coding_system);
1167   else if (EQ (prop, Qdoc_string))
1168     return XCODING_SYSTEM_DOC_STRING (coding_system);
1169   else if (EQ (prop, Qmnemonic))
1170     return XCODING_SYSTEM_MNEMONIC (coding_system);
1171   else if (EQ (prop, Qeol_type))
1172     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1173   else if (EQ (prop, Qeol_lf))
1174     return XCODING_SYSTEM_EOL_LF (coding_system);
1175   else if (EQ (prop, Qeol_crlf))
1176     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1177   else if (EQ (prop, Qeol_cr))
1178     return XCODING_SYSTEM_EOL_CR (coding_system);
1179   else if (EQ (prop, Qpost_read_conversion))
1180     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1181   else if (EQ (prop, Qpre_write_conversion))
1182     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1183 #ifdef MULE
1184   else if (type == CODESYS_ISO2022)
1185     {
1186       if (EQ (prop, Qcharset_g0))
1187         return coding_system_charset (coding_system, 0);
1188       else if (EQ (prop, Qcharset_g1))
1189         return coding_system_charset (coding_system, 1);
1190       else if (EQ (prop, Qcharset_g2))
1191         return coding_system_charset (coding_system, 2);
1192       else if (EQ (prop, Qcharset_g3))
1193         return coding_system_charset (coding_system, 3);
1194
1195 #define FORCE_CHARSET(charset_num) \
1196   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1197    (coding_system, charset_num) ? Qt : Qnil)
1198
1199       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1200       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1201       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1202       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1203
1204 #define LISP_BOOLEAN(prop) \
1205   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1206
1207       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1208       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1209       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1210       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1211       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1212       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1213       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1214
1215       else if (EQ (prop, Qinput_charset_conversion))
1216         return
1217           unparse_charset_conversion_specs
1218             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1219       else if (EQ (prop, Qoutput_charset_conversion))
1220         return
1221           unparse_charset_conversion_specs
1222             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1223       else
1224         abort ();
1225     }
1226   else if (type == CODESYS_CCL)
1227     {
1228       if (EQ (prop, Qdecode))
1229         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1230       else if (EQ (prop, Qencode))
1231         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1232       else
1233         abort ();
1234     }
1235 #endif /* MULE */
1236   else
1237     abort ();
1238
1239   return Qnil; /* not reached */
1240 }
1241
1242 \f
1243 /************************************************************************/
1244 /*                       Coding category functions                      */
1245 /************************************************************************/
1246
1247 static int
1248 decode_coding_category (Lisp_Object symbol)
1249 {
1250   int i;
1251
1252   CHECK_SYMBOL (symbol);
1253   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1254     if (EQ (coding_category_symbol[i], symbol))
1255       return i;
1256
1257   signal_simple_error ("Unrecognized coding category", symbol);
1258   return 0; /* not reached */
1259 }
1260
1261 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1262 Return a list of all recognized coding categories.
1263 */
1264        ())
1265 {
1266   int i;
1267   Lisp_Object list = Qnil;
1268
1269   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1270     list = Fcons (coding_category_symbol[i], list);
1271   return list;
1272 }
1273
1274 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1275 Change the priority order of the coding categories.
1276 LIST should be list of coding categories, in descending order of
1277 priority.  Unspecified coding categories will be lower in priority
1278 than all specified ones, in the same relative order they were in
1279 previously.
1280 */
1281        (list))
1282 {
1283   int category_to_priority[CODING_CATEGORY_LAST + 1];
1284   int i, j;
1285   Lisp_Object rest;
1286
1287   /* First generate a list that maps coding categories to priorities. */
1288
1289   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1290     category_to_priority[i] = -1;
1291
1292   /* Highest priority comes from the specified list. */
1293   i = 0;
1294   EXTERNAL_LIST_LOOP (rest, list)
1295     {
1296       int cat = decode_coding_category (XCAR (rest));
1297
1298       if (category_to_priority[cat] >= 0)
1299         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1300       category_to_priority[cat] = i++;
1301     }
1302
1303   /* Now go through the existing categories by priority to retrieve
1304      the categories not yet specified and preserve their priority
1305      order. */
1306   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1307     {
1308       int cat = coding_category_by_priority[j];
1309       if (category_to_priority[cat] < 0)
1310         category_to_priority[cat] = i++;
1311     }
1312
1313   /* Now we need to construct the inverse of the mapping we just
1314      constructed. */
1315
1316   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1317     coding_category_by_priority[category_to_priority[i]] = i;
1318
1319   /* Phew!  That was confusing. */
1320   return Qnil;
1321 }
1322
1323 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1324 Return a list of coding categories in descending order of priority.
1325 */
1326        ())
1327 {
1328   int i;
1329   Lisp_Object list = Qnil;
1330
1331   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1332     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1333                   list);
1334   return list;
1335 }
1336
1337 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1338 Change the coding system associated with a coding category.
1339 */
1340        (coding_category, coding_system))
1341 {
1342   int cat = decode_coding_category (coding_category);
1343
1344   coding_system = Fget_coding_system (coding_system);
1345   coding_category_system[cat] = coding_system;
1346   return Qnil;
1347 }
1348
1349 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1350 Return the coding system associated with a coding category.
1351 */
1352        (coding_category))
1353 {
1354   int cat = decode_coding_category (coding_category);
1355   Lisp_Object sys = coding_category_system[cat];
1356
1357   if (!NILP (sys))
1358     return XCODING_SYSTEM_NAME (sys);
1359   return Qnil;
1360 }
1361
1362 \f
1363 /************************************************************************/
1364 /*                     Detecting the encoding of data                   */
1365 /************************************************************************/
1366
1367 struct detection_state
1368 {
1369   enum eol_type eol_type;
1370   int seen_non_ascii;
1371   int mask;
1372 #ifdef MULE
1373   struct
1374     {
1375       int mask;
1376       int in_second_byte;
1377     }
1378   big5;
1379
1380   struct
1381     {
1382       int mask;
1383       int in_second_byte;
1384     }
1385   shift_jis;
1386
1387   struct
1388     {
1389       int mask;
1390       int in_byte;
1391   }
1392   ucs4;
1393
1394   struct
1395     {
1396       int mask;
1397       int in_byte;
1398     }
1399   utf8;
1400
1401   struct
1402     {
1403       int mask;
1404       int initted;
1405       struct iso2022_decoder iso;
1406       unsigned int flags;
1407       int high_byte_count;
1408       unsigned int saw_single_shift:1;
1409     }
1410   iso2022;
1411 #endif
1412   struct
1413     {
1414       int seen_anything;
1415       int just_saw_cr;
1416     }
1417   eol;
1418 };
1419
1420 static int
1421 acceptable_control_char_p (int c)
1422 {
1423   switch (c)
1424     {
1425       /* Allow and ignore control characters that you might
1426          reasonably see in a text file */
1427     case '\r':
1428     case '\n':
1429     case '\t':
1430     case  7: /* bell */
1431     case  8: /* backspace */
1432     case 11: /* vertical tab */
1433     case 12: /* form feed */
1434     case 26: /* MS-DOS C-z junk */
1435     case 31: /* '^_' -- for info */
1436       return 1;
1437     default:
1438       return 0;
1439     }
1440 }
1441
1442 static int
1443 mask_has_at_most_one_bit_p (int mask)
1444 {
1445   /* Perhaps the only thing useful you learn from intensive Microsoft
1446      technical interviews */
1447   return (mask & (mask - 1)) == 0;
1448 }
1449
1450 static enum eol_type
1451 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1452                  unsigned int n)
1453 {
1454   int c;
1455
1456   while (n--)
1457     {
1458       c = *src++;
1459       if (c == '\r')
1460         st->eol.just_saw_cr = 1;
1461       else
1462         {
1463           if (c == '\n')
1464             {
1465               if (st->eol.just_saw_cr)
1466                 return EOL_CRLF;
1467               else if (st->eol.seen_anything)
1468                 return EOL_LF;
1469             }
1470           else if (st->eol.just_saw_cr)
1471             return EOL_CR;
1472           st->eol.just_saw_cr = 0;
1473         }
1474       st->eol.seen_anything = 1;
1475     }
1476
1477   return EOL_AUTODETECT;
1478 }
1479
1480 /* Attempt to determine the encoding and EOL type of the given text.
1481    Before calling this function for the first type, you must initialize
1482    st->eol_type as appropriate and initialize st->mask to ~0.
1483
1484    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1485    not yet known.
1486
1487    st->mask holds the determined coding category mask, or ~0 if only
1488    ASCII has been seen so far.
1489
1490    Returns:
1491
1492    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1493         is present in st->mask
1494    1 == definitive answers are here for both st->eol_type and st->mask
1495 */
1496
1497 static int
1498 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1499                     unsigned int n, int just_do_eol)
1500 {
1501   int c;
1502
1503   if (st->eol_type == EOL_AUTODETECT)
1504     st->eol_type = detect_eol_type (st, src, n);
1505
1506   if (just_do_eol)
1507     return st->eol_type != EOL_AUTODETECT;
1508
1509   if (!st->seen_non_ascii)
1510     {
1511       for (; n; n--, src++)
1512         {
1513           c = *src;
1514           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1515             {
1516               st->seen_non_ascii = 1;
1517 #ifdef MULE
1518               st->shift_jis.mask = ~0;
1519               st->big5.mask = ~0;
1520               st->ucs4.mask = ~0;
1521               st->utf8.mask = ~0;
1522               st->iso2022.mask = ~0;
1523 #endif
1524               break;
1525             }
1526         }
1527     }
1528
1529   if (!n)
1530     return 0;
1531 #ifdef MULE
1532   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1533     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1534   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1535     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1536   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1537     st->big5.mask = detect_coding_big5 (st, src, n);
1538   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1539     st->utf8.mask = detect_coding_utf8 (st, src, n);
1540   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1541     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1542
1543   st->mask
1544     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1545     | st->utf8.mask | st->ucs4.mask;
1546 #endif
1547   {
1548     int retval = mask_has_at_most_one_bit_p (st->mask);
1549     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1550     return retval && st->eol_type != EOL_AUTODETECT;
1551   }
1552 }
1553
1554 static Lisp_Object
1555 coding_system_from_mask (int mask)
1556 {
1557   if (mask == ~0)
1558     {
1559       /* If the file was entirely or basically ASCII, use the
1560          default value of `buffer-file-coding-system'. */
1561       Lisp_Object retval =
1562         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1563       if (!NILP (retval))
1564         {
1565           retval = Ffind_coding_system (retval);
1566           if (NILP (retval))
1567             {
1568               warn_when_safe
1569                 (Qbad_variable, Qwarning,
1570                  "Invalid `default-buffer-file-coding-system', set to nil");
1571               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1572             }
1573         }
1574       if (NILP (retval))
1575         retval = Fget_coding_system (Qraw_text);
1576       return retval;
1577     }
1578   else
1579     {
1580       int i;
1581       int cat = -1;
1582 #ifdef MULE
1583       mask = postprocess_iso2022_mask (mask);
1584 #endif
1585       /* Look through the coding categories by priority and find
1586          the first one that is allowed. */
1587       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1588         {
1589           cat = coding_category_by_priority[i];
1590           if ((mask & (1 << cat)) &&
1591               !NILP (coding_category_system[cat]))
1592             break;
1593         }
1594       if (cat >= 0)
1595         return coding_category_system[cat];
1596       else
1597         return Fget_coding_system (Qraw_text);
1598     }
1599 }
1600
1601 /* Given a seekable read stream and potential coding system and EOL type
1602    as specified, do any autodetection that is called for.  If the
1603    coding system and/or EOL type are not autodetect, they will be left
1604    alone; but this function will never return an autodetect coding system
1605    or EOL type.
1606
1607    This function does not automatically fetch subsidiary coding systems;
1608    that should be unnecessary with the explicit eol-type argument. */
1609
1610 void
1611 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1612                               enum eol_type *eol_type_in_out)
1613 {
1614   struct detection_state decst;
1615
1616   if (*eol_type_in_out == EOL_AUTODETECT)
1617     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1618
1619   xzero (decst);
1620   decst.eol_type = *eol_type_in_out;
1621   decst.mask = ~0;
1622
1623   /* If autodetection is called for, do it now. */
1624   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1625       *eol_type_in_out == EOL_AUTODETECT)
1626     {
1627       unsigned char random_buffer[4096];
1628       int nread;
1629       Lisp_Object coding_system = Qnil;
1630
1631       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1632       if (nread)
1633         {
1634           unsigned char *cp = random_buffer;
1635
1636           while (cp < random_buffer + nread)
1637             {
1638               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1639                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1640                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1641                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1642                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1643                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1644                   (*cp++ == ':') && (cp < random_buffer + nread))
1645                 {
1646                   unsigned char coding_system_name[4096 - 6];
1647                   unsigned char *np = coding_system_name;
1648
1649                   while ( (cp < random_buffer + nread)
1650                           && ((*cp == ' ') || (*cp == '\t')) )
1651                     {
1652                       cp++;
1653                     }
1654                   while ( (cp < random_buffer + nread) &&
1655                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1656                     {
1657                       *np++ = *cp++;
1658                     }
1659                   *np = 0;
1660                   coding_system
1661                     = Ffind_coding_system (intern (coding_system_name));
1662                   break;
1663                 }
1664             }
1665           if (EQ(coding_system, Qnil))
1666             do{
1667               if (detect_coding_type (&decst, random_buffer, nread,
1668                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1669                                       != CODESYS_AUTODETECT))
1670                 break;
1671               nread = Lstream_read (stream,
1672                                     random_buffer, sizeof (random_buffer));
1673               if (!nread)
1674                 break;
1675             } while(1);
1676         }
1677       *eol_type_in_out = decst.eol_type;
1678       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1679         {
1680           if (EQ(coding_system, Qnil))
1681             *codesys_in_out = coding_system_from_mask (decst.mask);
1682           else
1683             *codesys_in_out = coding_system;
1684         }
1685     }
1686   /* If we absolutely can't determine the EOL type, just assume LF. */
1687   if (*eol_type_in_out == EOL_AUTODETECT)
1688     *eol_type_in_out = EOL_LF;
1689
1690   Lstream_rewind (stream);
1691 }
1692
1693 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1694 Detect coding system of the text in the region between START and END.
1695 Returned a list of possible coding systems ordered by priority.
1696 If only ASCII characters are found, it returns 'undecided or one of
1697 its subsidiary coding systems according to a detected end-of-line
1698 type.  Optional arg BUFFER defaults to the current buffer.
1699 */
1700        (start, end, buffer))
1701 {
1702   Lisp_Object val = Qnil;
1703   struct buffer *buf = decode_buffer (buffer, 0);
1704   Bufpos b, e;
1705   Lisp_Object instream, lb_instream;
1706   Lstream *istr, *lb_istr;
1707   struct detection_state decst;
1708   struct gcpro gcpro1, gcpro2;
1709
1710   get_buffer_range_char (buf, start, end, &b, &e, 0);
1711   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1712   lb_istr = XLSTREAM (lb_instream);
1713   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1714   istr = XLSTREAM (instream);
1715   GCPRO2 (instream, lb_instream);
1716   xzero (decst);
1717   decst.eol_type = EOL_AUTODETECT;
1718   decst.mask = ~0;
1719   while (1)
1720     {
1721       unsigned char random_buffer[4096];
1722       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1723
1724       if (!nread)
1725         break;
1726       if (detect_coding_type (&decst, random_buffer, nread, 0))
1727         break;
1728     }
1729
1730   if (decst.mask == ~0)
1731     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1732                                     decst.eol_type);
1733   else
1734     {
1735       int i;
1736
1737       val = Qnil;
1738 #ifdef MULE
1739       decst.mask = postprocess_iso2022_mask (decst.mask);
1740 #endif
1741       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1742         {
1743           int sys = coding_category_by_priority[i];
1744           if (decst.mask & (1 << sys))
1745             {
1746               Lisp_Object codesys = coding_category_system[sys];
1747               if (!NILP (codesys))
1748                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1749               val = Fcons (codesys, val);
1750             }
1751         }
1752     }
1753   Lstream_close (istr);
1754   UNGCPRO;
1755   Lstream_delete (istr);
1756   Lstream_delete (lb_istr);
1757   return val;
1758 }
1759
1760 \f
1761 /************************************************************************/
1762 /*           Converting to internal Mule format ("decoding")            */
1763 /************************************************************************/
1764
1765 /* A decoding stream is a stream used for decoding text (i.e.
1766    converting from some external format to internal format).
1767    The decoding-stream object keeps track of the actual coding
1768    stream, the stream that is at the other end, and data that
1769    needs to be persistent across the lifetime of the stream. */
1770
1771 /* Handle the EOL stuff related to just-read-in character C.
1772    EOL_TYPE is the EOL type of the coding stream.
1773    FLAGS is the current value of FLAGS in the coding stream, and may
1774    be modified by this macro.  (The macro only looks at the
1775    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1776    bytes are to be written.  You need to also define a local goto
1777    label "label_continue_loop" that is at the end of the main
1778    character-reading loop.
1779
1780    If C is a CR character, then this macro handles it entirely and
1781    jumps to label_continue_loop.  Otherwise, this macro does not add
1782    anything to DST, and continues normally.  You should continue
1783    processing C normally after this macro. */
1784
1785 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1786 do {                                                            \
1787   if (c == '\r')                                                \
1788     {                                                           \
1789       if (eol_type == EOL_CR)                                   \
1790         Dynarr_add (dst, '\n');                                 \
1791       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1792         Dynarr_add (dst, c);                                    \
1793       else                                                      \
1794         flags |= CODING_STATE_CR;                               \
1795       goto label_continue_loop;                                 \
1796     }                                                           \
1797   else if (flags & CODING_STATE_CR)                             \
1798     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1799       if (c != '\n')                                            \
1800         Dynarr_add (dst, '\r');                                 \
1801       flags &= ~CODING_STATE_CR;                                \
1802     }                                                           \
1803 } while (0)
1804
1805 /* C should be a binary character in the range 0 - 255; convert
1806    to internal format and add to Dynarr DST. */
1807
1808 #ifdef UTF2000
1809 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1810 do {                                            \
1811   if (BYTE_ASCII_P (c))                         \
1812     Dynarr_add (dst, c);                        \
1813   else                                          \
1814     {                                           \
1815       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1816       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1817     }                                           \
1818 } while (0)
1819
1820 INLINE void
1821 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1822 {
1823   if ( c <= 0x7f )
1824     {
1825       Dynarr_add (dst, c);
1826     }
1827   else if ( c <= 0x7ff )
1828     {
1829       Dynarr_add (dst, (c >> 6) | 0xc0);
1830       Dynarr_add (dst, (c & 0x3f) | 0x80);
1831     }
1832   else if ( c <= 0xffff )
1833     {
1834       Dynarr_add (dst,  (c >> 12) | 0xe0);
1835       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1836       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1837     }
1838   else if ( c <= 0x1fffff )
1839     {
1840       Dynarr_add (dst,  (c >> 18) | 0xf0);
1841       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1842       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1843       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1844     }
1845   else if ( c <= 0x3ffffff )
1846     {
1847       Dynarr_add (dst,  (c >> 24) | 0xf8);
1848       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1849       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1850       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1851       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1852     }
1853   else
1854     {
1855       Dynarr_add (dst,  (c >> 30) | 0xfc);
1856       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1857       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1858       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1859       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1860       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1861     }
1862 }
1863 #else
1864 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1865 do {                                            \
1866   if (BYTE_ASCII_P (c))                         \
1867     Dynarr_add (dst, c);                        \
1868   else if (BYTE_C1_P (c))                       \
1869     {                                           \
1870       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1871       Dynarr_add (dst, c + 0x20);               \
1872     }                                           \
1873   else                                          \
1874     {                                           \
1875       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1876       Dynarr_add (dst, c);                      \
1877     }                                           \
1878 } while (0)
1879 #endif
1880
1881 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1882 do {                                    \
1883   if (ch)                               \
1884     {                                   \
1885       DECODE_ADD_BINARY_CHAR (ch, dst); \
1886       ch = 0;                           \
1887     }                                   \
1888 } while (0)
1889
1890 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1891 do {                                    \
1892   if (flags & CODING_STATE_END)         \
1893     {                                   \
1894       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1895       if (flags & CODING_STATE_CR)      \
1896         Dynarr_add (dst, '\r');         \
1897     }                                   \
1898 } while (0)
1899
1900 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1901
1902 struct decoding_stream
1903 {
1904   /* Coding system that governs the conversion. */
1905   Lisp_Coding_System *codesys;
1906
1907   /* Stream that we read the encoded data from or
1908      write the decoded data to. */
1909   Lstream *other_end;
1910
1911   /* If we are reading, then we can return only a fixed amount of
1912      data, so if the conversion resulted in too much data, we store it
1913      here for retrieval the next time around. */
1914   unsigned_char_dynarr *runoff;
1915
1916   /* FLAGS holds flags indicating the current state of the decoding.
1917      Some of these flags are dependent on the coding system. */
1918   unsigned int flags;
1919
1920   /* CH holds a partially built-up character.  Since we only deal
1921      with one- and two-byte characters at the moment, we only use
1922      this to store the first byte of a two-byte character. */
1923   unsigned int ch;
1924
1925   /* EOL_TYPE specifies the type of end-of-line conversion that
1926      currently applies.  We need to keep this separate from the
1927      EOL type stored in CODESYS because the latter might indicate
1928      automatic EOL-type detection while the former will always
1929      indicate a particular EOL type. */
1930   enum eol_type eol_type;
1931 #ifdef MULE
1932   /* Additional ISO2022 information.  We define the structure above
1933      because it's also needed by the detection routines. */
1934   struct iso2022_decoder iso2022;
1935
1936   /* Additional information (the state of the running CCL program)
1937      used by the CCL decoder. */
1938   struct ccl_program ccl;
1939
1940   /* counter for UTF-8 or UCS-4 */
1941   unsigned char counter;
1942 #endif
1943   struct detection_state decst;
1944 };
1945
1946 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1947 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1948 static int decoding_rewinder   (Lstream *stream);
1949 static int decoding_seekable_p (Lstream *stream);
1950 static int decoding_flusher    (Lstream *stream);
1951 static int decoding_closer     (Lstream *stream);
1952
1953 static Lisp_Object decoding_marker (Lisp_Object stream,
1954                                     void (*markobj) (Lisp_Object));
1955
1956 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1957                                sizeof (struct decoding_stream));
1958
1959 static Lisp_Object
1960 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1961 {
1962   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1963   Lisp_Object str_obj;
1964
1965   /* We do not need to mark the coding systems or charsets stored
1966      within the stream because they are stored in a global list
1967      and automatically marked. */
1968
1969   XSETLSTREAM (str_obj, str);
1970   markobj (str_obj);
1971   if (str->imp->marker)
1972     return (str->imp->marker) (str_obj, markobj);
1973   else
1974     return Qnil;
1975 }
1976
1977 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1978    so we read data from the other end, decode it, and store it into DATA. */
1979
1980 static int
1981 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1982 {
1983   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1984   unsigned char *orig_data = data;
1985   int read_size;
1986   int error_occurred = 0;
1987
1988   /* We need to interface to mule_decode(), which expects to take some
1989      amount of data and store the result into a Dynarr.  We have
1990      mule_decode() store into str->runoff, and take data from there
1991      as necessary. */
1992
1993   /* We loop until we have enough data, reading chunks from the other
1994      end and decoding it. */
1995   while (1)
1996     {
1997       /* Take data from the runoff if we can.  Make sure to take at
1998          most SIZE bytes, and delete the data from the runoff. */
1999       if (Dynarr_length (str->runoff) > 0)
2000         {
2001           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2002           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2003           Dynarr_delete_many (str->runoff, 0, chunk);
2004           data += chunk;
2005           size -= chunk;
2006         }
2007
2008       if (size == 0)
2009         break; /* No more room for data */
2010
2011       if (str->flags & CODING_STATE_END)
2012         /* This means that on the previous iteration, we hit the EOF on
2013            the other end.  We loop once more so that mule_decode() can
2014            output any final stuff it may be holding, or any "go back
2015            to a sane state" escape sequences. (This latter makes sense
2016            during encoding.) */
2017         break;
2018
2019       /* Exhausted the runoff, so get some more.  DATA has at least
2020          SIZE bytes left of storage in it, so it's OK to read directly
2021          into it.  (We'll be overwriting above, after we've decoded it
2022          into the runoff.) */
2023       read_size = Lstream_read (str->other_end, data, size);
2024       if (read_size < 0)
2025         {
2026           error_occurred = 1;
2027           break;
2028         }
2029       if (read_size == 0)
2030         /* There might be some more end data produced in the translation.
2031            See the comment above. */
2032         str->flags |= CODING_STATE_END;
2033       mule_decode (stream, data, str->runoff, read_size);
2034     }
2035
2036   if (data - orig_data == 0)
2037     return error_occurred ? -1 : 0;
2038   else
2039     return data - orig_data;
2040 }
2041
2042 static int
2043 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2044 {
2045   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2046   int retval;
2047
2048   /* Decode all our data into the runoff, and then attempt to write
2049      it all out to the other end.  Remove whatever chunk we succeeded
2050      in writing. */
2051   mule_decode (stream, data, str->runoff, size);
2052   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2053                           Dynarr_length (str->runoff));
2054   if (retval > 0)
2055     Dynarr_delete_many (str->runoff, 0, retval);
2056   /* Do NOT return retval.  The return value indicates how much
2057      of the incoming data was written, not how many bytes were
2058      written. */
2059   return size;
2060 }
2061
2062 static void
2063 reset_decoding_stream (struct decoding_stream *str)
2064 {
2065 #ifdef MULE
2066   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2067     {
2068       Lisp_Object coding_system;
2069       XSETCODING_SYSTEM (coding_system, str->codesys);
2070       reset_iso2022 (coding_system, &str->iso2022);
2071     }
2072   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2073     {
2074       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2075     }
2076   str->counter = 0;
2077 #endif /* MULE */
2078   str->flags = str->ch = 0;
2079 }
2080
2081 static int
2082 decoding_rewinder (Lstream *stream)
2083 {
2084   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2085   reset_decoding_stream (str);
2086   Dynarr_reset (str->runoff);
2087   return Lstream_rewind (str->other_end);
2088 }
2089
2090 static int
2091 decoding_seekable_p (Lstream *stream)
2092 {
2093   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2094   return Lstream_seekable_p (str->other_end);
2095 }
2096
2097 static int
2098 decoding_flusher (Lstream *stream)
2099 {
2100   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101   return Lstream_flush (str->other_end);
2102 }
2103
2104 static int
2105 decoding_closer (Lstream *stream)
2106 {
2107   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108   if (stream->flags & LSTREAM_FL_WRITE)
2109     {
2110       str->flags |= CODING_STATE_END;
2111       decoding_writer (stream, 0, 0);
2112     }
2113   Dynarr_free (str->runoff);
2114 #ifdef MULE
2115 #ifdef ENABLE_COMPOSITE_CHARS
2116   if (str->iso2022.composite_chars)
2117     Dynarr_free (str->iso2022.composite_chars);
2118 #endif
2119 #endif
2120   return Lstream_close (str->other_end);
2121 }
2122
2123 Lisp_Object
2124 decoding_stream_coding_system (Lstream *stream)
2125 {
2126   Lisp_Object coding_system;
2127   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2128
2129   XSETCODING_SYSTEM (coding_system, str->codesys);
2130   return subsidiary_coding_system (coding_system, str->eol_type);
2131 }
2132
2133 void
2134 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2135 {
2136   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2137   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2138   str->codesys = cs;
2139   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2140     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2141   reset_decoding_stream (str);
2142 }
2143
2144 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2145    stream for writing, no automatic code detection will be performed.
2146    The reason for this is that automatic code detection requires a
2147    seekable input.  Things will also fail if you open a decoding
2148    stream for reading using a non-fully-specified coding system and
2149    a non-seekable input stream. */
2150
2151 static Lisp_Object
2152 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2153                         CONST char *mode)
2154 {
2155   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2156   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2157   Lisp_Object obj;
2158
2159   xzero (*str);
2160   str->other_end = stream;
2161   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2162   str->eol_type = EOL_AUTODETECT;
2163   if (!strcmp (mode, "r")
2164       && Lstream_seekable_p (stream))
2165     /* We can determine the coding system now. */
2166     determine_real_coding_system (stream, &codesys, &str->eol_type);
2167   set_decoding_stream_coding_system (lstr, codesys);
2168   str->decst.eol_type = str->eol_type;
2169   str->decst.mask = ~0;
2170   XSETLSTREAM (obj, lstr);
2171   return obj;
2172 }
2173
2174 Lisp_Object
2175 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2176 {
2177   return make_decoding_stream_1 (stream, codesys, "r");
2178 }
2179
2180 Lisp_Object
2181 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2182 {
2183   return make_decoding_stream_1 (stream, codesys, "w");
2184 }
2185
2186 /* Note: the decode_coding_* functions all take the same
2187    arguments as mule_decode(), which is to say some SRC data of
2188    size N, which is to be stored into dynamic array DST.
2189    DECODING is the stream within which the decoding is
2190    taking place, but no data is actually read from or
2191    written to that stream; that is handled in decoding_reader()
2192    or decoding_writer().  This allows the same functions to
2193    be used for both reading and writing. */
2194
2195 static void
2196 mule_decode (Lstream *decoding, CONST unsigned char *src,
2197              unsigned_char_dynarr *dst, unsigned int n)
2198 {
2199   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2200
2201   /* If necessary, do encoding-detection now.  We do this when
2202      we're a writing stream or a non-seekable reading stream,
2203      meaning that we can't just process the whole input,
2204      rewind, and start over. */
2205
2206   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2207       str->eol_type == EOL_AUTODETECT)
2208     {
2209       Lisp_Object codesys;
2210
2211       XSETCODING_SYSTEM (codesys, str->codesys);
2212       detect_coding_type (&str->decst, src, n,
2213                           CODING_SYSTEM_TYPE (str->codesys) !=
2214                           CODESYS_AUTODETECT);
2215       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2216           str->decst.mask != ~0)
2217         /* #### This is cheesy.  What we really ought to do is
2218            buffer up a certain amount of data so as to get a
2219            less random result. */
2220         codesys = coding_system_from_mask (str->decst.mask);
2221       str->eol_type = str->decst.eol_type;
2222       if (XCODING_SYSTEM (codesys) != str->codesys)
2223         {
2224           /* Preserve the CODING_STATE_END flag in case it was set.
2225              If we erase it, bad things might happen. */
2226           int was_end = str->flags & CODING_STATE_END;
2227           set_decoding_stream_coding_system (decoding, codesys);
2228           if (was_end)
2229             str->flags |= CODING_STATE_END;
2230         }
2231     }
2232
2233   switch (CODING_SYSTEM_TYPE (str->codesys))
2234     {
2235 #ifdef DEBUG_XEMACS
2236     case CODESYS_INTERNAL:
2237       Dynarr_add_many (dst, src, n);
2238       break;
2239 #endif
2240     case CODESYS_AUTODETECT:
2241       /* If we got this far and still haven't decided on the coding
2242          system, then do no conversion. */
2243     case CODESYS_NO_CONVERSION:
2244       decode_coding_no_conversion (decoding, src, dst, n);
2245       break;
2246 #ifdef MULE
2247     case CODESYS_SHIFT_JIS:
2248       decode_coding_sjis (decoding, src, dst, n);
2249       break;
2250     case CODESYS_BIG5:
2251       decode_coding_big5 (decoding, src, dst, n);
2252       break;
2253     case CODESYS_UCS4:
2254       decode_coding_ucs4 (decoding, src, dst, n);
2255       break;
2256     case CODESYS_UTF8:
2257       decode_coding_utf8 (decoding, src, dst, n);
2258       break;
2259     case CODESYS_CCL:
2260       str->ccl.last_block = str->flags & CODING_STATE_END;
2261       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2262       break;
2263     case CODESYS_ISO2022:
2264       decode_coding_iso2022 (decoding, src, dst, n);
2265       break;
2266 #endif /* MULE */
2267     default:
2268       abort ();
2269     }
2270 }
2271
2272 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2273 Decode the text between START and END which is encoded in CODING-SYSTEM.
2274 This is useful if you've read in encoded text from a file without decoding
2275 it (e.g. you read in a JIS-formatted file but used the `binary' or
2276 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2277 Return length of decoded text.
2278 BUFFER defaults to the current buffer if unspecified.
2279 */
2280        (start, end, coding_system, buffer))
2281 {
2282   Bufpos b, e;
2283   struct buffer *buf = decode_buffer (buffer, 0);
2284   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2285   Lstream *istr, *ostr;
2286   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2287
2288   get_buffer_range_char (buf, start, end, &b, &e, 0);
2289
2290   barf_if_buffer_read_only (buf, b, e);
2291
2292   coding_system = Fget_coding_system (coding_system);
2293   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2294   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2295   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2296                                               coding_system);
2297   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2298                                            Fget_coding_system (Qbinary));
2299   istr = XLSTREAM (instream);
2300   ostr = XLSTREAM (outstream);
2301   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2302
2303   /* The chain of streams looks like this:
2304
2305      [BUFFER] <----- send through
2306                      ------> [ENCODE AS BINARY]
2307                              ------> [DECODE AS SPECIFIED]
2308                                      ------> [BUFFER]
2309    */
2310
2311   while (1)
2312     {
2313       char tempbuf[1024]; /* some random amount */
2314       Bufpos newpos, even_newer_pos;
2315       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2316       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2317
2318       if (!size_in_bytes)
2319         break;
2320       newpos = lisp_buffer_stream_startpos (istr);
2321       Lstream_write (ostr, tempbuf, size_in_bytes);
2322       even_newer_pos = lisp_buffer_stream_startpos (istr);
2323       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2324                            even_newer_pos, 0);
2325     }
2326   Lstream_close (istr);
2327   Lstream_close (ostr);
2328   UNGCPRO;
2329   Lstream_delete (istr);
2330   Lstream_delete (ostr);
2331   Lstream_delete (XLSTREAM (de_outstream));
2332   Lstream_delete (XLSTREAM (lb_outstream));
2333   return Qnil;
2334 }
2335
2336 \f
2337 /************************************************************************/
2338 /*           Converting to an external encoding ("encoding")            */
2339 /************************************************************************/
2340
2341 /* An encoding stream is an output stream.  When you create the
2342    stream, you specify the coding system that governs the encoding
2343    and another stream that the resulting encoded data is to be
2344    sent to, and then start sending data to it. */
2345
2346 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2347
2348 struct encoding_stream
2349 {
2350   /* Coding system that governs the conversion. */
2351   Lisp_Coding_System *codesys;
2352
2353   /* Stream that we read the encoded data from or
2354      write the decoded data to. */
2355   Lstream *other_end;
2356
2357   /* If we are reading, then we can return only a fixed amount of
2358      data, so if the conversion resulted in too much data, we store it
2359      here for retrieval the next time around. */
2360   unsigned_char_dynarr *runoff;
2361
2362   /* FLAGS holds flags indicating the current state of the encoding.
2363      Some of these flags are dependent on the coding system. */
2364   unsigned int flags;
2365
2366   /* CH holds a partially built-up character.  Since we only deal
2367      with one- and two-byte characters at the moment, we only use
2368      this to store the first byte of a two-byte character. */
2369   unsigned int ch;
2370 #ifdef MULE
2371   /* Additional information used by the ISO2022 encoder. */
2372   struct
2373     {
2374       /* CHARSET holds the character sets currently assigned to the G0
2375          through G3 registers.  It is initialized from the array
2376          INITIAL_CHARSET in CODESYS. */
2377       Lisp_Object charset[4];
2378
2379       /* Which registers are currently invoked into the left (GL) and
2380          right (GR) halves of the 8-bit encoding space? */
2381       int register_left, register_right;
2382
2383       /* Whether we need to explicitly designate the charset in the
2384          G? register before using it.  It is initialized from the
2385          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2386       unsigned char force_charset_on_output[4];
2387
2388       /* Other state variables that need to be preserved across
2389          invocations. */
2390       Lisp_Object current_charset;
2391       int current_half;
2392       int current_char_boundary;
2393     } iso2022;
2394
2395   /* Additional information (the state of the running CCL program)
2396      used by the CCL encoder. */
2397   struct ccl_program ccl;
2398 #endif /* MULE */
2399 };
2400
2401 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2402 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2403                             size_t size);
2404 static int encoding_rewinder   (Lstream *stream);
2405 static int encoding_seekable_p (Lstream *stream);
2406 static int encoding_flusher    (Lstream *stream);
2407 static int encoding_closer     (Lstream *stream);
2408
2409 static Lisp_Object encoding_marker (Lisp_Object stream,
2410                                     void (*markobj) (Lisp_Object));
2411
2412 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2413                                sizeof (struct encoding_stream));
2414
2415 static Lisp_Object
2416 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2417 {
2418   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2419   Lisp_Object str_obj;
2420
2421   /* We do not need to mark the coding systems or charsets stored
2422      within the stream because they are stored in a global list
2423      and automatically marked. */
2424
2425   XSETLSTREAM (str_obj, str);
2426   markobj (str_obj);
2427   if (str->imp->marker)
2428     return (str->imp->marker) (str_obj, markobj);
2429   else
2430     return Qnil;
2431 }
2432
2433 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2434    so we read data from the other end, encode it, and store it into DATA. */
2435
2436 static int
2437 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2438 {
2439   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2440   unsigned char *orig_data = data;
2441   int read_size;
2442   int error_occurred = 0;
2443
2444   /* We need to interface to mule_encode(), which expects to take some
2445      amount of data and store the result into a Dynarr.  We have
2446      mule_encode() store into str->runoff, and take data from there
2447      as necessary. */
2448
2449   /* We loop until we have enough data, reading chunks from the other
2450      end and encoding it. */
2451   while (1)
2452     {
2453       /* Take data from the runoff if we can.  Make sure to take at
2454          most SIZE bytes, and delete the data from the runoff. */
2455       if (Dynarr_length (str->runoff) > 0)
2456         {
2457           int chunk = min ((int) size, Dynarr_length (str->runoff));
2458           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2459           Dynarr_delete_many (str->runoff, 0, chunk);
2460           data += chunk;
2461           size -= chunk;
2462         }
2463
2464       if (size == 0)
2465         break; /* No more room for data */
2466
2467       if (str->flags & CODING_STATE_END)
2468         /* This means that on the previous iteration, we hit the EOF on
2469            the other end.  We loop once more so that mule_encode() can
2470            output any final stuff it may be holding, or any "go back
2471            to a sane state" escape sequences. (This latter makes sense
2472            during encoding.) */
2473         break;
2474
2475       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2476          left of storage in it, so it's OK to read directly into it.
2477          (We'll be overwriting above, after we've encoded it into the
2478          runoff.) */
2479       read_size = Lstream_read (str->other_end, data, size);
2480       if (read_size < 0)
2481         {
2482           error_occurred = 1;
2483           break;
2484         }
2485       if (read_size == 0)
2486         /* There might be some more end data produced in the translation.
2487            See the comment above. */
2488         str->flags |= CODING_STATE_END;
2489       mule_encode (stream, data, str->runoff, read_size);
2490     }
2491
2492   if (data == orig_data)
2493     return error_occurred ? -1 : 0;
2494   else
2495     return data - orig_data;
2496 }
2497
2498 static int
2499 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2500 {
2501   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2502   int retval;
2503
2504   /* Encode all our data into the runoff, and then attempt to write
2505      it all out to the other end.  Remove whatever chunk we succeeded
2506      in writing. */
2507   mule_encode (stream, data, str->runoff, size);
2508   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2509                           Dynarr_length (str->runoff));
2510   if (retval > 0)
2511     Dynarr_delete_many (str->runoff, 0, retval);
2512   /* Do NOT return retval.  The return value indicates how much
2513      of the incoming data was written, not how many bytes were
2514      written. */
2515   return size;
2516 }
2517
2518 static void
2519 reset_encoding_stream (struct encoding_stream *str)
2520 {
2521 #ifdef MULE
2522   switch (CODING_SYSTEM_TYPE (str->codesys))
2523     {
2524     case CODESYS_ISO2022:
2525       {
2526         int i;
2527
2528         for (i = 0; i < 4; i++)
2529           {
2530             str->iso2022.charset[i] =
2531               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2532             str->iso2022.force_charset_on_output[i] =
2533               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2534           }
2535         str->iso2022.register_left = 0;
2536         str->iso2022.register_right = 1;
2537         str->iso2022.current_charset = Qnil;
2538         str->iso2022.current_half = 0;
2539 #ifdef UTF2000
2540         str->iso2022.current_char_boundary = 0;
2541 #else
2542         str->iso2022.current_char_boundary = 1;
2543 #endif
2544         break;
2545       }
2546     case CODESYS_CCL:
2547       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2548       break;
2549     default:
2550       break;
2551     }
2552 #endif /* MULE */
2553
2554   str->flags = str->ch = 0;
2555 }
2556
2557 static int
2558 encoding_rewinder (Lstream *stream)
2559 {
2560   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2561   reset_encoding_stream (str);
2562   Dynarr_reset (str->runoff);
2563   return Lstream_rewind (str->other_end);
2564 }
2565
2566 static int
2567 encoding_seekable_p (Lstream *stream)
2568 {
2569   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2570   return Lstream_seekable_p (str->other_end);
2571 }
2572
2573 static int
2574 encoding_flusher (Lstream *stream)
2575 {
2576   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2577   return Lstream_flush (str->other_end);
2578 }
2579
2580 static int
2581 encoding_closer (Lstream *stream)
2582 {
2583   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2584   if (stream->flags & LSTREAM_FL_WRITE)
2585     {
2586       str->flags |= CODING_STATE_END;
2587       encoding_writer (stream, 0, 0);
2588     }
2589   Dynarr_free (str->runoff);
2590   return Lstream_close (str->other_end);
2591 }
2592
2593 Lisp_Object
2594 encoding_stream_coding_system (Lstream *stream)
2595 {
2596   Lisp_Object coding_system;
2597   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2598
2599   XSETCODING_SYSTEM (coding_system, str->codesys);
2600   return coding_system;
2601 }
2602
2603 void
2604 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2605 {
2606   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2607   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2608   str->codesys = cs;
2609   reset_encoding_stream (str);
2610 }
2611
2612 static Lisp_Object
2613 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2614                         CONST char *mode)
2615 {
2616   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2617   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2618   Lisp_Object obj;
2619
2620   xzero (*str);
2621   str->runoff = Dynarr_new (unsigned_char);
2622   str->other_end = stream;
2623   set_encoding_stream_coding_system (lstr, codesys);
2624   XSETLSTREAM (obj, lstr);
2625   return obj;
2626 }
2627
2628 Lisp_Object
2629 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2630 {
2631   return make_encoding_stream_1 (stream, codesys, "r");
2632 }
2633
2634 Lisp_Object
2635 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2636 {
2637   return make_encoding_stream_1 (stream, codesys, "w");
2638 }
2639
2640 /* Convert N bytes of internally-formatted data stored in SRC to an
2641    external format, according to the encoding stream ENCODING.
2642    Store the encoded data into DST. */
2643
2644 static void
2645 mule_encode (Lstream *encoding, CONST unsigned char *src,
2646              unsigned_char_dynarr *dst, unsigned int n)
2647 {
2648   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2649
2650   switch (CODING_SYSTEM_TYPE (str->codesys))
2651     {
2652 #ifdef DEBUG_XEMACS
2653     case CODESYS_INTERNAL:
2654       Dynarr_add_many (dst, src, n);
2655       break;
2656 #endif
2657     case CODESYS_AUTODETECT:
2658       /* If we got this far and still haven't decided on the coding
2659          system, then do no conversion. */
2660     case CODESYS_NO_CONVERSION:
2661       encode_coding_no_conversion (encoding, src, dst, n);
2662       break;
2663 #ifdef MULE
2664     case CODESYS_SHIFT_JIS:
2665       encode_coding_sjis (encoding, src, dst, n);
2666       break;
2667     case CODESYS_BIG5:
2668       encode_coding_big5 (encoding, src, dst, n);
2669       break;
2670     case CODESYS_UCS4:
2671       encode_coding_ucs4 (encoding, src, dst, n);
2672       break;
2673     case CODESYS_UTF8:
2674       encode_coding_utf8 (encoding, src, dst, n);
2675       break;
2676     case CODESYS_CCL:
2677       str->ccl.last_block = str->flags & CODING_STATE_END;
2678       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2679       break;
2680     case CODESYS_ISO2022:
2681       encode_coding_iso2022 (encoding, src, dst, n);
2682       break;
2683 #endif /* MULE */
2684     default:
2685       abort ();
2686     }
2687 }
2688
2689 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2690 Encode the text between START and END using CODING-SYSTEM.
2691 This will, for example, convert Japanese characters into stuff such as
2692 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2693 text.  BUFFER defaults to the current buffer if unspecified.
2694 */
2695        (start, end, coding_system, buffer))
2696 {
2697   Bufpos b, e;
2698   struct buffer *buf = decode_buffer (buffer, 0);
2699   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2700   Lstream *istr, *ostr;
2701   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2702
2703   get_buffer_range_char (buf, start, end, &b, &e, 0);
2704
2705   barf_if_buffer_read_only (buf, b, e);
2706
2707   coding_system = Fget_coding_system (coding_system);
2708   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2709   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2710   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2711                                               Fget_coding_system (Qbinary));
2712   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2713                                            coding_system);
2714   istr = XLSTREAM (instream);
2715   ostr = XLSTREAM (outstream);
2716   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2717   /* The chain of streams looks like this:
2718
2719      [BUFFER] <----- send through
2720                      ------> [ENCODE AS SPECIFIED]
2721                              ------> [DECODE AS BINARY]
2722                                      ------> [BUFFER]
2723    */
2724   while (1)
2725     {
2726       char tempbuf[1024]; /* some random amount */
2727       Bufpos newpos, even_newer_pos;
2728       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2729       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2730
2731       if (!size_in_bytes)
2732         break;
2733       newpos = lisp_buffer_stream_startpos (istr);
2734       Lstream_write (ostr, tempbuf, size_in_bytes);
2735       even_newer_pos = lisp_buffer_stream_startpos (istr);
2736       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2737                            even_newer_pos, 0);
2738     }
2739
2740   {
2741     Charcount retlen =
2742       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2743     Lstream_close (istr);
2744     Lstream_close (ostr);
2745     UNGCPRO;
2746     Lstream_delete (istr);
2747     Lstream_delete (ostr);
2748     Lstream_delete (XLSTREAM (de_outstream));
2749     Lstream_delete (XLSTREAM (lb_outstream));
2750     return make_int (retlen);
2751   }
2752 }
2753
2754 #ifdef MULE
2755 \f
2756 /************************************************************************/
2757 /*                          Shift-JIS methods                           */
2758 /************************************************************************/
2759
2760 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2761    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2762    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2763    encoded by "position-code + 0x80".  A character of JISX0208
2764    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2765    position-codes are divided and shifted so that it fit in the range
2766    below.
2767
2768    --- CODE RANGE of Shift-JIS ---
2769    (character set)      (range)
2770    ASCII                0x00 .. 0x7F
2771    JISX0201-Kana        0xA0 .. 0xDF
2772    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2773             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2774    -------------------------------
2775
2776 */
2777
2778 /* Is this the first byte of a Shift-JIS two-byte char? */
2779
2780 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2781   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2782
2783 /* Is this the second byte of a Shift-JIS two-byte char? */
2784
2785 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2786   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2787
2788 #define BYTE_SJIS_KATAKANA_P(c) \
2789   ((c) >= 0xA1 && (c) <= 0xDF)
2790
2791 static int
2792 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2793                     unsigned int n)
2794 {
2795   int c;
2796
2797   while (n--)
2798     {
2799       c = *src++;
2800       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2801         return 0;
2802       if (st->shift_jis.in_second_byte)
2803         {
2804           st->shift_jis.in_second_byte = 0;
2805           if (c < 0x40)
2806             return 0;
2807         }
2808       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2809         st->shift_jis.in_second_byte = 1;
2810     }
2811   return CODING_CATEGORY_SHIFT_JIS_MASK;
2812 }
2813
2814 /* Convert Shift-JIS data to internal format. */
2815
2816 static void
2817 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2818                     unsigned_char_dynarr *dst, unsigned int n)
2819 {
2820   unsigned char c;
2821   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2822   unsigned int flags  = str->flags;
2823   unsigned int ch     = str->ch;
2824   eol_type_t eol_type = str->eol_type;
2825
2826   while (n--)
2827     {
2828       c = *src++;
2829
2830       if (ch)
2831         {
2832           /* Previous character was first byte of Shift-JIS Kanji char. */
2833           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2834             {
2835               unsigned char e1, e2;
2836
2837               DECODE_SJIS (ch, c, e1, e2);
2838 #ifdef UTF2000
2839               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2840                                             e1 & 0x7F,
2841                                             e2 & 0x7F), dst);
2842 #else
2843               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2844               Dynarr_add (dst, e1);
2845               Dynarr_add (dst, e2);
2846 #endif
2847             }
2848           else
2849             {
2850               DECODE_ADD_BINARY_CHAR (ch, dst);
2851               DECODE_ADD_BINARY_CHAR (c, dst);
2852             }
2853           ch = 0;
2854         }
2855       else
2856         {
2857           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2858           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2859             ch = c;
2860           else if (BYTE_SJIS_KATAKANA_P (c))
2861             {
2862 #ifdef UTF2000
2863               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2864                                             c & 0x7F, 0), dst);
2865 #else
2866               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2867               Dynarr_add (dst, c);
2868 #endif
2869             }
2870           else
2871             DECODE_ADD_BINARY_CHAR (c, dst);
2872         }
2873     label_continue_loop:;
2874     }
2875
2876   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2877
2878   str->flags = flags;
2879   str->ch    = ch;
2880 }
2881
2882 /* Convert internally-formatted data to Shift-JIS. */
2883
2884 static void
2885 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2886                     unsigned_char_dynarr *dst, unsigned int n)
2887 {
2888   unsigned char c;
2889   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2890   unsigned int flags  = str->flags;
2891   unsigned int ch     = str->ch;
2892   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2893 #ifdef UTF2000
2894   unsigned char char_boundary = str->iso2022.current_char_boundary;
2895 #endif
2896
2897   while (n--)
2898     {
2899       c = *src++;
2900 #ifdef UTF2000
2901       switch (char_boundary)
2902         {
2903         case 0:
2904           if ( c >= 0xfc )
2905             {
2906               ch = c & 0x01;
2907               char_boundary = 5;
2908             }
2909           else if ( c >= 0xf8 )
2910             {
2911               ch = c & 0x03;
2912               char_boundary = 4;
2913             }
2914           else if ( c >= 0xf0 )
2915             {
2916               ch = c & 0x07;
2917               char_boundary = 3;
2918             }
2919           else if ( c >= 0xe0 )
2920             {
2921               ch = c & 0x0f;
2922               char_boundary = 2;
2923             }
2924           else if ( c >= 0xc0 )
2925             {
2926               ch = c & 0x1f;
2927               char_boundary = 1;
2928             }
2929           else
2930             {
2931               ch = 0;
2932               if (c == '\n')
2933                 {
2934                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2935                     Dynarr_add (dst, '\r');
2936                   if (eol_type != EOL_CR)
2937                     Dynarr_add (dst, c);
2938                 }
2939               else
2940                 Dynarr_add (dst, c);
2941               char_boundary = 0;
2942             }
2943           break;
2944         case 1:
2945           ch = ( ch << 6 ) | ( c & 0x3f );
2946           {
2947             Lisp_Object charset;
2948             unsigned int c1, c2, s1, s2;
2949
2950             BREAKUP_CHAR (ch, charset, c1, c2);
2951             if (EQ(charset, Vcharset_katakana_jisx0201))
2952               {
2953                 Dynarr_add (dst, c1 | 0x80);
2954               }
2955             else if (EQ(charset, Vcharset_japanese_jisx0208))
2956               {
2957                 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2958                 Dynarr_add (dst, s1);
2959                 Dynarr_add (dst, s2);
2960               }
2961           }
2962           char_boundary = 0;
2963           break;
2964         default:
2965           ch = ( ch << 6 ) | ( c & 0x3f );
2966           char_boundary--;
2967         }
2968 #else
2969       if (c == '\n')
2970         {
2971           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2972             Dynarr_add (dst, '\r');
2973           if (eol_type != EOL_CR)
2974             Dynarr_add (dst, '\n');
2975           ch = 0;
2976         }
2977       else if (BYTE_ASCII_P (c))
2978         {
2979           Dynarr_add (dst, c);
2980           ch = 0;
2981         }
2982       else if (BUFBYTE_LEADING_BYTE_P (c))
2983         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2984               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2985               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2986       else if (ch)
2987         {
2988           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2989             {
2990               Dynarr_add (dst, c);
2991               ch = 0;
2992             }
2993           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2994                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2995             ch = c;
2996           else
2997             {
2998               unsigned char j1, j2;
2999               ENCODE_SJIS (ch, c, j1, j2);
3000               Dynarr_add (dst, j1);
3001               Dynarr_add (dst, j2);
3002               ch = 0;
3003             }
3004         }
3005 #endif
3006     }
3007
3008   str->flags = flags;
3009   str->ch    = ch;
3010 #ifdef UTF2000
3011   str->iso2022.current_char_boundary = char_boundary;
3012 #endif
3013 }
3014
3015 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3016 Decode a JISX0208 character of Shift-JIS coding-system.
3017 CODE is the character code in Shift-JIS as a cons of type bytes.
3018 Return the corresponding character.
3019 */
3020        (code))
3021 {
3022   unsigned char c1, c2, s1, s2;
3023
3024   CHECK_CONS (code);
3025   CHECK_INT (XCAR (code));
3026   CHECK_INT (XCDR (code));
3027   s1 = XINT (XCAR (code));
3028   s2 = XINT (XCDR (code));
3029   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3030       BYTE_SJIS_TWO_BYTE_2_P (s2))
3031     {
3032       DECODE_SJIS (s1, s2, c1, c2);
3033       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3034                                    c1 & 0x7F, c2 & 0x7F));
3035     }
3036   else
3037     return Qnil;
3038 }
3039
3040 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3041 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3042 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3043 */
3044        (ch))
3045 {
3046   Lisp_Object charset;
3047   int c1, c2, s1, s2;
3048
3049   CHECK_CHAR_COERCE_INT (ch);
3050   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3051   if (EQ (charset, Vcharset_japanese_jisx0208))
3052     {
3053       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3054       return Fcons (make_int (s1), make_int (s2));
3055     }
3056   else
3057     return Qnil;
3058 }
3059
3060 \f
3061 /************************************************************************/
3062 /*                            Big5 methods                              */
3063 /************************************************************************/
3064
3065 /* BIG5 is a coding system encoding two character sets: ASCII and
3066    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3067    character set and is encoded in two-byte.
3068
3069    --- CODE RANGE of BIG5 ---
3070    (character set)      (range)
3071    ASCII                0x00 .. 0x7F
3072    Big5 (1st byte)      0xA1 .. 0xFE
3073         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3074    --------------------------
3075
3076    Since the number of characters in Big5 is larger than maximum
3077    characters in Emacs' charset (96x96), it can't be handled as one
3078    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3079    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3080    contains frequently used characters and the latter contains less
3081    frequently used characters.  */
3082
3083 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3084   ((c) >= 0xA1 && (c) <= 0xFE)
3085
3086 /* Is this the second byte of a Shift-JIS two-byte char? */
3087
3088 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3089   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3090
3091 /* Number of Big5 characters which have the same code in 1st byte.  */
3092
3093 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3094
3095 /* Code conversion macros.  These are macros because they are used in
3096    inner loops during code conversion.
3097
3098    Note that temporary variables in macros introduce the classic
3099    dynamic-scoping problems with variable names.  We use capital-
3100    lettered variables in the assumption that XEmacs does not use
3101    capital letters in variables except in a very formalized way
3102    (e.g. Qstring). */
3103
3104 /* Convert Big5 code (b1, b2) into its internal string representation
3105    (lb, c1, c2). */
3106
3107 /* There is a much simpler way to split the Big5 charset into two.
3108    For the moment I'm going to leave the algorithm as-is because it
3109    claims to separate out the most-used characters into a single
3110    charset, which perhaps will lead to optimizations in various
3111    places.
3112
3113    The way the algorithm works is something like this:
3114
3115    Big5 can be viewed as a 94x157 charset, where the row is
3116    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3117    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3118    the split between low and high column numbers is apparently
3119    meaningless; ascending rows produce less and less frequent chars.
3120    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3121    the first charset, and the upper half (0xC9 .. 0xFE) to the
3122    second.  To do the conversion, we convert the character into
3123    a single number where 0 .. 156 is the first row, 157 .. 313
3124    is the second, etc.  That way, the characters are ordered by
3125    decreasing frequency.  Then we just chop the space in two
3126    and coerce the result into a 94x94 space.
3127    */
3128
3129 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3130 {                                                                       \
3131   int B1 = b1, B2 = b2;                                                 \
3132   unsigned int I                                                        \
3133     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3134                                                                         \
3135   if (B1 < 0xC9)                                                        \
3136     {                                                                   \
3137       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3138     }                                                                   \
3139   else                                                                  \
3140     {                                                                   \
3141       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3142       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3143     }                                                                   \
3144   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3145   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3146 } while (0)
3147
3148 /* Convert the internal string representation of a Big5 character
3149    (lb, c1, c2) into Big5 code (b1, b2). */
3150
3151 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3152 {                                                                       \
3153   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3154                                                                         \
3155   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3156     {                                                                   \
3157       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3158     }                                                                   \
3159   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3160   b2 = I % BIG5_SAME_ROW;                                               \
3161   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3162 } while (0)
3163
3164 static int
3165 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3166                     unsigned int n)
3167 {
3168   int c;
3169
3170   while (n--)
3171     {
3172       c = *src++;
3173       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3174           (c >= 0x80 && c <= 0xA0))
3175         return 0;
3176       if (st->big5.in_second_byte)
3177         {
3178           st->big5.in_second_byte = 0;
3179           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3180             return 0;
3181         }
3182       else if (c >= 0xA1)
3183         st->big5.in_second_byte = 1;
3184     }
3185   return CODING_CATEGORY_BIG5_MASK;
3186 }
3187
3188 /* Convert Big5 data to internal format. */
3189
3190 static void
3191 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3192                     unsigned_char_dynarr *dst, unsigned int n)
3193 {
3194   unsigned char c;
3195   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3196   unsigned int flags  = str->flags;
3197   unsigned int ch     = str->ch;
3198   eol_type_t eol_type = str->eol_type;
3199
3200   while (n--)
3201     {
3202       c = *src++;
3203       if (ch)
3204         {
3205           /* Previous character was first byte of Big5 char. */
3206           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3207             {
3208               unsigned char b1, b2, b3;
3209               DECODE_BIG5 (ch, c, b1, b2, b3);
3210               Dynarr_add (dst, b1);
3211               Dynarr_add (dst, b2);
3212               Dynarr_add (dst, b3);
3213             }
3214           else
3215             {
3216               DECODE_ADD_BINARY_CHAR (ch, dst);
3217               DECODE_ADD_BINARY_CHAR (c, dst);
3218             }
3219           ch = 0;
3220         }
3221       else
3222         {
3223           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3224           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3225             ch = c;
3226           else
3227             DECODE_ADD_BINARY_CHAR (c, dst);
3228         }
3229     label_continue_loop:;
3230     }
3231
3232   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3233
3234   str->flags = flags;
3235   str->ch    = ch;
3236 }
3237
3238 /* Convert internally-formatted data to Big5. */
3239
3240 static void
3241 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3242                     unsigned_char_dynarr *dst, unsigned int n)
3243 {
3244   unsigned char c;
3245   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3246   unsigned int flags  = str->flags;
3247   unsigned int ch     = str->ch;
3248   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3249
3250   while (n--)
3251     {
3252       c = *src++;
3253       if (c == '\n')
3254         {
3255           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3256             Dynarr_add (dst, '\r');
3257           if (eol_type != EOL_CR)
3258             Dynarr_add (dst, '\n');
3259         }
3260       else if (BYTE_ASCII_P (c))
3261         {
3262           /* ASCII. */
3263           Dynarr_add (dst, c);
3264         }
3265       else if (BUFBYTE_LEADING_BYTE_P (c))
3266         {
3267           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3268               c == LEADING_BYTE_CHINESE_BIG5_2)
3269             {
3270               /* A recognized leading byte. */
3271               ch = c;
3272               continue; /* not done with this character. */
3273             }
3274           /* otherwise just ignore this character. */
3275         }
3276       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3277                ch == LEADING_BYTE_CHINESE_BIG5_2)
3278         {
3279           /* Previous char was a recognized leading byte. */
3280           ch = (ch << 8) | c;
3281           continue; /* not done with this character. */
3282         }
3283       else if (ch)
3284         {
3285           /* Encountering second byte of a Big5 character. */
3286           unsigned char b1, b2;
3287
3288           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3289           Dynarr_add (dst, b1);
3290           Dynarr_add (dst, b2);
3291         }
3292
3293       ch = 0;
3294     }
3295
3296   str->flags = flags;
3297   str->ch    = ch;
3298 }
3299
3300
3301 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3302 Decode a Big5 character CODE of BIG5 coding-system.
3303 CODE is the character code in BIG5, a cons of two integers.
3304 Return the corresponding character.
3305 */
3306        (code))
3307 {
3308   unsigned char c1, c2, b1, b2;
3309
3310   CHECK_CONS (code);
3311   CHECK_INT (XCAR (code));
3312   CHECK_INT (XCDR (code));
3313   b1 = XINT (XCAR (code));
3314   b2 = XINT (XCDR (code));
3315   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3316       BYTE_BIG5_TWO_BYTE_2_P (b2))
3317     {
3318       int leading_byte;
3319       Lisp_Object charset;
3320       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3321       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3322       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3323     }
3324   else
3325     return Qnil;
3326 }
3327
3328 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3329 Encode the Big5 character CH to BIG5 coding-system.
3330 Return the corresponding character code in Big5.
3331 */
3332        (ch))
3333 {
3334   Lisp_Object charset;
3335   int c1, c2, b1, b2;
3336
3337   CHECK_CHAR_COERCE_INT (ch);
3338   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3339   if (EQ (charset, Vcharset_chinese_big5_1) ||
3340       EQ (charset, Vcharset_chinese_big5_2))
3341     {
3342       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3343                    b1, b2);
3344       return Fcons (make_int (b1), make_int (b2));
3345     }
3346   else
3347     return Qnil;
3348 }
3349
3350 \f
3351 /************************************************************************/
3352 /*                           UCS-4 methods                              */
3353 /*                                                                      */
3354 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3355 /*                                                                      */
3356 /************************************************************************/
3357
3358 Lisp_Object ucs_to_mule_table[65536];
3359 Lisp_Object mule_to_ucs_table;
3360
3361 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3362 Map UCS-4 code CODE to Mule character CHARACTER.
3363
3364 Return T on success, NIL on failure.
3365 */
3366        (code, character))
3367 {
3368   unsigned int c;
3369
3370   CHECK_CHAR (character);
3371   CHECK_INT (code);
3372   c = XINT (code);
3373
3374   if (c < sizeof (ucs_to_mule_table))
3375     {
3376       ucs_to_mule_table[c] = character;
3377       return Qt;
3378     }
3379   else
3380     return Qnil;
3381 }
3382
3383 static Lisp_Object
3384 ucs_to_char (unsigned long code)
3385 {
3386   if (code < sizeof (ucs_to_mule_table))
3387     {
3388       return ucs_to_mule_table[code];
3389     }
3390   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3391     {
3392       unsigned int c;
3393
3394       code -= 0xe00000;
3395       c = code % (94 * 94);
3396       return make_char
3397         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3398                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3399                      CHARSET_LEFT_TO_RIGHT),
3400                     c / 94 + 33, c % 94 + 33));
3401     }
3402   else
3403     return Qnil;
3404 }
3405
3406 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3407 Return Mule character corresponding to UCS code CODE (a positive integer).
3408 */
3409        (code))
3410 {
3411   CHECK_NATNUM (code);
3412   return ucs_to_char (XINT (code));
3413 }
3414
3415 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3416 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3417 */
3418        (character, code))
3419 {
3420   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3421           Fset_char_ucs is more restrictive on index arg, but should
3422           check code arg in a char_table method. */
3423   CHECK_CHAR (character);
3424   CHECK_NATNUM (code);
3425   return Fput_char_table (character, code, mule_to_ucs_table);
3426 }
3427
3428 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3429 Return the UCS code (a positive integer) corresponding to CHARACTER.
3430 */
3431        (character))
3432 {
3433   return Fget_char_table (character, mule_to_ucs_table);
3434 }
3435
3436 #ifdef UTF2000
3437 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3438 #else
3439 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3440    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3441    is not found, instead.
3442    #### do something more appropriate (use blob?)
3443         Danger, Will Robinson!  Data loss.  Should we signal user? */
3444 static void
3445 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3446 {
3447   Lisp_Object chr = ucs_to_char (ch);
3448
3449   if (! NILP (chr))
3450     {
3451       Bufbyte work[MAX_EMCHAR_LEN];
3452       int len;
3453
3454       ch = XCHAR (chr);
3455       len = (ch < 128) ?
3456         simple_set_charptr_emchar (work, ch) :
3457         non_ascii_set_charptr_emchar (work, ch);
3458       Dynarr_add_many (dst, work, len);
3459     }
3460   else
3461     {
3462       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3463       Dynarr_add (dst, 34 + 128);
3464       Dynarr_add (dst, 46 + 128);
3465     }
3466 }
3467 #endif
3468
3469 static unsigned long
3470 mule_char_to_ucs4 (Lisp_Object charset,
3471                    unsigned char h, unsigned char l)
3472 {
3473   Lisp_Object code
3474     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3475                        mule_to_ucs_table);
3476
3477   if (INTP (code))
3478     {
3479       return XINT (code);
3480     }
3481   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3482             (XCHARSET_CHARS (charset) == 94) )
3483     {
3484       unsigned char final = XCHARSET_FINAL (charset);
3485
3486       if ( ('@' <= final) && (final < 0x7f) )
3487         {
3488           return 0xe00000 + (final - '@') * 94 * 94
3489             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3490         }
3491       else
3492         {
3493           return '?';
3494         }
3495     }
3496   else
3497     {
3498       return '?';
3499     }
3500 }
3501
3502 static void
3503 encode_ucs4 (Lisp_Object charset,
3504              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3505 {
3506   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3507   Dynarr_add (dst,  code >> 24);
3508   Dynarr_add (dst, (code >> 16) & 255);
3509   Dynarr_add (dst, (code >>  8) & 255);
3510   Dynarr_add (dst,  code        & 255);
3511 }
3512
3513 static int
3514 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3515                     unsigned int n)
3516 {
3517   while (n--)
3518     {
3519       int c = *src++;
3520       switch (st->ucs4.in_byte)
3521         {
3522         case 0:
3523           if (c >= 128)
3524             return 0;
3525           else
3526             st->ucs4.in_byte++;
3527           break;
3528         case 3:
3529           st->ucs4.in_byte = 0;
3530           break;
3531         default:
3532           st->ucs4.in_byte++;
3533         }
3534     }
3535   return CODING_CATEGORY_UCS4_MASK;
3536 }
3537
3538 static void
3539 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3540                     unsigned_char_dynarr *dst, unsigned int n)
3541 {
3542   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3543   unsigned int flags = str->flags;
3544   unsigned int ch    = str->ch;
3545   unsigned char counter = str->counter;
3546
3547   while (n--)
3548     {
3549       unsigned char c = *src++;
3550       switch (counter)
3551         {
3552         case 0:
3553           ch = c;
3554           counter = 3;
3555           break;
3556         case 1:
3557           decode_ucs4 ( ( ch << 8 ) | c, dst);
3558           ch = 0;
3559           counter = 0;
3560           break;
3561         default:
3562           ch = ( ch << 8 ) | c;
3563           counter--;
3564         }
3565     }
3566   if (counter & CODING_STATE_END)
3567     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3568
3569   str->flags = flags;
3570   str->ch    = ch;
3571   str->counter = counter;
3572 }
3573
3574 static void
3575 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3576                     unsigned_char_dynarr *dst, unsigned int n)
3577 {
3578   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3579   unsigned int flags = str->flags;
3580   unsigned int ch = str->ch;
3581   unsigned char char_boundary = str->iso2022.current_char_boundary;
3582   Lisp_Object charset = str->iso2022.current_charset;
3583
3584 #ifdef ENABLE_COMPOSITE_CHARS
3585   /* flags for handling composite chars.  We do a little switcharoo
3586      on the source while we're outputting the composite char. */
3587   unsigned int saved_n = 0;
3588   CONST unsigned char *saved_src = NULL;
3589   int in_composite = 0;
3590
3591  back_to_square_n:
3592 #endif
3593
3594   while (n--)
3595     {
3596       unsigned char c = *src++;
3597
3598       if (BYTE_ASCII_P (c))
3599         {               /* Processing ASCII character */
3600           ch = 0;
3601           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3602           char_boundary = 1;
3603         }
3604       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3605         { /* Processing Leading Byte */
3606           ch = 0;
3607           charset = CHARSET_BY_LEADING_BYTE (c);
3608           if (LEADING_BYTE_PREFIX_P(c))
3609             ch = c;
3610           char_boundary = 0;
3611         }
3612       else
3613         {                       /* Processing Non-ASCII character */
3614           char_boundary = 1;
3615           if (EQ (charset, Vcharset_control_1))
3616             {
3617               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3618             }
3619           else
3620             {
3621               switch (XCHARSET_REP_BYTES (charset))
3622                 {
3623                 case 2:
3624                   encode_ucs4 (charset, c, 0, dst);
3625                   break;
3626                 case 3:
3627                   if (XCHARSET_PRIVATE_P (charset))
3628                     {
3629                       encode_ucs4 (charset, c, 0, dst);
3630                       ch = 0;
3631                     }
3632                   else if (ch)
3633                     {
3634 #ifdef ENABLE_COMPOSITE_CHARS
3635                       if (EQ (charset, Vcharset_composite))
3636                         {
3637                           if (in_composite)
3638                             {
3639                               /* #### Bother! We don't know how to
3640                                  handle this yet. */
3641                               Dynarr_add (dst, 0);
3642                               Dynarr_add (dst, 0);
3643                               Dynarr_add (dst, 0);
3644                               Dynarr_add (dst, '~');
3645                             }
3646                           else
3647                             {
3648                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3649                                                        ch & 0x7F, c & 0x7F);
3650                               Lisp_Object lstr = composite_char_string (emch);
3651                               saved_n = n;
3652                               saved_src = src;
3653                               in_composite = 1;
3654                               src = XSTRING_DATA   (lstr);
3655                               n   = XSTRING_LENGTH (lstr);
3656                             }
3657                         }
3658                       else
3659 #endif /* ENABLE_COMPOSITE_CHARS */
3660                         {
3661                           encode_ucs4(charset, ch, c, dst);
3662                         }
3663                       ch = 0;
3664                     }
3665                   else
3666                     {
3667                       ch = c;
3668                       char_boundary = 0;
3669                     }
3670                   break;
3671                 case 4:
3672                   if (ch)
3673                     {
3674                       encode_ucs4 (charset, ch, c, dst);
3675                       ch = 0;
3676                     }
3677                   else
3678                     {
3679                       ch = c;
3680                       char_boundary = 0;
3681                     }
3682                   break;
3683                 default:
3684                   abort ();
3685                 }
3686             }
3687         }
3688     }
3689
3690 #ifdef ENABLE_COMPOSITE_CHARS
3691   if (in_composite)
3692     {
3693       n = saved_n;
3694       src = saved_src;
3695       in_composite = 0;
3696       goto back_to_square_n; /* Wheeeeeeeee ..... */
3697     }
3698 #endif /* ENABLE_COMPOSITE_CHARS */
3699
3700   str->flags = flags;
3701   str->ch = ch;
3702   str->iso2022.current_char_boundary = char_boundary;
3703   str->iso2022.current_charset = charset;
3704
3705   /* Verbum caro factum est! */
3706 }
3707
3708 \f
3709 /************************************************************************/
3710 /*                           UTF-8 methods                              */
3711 /************************************************************************/
3712
3713 static int
3714 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3715                     unsigned int n)
3716 {
3717   while (n--)
3718     {
3719       unsigned char c = *src++;
3720       switch (st->utf8.in_byte)
3721         {
3722         case 0:
3723           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3724             return 0;
3725           else if (c >= 0xfc)
3726             st->utf8.in_byte = 5;
3727           else if (c >= 0xf8)
3728             st->utf8.in_byte = 4;
3729           else if (c >= 0xf0)
3730             st->utf8.in_byte = 3;
3731           else if (c >= 0xe0)
3732             st->utf8.in_byte = 2;
3733           else if (c >= 0xc0)
3734             st->utf8.in_byte = 1;
3735           else if (c >= 0x80)
3736             return 0;
3737           break;
3738         default:
3739           if ((c & 0xc0) != 0x80)
3740             return 0;
3741           else
3742             st->utf8.in_byte--;
3743         }
3744     }
3745   return CODING_CATEGORY_UTF8_MASK;
3746 }
3747
3748 static void
3749 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3750                     unsigned_char_dynarr *dst, unsigned int n)
3751 {
3752   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3753   unsigned int flags  = str->flags;
3754   unsigned int ch     = str->ch;
3755   eol_type_t eol_type = str->eol_type;
3756   unsigned char counter = str->counter;
3757
3758   while (n--)
3759     {
3760       unsigned char c = *src++;
3761       switch (counter)
3762         {
3763         case 0:
3764           if ( c >= 0xfc )
3765             {
3766               ch = c & 0x01;
3767               counter = 5;
3768             }
3769           else if ( c >= 0xf8 )
3770             {
3771               ch = c & 0x03;
3772               counter = 4;
3773             }
3774           else if ( c >= 0xf0 )
3775             {
3776               ch = c & 0x07;
3777               counter = 3;
3778             }
3779           else if ( c >= 0xe0 )
3780             {
3781               ch = c & 0x0f;
3782               counter = 2;
3783             }
3784           else if ( c >= 0xc0 )
3785             {
3786               ch = c & 0x1f;
3787               counter = 1;
3788             }
3789           else
3790             {
3791               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3792               decode_ucs4 (c, dst);
3793             }
3794           break;
3795         case 1:
3796           ch = ( ch << 6 ) | ( c & 0x3f );
3797           decode_ucs4 (ch, dst);
3798           ch = 0;
3799           counter = 0;
3800           break;
3801         default:
3802           ch = ( ch << 6 ) | ( c & 0x3f );
3803           counter--;
3804         }
3805     label_continue_loop:;
3806     }
3807
3808   if (flags & CODING_STATE_END)
3809     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3810
3811   str->flags = flags;
3812   str->ch    = ch;
3813   str->counter = counter;
3814 }
3815
3816 #ifndef UTF2000
3817 static void
3818 encode_utf8 (Lisp_Object charset,
3819              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3820 {
3821   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3822   if ( code <= 0x7f )
3823     {
3824       Dynarr_add (dst, code);
3825     }
3826   else if ( code <= 0x7ff )
3827     {
3828       Dynarr_add (dst, (code >> 6) | 0xc0);
3829       Dynarr_add (dst, (code & 0x3f) | 0x80);
3830     }
3831   else if ( code <= 0xffff )
3832     {
3833       Dynarr_add (dst,  (code >> 12) | 0xe0);
3834       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3835       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3836     }
3837   else if ( code <= 0x1fffff )
3838     {
3839       Dynarr_add (dst,  (code >> 18) | 0xf0);
3840       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3841       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3842       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3843     }
3844   else if ( code <= 0x3ffffff )
3845     {
3846       Dynarr_add (dst,  (code >> 24) | 0xf8);
3847       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3848       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3849       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3850       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3851     }
3852   else
3853     {
3854       Dynarr_add (dst,  (code >> 30) | 0xfc);
3855       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3856       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3857       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3858       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3859       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3860     }
3861 }
3862 #endif
3863
3864 static void
3865 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3866                     unsigned_char_dynarr *dst, unsigned int n)
3867 {
3868   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3869   unsigned int flags  = str->flags;
3870   unsigned int ch     = str->ch;
3871   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3872   unsigned char char_boundary = str->iso2022.current_char_boundary;
3873 #ifdef UTF2000
3874
3875   while (n--)
3876     {
3877       unsigned char c = *src++;
3878       switch (char_boundary)
3879         {
3880         case 0:
3881           if ( c >= 0xfc )
3882             {
3883               Dynarr_add (dst, c);
3884               char_boundary = 5;
3885             }
3886           else if ( c >= 0xf8 )
3887             {
3888               Dynarr_add (dst, c);
3889               char_boundary = 4;
3890             }
3891           else if ( c >= 0xf0 )
3892             {
3893               Dynarr_add (dst, c);
3894               char_boundary = 3;
3895             }
3896           else if ( c >= 0xe0 )
3897             {
3898               Dynarr_add (dst, c);
3899               char_boundary = 2;
3900             }
3901           else if ( c >= 0xc0 )
3902             {
3903               Dynarr_add (dst, c);
3904               char_boundary = 1;
3905             }
3906           else
3907             {
3908               if (c == '\n')
3909                 {
3910                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3911                     Dynarr_add (dst, '\r');
3912                   if (eol_type != EOL_CR)
3913                     Dynarr_add (dst, c);
3914                 }
3915               else
3916                 Dynarr_add (dst, c);
3917               char_boundary = 0;
3918             }
3919           break;
3920         case 1:
3921           Dynarr_add (dst, c);
3922           char_boundary = 0;
3923           break;
3924         default:
3925           Dynarr_add (dst, c);
3926           char_boundary--;
3927         }
3928     }
3929 #else /* not UTF2000 */
3930   Lisp_Object charset = str->iso2022.current_charset;
3931
3932 #ifdef ENABLE_COMPOSITE_CHARS
3933   /* flags for handling composite chars.  We do a little switcharoo
3934      on the source while we're outputting the composite char. */
3935   unsigned int saved_n = 0;
3936   CONST unsigned char *saved_src = NULL;
3937   int in_composite = 0;
3938
3939  back_to_square_n:
3940 #endif /* ENABLE_COMPOSITE_CHARS */
3941
3942   while (n--)
3943     {
3944       unsigned char c = *src++;
3945
3946       if (BYTE_ASCII_P (c))
3947         {               /* Processing ASCII character */
3948           ch = 0;
3949           if (c == '\n')
3950             {
3951               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3952                 Dynarr_add (dst, '\r');
3953               if (eol_type != EOL_CR)
3954                 Dynarr_add (dst, c);
3955             }
3956           else
3957             encode_utf8 (Vcharset_ascii, c, 0, dst);
3958           char_boundary = 1;
3959         }
3960       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3961         { /* Processing Leading Byte */
3962           ch = 0;
3963           charset = CHARSET_BY_LEADING_BYTE (c);
3964           if (LEADING_BYTE_PREFIX_P(c))
3965             ch = c;
3966           char_boundary = 0;
3967         }
3968       else
3969         {                       /* Processing Non-ASCII character */
3970           char_boundary = 1;
3971           if (EQ (charset, Vcharset_control_1))
3972             {
3973               encode_utf8 (Vcharset_control_1, c, 0, dst);
3974             }
3975           else
3976             {
3977               switch (XCHARSET_REP_BYTES (charset))
3978                 {
3979                 case 2:
3980                   encode_utf8 (charset, c, 0, dst);
3981                   break;
3982                 case 3:
3983                   if (XCHARSET_PRIVATE_P (charset))
3984                     {
3985                       encode_utf8 (charset, c, 0, dst);
3986                       ch = 0;
3987                     }
3988                   else if (ch)
3989                     {
3990 #ifdef ENABLE_COMPOSITE_CHARS
3991                       if (EQ (charset, Vcharset_composite))
3992                         {
3993                           if (in_composite)
3994                             {
3995                               /* #### Bother! We don't know how to
3996                                  handle this yet. */
3997                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3998                             }
3999                           else
4000                             {
4001                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4002                                                        ch & 0x7F, c & 0x7F);
4003                               Lisp_Object lstr = composite_char_string (emch);
4004                               saved_n = n;
4005                               saved_src = src;
4006                               in_composite = 1;
4007                               src = XSTRING_DATA   (lstr);
4008                               n   = XSTRING_LENGTH (lstr);
4009                             }
4010                         }
4011                       else
4012 #endif /* ENABLE_COMPOSITE_CHARS */
4013                         {
4014                           encode_utf8 (charset, ch, c, dst);
4015                         }
4016                       ch = 0;
4017                     }
4018                   else
4019                     {
4020                       ch = c;
4021                       char_boundary = 0;
4022                     }
4023                   break;
4024                 case 4:
4025                   if (ch)
4026                     {
4027                       encode_utf8 (charset, ch, c, dst);
4028                       ch = 0;
4029                     }
4030                   else
4031                     {
4032                       ch = c;
4033                       char_boundary = 0;
4034                     }
4035                   break;
4036                 default:
4037                   abort ();
4038                 }
4039             }
4040         }
4041     }
4042
4043 #ifdef ENABLE_COMPOSITE_CHARS
4044   if (in_composite)
4045     {
4046       n = saved_n;
4047       src = saved_src;
4048       in_composite = 0;
4049       goto back_to_square_n; /* Wheeeeeeeee ..... */
4050     }
4051 #endif
4052
4053 #endif /* not UTF2000 */
4054   str->flags = flags;
4055   str->ch    = ch;
4056   str->iso2022.current_char_boundary = char_boundary;
4057 #ifndef UTF2000
4058   str->iso2022.current_charset = charset;
4059 #endif
4060
4061   /* Verbum caro factum est! */
4062 }
4063
4064 \f
4065 /************************************************************************/
4066 /*                           ISO2022 methods                            */
4067 /************************************************************************/
4068
4069 /* The following note describes the coding system ISO2022 briefly.
4070    Since the intention of this note is to help understand the
4071    functions in this file, some parts are NOT ACCURATE or OVERLY
4072    SIMPLIFIED.  For thorough understanding, please refer to the
4073    original document of ISO2022.
4074
4075    ISO2022 provides many mechanisms to encode several character sets
4076    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4077    is encoded using bytes less than 128.  This may make the encoded
4078    text a little bit longer, but the text passes more easily through
4079    several gateways, some of which strip off MSB (Most Signigant Bit).
4080
4081    There are two kinds of character sets: control character set and
4082    graphic character set.  The former contains control characters such
4083    as `newline' and `escape' to provide control functions (control
4084    functions are also provided by escape sequences).  The latter
4085    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4086    two control character sets and many graphic character sets.
4087
4088    Graphic character sets are classified into one of the following
4089    four classes, according to the number of bytes (DIMENSION) and
4090    number of characters in one dimension (CHARS) of the set:
4091    - DIMENSION1_CHARS94
4092    - DIMENSION1_CHARS96
4093    - DIMENSION2_CHARS94
4094    - DIMENSION2_CHARS96
4095
4096    In addition, each character set is assigned an identification tag,
4097    unique for each set, called "final character" (denoted as <F>
4098    hereafter).  The <F> of each character set is decided by ECMA(*)
4099    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4100    (0x30..0x3F are for private use only).
4101
4102    Note (*): ECMA = European Computer Manufacturers Association
4103
4104    Here are examples of graphic character set [NAME(<F>)]:
4105         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4106         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4107         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4108         o DIMENSION2_CHARS96 -- none for the moment
4109
4110    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4111         C0 [0x00..0x1F] -- control character plane 0
4112         GL [0x20..0x7F] -- graphic character plane 0
4113         C1 [0x80..0x9F] -- control character plane 1
4114         GR [0xA0..0xFF] -- graphic character plane 1
4115
4116    A control character set is directly designated and invoked to C0 or
4117    C1 by an escape sequence.  The most common case is that:
4118    - ISO646's  control character set is designated/invoked to C0, and
4119    - ISO6429's control character set is designated/invoked to C1,
4120    and usually these designations/invocations are omitted in encoded
4121    text.  In a 7-bit environment, only C0 can be used, and a control
4122    character for C1 is encoded by an appropriate escape sequence to
4123    fit into the environment.  All control characters for C1 are
4124    defined to have corresponding escape sequences.
4125
4126    A graphic character set is at first designated to one of four
4127    graphic registers (G0 through G3), then these graphic registers are
4128    invoked to GL or GR.  These designations and invocations can be
4129    done independently.  The most common case is that G0 is invoked to
4130    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4131    these invocations and designations are omitted in encoded text.
4132    In a 7-bit environment, only GL can be used.
4133
4134    When a graphic character set of CHARS94 is invoked to GL, codes
4135    0x20 and 0x7F of the GL area work as control characters SPACE and
4136    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4137    be used.
4138
4139    There are two ways of invocation: locking-shift and single-shift.
4140    With locking-shift, the invocation lasts until the next different
4141    invocation, whereas with single-shift, the invocation affects the
4142    following character only and doesn't affect the locking-shift
4143    state.  Invocations are done by the following control characters or
4144    escape sequences:
4145
4146    ----------------------------------------------------------------------
4147    abbrev  function                  cntrl escape seq   description
4148    ----------------------------------------------------------------------
4149    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4150    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4151    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4152    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4153    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4154    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4155    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4156    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4157    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4158    ----------------------------------------------------------------------
4159    (*) These are not used by any known coding system.
4160
4161    Control characters for these functions are defined by macros
4162    ISO_CODE_XXX in `coding.h'.
4163
4164    Designations are done by the following escape sequences:
4165    ----------------------------------------------------------------------
4166    escape sequence      description
4167    ----------------------------------------------------------------------
4168    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4169    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4170    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4171    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4172    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4173    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4174    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4175    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4176    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4177    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4178    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4179    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4180    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4181    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4182    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4183    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4184    ----------------------------------------------------------------------
4185
4186    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4187    of dimension 1, chars 94, and final character <F>, etc...
4188
4189    Note (*): Although these designations are not allowed in ISO2022,
4190    Emacs accepts them on decoding, and produces them on encoding
4191    CHARS96 character sets in a coding system which is characterized as
4192    7-bit environment, non-locking-shift, and non-single-shift.
4193
4194    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4195    '(' can be omitted.  We refer to this as "short-form" hereafter.
4196
4197    Now you may notice that there are a lot of ways for encoding the
4198    same multilingual text in ISO2022.  Actually, there exist many
4199    coding systems such as Compound Text (used in X11's inter client
4200    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4201    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4202    localized platforms), and all of these are variants of ISO2022.
4203
4204    In addition to the above, Emacs handles two more kinds of escape
4205    sequences: ISO6429's direction specification and Emacs' private
4206    sequence for specifying character composition.
4207
4208    ISO6429's direction specification takes the following form:
4209         o CSI ']'      -- end of the current direction
4210         o CSI '0' ']'  -- end of the current direction
4211         o CSI '1' ']'  -- start of left-to-right text
4212         o CSI '2' ']'  -- start of right-to-left text
4213    The control character CSI (0x9B: control sequence introducer) is
4214    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4215
4216    Character composition specification takes the following form:
4217         o ESC '0' -- start character composition
4218         o ESC '1' -- end character composition
4219    Since these are not standard escape sequences of any ISO standard,
4220    their use with these meanings is restricted to Emacs only.  */
4221
4222 static void
4223 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4224 {
4225   int i;
4226
4227   for (i = 0; i < 4; i++)
4228     {
4229       if (!NILP (coding_system))
4230         iso->charset[i] =
4231           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4232       else
4233         iso->charset[i] = Qt;
4234       iso->invalid_designated[i] = 0;
4235     }
4236   iso->esc = ISO_ESC_NOTHING;
4237   iso->esc_bytes_index = 0;
4238   iso->register_left = 0;
4239   iso->register_right = 1;
4240   iso->switched_dir_and_no_valid_charset_yet = 0;
4241   iso->invalid_switch_dir = 0;
4242   iso->output_direction_sequence = 0;
4243   iso->output_literally = 0;
4244 #ifdef ENABLE_COMPOSITE_CHARS
4245   if (iso->composite_chars)
4246     Dynarr_reset (iso->composite_chars);
4247 #endif
4248 }
4249
4250 static int
4251 fit_to_be_escape_quoted (unsigned char c)
4252 {
4253   switch (c)
4254     {
4255     case ISO_CODE_ESC:
4256     case ISO_CODE_CSI:
4257     case ISO_CODE_SS2:
4258     case ISO_CODE_SS3:
4259     case ISO_CODE_SO:
4260     case ISO_CODE_SI:
4261       return 1;
4262
4263     default:
4264       return 0;
4265     }
4266 }
4267
4268 /* Parse one byte of an ISO2022 escape sequence.
4269    If the result is an invalid escape sequence, return 0 and
4270    do not change anything in STR.  Otherwise, if the result is
4271    an incomplete escape sequence, update ISO2022.ESC and
4272    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4273    all the state variables (but not ISO2022.ESC_BYTES) and
4274    return 1.
4275
4276    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4277    or invocation of an invalid character set and treat that as
4278    an unrecognized escape sequence. */
4279
4280 static int
4281 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4282                    unsigned char c, unsigned int *flags,
4283                    int check_invalid_charsets)
4284 {
4285   /* (1) If we're at the end of a designation sequence, CS is the
4286      charset being designated and REG is the register to designate
4287      it to.
4288
4289      (2) If we're at the end of a locking-shift sequence, REG is
4290      the register to invoke and HALF (0 == left, 1 == right) is
4291      the half to invoke it into.
4292
4293      (3) If we're at the end of a single-shift sequence, REG is
4294      the register to invoke. */
4295   Lisp_Object cs = Qnil;
4296   int reg, half;
4297
4298   /* NOTE: This code does goto's all over the fucking place.
4299      The reason for this is that we're basically implementing
4300      a state machine here, and hierarchical languages like C
4301      don't really provide a clean way of doing this. */
4302
4303   if (! (*flags & CODING_STATE_ESCAPE))
4304     /* At beginning of escape sequence; we need to reset our
4305        escape-state variables. */
4306     iso->esc = ISO_ESC_NOTHING;
4307
4308   iso->output_literally = 0;
4309   iso->output_direction_sequence = 0;
4310
4311   switch (iso->esc)
4312     {
4313     case ISO_ESC_NOTHING:
4314       iso->esc_bytes_index = 0;
4315       switch (c)
4316         {
4317         case ISO_CODE_ESC:      /* Start escape sequence */
4318           *flags |= CODING_STATE_ESCAPE;
4319           iso->esc = ISO_ESC;
4320           goto not_done;
4321
4322         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4323           *flags |= CODING_STATE_ESCAPE;
4324           iso->esc = ISO_ESC_5_11;
4325           goto not_done;
4326
4327         case ISO_CODE_SO:       /* locking shift 1 */
4328           reg = 1; half = 0;
4329           goto locking_shift;
4330         case ISO_CODE_SI:       /* locking shift 0 */
4331           reg = 0; half = 0;
4332           goto locking_shift;
4333
4334         case ISO_CODE_SS2:      /* single shift */
4335           reg = 2;
4336           goto single_shift;
4337         case ISO_CODE_SS3:      /* single shift */
4338           reg = 3;
4339           goto single_shift;
4340
4341         default:                        /* Other control characters */
4342           return 0;
4343         }
4344
4345     case ISO_ESC:
4346       switch (c)
4347         {
4348           /**** single shift ****/
4349
4350         case 'N':       /* single shift 2 */
4351           reg = 2;
4352           goto single_shift;
4353         case 'O':       /* single shift 3 */
4354           reg = 3;
4355           goto single_shift;
4356
4357           /**** locking shift ****/
4358
4359         case '~':       /* locking shift 1 right */
4360           reg = 1; half = 1;
4361           goto locking_shift;
4362         case 'n':       /* locking shift 2 */
4363           reg = 2; half = 0;
4364           goto locking_shift;
4365         case '}':       /* locking shift 2 right */
4366           reg = 2; half = 1;
4367           goto locking_shift;
4368         case 'o':       /* locking shift 3 */
4369           reg = 3; half = 0;
4370           goto locking_shift;
4371         case '|':       /* locking shift 3 right */
4372           reg = 3; half = 1;
4373           goto locking_shift;
4374
4375 #ifdef ENABLE_COMPOSITE_CHARS
4376           /**** composite ****/
4377
4378         case '0':
4379           iso->esc = ISO_ESC_START_COMPOSITE;
4380           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4381             CODING_STATE_COMPOSITE;
4382           return 1;
4383
4384         case '1':
4385           iso->esc = ISO_ESC_END_COMPOSITE;
4386           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4387             ~CODING_STATE_COMPOSITE;
4388           return 1;
4389 #endif /* ENABLE_COMPOSITE_CHARS */
4390
4391           /**** directionality ****/
4392
4393         case '[':
4394           iso->esc = ISO_ESC_5_11;
4395           goto not_done;
4396
4397           /**** designation ****/
4398
4399         case '$':       /* multibyte charset prefix */
4400           iso->esc = ISO_ESC_2_4;
4401           goto not_done;
4402
4403         default:
4404           if (0x28 <= c && c <= 0x2F)
4405             {
4406               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4407               goto not_done;
4408             }
4409
4410           /* This function is called with CODESYS equal to nil when
4411              doing coding-system detection. */
4412           if (!NILP (codesys)
4413               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4414               && fit_to_be_escape_quoted (c))
4415             {
4416               iso->esc = ISO_ESC_LITERAL;
4417               *flags &= CODING_STATE_ISO2022_LOCK;
4418               return 1;
4419             }
4420
4421           /* bzzzt! */
4422           return 0;
4423         }
4424
4425
4426
4427       /**** directionality ****/
4428
4429     case ISO_ESC_5_11:          /* ISO6429 direction control */
4430       if (c == ']')
4431         {
4432           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4433           goto directionality;
4434         }
4435       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4436       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4437       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4438       else               return 0;
4439       goto not_done;
4440
4441     case ISO_ESC_5_11_0:
4442       if (c == ']')
4443         {
4444           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4445           goto directionality;
4446         }
4447       return 0;
4448
4449     case ISO_ESC_5_11_1:
4450       if (c == ']')
4451         {
4452           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4453           goto directionality;
4454         }
4455       return 0;
4456
4457     case ISO_ESC_5_11_2:
4458       if (c == ']')
4459         {
4460           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4461           goto directionality;
4462         }
4463       return 0;
4464
4465     directionality:
4466       iso->esc = ISO_ESC_DIRECTIONALITY;
4467       /* Various junk here to attempt to preserve the direction sequences
4468          literally in the text if they would otherwise be swallowed due
4469          to invalid designations that don't show up as actual charset
4470          changes in the text. */
4471       if (iso->invalid_switch_dir)
4472         {
4473           /* We already inserted a direction switch literally into the
4474              text.  We assume (#### this may not be right) that the
4475              next direction switch is the one going the other way,
4476              and we need to output that literally as well. */
4477           iso->output_literally = 1;
4478           iso->invalid_switch_dir = 0;
4479         }
4480       else
4481         {
4482           int jj;
4483
4484           /* If we are in the thrall of an invalid designation,
4485            then stick the directionality sequence literally into the
4486            output stream so it ends up in the original text again. */
4487           for (jj = 0; jj < 4; jj++)
4488             if (iso->invalid_designated[jj])
4489               break;
4490           if (jj < 4)
4491             {
4492               iso->output_literally = 1;
4493               iso->invalid_switch_dir = 1;
4494             }
4495           else
4496             /* Indicate that we haven't yet seen a valid designation,
4497                so that if a switch-dir is directly followed by an
4498                invalid designation, both get inserted literally. */
4499             iso->switched_dir_and_no_valid_charset_yet = 1;
4500         }
4501       return 1;
4502
4503
4504       /**** designation ****/
4505
4506     case ISO_ESC_2_4:
4507       if (0x28 <= c && c <= 0x2F)
4508         {
4509           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4510           goto not_done;
4511         }
4512       if (0x40 <= c && c <= 0x42)
4513         {
4514           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4515                                       *flags & CODING_STATE_R2L ?
4516                                       CHARSET_RIGHT_TO_LEFT :
4517                                       CHARSET_LEFT_TO_RIGHT);
4518           reg = 0;
4519           goto designated;
4520         }
4521       return 0;
4522
4523     default:
4524       {
4525         int type =-1;
4526
4527         if (c < '0' || c > '~')
4528           return 0; /* bad final byte */
4529
4530         if (iso->esc >= ISO_ESC_2_8 &&
4531             iso->esc <= ISO_ESC_2_15)
4532           {
4533             type = ((iso->esc >= ISO_ESC_2_12) ?
4534                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4535             reg = (iso->esc - ISO_ESC_2_8) & 3;
4536           }
4537         else if (iso->esc >= ISO_ESC_2_4_8 &&
4538                  iso->esc <= ISO_ESC_2_4_15)
4539           {
4540             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4541                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4542             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4543           }
4544         else
4545           {
4546             /* Can this ever be reached? -slb */
4547             abort();
4548           }
4549
4550         cs = CHARSET_BY_ATTRIBUTES (type, c,
4551                                     *flags & CODING_STATE_R2L ?
4552                                     CHARSET_RIGHT_TO_LEFT :
4553                                     CHARSET_LEFT_TO_RIGHT);
4554         goto designated;
4555       }
4556     }
4557
4558  not_done:
4559   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4560   return -1;
4561
4562  single_shift:
4563   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4564     /* can't invoke something that ain't there. */
4565     return 0;
4566   iso->esc = ISO_ESC_SINGLE_SHIFT;
4567   *flags &= CODING_STATE_ISO2022_LOCK;
4568   if (reg == 2)
4569     *flags |= CODING_STATE_SS2;
4570   else
4571     *flags |= CODING_STATE_SS3;
4572   return 1;
4573
4574  locking_shift:
4575   if (check_invalid_charsets &&
4576       !CHARSETP (iso->charset[reg]))
4577     /* can't invoke something that ain't there. */
4578     return 0;
4579   if (half)
4580     iso->register_right = reg;
4581   else
4582     iso->register_left = reg;
4583   *flags &= CODING_STATE_ISO2022_LOCK;
4584   iso->esc = ISO_ESC_LOCKING_SHIFT;
4585   return 1;
4586
4587  designated:
4588   if (NILP (cs) && check_invalid_charsets)
4589     {
4590       iso->invalid_designated[reg] = 1;
4591       iso->charset[reg] = Vcharset_ascii;
4592       iso->esc = ISO_ESC_DESIGNATE;
4593       *flags &= CODING_STATE_ISO2022_LOCK;
4594       iso->output_literally = 1;
4595       if (iso->switched_dir_and_no_valid_charset_yet)
4596         {
4597           /* We encountered a switch-direction followed by an
4598              invalid designation.  Ensure that the switch-direction
4599              gets outputted; otherwise it will probably get eaten
4600              when the text is written out again. */
4601           iso->switched_dir_and_no_valid_charset_yet = 0;
4602           iso->output_direction_sequence = 1;
4603           /* And make sure that the switch-dir going the other
4604              way gets outputted, as well. */
4605           iso->invalid_switch_dir = 1;
4606         }
4607       return 1;
4608     }
4609   /* This function is called with CODESYS equal to nil when
4610      doing coding-system detection. */
4611   if (!NILP (codesys))
4612     {
4613       charset_conversion_spec_dynarr *dyn =
4614         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4615
4616       if (dyn)
4617         {
4618           int i;
4619
4620           for (i = 0; i < Dynarr_length (dyn); i++)
4621             {
4622               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4623               if (EQ (cs, spec->from_charset))
4624                 cs = spec->to_charset;
4625             }
4626         }
4627     }
4628
4629   iso->charset[reg] = cs;
4630   iso->esc = ISO_ESC_DESIGNATE;
4631   *flags &= CODING_STATE_ISO2022_LOCK;
4632   if (iso->invalid_designated[reg])
4633     {
4634       iso->invalid_designated[reg] = 0;
4635       iso->output_literally = 1;
4636     }
4637   if (iso->switched_dir_and_no_valid_charset_yet)
4638     iso->switched_dir_and_no_valid_charset_yet = 0;
4639   return 1;
4640 }
4641
4642 static int
4643 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4644                        unsigned int n)
4645 {
4646   int mask;
4647
4648   /* #### There are serious deficiencies in the recognition mechanism
4649      here.  This needs to be much smarter if it's going to cut it.
4650      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4651      it should be detected as Latin-1.
4652      All the ISO2022 stuff in this file should be synced up with the
4653      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4654      Perhaps we should wait till R2L works in FSF Emacs? */
4655
4656   if (!st->iso2022.initted)
4657     {
4658       reset_iso2022 (Qnil, &st->iso2022.iso);
4659       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4660                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4661                           CODING_CATEGORY_ISO_8_1_MASK |
4662                           CODING_CATEGORY_ISO_8_2_MASK |
4663                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4664       st->iso2022.flags = 0;
4665       st->iso2022.high_byte_count = 0;
4666       st->iso2022.saw_single_shift = 0;
4667       st->iso2022.initted = 1;
4668     }
4669
4670   mask = st->iso2022.mask;
4671
4672   while (n--)
4673     {
4674       int c = *src++;
4675       if (c >= 0xA0)
4676         {
4677           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4678           st->iso2022.high_byte_count++;
4679         }
4680       else
4681         {
4682           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4683             {
4684               if (st->iso2022.high_byte_count & 1)
4685                 /* odd number of high bytes; assume not iso-8-2 */
4686                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4687             }
4688           st->iso2022.high_byte_count = 0;
4689           st->iso2022.saw_single_shift = 0;
4690           if (c > 0x80)
4691             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4692         }
4693       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4694           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4695         { /* control chars */
4696           switch (c)
4697             {
4698               /* Allow and ignore control characters that you might
4699                  reasonably see in a text file */
4700             case '\r':
4701             case '\n':
4702             case '\t':
4703             case  7: /* bell */
4704             case  8: /* backspace */
4705             case 11: /* vertical tab */
4706             case 12: /* form feed */
4707             case 26: /* MS-DOS C-z junk */
4708             case 31: /* '^_' -- for info */
4709               goto label_continue_loop;
4710
4711             default:
4712               break;
4713             }
4714         }
4715
4716       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4717           || BYTE_C1_P (c))
4718         {
4719           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4720                                  &st->iso2022.flags, 0))
4721             {
4722               switch (st->iso2022.iso.esc)
4723                 {
4724                 case ISO_ESC_DESIGNATE:
4725                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4726                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4727                   break;
4728                 case ISO_ESC_LOCKING_SHIFT:
4729                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4730                   goto ran_out_of_chars;
4731                 case ISO_ESC_SINGLE_SHIFT:
4732                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4733                   st->iso2022.saw_single_shift = 1;
4734                   break;
4735                 default:
4736                   break;
4737                 }
4738             }
4739           else
4740             {
4741               mask = 0;
4742               goto ran_out_of_chars;
4743             }
4744         }
4745     label_continue_loop:;
4746     }
4747
4748  ran_out_of_chars:
4749
4750   return mask;
4751 }
4752
4753 static int
4754 postprocess_iso2022_mask (int mask)
4755 {
4756   /* #### kind of cheesy */
4757   /* If seven-bit ISO is allowed, then assume that the encoding is
4758      entirely seven-bit and turn off the eight-bit ones. */
4759   if (mask & CODING_CATEGORY_ISO_7_MASK)
4760     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4761                CODING_CATEGORY_ISO_8_1_MASK |
4762                CODING_CATEGORY_ISO_8_2_MASK);
4763   return mask;
4764 }
4765
4766 /* If FLAGS is a null pointer or specifies right-to-left motion,
4767    output a switch-dir-to-left-to-right sequence to DST.
4768    Also update FLAGS if it is not a null pointer.
4769    If INTERNAL_P is set, we are outputting in internal format and
4770    need to handle the CSI differently. */
4771
4772 static void
4773 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4774                                  unsigned_char_dynarr *dst,
4775                                  unsigned int *flags,
4776                                  int internal_p)
4777 {
4778   if (!flags || (*flags & CODING_STATE_R2L))
4779     {
4780       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4781         {
4782           Dynarr_add (dst, ISO_CODE_ESC);
4783           Dynarr_add (dst, '[');
4784         }
4785       else if (internal_p)
4786         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4787       else
4788         Dynarr_add (dst, ISO_CODE_CSI);
4789       Dynarr_add (dst, '0');
4790       Dynarr_add (dst, ']');
4791       if (flags)
4792         *flags &= ~CODING_STATE_R2L;
4793     }
4794 }
4795
4796 /* If FLAGS is a null pointer or specifies a direction different from
4797    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4798    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4799    sequence to DST.  Also update FLAGS if it is not a null pointer.
4800    If INTERNAL_P is set, we are outputting in internal format and
4801    need to handle the CSI differently. */
4802
4803 static void
4804 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4805                           unsigned_char_dynarr *dst, unsigned int *flags,
4806                           int internal_p)
4807 {
4808   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4809       direction == CHARSET_LEFT_TO_RIGHT)
4810     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4811   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4812            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4813            direction == CHARSET_RIGHT_TO_LEFT)
4814     {
4815       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4816         {
4817           Dynarr_add (dst, ISO_CODE_ESC);
4818           Dynarr_add (dst, '[');
4819         }
4820       else if (internal_p)
4821         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4822       else
4823         Dynarr_add (dst, ISO_CODE_CSI);
4824       Dynarr_add (dst, '2');
4825       Dynarr_add (dst, ']');
4826       if (flags)
4827         *flags |= CODING_STATE_R2L;
4828     }
4829 }
4830
4831 /* Convert ISO2022-format data to internal format. */
4832
4833 static void
4834 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4835                        unsigned_char_dynarr *dst, unsigned int n)
4836 {
4837   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4838   unsigned int flags  = str->flags;
4839   unsigned int ch     = str->ch;
4840   eol_type_t eol_type = str->eol_type;
4841 #ifdef ENABLE_COMPOSITE_CHARS
4842   unsigned_char_dynarr *real_dst = dst;
4843 #endif
4844   Lisp_Object coding_system;
4845
4846   XSETCODING_SYSTEM (coding_system, str->codesys);
4847
4848 #ifdef ENABLE_COMPOSITE_CHARS
4849   if (flags & CODING_STATE_COMPOSITE)
4850     dst = str->iso2022.composite_chars;
4851 #endif /* ENABLE_COMPOSITE_CHARS */
4852
4853   while (n--)
4854     {
4855       unsigned char c = *src++;
4856       if (flags & CODING_STATE_ESCAPE)
4857         {       /* Within ESC sequence */
4858           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4859                                           c, &flags, 1);
4860
4861           if (retval)
4862             {
4863               switch (str->iso2022.esc)
4864                 {
4865 #ifdef ENABLE_COMPOSITE_CHARS
4866                 case ISO_ESC_START_COMPOSITE:
4867                   if (str->iso2022.composite_chars)
4868                     Dynarr_reset (str->iso2022.composite_chars);
4869                   else
4870                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4871                   dst = str->iso2022.composite_chars;
4872                   break;
4873                 case ISO_ESC_END_COMPOSITE:
4874                   {
4875                     Bufbyte comstr[MAX_EMCHAR_LEN];
4876                     Bytecount len;
4877                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4878                                                          Dynarr_length (dst));
4879                     dst = real_dst;
4880                     len = set_charptr_emchar (comstr, emch);
4881                     Dynarr_add_many (dst, comstr, len);
4882                     break;
4883                   }
4884 #endif /* ENABLE_COMPOSITE_CHARS */
4885
4886                 case ISO_ESC_LITERAL:
4887                   DECODE_ADD_BINARY_CHAR (c, dst);
4888                   break;
4889
4890                 default:
4891                   /* Everything else handled already */
4892                   break;
4893                 }
4894             }
4895
4896           /* Attempted error recovery. */
4897           if (str->iso2022.output_direction_sequence)
4898             ensure_correct_direction (flags & CODING_STATE_R2L ?
4899                                       CHARSET_RIGHT_TO_LEFT :
4900                                       CHARSET_LEFT_TO_RIGHT,
4901                                       str->codesys, dst, 0, 1);
4902           /* More error recovery. */
4903           if (!retval || str->iso2022.output_literally)
4904             {
4905               /* Output the (possibly invalid) sequence */
4906               int i;
4907               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4908                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4909               flags &= CODING_STATE_ISO2022_LOCK;
4910               if (!retval)
4911                 n++, src--;/* Repeat the loop with the same character. */
4912               else
4913                 {
4914                   /* No sense in reprocessing the final byte of the
4915                      escape sequence; it could mess things up anyway.
4916                      Just add it now. */
4917                   DECODE_ADD_BINARY_CHAR (c, dst);
4918                 }
4919             }
4920           ch = 0;
4921         }
4922       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4923         { /* Control characters */
4924
4925           /***** Error-handling *****/
4926
4927           /* If we were in the middle of a character, dump out the
4928              partial character. */
4929           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4930
4931           /* If we just saw a single-shift character, dump it out.
4932              This may dump out the wrong sort of single-shift character,
4933              but least it will give an indication that something went
4934              wrong. */
4935           if (flags & CODING_STATE_SS2)
4936             {
4937               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4938               flags &= ~CODING_STATE_SS2;
4939             }
4940           if (flags & CODING_STATE_SS3)
4941             {
4942               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4943               flags &= ~CODING_STATE_SS3;
4944             }
4945
4946           /***** Now handle the control characters. *****/
4947
4948           /* Handle CR/LF */
4949           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4950
4951           flags &= CODING_STATE_ISO2022_LOCK;
4952
4953           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4954             DECODE_ADD_BINARY_CHAR (c, dst);
4955         }
4956       else
4957         {                       /* Graphic characters */
4958           Lisp_Object charset;
4959 #ifndef UTF2000
4960           int lb;
4961 #endif
4962           int reg;
4963
4964           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4965
4966           /* Now determine the charset. */
4967           reg = ((flags & CODING_STATE_SS2) ? 2
4968                  : (flags & CODING_STATE_SS3) ? 3
4969                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4970                  : str->iso2022.register_left);
4971           charset = str->iso2022.charset[reg];
4972
4973           /* Error checking: */
4974           if (! CHARSETP (charset)
4975               || str->iso2022.invalid_designated[reg]
4976               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4977                   && XCHARSET_CHARS (charset) == 94))
4978             /* Mrmph.  We are trying to invoke a register that has no
4979                or an invalid charset in it, or trying to add a character
4980                outside the range of the charset.  Insert that char literally
4981                to preserve it for the output. */
4982             {
4983               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4984               DECODE_ADD_BINARY_CHAR (c, dst);
4985             }
4986
4987           else
4988             {
4989               /* Things are probably hunky-dorey. */
4990
4991               /* Fetch reverse charset, maybe. */
4992               if (((flags & CODING_STATE_R2L) &&
4993                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4994                   ||
4995                   (!(flags & CODING_STATE_R2L) &&
4996                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4997                 {
4998                   Lisp_Object new_charset =
4999                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5000                   if (!NILP (new_charset))
5001                     charset = new_charset;
5002                 }
5003
5004 #ifndef UTF2000
5005               lb = XCHARSET_LEADING_BYTE (charset);
5006 #endif
5007               switch (XCHARSET_REP_BYTES (charset))
5008                 {
5009                 case 1: /* ASCII */
5010                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5011                   Dynarr_add (dst, c & 0x7F);
5012                   break;
5013
5014                 case 2: /* one-byte official */
5015                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5016 #ifdef UTF2000
5017                   DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0), dst);
5018 #else
5019                   Dynarr_add (dst, lb);
5020                   Dynarr_add (dst, c | 0x80);
5021 #endif
5022                   break;
5023
5024                 case 3: /* one-byte private or two-byte official */
5025                   if (XCHARSET_PRIVATE_P (charset))
5026                     {
5027                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5028 #ifdef UTF2000
5029                       DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0),
5030                                           dst);
5031 #else
5032                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5033                       Dynarr_add (dst, lb);
5034                       Dynarr_add (dst, c | 0x80);
5035 #endif
5036                     }
5037                   else
5038                     {
5039                       if (ch)
5040                         {
5041 #ifdef UTF2000
5042                           DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
5043                                                         ch & 0x7F,
5044                                                         c & 0x7F), dst);
5045 #else
5046                           Dynarr_add (dst, lb);
5047                           Dynarr_add (dst, ch | 0x80);
5048                           Dynarr_add (dst, c | 0x80);
5049 #endif
5050                           ch = 0;
5051                         }
5052                       else
5053                         ch = c;
5054                     }
5055                   break;
5056
5057                 default:        /* two-byte private */
5058                   if (ch)
5059                     {
5060 #ifdef UTF2000
5061                       DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
5062                                                     ch & 0x7F,
5063                                                     c & 0x7F), dst);
5064 #else
5065                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5066                       Dynarr_add (dst, lb);
5067                       Dynarr_add (dst, ch | 0x80);
5068                       Dynarr_add (dst, c | 0x80);
5069 #endif
5070                       ch = 0;
5071                     }
5072                   else
5073                     ch = c;
5074                 }
5075             }
5076
5077           if (!ch)
5078             flags &= CODING_STATE_ISO2022_LOCK;
5079         }
5080
5081     label_continue_loop:;
5082     }
5083
5084   if (flags & CODING_STATE_END)
5085     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5086
5087   str->flags = flags;
5088   str->ch    = ch;
5089 }
5090
5091
5092 /***** ISO2022 encoder *****/
5093
5094 /* Designate CHARSET into register REG. */
5095
5096 static void
5097 iso2022_designate (Lisp_Object charset, unsigned char reg,
5098                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5099 {
5100   static CONST char inter94[] = "()*+";
5101   static CONST char inter96[] = ",-./";
5102   unsigned int type;
5103   unsigned char final;
5104   Lisp_Object old_charset = str->iso2022.charset[reg];
5105
5106   str->iso2022.charset[reg] = charset;
5107   if (!CHARSETP (charset))
5108     /* charset might be an initial nil or t. */
5109     return;
5110   type = XCHARSET_TYPE (charset);
5111   final = XCHARSET_FINAL (charset);
5112   if (!str->iso2022.force_charset_on_output[reg] &&
5113       CHARSETP (old_charset) &&
5114       XCHARSET_TYPE (old_charset) == type &&
5115       XCHARSET_FINAL (old_charset) == final)
5116     return;
5117
5118   str->iso2022.force_charset_on_output[reg] = 0;
5119
5120   {
5121     charset_conversion_spec_dynarr *dyn =
5122       str->codesys->iso2022.output_conv;
5123
5124     if (dyn)
5125       {
5126         int i;
5127
5128         for (i = 0; i < Dynarr_length (dyn); i++)
5129           {
5130             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5131             if (EQ (charset, spec->from_charset))
5132                 charset = spec->to_charset;
5133           }
5134       }
5135   }
5136
5137   Dynarr_add (dst, ISO_CODE_ESC);
5138   switch (type)
5139     {
5140     case CHARSET_TYPE_94:
5141       Dynarr_add (dst, inter94[reg]);
5142       break;
5143     case CHARSET_TYPE_96:
5144       Dynarr_add (dst, inter96[reg]);
5145       break;
5146     case CHARSET_TYPE_94X94:
5147       Dynarr_add (dst, '$');
5148       if (reg != 0
5149           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5150           || final < '@'
5151           || final > 'B')
5152         Dynarr_add (dst, inter94[reg]);
5153       break;
5154     case CHARSET_TYPE_96X96:
5155       Dynarr_add (dst, '$');
5156       Dynarr_add (dst, inter96[reg]);
5157       break;
5158     }
5159   Dynarr_add (dst, final);
5160 }
5161
5162 static void
5163 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5164 {
5165   if (str->iso2022.register_left != 0)
5166     {
5167       Dynarr_add (dst, ISO_CODE_SI);
5168       str->iso2022.register_left = 0;
5169     }
5170 }
5171
5172 static void
5173 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5174 {
5175   if (str->iso2022.register_left != 1)
5176     {
5177       Dynarr_add (dst, ISO_CODE_SO);
5178       str->iso2022.register_left = 1;
5179     }
5180 }
5181
5182 /* Convert internally-formatted data to ISO2022 format. */
5183
5184 static void
5185 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5186                        unsigned_char_dynarr *dst, unsigned int n)
5187 {
5188   unsigned char charmask, c;
5189   unsigned char char_boundary;
5190   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5191   unsigned int flags          = str->flags;
5192   Emchar ch                   = str->ch;
5193   Lisp_Coding_System *codesys = str->codesys;
5194   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5195   int i;
5196   Lisp_Object charset;
5197   int half;
5198 #ifdef UTF2000
5199   unsigned int byte1, byte2;
5200 #endif
5201
5202 #ifdef ENABLE_COMPOSITE_CHARS
5203   /* flags for handling composite chars.  We do a little switcharoo
5204      on the source while we're outputting the composite char. */
5205   unsigned int saved_n = 0;
5206   CONST unsigned char *saved_src = NULL;
5207   int in_composite = 0;
5208 #endif /* ENABLE_COMPOSITE_CHARS */
5209
5210   char_boundary = str->iso2022.current_char_boundary;
5211   charset = str->iso2022.current_charset;
5212   half = str->iso2022.current_half;
5213
5214 #ifdef ENABLE_COMPOSITE_CHARS
5215  back_to_square_n:
5216 #endif
5217 #ifdef UTF2000
5218   while (n--)
5219     {
5220       c = *src++;
5221
5222       switch (char_boundary)
5223         {
5224         case 0:
5225           if ( c >= 0xfc )
5226             {
5227               ch = c & 0x01;
5228               char_boundary = 5;
5229             }
5230           else if ( c >= 0xf8 )
5231             {
5232               ch = c & 0x03;
5233               char_boundary = 4;
5234             }
5235           else if ( c >= 0xf0 )
5236             {
5237               ch = c & 0x07;
5238               char_boundary = 3;
5239             }
5240           else if ( c >= 0xe0 )
5241             {
5242               ch = c & 0x0f;
5243               char_boundary = 2;
5244             }
5245           else if ( c >= 0xc0 )
5246             {
5247               ch = c & 0x1f;
5248               char_boundary = 1;
5249             }
5250           else
5251             {
5252               ch = 0;
5253
5254               restore_left_to_right_direction (codesys, dst, &flags, 0);
5255
5256               /* Make sure G0 contains ASCII */
5257               if ((c > ' ' && c < ISO_CODE_DEL) ||
5258                   !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5259                 {
5260                   ensure_normal_shift (str, dst);
5261                   iso2022_designate (Vcharset_ascii, 0, str, dst);
5262                 }
5263
5264               /* If necessary, restore everything to the default state
5265                  at end-of-line */
5266               if (c == '\n' &&
5267                   !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5268                 {
5269                   restore_left_to_right_direction (codesys, dst, &flags, 0);
5270
5271                   ensure_normal_shift (str, dst);
5272
5273                   for (i = 0; i < 4; i++)
5274                     {
5275                       Lisp_Object initial_charset =
5276                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5277                       iso2022_designate (initial_charset, i, str, dst);
5278                     }
5279                 }
5280               if (c == '\n')
5281                 {
5282                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5283                     Dynarr_add (dst, '\r');
5284                   if (eol_type != EOL_CR)
5285                     Dynarr_add (dst, c);
5286                 }
5287               else
5288                 {
5289                   if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5290                       && fit_to_be_escape_quoted (c))
5291                     Dynarr_add (dst, ISO_CODE_ESC);
5292                   Dynarr_add (dst, c);
5293                 }
5294               char_boundary = 0;
5295             }
5296           break;
5297         case 1:
5298           ch = ( ch << 6 ) | ( c & 0x3f );
5299
5300           char_boundary = 0;
5301           if ( (0x80 <= ch) && (ch <= 0x9f) )
5302             {
5303               charmask = (half == 0 ? 0x00 : 0x80);
5304
5305               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5306                   && fit_to_be_escape_quoted (ch))
5307                 Dynarr_add (dst, ISO_CODE_ESC);
5308               /* you asked for it ... */
5309               Dynarr_add (dst, ch);
5310             }
5311           else
5312             {
5313               int reg;
5314
5315               BREAKUP_CHAR (ch, charset, byte1, byte2);
5316               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5317                                         codesys, dst, &flags, 0);
5318
5319               /* Now determine which register to use. */
5320               reg = -1;
5321               for (i = 0; i < 4; i++)
5322                 {
5323                   if (EQ (charset, str->iso2022.charset[i]) ||
5324                       EQ (charset,
5325                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5326                     {
5327                       reg = i;
5328                       break;
5329                     }
5330                 }
5331
5332               if (reg == -1)
5333                 {
5334                   if (XCHARSET_GRAPHIC (charset) != 0)
5335                     {
5336                       if (!NILP (str->iso2022.charset[1]) &&
5337                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5338                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5339                         reg = 1;
5340                       else if (!NILP (str->iso2022.charset[2]))
5341                         reg = 2;
5342                       else if (!NILP (str->iso2022.charset[3]))
5343                         reg = 3;
5344                       else
5345                         reg = 0;
5346                     }
5347                   else
5348                     reg = 0;
5349                 }
5350
5351               iso2022_designate (charset, reg, str, dst);
5352
5353               /* Now invoke that register. */
5354               switch (reg)
5355                 {
5356                 case 0:
5357                   ensure_normal_shift (str, dst);
5358                   half = 0;
5359                   break;
5360
5361                 case 1:
5362                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5363                     {
5364                       ensure_shift_out (str, dst);
5365                       half = 0;
5366                     }
5367                   else
5368                     half = 1;
5369                   break;
5370
5371                 case 2:
5372                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5373                     {
5374                       Dynarr_add (dst, ISO_CODE_ESC);
5375                       Dynarr_add (dst, 'N');
5376                       half = 0;
5377                     }
5378                   else
5379                     {
5380                       Dynarr_add (dst, ISO_CODE_SS2);
5381                       half = 1;
5382                     }
5383                   break;
5384
5385                 case 3:
5386                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5387                     {
5388                       Dynarr_add (dst, ISO_CODE_ESC);
5389                       Dynarr_add (dst, 'O');
5390                       half = 0;
5391                     }
5392                   else
5393                     {
5394                       Dynarr_add (dst, ISO_CODE_SS3);
5395                       half = 1;
5396                     }
5397                   break;
5398
5399                 default:
5400                   abort ();
5401                 }
5402
5403               charmask = (half == 0 ? 0x00 : 0x80);
5404
5405               switch (XCHARSET_DIMENSION (charset))
5406                 {
5407                 case 1:
5408                   Dynarr_add (dst, byte1 | charmask);
5409                   break;
5410                 case 2:
5411                   Dynarr_add (dst, byte1 | charmask);
5412                   Dynarr_add (dst, byte2 | charmask);
5413                   break;
5414                 default:
5415                   abort ();
5416                 }
5417             }
5418           ch =0;
5419           break;
5420         default:
5421           ch = ( ch << 6 ) | ( c & 0x3f );
5422           char_boundary--;
5423         }
5424     }
5425 #else /* not UTF2000 */
5426
5427   while (n--)
5428     {
5429       c = *src++;
5430
5431       if (BYTE_ASCII_P (c))
5432         {               /* Processing ASCII character */
5433           ch = 0;
5434
5435           restore_left_to_right_direction (codesys, dst, &flags, 0);
5436
5437           /* Make sure G0 contains ASCII */
5438           if ((c > ' ' && c < ISO_CODE_DEL) ||
5439               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5440             {
5441               ensure_normal_shift (str, dst);
5442               iso2022_designate (Vcharset_ascii, 0, str, dst);
5443             }
5444
5445           /* If necessary, restore everything to the default state
5446              at end-of-line */
5447           if (c == '\n' &&
5448               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5449             {
5450               restore_left_to_right_direction (codesys, dst, &flags, 0);
5451
5452               ensure_normal_shift (str, dst);
5453
5454               for (i = 0; i < 4; i++)
5455                 {
5456                   Lisp_Object initial_charset =
5457                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5458                   iso2022_designate (initial_charset, i, str, dst);
5459                 }
5460             }
5461           if (c == '\n')
5462             {
5463               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5464                 Dynarr_add (dst, '\r');
5465               if (eol_type != EOL_CR)
5466                 Dynarr_add (dst, c);
5467             }
5468           else
5469             {
5470               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5471                   && fit_to_be_escape_quoted (c))
5472                 Dynarr_add (dst, ISO_CODE_ESC);
5473               Dynarr_add (dst, c);
5474             }
5475           char_boundary = 1;
5476         }
5477
5478       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5479         { /* Processing Leading Byte */
5480           ch = 0;
5481           charset = CHARSET_BY_LEADING_BYTE (c);
5482           if (LEADING_BYTE_PREFIX_P(c))
5483             ch = c;
5484           else if (!EQ (charset, Vcharset_control_1)
5485 #ifdef ENABLE_COMPOSITE_CHARS
5486                    && !EQ (charset, Vcharset_composite)
5487 #endif
5488                    )
5489             {
5490               int reg;
5491
5492               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5493                                         codesys, dst, &flags, 0);
5494
5495               /* Now determine which register to use. */
5496               reg = -1;
5497               for (i = 0; i < 4; i++)
5498                 {
5499                   if (EQ (charset, str->iso2022.charset[i]) ||
5500                       EQ (charset,
5501                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5502                     {
5503                       reg = i;
5504                       break;
5505                     }
5506                 }
5507
5508               if (reg == -1)
5509                 {
5510                   if (XCHARSET_GRAPHIC (charset) != 0)
5511                     {
5512                       if (!NILP (str->iso2022.charset[1]) &&
5513                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5514                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5515                         reg = 1;
5516                       else if (!NILP (str->iso2022.charset[2]))
5517                         reg = 2;
5518                       else if (!NILP (str->iso2022.charset[3]))
5519                         reg = 3;
5520                       else
5521                         reg = 0;
5522                     }
5523                   else
5524                     reg = 0;
5525                 }
5526
5527               iso2022_designate (charset, reg, str, dst);
5528
5529               /* Now invoke that register. */
5530               switch (reg)
5531                 {
5532                 case 0:
5533                   ensure_normal_shift (str, dst);
5534                   half = 0;
5535                   break;
5536
5537                 case 1:
5538                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5539                     {
5540                       ensure_shift_out (str, dst);
5541                       half = 0;
5542                     }
5543                   else
5544                     half = 1;
5545                   break;
5546
5547                 case 2:
5548                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5549                     {
5550                       Dynarr_add (dst, ISO_CODE_ESC);
5551                       Dynarr_add (dst, 'N');
5552                       half = 0;
5553                     }
5554                   else
5555                     {
5556                       Dynarr_add (dst, ISO_CODE_SS2);
5557                       half = 1;
5558                     }
5559                   break;
5560
5561                 case 3:
5562                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5563                     {
5564                       Dynarr_add (dst, ISO_CODE_ESC);
5565                       Dynarr_add (dst, 'O');
5566                       half = 0;
5567                     }
5568                   else
5569                     {
5570                       Dynarr_add (dst, ISO_CODE_SS3);
5571                       half = 1;
5572                     }
5573                   break;
5574
5575                 default:
5576                   abort ();
5577                 }
5578             }
5579           char_boundary = 0;
5580         }
5581       else
5582         {                       /* Processing Non-ASCII character */
5583           charmask = (half == 0 ? 0x7F : 0xFF);
5584           char_boundary = 1;
5585           if (EQ (charset, Vcharset_control_1))
5586             {
5587               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5588                   && fit_to_be_escape_quoted (c))
5589                 Dynarr_add (dst, ISO_CODE_ESC);
5590               /* you asked for it ... */
5591               Dynarr_add (dst, c - 0x20);
5592             }
5593           else
5594             {
5595               switch (XCHARSET_REP_BYTES (charset))
5596                 {
5597                 case 2:
5598                   Dynarr_add (dst, c & charmask);
5599                   break;
5600                 case 3:
5601                   if (XCHARSET_PRIVATE_P (charset))
5602                     {
5603                       Dynarr_add (dst, c & charmask);
5604                       ch = 0;
5605                     }
5606                   else if (ch)
5607                     {
5608 #ifdef ENABLE_COMPOSITE_CHARS
5609                       if (EQ (charset, Vcharset_composite))
5610                         {
5611                           if (in_composite)
5612                             {
5613                               /* #### Bother! We don't know how to
5614                                  handle this yet. */
5615                               Dynarr_add (dst, '~');
5616                             }
5617                           else
5618                             {
5619                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5620                                                        ch & 0x7F, c & 0x7F);
5621                               Lisp_Object lstr = composite_char_string (emch);
5622                               saved_n = n;
5623                               saved_src = src;
5624                               in_composite = 1;
5625                               src = XSTRING_DATA   (lstr);
5626                               n   = XSTRING_LENGTH (lstr);
5627                               Dynarr_add (dst, ISO_CODE_ESC);
5628                               Dynarr_add (dst, '0'); /* start composing */
5629                             }
5630                         }
5631                       else
5632 #endif /* ENABLE_COMPOSITE_CHARS */
5633                         {
5634                           Dynarr_add (dst, ch & charmask);
5635                           Dynarr_add (dst, c & charmask);
5636                         }
5637                       ch = 0;
5638                     }
5639                   else
5640                     {
5641                       ch = c;
5642                       char_boundary = 0;
5643                     }
5644                   break;
5645                 case 4:
5646                   if (ch)
5647                     {
5648                       Dynarr_add (dst, ch & charmask);
5649                       Dynarr_add (dst, c & charmask);
5650                       ch = 0;
5651                     }
5652                   else
5653                     {
5654                       ch = c;
5655                       char_boundary = 0;
5656                     }
5657                   break;
5658                 default:
5659                   abort ();
5660                 }
5661             }
5662         }
5663     }
5664 #endif /* not UTF2000 */
5665
5666 #ifdef ENABLE_COMPOSITE_CHARS
5667   if (in_composite)
5668     {
5669       n = saved_n;
5670       src = saved_src;
5671       in_composite = 0;
5672       Dynarr_add (dst, ISO_CODE_ESC);
5673       Dynarr_add (dst, '1'); /* end composing */
5674       goto back_to_square_n; /* Wheeeeeeeee ..... */
5675     }
5676 #endif /* ENABLE_COMPOSITE_CHARS */
5677
5678 #ifdef UTF2000
5679   if ( (char_boundary == 0) && flags & CODING_STATE_END)
5680 #else
5681   if (char_boundary && flags & CODING_STATE_END)
5682 #endif
5683     {
5684       restore_left_to_right_direction (codesys, dst, &flags, 0);
5685       ensure_normal_shift (str, dst);
5686       for (i = 0; i < 4; i++)
5687         {
5688           Lisp_Object initial_charset =
5689             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5690           iso2022_designate (initial_charset, i, str, dst);
5691         }
5692     }
5693
5694   str->flags = flags;
5695   str->ch    = ch;
5696   str->iso2022.current_char_boundary = char_boundary;
5697   str->iso2022.current_charset = charset;
5698   str->iso2022.current_half = half;
5699
5700   /* Verbum caro factum est! */
5701 }
5702 #endif /* MULE */
5703 \f
5704 /************************************************************************/
5705 /*                     No-conversion methods                            */
5706 /************************************************************************/
5707
5708 /* This is used when reading in "binary" files -- i.e. files that may
5709    contain all 256 possible byte values and that are not to be
5710    interpreted as being in any particular decoding. */
5711 static void
5712 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5713                              unsigned_char_dynarr *dst, unsigned int n)
5714 {
5715   unsigned char c;
5716   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5717   unsigned int flags  = str->flags;
5718   unsigned int ch     = str->ch;
5719   eol_type_t eol_type = str->eol_type;
5720
5721   while (n--)
5722     {
5723       c = *src++;
5724
5725       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5726       DECODE_ADD_BINARY_CHAR (c, dst);
5727     label_continue_loop:;
5728     }
5729
5730   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5731
5732   str->flags = flags;
5733   str->ch    = ch;
5734 }
5735
5736 static void
5737 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5738                              unsigned_char_dynarr *dst, unsigned int n)
5739 {
5740   unsigned char c;
5741   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5742   unsigned int flags  = str->flags;
5743   unsigned int ch     = str->ch;
5744   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5745 #ifdef UTF2000
5746   unsigned char char_boundary = str->iso2022.current_char_boundary;
5747 #endif
5748
5749   while (n--)
5750     {
5751       c = *src++;
5752 #ifdef UTF2000
5753       switch (char_boundary)
5754         {
5755         case 0:
5756           if ( c >= 0xfc )
5757             {
5758               ch = c & 0x01;
5759               char_boundary = 5;
5760             }
5761           else if ( c >= 0xf8 )
5762             {
5763               ch = c & 0x03;
5764               char_boundary = 4;
5765             }
5766           else if ( c >= 0xf0 )
5767             {
5768               ch = c & 0x07;
5769               char_boundary = 3;
5770             }
5771           else if ( c >= 0xe0 )
5772             {
5773               ch = c & 0x0f;
5774               char_boundary = 2;
5775             }
5776           else if ( c >= 0xc0 )
5777             {
5778               ch = c & 0x1f;
5779               char_boundary = 1;
5780             }
5781           else
5782             {
5783               ch = 0;
5784
5785               if (c == '\n')
5786                 {
5787                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5788                     Dynarr_add (dst, '\r');
5789                   if (eol_type != EOL_CR)
5790                     Dynarr_add (dst, c);
5791                 }
5792               else
5793                 Dynarr_add (dst, c);
5794               char_boundary = 0;
5795             }
5796           break;
5797         case 1:
5798           ch = ( ch << 6 ) | ( c & 0x3f );
5799           switch ( str->codesys->fixed.size )
5800             {
5801             case 1:
5802               Dynarr_add (dst, ch & 0xff);
5803               break;
5804             case 2:
5805               Dynarr_add (dst, (ch >> 8) & 0xff);
5806               Dynarr_add (dst,  ch       & 0xff);
5807               break;
5808             case 3:
5809               Dynarr_add (dst, (ch >> 16) & 0xff);
5810               Dynarr_add (dst, (ch >>  8) & 0xff);
5811               Dynarr_add (dst,  ch        & 0xff);
5812               break;
5813             case 4:
5814               Dynarr_add (dst, (ch >> 24) & 0xff);
5815               Dynarr_add (dst, (ch >> 16) & 0xff);
5816               Dynarr_add (dst, (ch >>  8) & 0xff);
5817               Dynarr_add (dst,  ch        & 0xff);
5818               break;
5819             default:
5820               fprintf(stderr, "It seems %d bytes stream.\n",
5821                       str->codesys->fixed.size);
5822               abort ();
5823             }
5824           char_boundary = 0;
5825           break;
5826         default:
5827           ch = ( ch << 6 ) | ( c & 0x3f );
5828           char_boundary--;
5829         }
5830 #else /* not UTF2000 */
5831       if (c == '\n')
5832         {
5833           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5834             Dynarr_add (dst, '\r');
5835           if (eol_type != EOL_CR)
5836             Dynarr_add (dst, '\n');
5837           ch = 0;
5838         }
5839       else if (BYTE_ASCII_P (c))
5840         {
5841           assert (ch == 0);
5842           Dynarr_add (dst, c);
5843         }
5844       else if (BUFBYTE_LEADING_BYTE_P (c))
5845         {
5846           assert (ch == 0);
5847           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5848               c == LEADING_BYTE_CONTROL_1)
5849             ch = c;
5850           else
5851             Dynarr_add (dst, '~'); /* untranslatable character */
5852         }
5853       else
5854         {
5855           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5856             Dynarr_add (dst, c);
5857           else if (ch == LEADING_BYTE_CONTROL_1)
5858             {
5859               assert (c < 0xC0);
5860               Dynarr_add (dst, c - 0x20);
5861             }
5862           /* else it should be the second or third byte of an
5863              untranslatable character, so ignore it */
5864           ch = 0;
5865         }
5866 #endif /* not UTF2000 */
5867     }
5868
5869   str->flags = flags;
5870   str->ch    = ch;
5871 #ifdef UTF2000
5872   str->iso2022.current_char_boundary = char_boundary;
5873 #endif
5874 }
5875
5876 \f
5877 /************************************************************************/
5878 /*                   Simple internal/external functions                 */
5879 /************************************************************************/
5880
5881 static Extbyte_dynarr *conversion_out_dynarr;
5882 static Bufbyte_dynarr *conversion_in_dynarr;
5883
5884 /* Determine coding system from coding format */
5885
5886 /* #### not correct for all values of `fmt'! */
5887 static Lisp_Object
5888 external_data_format_to_coding_system (enum external_data_format fmt)
5889 {
5890   switch (fmt)
5891     {
5892     case FORMAT_FILENAME:
5893     case FORMAT_TERMINAL:
5894       if (EQ (Vfile_name_coding_system, Qnil) ||
5895           EQ (Vfile_name_coding_system, Qbinary))
5896         return Qnil;
5897       else
5898         return Fget_coding_system (Vfile_name_coding_system);
5899 #ifdef MULE
5900     case FORMAT_CTEXT:
5901       return Fget_coding_system (Qctext);
5902 #endif
5903     default:
5904       return Qnil;
5905     }
5906 }
5907
5908 Extbyte *
5909 convert_to_external_format (CONST Bufbyte *ptr,
5910                             Bytecount len,
5911                             Extcount *len_out,
5912                             enum external_data_format fmt)
5913 {
5914   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5915
5916   if (!conversion_out_dynarr)
5917     conversion_out_dynarr = Dynarr_new (Extbyte);
5918   else
5919     Dynarr_reset (conversion_out_dynarr);
5920
5921   if (NILP (coding_system))
5922     {
5923       CONST Bufbyte *end = ptr + len;
5924
5925       for (; ptr < end;)
5926         {
5927 #ifdef UTF2000
5928           Bufbyte c =
5929             (*ptr < 0xc0) ? *ptr :
5930             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5931 #else
5932           Bufbyte c =
5933             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5934             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5935             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5936             '~';
5937 #endif
5938           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5939           INC_CHARPTR (ptr);
5940         }
5941
5942 #ifdef ERROR_CHECK_BUFPOS
5943       assert (ptr == end);
5944 #endif
5945     }
5946   else
5947     {
5948       Lisp_Object instream, outstream, da_outstream;
5949       Lstream *istr, *ostr;
5950       struct gcpro gcpro1, gcpro2, gcpro3;
5951       char tempbuf[1024]; /* some random amount */
5952
5953       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5954       da_outstream = make_dynarr_output_stream
5955         ((unsigned_char_dynarr *) conversion_out_dynarr);
5956       outstream =
5957         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5958       istr = XLSTREAM (instream);
5959       ostr = XLSTREAM (outstream);
5960       GCPRO3 (instream, outstream, da_outstream);
5961       while (1)
5962         {
5963           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5964           if (!size_in_bytes)
5965             break;
5966           Lstream_write (ostr, tempbuf, size_in_bytes);
5967         }
5968       Lstream_close (istr);
5969       Lstream_close (ostr);
5970       UNGCPRO;
5971       Lstream_delete (istr);
5972       Lstream_delete (ostr);
5973       Lstream_delete (XLSTREAM (da_outstream));
5974     }
5975
5976   *len_out = Dynarr_length (conversion_out_dynarr);
5977   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5978   return Dynarr_atp (conversion_out_dynarr, 0);
5979 }
5980
5981 Bufbyte *
5982 convert_from_external_format (CONST Extbyte *ptr,
5983                               Extcount len,
5984                               Bytecount *len_out,
5985                               enum external_data_format fmt)
5986 {
5987   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5988
5989   if (!conversion_in_dynarr)
5990     conversion_in_dynarr = Dynarr_new (Bufbyte);
5991   else
5992     Dynarr_reset (conversion_in_dynarr);
5993
5994   if (NILP (coding_system))
5995     {
5996       CONST Extbyte *end = ptr + len;
5997       for (; ptr < end; ptr++)
5998         {
5999           Extbyte c = *ptr;
6000           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
6001         }
6002     }
6003   else
6004     {
6005       Lisp_Object instream, outstream, da_outstream;
6006       Lstream *istr, *ostr;
6007       struct gcpro gcpro1, gcpro2, gcpro3;
6008       char tempbuf[1024]; /* some random amount */
6009
6010       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
6011       da_outstream = make_dynarr_output_stream
6012         ((unsigned_char_dynarr *) conversion_in_dynarr);
6013       outstream =
6014         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
6015       istr = XLSTREAM (instream);
6016       ostr = XLSTREAM (outstream);
6017       GCPRO3 (instream, outstream, da_outstream);
6018       while (1)
6019         {
6020           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
6021           if (!size_in_bytes)
6022             break;
6023           Lstream_write (ostr, tempbuf, size_in_bytes);
6024         }
6025       Lstream_close (istr);
6026       Lstream_close (ostr);
6027       UNGCPRO;
6028       Lstream_delete (istr);
6029       Lstream_delete (ostr);
6030       Lstream_delete (XLSTREAM (da_outstream));
6031     }
6032
6033   *len_out = Dynarr_length (conversion_in_dynarr);
6034   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
6035   return Dynarr_atp (conversion_in_dynarr, 0);
6036 }
6037
6038 \f
6039 /************************************************************************/
6040 /*                             Initialization                           */
6041 /************************************************************************/
6042
6043 void
6044 syms_of_file_coding (void)
6045 {
6046   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
6047   deferror (&Qcoding_system_error, "coding-system-error",
6048             "Coding-system error", Qio_error);
6049
6050   DEFSUBR (Fcoding_system_p);
6051   DEFSUBR (Ffind_coding_system);
6052   DEFSUBR (Fget_coding_system);
6053   DEFSUBR (Fcoding_system_list);
6054   DEFSUBR (Fcoding_system_name);
6055   DEFSUBR (Fmake_coding_system);
6056   DEFSUBR (Fcopy_coding_system);
6057   DEFSUBR (Fdefine_coding_system_alias);
6058   DEFSUBR (Fsubsidiary_coding_system);
6059
6060   DEFSUBR (Fcoding_system_type);
6061   DEFSUBR (Fcoding_system_doc_string);
6062 #ifdef MULE
6063   DEFSUBR (Fcoding_system_charset);
6064 #endif
6065   DEFSUBR (Fcoding_system_property);
6066
6067   DEFSUBR (Fcoding_category_list);
6068   DEFSUBR (Fset_coding_priority_list);
6069   DEFSUBR (Fcoding_priority_list);
6070   DEFSUBR (Fset_coding_category_system);
6071   DEFSUBR (Fcoding_category_system);
6072
6073   DEFSUBR (Fdetect_coding_region);
6074   DEFSUBR (Fdecode_coding_region);
6075   DEFSUBR (Fencode_coding_region);
6076 #ifdef MULE
6077   DEFSUBR (Fdecode_shift_jis_char);
6078   DEFSUBR (Fencode_shift_jis_char);
6079   DEFSUBR (Fdecode_big5_char);
6080   DEFSUBR (Fencode_big5_char);
6081   DEFSUBR (Fset_ucs_char);
6082   DEFSUBR (Fucs_char);
6083   DEFSUBR (Fset_char_ucs);
6084   DEFSUBR (Fchar_ucs);
6085 #endif /* MULE */
6086   defsymbol (&Qcoding_system_p, "coding-system-p");
6087   defsymbol (&Qno_conversion, "no-conversion");
6088   defsymbol (&Qraw_text, "raw-text");
6089 #ifdef MULE
6090   defsymbol (&Qbig5, "big5");
6091   defsymbol (&Qshift_jis, "shift-jis");
6092   defsymbol (&Qucs4, "ucs-4");
6093   defsymbol (&Qutf8, "utf-8");
6094   defsymbol (&Qccl, "ccl");
6095   defsymbol (&Qiso2022, "iso2022");
6096 #endif /* MULE */
6097   defsymbol (&Qmnemonic, "mnemonic");
6098   defsymbol (&Qeol_type, "eol-type");
6099   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6100   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6101
6102   defsymbol (&Qcr, "cr");
6103   defsymbol (&Qlf, "lf");
6104   defsymbol (&Qcrlf, "crlf");
6105   defsymbol (&Qeol_cr, "eol-cr");
6106   defsymbol (&Qeol_lf, "eol-lf");
6107   defsymbol (&Qeol_crlf, "eol-crlf");
6108 #ifdef MULE
6109   defsymbol (&Qcharset_g0, "charset-g0");
6110   defsymbol (&Qcharset_g1, "charset-g1");
6111   defsymbol (&Qcharset_g2, "charset-g2");
6112   defsymbol (&Qcharset_g3, "charset-g3");
6113   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6114   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6115   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6116   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6117   defsymbol (&Qno_iso6429, "no-iso6429");
6118   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6119   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6120
6121   defsymbol (&Qshort, "short");
6122   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6123   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6124   defsymbol (&Qseven, "seven");
6125   defsymbol (&Qlock_shift, "lock-shift");
6126   defsymbol (&Qescape_quoted, "escape-quoted");
6127 #endif /* MULE */
6128   defsymbol (&Qencode, "encode");
6129   defsymbol (&Qdecode, "decode");
6130
6131 #ifdef MULE
6132   defsymbol (&Qctext, "ctext");
6133   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6134              "shift-jis");
6135   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6136              "big5");
6137   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6138              "ucs-4");
6139   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6140              "utf-8");
6141   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6142              "iso-7");
6143   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6144              "iso-8-designate");
6145   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6146              "iso-8-1");
6147   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6148              "iso-8-2");
6149   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6150              "iso-lock-shift");
6151 #endif /* MULE */
6152   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6153              "no-conversion");
6154 }
6155
6156 void
6157 lstream_type_create_file_coding (void)
6158 {
6159   LSTREAM_HAS_METHOD (decoding, reader);
6160   LSTREAM_HAS_METHOD (decoding, writer);
6161   LSTREAM_HAS_METHOD (decoding, rewinder);
6162   LSTREAM_HAS_METHOD (decoding, seekable_p);
6163   LSTREAM_HAS_METHOD (decoding, flusher);
6164   LSTREAM_HAS_METHOD (decoding, closer);
6165   LSTREAM_HAS_METHOD (decoding, marker);
6166
6167   LSTREAM_HAS_METHOD (encoding, reader);
6168   LSTREAM_HAS_METHOD (encoding, writer);
6169   LSTREAM_HAS_METHOD (encoding, rewinder);
6170   LSTREAM_HAS_METHOD (encoding, seekable_p);
6171   LSTREAM_HAS_METHOD (encoding, flusher);
6172   LSTREAM_HAS_METHOD (encoding, closer);
6173   LSTREAM_HAS_METHOD (encoding, marker);
6174 }
6175
6176 void
6177 vars_of_file_coding (void)
6178 {
6179   int i;
6180
6181   /* Initialize to something reasonable ... */
6182   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6183     {
6184       coding_category_system[i] = Qnil;
6185       coding_category_by_priority[i] = i;
6186     }
6187
6188   Fprovide (intern ("file-coding"));
6189
6190   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6191 Coding system used for TTY keyboard input.
6192 Not used under a windowing system.
6193 */ );
6194   Vkeyboard_coding_system = Qnil;
6195
6196   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6197 Coding system used for TTY display output.
6198 Not used under a windowing system.
6199 */ );
6200   Vterminal_coding_system = Qnil;
6201
6202   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6203 Overriding coding system used when writing a file or process.
6204 You should *bind* this, not set it.  If this is non-nil, it specifies
6205 the coding system that will be used when a file or process is read
6206 in, and overrides `buffer-file-coding-system-for-read',
6207 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6208 this one for permanent changes to the environment.
6209 */ );
6210   Vcoding_system_for_read = Qnil;
6211
6212   DEFVAR_LISP ("coding-system-for-write",
6213                &Vcoding_system_for_write /*
6214 Overriding coding system used when writing a file or process.
6215 You should *bind* this, not set it.  If this is non-nil, it specifies
6216 the coding system that will be used when a file or process is wrote
6217 in, and overrides `buffer-file-coding-system',
6218 `write-region-pre-hook', etc.  Use those variables instead of this one
6219 for permanent changes to the environment.
6220 */ );
6221   Vcoding_system_for_write = Qnil;
6222
6223   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6224 Coding system used to convert pathnames when accessing files.
6225 */ );
6226   Vfile_name_coding_system = Qnil;
6227
6228   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6229 Non-nil means the buffer contents are regarded as multi-byte form
6230 of characters, not a binary code.  This affects the display, file I/O,
6231 and behaviors of various editing commands.
6232
6233 Setting this to nil does not do anything.
6234 */ );
6235   enable_multibyte_characters = 1;
6236 }
6237
6238 void
6239 complex_vars_of_file_coding (void)
6240 {
6241   staticpro (&Vcoding_system_hash_table);
6242   Vcoding_system_hash_table =
6243     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6244
6245   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6246
6247 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6248 {                                               \
6249   struct codesys_prop csp;                      \
6250   csp.sym = (Sym);                              \
6251   csp.prop_type = (Prop_Type);                  \
6252   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6253 } while (0)
6254
6255   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6256   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6257   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6258   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6259   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6260   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6261   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6262 #ifdef MULE
6263   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6264   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6265   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6266   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6267   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6268   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6269   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6270   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6271   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6272   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6273   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6274   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6275   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6276   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6277   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6278   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6279   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6280
6281   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6282   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6283 #endif /* MULE */
6284   /* Need to create this here or we're really screwed. */
6285   Fmake_coding_system
6286     (Qraw_text, Qno_conversion,
6287      build_string ("Raw text, which means it converts only line-break-codes."),
6288      list2 (Qmnemonic, build_string ("Raw")));
6289
6290   Fmake_coding_system
6291     (Qbinary, Qno_conversion,
6292      build_string ("Binary, which means it does not convert anything."),
6293      list4 (Qeol_type, Qlf,
6294             Qmnemonic, build_string ("Binary")));
6295
6296 #ifdef UTF2000
6297   Fmake_coding_system
6298     (Qutf8, Qutf8,
6299      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6300      list2 (Qmnemonic, build_string ("UTF8")));
6301 #endif
6302
6303   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6304
6305   /* Need this for bootstrapping */
6306   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6307     Fget_coding_system (Qraw_text);
6308
6309 #ifdef UTF2000
6310   coding_category_system[CODING_CATEGORY_UTF8]
6311    = Fget_coding_system (Qutf8);
6312 #endif
6313
6314 #ifdef MULE
6315   {
6316     unsigned int i;
6317
6318     for (i = 0; i < 65536; i++)
6319       ucs_to_mule_table[i] = Qnil;
6320   }
6321   staticpro (&mule_to_ucs_table);
6322   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6323 #endif /* MULE */
6324 }