src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28 #include "buffer.h"
  29 #include "elhash.h"
  30 #include "insdel.h"
  31 #include "lstream.h"
  32 #ifdef MULE
  33 #include "mule-ccl.h"
  34 #include "chartab.h"
  35 #endif
  36 #include "file-coding.h"
  37
  38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  39
  40 Lisp_Object Vkeyboard_coding_system;
  41 Lisp_Object Vterminal_coding_system;
  42 Lisp_Object Vcoding_system_for_read;
  43 Lisp_Object Vcoding_system_for_write;
  44 Lisp_Object Vfile_name_coding_system;
  45
  46 /* Table of symbols identifying each coding category. */
  47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  48
  49 /* Coding system currently associated with each coding category. */
  50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  51
  52 /* Table of all coding categories in decreasing order of priority.
  53    This describes a permutation of the possible coding categories. */
  54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  55
  56 Lisp_Object Qcoding_system_p;
  57
  58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  59 /* Qinternal in general.c */
  60
  61 Lisp_Object Qmnemonic, Qeol_type;
  62 Lisp_Object Qcr, Qcrlf, Qlf;
  63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  64 Lisp_Object Qpost_read_conversion;
  65 Lisp_Object Qpre_write_conversion;
  66
  67 #ifdef MULE
  68 Lisp_Object Qucs4, Qutf8;
  69 Lisp_Object Qbig5, Qshift_jis;
  70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  73 Lisp_Object Qno_iso6429;
  74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  75 Lisp_Object Qctext, Qescape_quoted;
  76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  77 #endif
  78 Lisp_Object Qencode, Qdecode;
  79
  80 Lisp_Object Vcoding_system_hash_table;
  81
  82 int enable_multibyte_characters;
  83
  84 #ifdef MULE
  85 /* Additional information used by the ISO2022 decoder and detector. */
  86 struct iso2022_decoder
  87 {
  88   /* CHARSET holds the character sets currently assigned to the G0
  89      through G3 variables.  It is initialized from the array
  90      INITIAL_CHARSET in CODESYS. */
  91   Lisp_Object charset[4];
  92
  93   /* Which registers are currently invoked into the left (GL) and
  94      right (GR) halves of the 8-bit encoding space? */
  95   int register_left, register_right;
  96
  97   /* ISO_ESC holds a value indicating part of an escape sequence
  98      that has already been seen. */
  99   enum iso_esc_flag esc;
 100
 101   /* This records the bytes we've seen so far in an escape sequence,
 102      in case the sequence is invalid (we spit out the bytes unchanged). */
 103   unsigned char esc_bytes[8];
 104
 105   /* Index for next byte to store in ISO escape sequence. */
 106   int esc_bytes_index;
 107
 108 #ifdef ENABLE_COMPOSITE_CHARS
 109   /* Stuff seen so far when composing a string. */
 110   unsigned_char_dynarr *composite_chars;
 111 #endif
 112
 113   /* If we saw an invalid designation sequence for a particular
 114      register, we flag it here and switch to ASCII.  The next time we
 115      see a valid designation for this register, we turn off the flag
 116      and do the designation normally, but pretend the sequence was
 117      invalid.  The effect of all this is that (most of the time) the
 118      escape sequences for both the switch to the unknown charset, and
 119      the switch back to the known charset, get inserted literally into
 120      the buffer and saved out as such.  The hope is that we can
 121      preserve the escape sequences so that the resulting written out
 122      file makes sense.  If we don't do any of this, the designation
 123      to the invalid charset will be preserved but that switch back
 124      to the known charset will probably get eaten because it was
 125      the same charset that was already present in the register. */
 126   unsigned char invalid_designated[4];
 127
 128   /* We try to do similar things as above for direction-switching
 129      sequences.  If we encountered a direction switch while an
 130      invalid designation was present, or an invalid designation
 131      just after a direction switch (i.e. no valid designation
 132      encountered yet), we insert the direction-switch escape
 133      sequence literally into the output stream, and later on
 134      insert the corresponding direction-restoring escape sequence
 135      literally also. */
 136   unsigned int switched_dir_and_no_valid_charset_yet :1;
 137   unsigned int invalid_switch_dir :1;
 138
 139   /* Tells the decoder to output the escape sequence literally
 140      even though it was valid.  Used in the games we play to
 141      avoid lossage when we encounter invalid designations. */
 142   unsigned int output_literally :1;
 143   /* We encountered a direction switch followed by an invalid
 144      designation.  We didn't output the direction switch
 145      literally because we didn't know about the invalid designation;
 146      but we have to do so now. */
 147   unsigned int output_direction_sequence :1;
 148 };
 149 #endif /* MULE */
 150 EXFUN (Fcopy_coding_system, 2);
 151 #ifdef MULE
 152 struct detection_state;
 153 static int detect_coding_sjis (struct detection_state *st,
 154                                CONST unsigned char *src,
 155                                unsigned int n);
 156 static void decode_coding_sjis (Lstream *decoding,
 157                                 CONST unsigned char *src,
 158                                 unsigned_char_dynarr *dst,
 159                                 unsigned int n);
 160 static void encode_coding_sjis (Lstream *encoding,
 161                                 CONST unsigned char *src,
 162                                 unsigned_char_dynarr *dst,
 163                                 unsigned int n);
 164 static int detect_coding_big5 (struct detection_state *st,
 165                                CONST unsigned char *src,
 166                                unsigned int n);
 167 static void decode_coding_big5 (Lstream *decoding,
 168                                 CONST unsigned char *src,
 169                                 unsigned_char_dynarr *dst, unsigned int n);
 170 static void encode_coding_big5 (Lstream *encoding,
 171                                 CONST unsigned char *src,
 172                                 unsigned_char_dynarr *dst, unsigned int n);
 173 static int detect_coding_ucs4 (struct detection_state *st,
 174                                CONST unsigned char *src,
 175                                unsigned int n);
 176 static void decode_coding_ucs4 (Lstream *decoding,
 177                                 CONST unsigned char *src,
 178                                 unsigned_char_dynarr *dst, unsigned int n);
 179 static void encode_coding_ucs4 (Lstream *encoding,
 180                                 CONST unsigned char *src,
 181                                 unsigned_char_dynarr *dst, unsigned int n);
 182 static int detect_coding_utf8 (struct detection_state *st,
 183                                CONST unsigned char *src,
 184                                unsigned int n);
 185 static void decode_coding_utf8 (Lstream *decoding,
 186                                 CONST unsigned char *src,
 187                                 unsigned_char_dynarr *dst, unsigned int n);
 188 static void encode_coding_utf8 (Lstream *encoding,
 189                                 CONST unsigned char *src,
 190                                 unsigned_char_dynarr *dst, unsigned int n);
 191 static int postprocess_iso2022_mask (int mask);
 192 static void reset_iso2022 (Lisp_Object coding_system,
 193                            struct iso2022_decoder *iso);
 194 static int detect_coding_iso2022 (struct detection_state *st,
 195                                   CONST unsigned char *src,
 196                                   unsigned int n);
 197 static void decode_coding_iso2022 (Lstream *decoding,
 198                                    CONST unsigned char *src,
 199                                    unsigned_char_dynarr *dst, unsigned int n);
 200 static void encode_coding_iso2022 (Lstream *encoding,
 201                                    CONST unsigned char *src,
 202                                    unsigned_char_dynarr *dst, unsigned int n);
 203 #endif /* MULE */
 204 static void decode_coding_no_conversion (Lstream *decoding,
 205                                          CONST unsigned char *src,
 206                                          unsigned_char_dynarr *dst,
 207                                          unsigned int n);
 208 static void encode_coding_no_conversion (Lstream *encoding,
 209                                          CONST unsigned char *src,
 210                                          unsigned_char_dynarr *dst,
 211                                          unsigned int n);
 212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 213                          unsigned_char_dynarr *dst, unsigned int n);
 214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 215                          unsigned_char_dynarr *dst, unsigned int n);
 216
 217 typedef struct codesys_prop codesys_prop;
 218 struct codesys_prop
 219 {
 220   Lisp_Object sym;
 221   int prop_type;
 222 };
 223
 224 typedef struct
 225 {
 226   Dynarr_declare (codesys_prop);
 227 } codesys_prop_dynarr;
 228
 229 codesys_prop_dynarr *the_codesys_prop_dynarr;
 230
 231 enum codesys_prop_enum
 232 {
 233   CODESYS_PROP_ALL_OK,
 234   CODESYS_PROP_ISO2022,
 235   CODESYS_PROP_CCL
 236 };
 237
 238 \f
 239 /************************************************************************/
 240 /*                       Coding system functions                        */
 241 /************************************************************************/
 242
 243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 245 static void finalize_coding_system (void *header, int for_disksave);
 246
 247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 248                                mark_coding_system, print_coding_system,
 249                                finalize_coding_system,
 250                                0, 0, struct Lisp_Coding_System);
 251
 252 static Lisp_Object
 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 254 {
 255   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 256
 257   markobj (CODING_SYSTEM_NAME (codesys));
 258   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 259   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 260   markobj (CODING_SYSTEM_EOL_LF (codesys));
 261   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 262   markobj (CODING_SYSTEM_EOL_CR (codesys));
 263
 264   switch (CODING_SYSTEM_TYPE (codesys))
 265     {
 266 #ifdef MULE
 267       int i;
 268     case CODESYS_ISO2022:
 269       for (i = 0; i < 4; i++)
 270         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 271       if (codesys->iso2022.input_conv)
 272         {
 273           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 274             {
 275               struct charset_conversion_spec *ccs =
 276                 Dynarr_atp (codesys->iso2022.input_conv, i);
 277               markobj (ccs->from_charset);
 278               markobj (ccs->to_charset);
 279             }
 280         }
 281       if (codesys->iso2022.output_conv)
 282         {
 283           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 284             {
 285               struct charset_conversion_spec *ccs =
 286                 Dynarr_atp (codesys->iso2022.output_conv, i);
 287               markobj (ccs->from_charset);
 288               markobj (ccs->to_charset);
 289             }
 290         }
 291       break;
 292
 293     case CODESYS_CCL:
 294       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 295       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 296       break;
 297 #endif /* MULE */
 298     default:
 299       break;
 300     }
 301
 302   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 303   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 304 }
 305
 306 static void
 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 308                      int escapeflag)
 309 {
 310   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 311   if (print_readably)
 312     error ("printing unreadable object #<coding_system 0x%x>",
 313            c->header.uid);
 314
 315   write_c_string ("#<coding_system ", printcharfun);
 316   print_internal (c->name, printcharfun, 1);
 317   write_c_string (">", printcharfun);
 318 }
 319
 320 static void
 321 finalize_coding_system (void *header, int for_disksave)
 322 {
 323   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 324   /* Since coding systems never go away, this function is not
 325      necessary.  But it would be necessary if we changed things
 326      so that coding systems could go away. */
 327   if (!for_disksave) /* see comment in lstream.c */
 328     {
 329       switch (CODING_SYSTEM_TYPE (c))
 330         {
 331 #ifdef MULE
 332         case CODESYS_ISO2022:
 333           if (c->iso2022.input_conv)
 334             {
 335               Dynarr_free (c->iso2022.input_conv);
 336               c->iso2022.input_conv = 0;
 337             }
 338           if (c->iso2022.output_conv)
 339             {
 340               Dynarr_free (c->iso2022.output_conv);
 341               c->iso2022.output_conv = 0;
 342             }
 343           break;
 344 #endif /* MULE */
 345         default:
 346           break;
 347         }
 348     }
 349 }
 350
 351 static enum eol_type
 352 symbol_to_eol_type (Lisp_Object symbol)
 353 {
 354   CHECK_SYMBOL (symbol);
 355   if (NILP (symbol))      return EOL_AUTODETECT;
 356   if (EQ (symbol, Qlf))   return EOL_LF;
 357   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 358   if (EQ (symbol, Qcr))   return EOL_CR;
 359
 360   signal_simple_error ("Unrecognized eol type", symbol);
 361   return EOL_AUTODETECT; /* not reached */
 362 }
 363
 364 static Lisp_Object
 365 eol_type_to_symbol (enum eol_type type)
 366 {
 367   switch (type)
 368     {
 369     default: abort ();
 370     case EOL_LF:         return Qlf;
 371     case EOL_CRLF:       return Qcrlf;
 372     case EOL_CR:         return Qcr;
 373     case EOL_AUTODETECT: return Qnil;
 374     }
 375 }
 376
 377 static void
 378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 379 {
 380   Lisp_Object codesys_obj;
 381   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 382   char *codesys_name = (char *) alloca (len + 7);
 383   int mlen = -1;
 384   char *codesys_mnemonic=0;
 385
 386   Lisp_Object codesys_name_sym, sub_codesys_obj;
 387
 388   /* kludge */
 389
 390   XSETCODING_SYSTEM (codesys_obj, codesys);
 391
 392   memcpy (codesys_name,
 393           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 394
 395   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 396     {
 397       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 398       codesys_mnemonic = (char *) alloca (mlen + 7);
 399       memcpy (codesys_mnemonic,
 400               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 401     }
 402
 403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 404   strcpy (codesys_name + len, "-" op_sys);                                      \
 405   if (mlen != -1)                                                               \
 406     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 407   codesys_name_sym = intern (codesys_name);                                     \
 408   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 409   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 410   if (mlen != -1)                                                               \
 411     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 412       build_string (codesys_mnemonic);                                          \
 413   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 414 } while (0)
 415
 416   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 417   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 418   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 419 }
 420
 421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 422 Return t if OBJECT is a coding system.
 423 A coding system is an object that defines how text containing multiple
 424 character sets is encoded into a stream of (typically 8-bit) bytes.
 425 The coding system is used to decode the stream into a series of
 426 characters (which may be from multiple charsets) when the text is read
 427 from a file or process, and is used to encode the text back into the
 428 same format when it is written out to a file or process.
 429
 430 For example, many ISO2022-compliant coding systems (such as Compound
 431 Text, which is used for inter-client data under the X Window System)
 432 use escape sequences to switch between different charsets -- Japanese
 433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 435 `make-coding-system' for more information.
 436
 437 Coding systems are normally identified using a symbol, and the
 438 symbol is accepted in place of the actual coding system object whenever
 439 a coding system is called for. (This is similar to how faces work.)
 440 */
 441        (object))
 442 {
 443   return CODING_SYSTEMP (object) ? Qt : Qnil;
 444 }
 445
 446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 447 Retrieve the coding system of the given name.
 448
 449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 450 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 451 If there is no such coding system, nil is returned.  Otherwise the
 452 associated coding system object is returned.
 453 */
 454        (coding_system_or_name))
 455 {
 456   if (CODING_SYSTEMP (coding_system_or_name))
 457     return coding_system_or_name;
 458
 459   if (NILP (coding_system_or_name))
 460     coding_system_or_name = Qbinary;
 461   else
 462     CHECK_SYMBOL (coding_system_or_name);
 463
 464   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 465 }
 466
 467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 468 Retrieve the coding system of the given name.
 469 Same as `find-coding-system' except that if there is no such
 470 coding system, an error is signaled instead of returning nil.
 471 */
 472        (name))
 473 {
 474   Lisp_Object coding_system = Ffind_coding_system (name);
 475
 476   if (NILP (coding_system))
 477     signal_simple_error ("No such coding system", name);
 478   return coding_system;
 479 }
 480
 481 /* We store the coding systems in hash tables with the names as the key and the
 482    actual coding system object as the value.  Occasionally we need to use them
 483    in a list format.  These routines provide us with that. */
 484 struct coding_system_list_closure
 485 {
 486   Lisp_Object *coding_system_list;
 487 };
 488
 489 static int
 490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 491                                   void *coding_system_list_closure)
 492 {
 493   /* This function can GC */
 494   struct coding_system_list_closure *cscl =
 495     (struct coding_system_list_closure *) coding_system_list_closure;
 496   Lisp_Object *coding_system_list = cscl->coding_system_list;
 497
 498   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 499                                *coding_system_list);
 500   return 0;
 501 }
 502
 503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 504 Return a list of the names of all defined coding systems.
 505 */
 506        ())
 507 {
 508   Lisp_Object coding_system_list = Qnil;
 509   struct gcpro gcpro1;
 510   struct coding_system_list_closure coding_system_list_closure;
 511
 512   GCPRO1 (coding_system_list);
 513   coding_system_list_closure.coding_system_list = &coding_system_list;
 514   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 515                  &coding_system_list_closure);
 516   UNGCPRO;
 517
 518   return coding_system_list;
 519 }
 520
 521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 522 Return the name of the given coding system.
 523 */
 524        (coding_system))
 525 {
 526   coding_system = Fget_coding_system (coding_system);
 527   return XCODING_SYSTEM_NAME (coding_system);
 528 }
 529
 530 static Lisp_Coding_System *
 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 532 {
 533   Lisp_Coding_System *codesys =
 534     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 535
 536   zero_lcrecord (codesys);
 537   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 538   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 539   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 540   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 541   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 542   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 543   CODING_SYSTEM_TYPE     (codesys) = type;
 544   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 545 #ifdef MULE
 546   if (type == CODESYS_ISO2022)
 547     {
 548       int i;
 549       for (i = 0; i < 4; i++)
 550         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 551     }
 552   else if (type == CODESYS_CCL)
 553     {
 554       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 555       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 556     }
 557 #endif /* MULE */
 558   CODING_SYSTEM_NAME (codesys) = name;
 559
 560   return codesys;
 561 }
 562
 563 #ifdef MULE
 564 /* Given a list of charset conversion specs as specified in a Lisp
 565    program, parse it into STORE_HERE. */
 566
 567 static void
 568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 569                                 Lisp_Object spec_list)
 570 {
 571   Lisp_Object rest;
 572
 573   EXTERNAL_LIST_LOOP (rest, spec_list)
 574     {
 575       Lisp_Object car = XCAR (rest);
 576       Lisp_Object from, to;
 577       struct charset_conversion_spec spec;
 578
 579       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 580         signal_simple_error ("Invalid charset conversion spec", car);
 581       from = Fget_charset (XCAR (car));
 582       to = Fget_charset (XCAR (XCDR (car)));
 583       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 584         signal_simple_error_2
 585           ("Attempted conversion between different charset types",
 586            from, to);
 587       spec.from_charset = from;
 588       spec.to_charset = to;
 589
 590       Dynarr_add (store_here, spec);
 591     }
 592 }
 593
 594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 595    specs, return the equivalent as the Lisp programmer would see it.
 596
 597    If LOAD_HERE is 0, return Qnil. */
 598
 599 static Lisp_Object
 600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 601 {
 602   int i;
 603   Lisp_Object result;
 604
 605   if (!load_here)
 606     return Qnil;
 607   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 608     {
 609       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 610       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 611     }
 612
 613   return Fnreverse (result);
 614 }
 615
 616 #endif /* MULE */
 617
 618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 619 Register symbol NAME as a coding system.
 620
 621 TYPE describes the conversion method used and should be one of
 622
 623 nil or 'undecided
 624      Automatic conversion.  XEmacs attempts to detect the coding system
 625      used in the file.
 626 'no-conversion
 627      No conversion.  Use this for binary files and such.  On output,
 628      graphic characters that are not in ASCII or Latin-1 will be
 629      replaced by a ?. (For a no-conversion-encoded buffer, these
 630      characters will only be present if you explicitly insert them.)
 631 'shift-jis
 632      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 633 'ucs-4
 634      ISO 10646 UCS-4 encoding.
 635 'utf-8
 636      ISO 10646 UTF-8 encoding.
 637 'iso2022
 638      Any ISO2022-compliant encoding.  Among other things, this includes
 639      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 640      standard Unix encoding for Japanese and other languages), and
 641      Compound Text (the encoding used in X11).  You can specify more
 642      specific information about the conversion with the FLAGS argument.
 643 'big5
 644      Big5 (the encoding commonly used for Taiwanese).
 645 'ccl
 646      The conversion is performed using a user-written pseudo-code
 647      program.  CCL (Code Conversion Language) is the name of this
 648      pseudo-code.
 649 'internal
 650      Write out or read in the raw contents of the memory representing
 651      the buffer's text.  This is primarily useful for debugging
 652      purposes, and is only enabled when XEmacs has been compiled with
 653      DEBUG_XEMACS defined (via the --debug configure option).
 654      WARNING: Reading in a file using 'internal conversion can result
 655      in an internal inconsistency in the memory representing a
 656      buffer's text, which will produce unpredictable results and may
 657      cause XEmacs to crash.  Under normal circumstances you should
 658      never use 'internal conversion.
 659
 660 DOC-STRING is a string describing the coding system.
 661
 662 PROPS is a property list, describing the specific nature of the
 663 character set.  Recognized properties are:
 664
 665 'mnemonic
 666      String to be displayed in the modeline when this coding system is
 667      active.
 668
 669 'eol-type
 670      End-of-line conversion to be used.  It should be one of
 671
 672         nil
 673                 Automatically detect the end-of-line type (LF, CRLF,
 674                 or CR).  Also generate subsidiary coding systems named
 675                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 676                 identical to this coding system but have an EOL-TYPE
 677                 value of 'lf, 'crlf, and 'cr, respectively.
 678         'lf
 679                 The end of a line is marked externally using ASCII LF.
 680                 Since this is also the way that XEmacs represents an
 681                 end-of-line internally, specifying this option results
 682                 in no end-of-line conversion.  This is the standard
 683                 format for Unix text files.
 684         'crlf
 685                 The end of a line is marked externally using ASCII
 686                 CRLF.  This is the standard format for MS-DOS text
 687                 files.
 688         'cr
 689                 The end of a line is marked externally using ASCII CR.
 690                 This is the standard format for Macintosh text files.
 691         t
 692                 Automatically detect the end-of-line type but do not
 693                 generate subsidiary coding systems.  (This value is
 694                 converted to nil when stored internally, and
 695                 `coding-system-property' will return nil.)
 696
 697 'post-read-conversion
 698      Function called after a file has been read in, to perform the
 699      decoding.  Called with two arguments, BEG and END, denoting
 700      a region of the current buffer to be decoded.
 701
 702 'pre-write-conversion
 703      Function called before a file is written out, to perform the
 704      encoding.  Called with two arguments, BEG and END, denoting
 705      a region of the current buffer to be encoded.
 706
 707
 708 The following additional properties are recognized if TYPE is 'iso2022:
 709
 710 'charset-g0
 711 'charset-g1
 712 'charset-g2
 713 'charset-g3
 714      The character set initially designated to the G0 - G3 registers.
 715      The value should be one of
 716
 717           -- A charset object (designate that character set)
 718           -- nil (do not ever use this register)
 719           -- t (no character set is initially designated to
 720                 the register, but may be later on; this automatically
 721                 sets the corresponding `force-g*-on-output' property)
 722
 723 'force-g0-on-output
 724 'force-g1-on-output
 725 'force-g2-on-output
 726 'force-g2-on-output
 727      If non-nil, send an explicit designation sequence on output before
 728      using the specified register.
 729
 730 'short
 731      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 732      "ESC $ B" on output in place of the full designation sequences
 733      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 734
 735 'no-ascii-eol
 736      If non-nil, don't designate ASCII to G0 at each end of line on output.
 737      Setting this to non-nil also suppresses other state-resetting that
 738      normally happens at the end of a line.
 739
 740 'no-ascii-cntl
 741      If non-nil, don't designate ASCII to G0 before control chars on output.
 742
 743 'seven
 744      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 745      environment.
 746
 747 'lock-shift
 748      If non-nil, use locking-shift (SO/SI) instead of single-shift
 749      or designation by escape sequence.
 750
 751 'no-iso6429
 752      If non-nil, don't use ISO6429's direction specification.
 753
 754 'escape-quoted
 755      If non-nil, literal control characters that are the same as
 756      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 757      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 758      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 759      so that they can be properly distinguished from an escape sequence.
 760      (Note that doing this results in a non-portable encoding.) This
 761      encoding flag is used for byte-compiled files.  Note that ESC
 762      is a good choice for a quoting character because there are no
 763      escape sequences whose second byte is a character from the Control-0
 764      or Control-1 character sets; this is explicitly disallowed by the
 765      ISO2022 standard.
 766
 767 'input-charset-conversion
 768      A list of conversion specifications, specifying conversion of
 769      characters in one charset to another when decoding is performed.
 770      Each specification is a list of two elements: the source charset,
 771      and the destination charset.
 772
 773 'output-charset-conversion
 774      A list of conversion specifications, specifying conversion of
 775      characters in one charset to another when encoding is performed.
 776      The form of each specification is the same as for
 777      'input-charset-conversion.
 778
 779
 780 The following additional properties are recognized (and required)
 781 if TYPE is 'ccl:
 782
 783 'decode
 784      CCL program used for decoding (converting to internal format).
 785
 786 'encode
 787      CCL program used for encoding (converting to external format).
 788 */
 789        (name, type, doc_string, props))
 790 {
 791   Lisp_Coding_System *codesys;
 792   Lisp_Object rest, key, value;
 793   enum coding_system_type ty;
 794   int need_to_setup_eol_systems = 1;
 795
 796   /* Convert type to constant */
 797   if (NILP (type) || EQ (type, Qundecided))
 798                                       { ty = CODESYS_AUTODETECT; }
 799 #ifdef MULE
 800   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 801   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 802   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 803   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 804   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 805   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 806 #endif
 807   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 808 #ifdef DEBUG_XEMACS
 809   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 810 #endif
 811   else
 812     signal_simple_error ("Invalid coding system type", type);
 813
 814   CHECK_SYMBOL (name);
 815
 816   codesys = allocate_coding_system (ty, name);
 817
 818   if (NILP (doc_string))
 819     doc_string = build_string ("");
 820   else
 821     CHECK_STRING (doc_string);
 822   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 823
 824 #ifdef UTF2000
 825   if (ty == CODESYS_NO_CONVERSION)
 826     codesys->fixed.size = 1;
 827 #endif
 828   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 829     {
 830       if (EQ (key, Qmnemonic))
 831         {
 832           if (!NILP (value))
 833             CHECK_STRING (value);
 834           CODING_SYSTEM_MNEMONIC (codesys) = value;
 835         }
 836
 837       else if (EQ (key, Qeol_type))
 838         {
 839           need_to_setup_eol_systems = NILP (value);
 840           if (EQ (value, Qt))
 841             value = Qnil;
 842           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 843         }
 844
 845       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 846       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 847 #ifdef MULE
 848       else if (ty == CODESYS_ISO2022)
 849         {
 850 #define FROB_INITIAL_CHARSET(charset_num) \
 851   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 852     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 853
 854           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 855           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 856           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 857           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 858
 859 #define FROB_FORCE_CHARSET(charset_num) \
 860   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 861
 862           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 863           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 864           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 865           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 866
 867 #define FROB_BOOLEAN_PROPERTY(prop) \
 868   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 869
 870           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 871           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 872           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 873           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 874           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 875           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 876           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 877
 878           else if (EQ (key, Qinput_charset_conversion))
 879             {
 880               codesys->iso2022.input_conv =
 881                 Dynarr_new (charset_conversion_spec);
 882               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 883                                               value);
 884             }
 885           else if (EQ (key, Qoutput_charset_conversion))
 886             {
 887               codesys->iso2022.output_conv =
 888                 Dynarr_new (charset_conversion_spec);
 889               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 890                                               value);
 891             }
 892           else
 893             signal_simple_error ("Unrecognized property", key);
 894         }
 895       else if (EQ (type, Qccl))
 896         {
 897           if (EQ (key, Qdecode))
 898             {
 899               CHECK_VECTOR (value);
 900               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 901             }
 902           else if (EQ (key, Qencode))
 903             {
 904               CHECK_VECTOR (value);
 905               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 906             }
 907           else
 908             signal_simple_error ("Unrecognized property", key);
 909         }
 910 #endif /* MULE */
 911       else
 912         signal_simple_error ("Unrecognized property", key);
 913     }
 914
 915   if (need_to_setup_eol_systems)
 916     setup_eol_coding_systems (codesys);
 917
 918   {
 919     Lisp_Object codesys_obj;
 920     XSETCODING_SYSTEM (codesys_obj, codesys);
 921     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 922     return codesys_obj;
 923   }
 924 }
 925
 926 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 927 Copy OLD-CODING-SYSTEM to NEW-NAME.
 928 If NEW-NAME does not name an existing coding system, a new one will
 929 be created.
 930 */
 931        (old_coding_system, new_name))
 932 {
 933   Lisp_Object new_coding_system;
 934   old_coding_system = Fget_coding_system (old_coding_system);
 935   new_coding_system = Ffind_coding_system (new_name);
 936   if (NILP (new_coding_system))
 937     {
 938       XSETCODING_SYSTEM (new_coding_system,
 939                          allocate_coding_system
 940                          (XCODING_SYSTEM_TYPE (old_coding_system),
 941                           new_name));
 942       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 943     }
 944
 945   {
 946     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 947     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 948     memcpy (((char *) to  ) + sizeof (to->header),
 949             ((char *) from) + sizeof (from->header),
 950             sizeof (*from) - sizeof (from->header));
 951     to->name = new_name;
 952   }
 953   return new_coding_system;
 954 }
 955
 956 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
 957 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
 958 */
 959        (alias, coding_system))
 960 {
 961   CHECK_SYMBOL (alias);
 962   if (!NILP (Ffind_coding_system (alias)))
 963     signal_simple_error ("Symbol already names a coding system", alias);
 964   coding_system = Fget_coding_system (coding_system);
 965   Fputhash (alias, coding_system, Vcoding_system_hash_table);
 966
 967   /* Set up aliases for subsidiaries. */
 968   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
 969     {
 970       Lisp_Object str;
 971       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
 972 #define FROB(type, name)                                                        \
 973       do {                                                                      \
 974         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
 975         if (!NILP (subsidiary))                                                 \
 976           Fdefine_coding_system_alias                                           \
 977             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
 978       } while (0)
 979       FROB (LF,   "-unix");
 980       FROB (CRLF, "-dos");
 981       FROB (CR,   "-mac");
 982 #undef FROB
 983     }
 984   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
 985      but it doesn't look intentional, so I'd rather return something
 986      meaningful or nothing at all. */
 987   return Qnil;
 988 }
 989
 990 static Lisp_Object
 991 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
 992 {
 993   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
 994   Lisp_Object new_coding_system;
 995
 996   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
 997     return coding_system;
 998
 999   switch (type)
1000     {
1001     case EOL_AUTODETECT: return coding_system;
1002     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1003     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1004     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1005     default:       abort ();
1006     }
1007
1008   return NILP (new_coding_system) ? coding_system : new_coding_system;
1009 }
1010
1011 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1012 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1013 */
1014        (coding_system, eol_type))
1015 {
1016   coding_system = Fget_coding_system (coding_system);
1017
1018   return subsidiary_coding_system (coding_system,
1019                                    symbol_to_eol_type (eol_type));
1020 }
1021
1022 \f
1023 /************************************************************************/
1024 /*                         Coding system accessors                      */
1025 /************************************************************************/
1026
1027 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1028 Return the doc string for CODING-SYSTEM.
1029 */
1030        (coding_system))
1031 {
1032   coding_system = Fget_coding_system (coding_system);
1033   return XCODING_SYSTEM_DOC_STRING (coding_system);
1034 }
1035
1036 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1037 Return the type of CODING-SYSTEM.
1038 */
1039        (coding_system))
1040 {
1041   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1042     {
1043     default: abort ();
1044     case CODESYS_AUTODETECT:    return Qundecided;
1045 #ifdef MULE
1046     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1047     case CODESYS_ISO2022:       return Qiso2022;
1048     case CODESYS_BIG5:          return Qbig5;
1049     case CODESYS_UCS4:          return Qucs4;
1050     case CODESYS_UTF8:          return Qutf8;
1051     case CODESYS_CCL:           return Qccl;
1052 #endif
1053     case CODESYS_NO_CONVERSION: return Qno_conversion;
1054 #ifdef DEBUG_XEMACS
1055     case CODESYS_INTERNAL:      return Qinternal;
1056 #endif
1057     }
1058 }
1059
1060 #ifdef MULE
1061 static
1062 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1063 {
1064   Lisp_Object cs
1065     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1066
1067   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1068 }
1069
1070 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1071 Return initial charset of CODING-SYSTEM designated to GNUM.
1072 GNUM allows 0 .. 3.
1073 */
1074        (coding_system, gnum))
1075 {
1076   coding_system = Fget_coding_system (coding_system);
1077   CHECK_INT (gnum);
1078
1079   return coding_system_charset (coding_system, XINT (gnum));
1080 }
1081 #endif /* MULE */
1082
1083 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1084 Return the PROP property of CODING-SYSTEM.
1085 */
1086        (coding_system, prop))
1087 {
1088   int i, ok = 0;
1089   enum coding_system_type type;
1090
1091   coding_system = Fget_coding_system (coding_system);
1092   CHECK_SYMBOL (prop);
1093   type = XCODING_SYSTEM_TYPE (coding_system);
1094
1095   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1096     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1097       {
1098         ok = 1;
1099         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1100           {
1101           case CODESYS_PROP_ALL_OK:
1102             break;
1103 #ifdef MULE
1104           case CODESYS_PROP_ISO2022:
1105             if (type != CODESYS_ISO2022)
1106               signal_simple_error
1107                 ("Property only valid in ISO2022 coding systems",
1108                  prop);
1109             break;
1110
1111           case CODESYS_PROP_CCL:
1112             if (type != CODESYS_CCL)
1113               signal_simple_error
1114                 ("Property only valid in CCL coding systems",
1115                  prop);
1116             break;
1117 #endif /* MULE */
1118           default:
1119             abort ();
1120           }
1121       }
1122
1123   if (!ok)
1124     signal_simple_error ("Unrecognized property", prop);
1125
1126   if (EQ (prop, Qname))
1127     return XCODING_SYSTEM_NAME (coding_system);
1128   else if (EQ (prop, Qtype))
1129     return Fcoding_system_type (coding_system);
1130   else if (EQ (prop, Qdoc_string))
1131     return XCODING_SYSTEM_DOC_STRING (coding_system);
1132   else if (EQ (prop, Qmnemonic))
1133     return XCODING_SYSTEM_MNEMONIC (coding_system);
1134   else if (EQ (prop, Qeol_type))
1135     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1136   else if (EQ (prop, Qeol_lf))
1137     return XCODING_SYSTEM_EOL_LF (coding_system);
1138   else if (EQ (prop, Qeol_crlf))
1139     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1140   else if (EQ (prop, Qeol_cr))
1141     return XCODING_SYSTEM_EOL_CR (coding_system);
1142   else if (EQ (prop, Qpost_read_conversion))
1143     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1144   else if (EQ (prop, Qpre_write_conversion))
1145     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1146 #ifdef MULE
1147   else if (type == CODESYS_ISO2022)
1148     {
1149       if (EQ (prop, Qcharset_g0))
1150         return coding_system_charset (coding_system, 0);
1151       else if (EQ (prop, Qcharset_g1))
1152         return coding_system_charset (coding_system, 1);
1153       else if (EQ (prop, Qcharset_g2))
1154         return coding_system_charset (coding_system, 2);
1155       else if (EQ (prop, Qcharset_g3))
1156         return coding_system_charset (coding_system, 3);
1157
1158 #define FORCE_CHARSET(charset_num) \
1159   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1160    (coding_system, charset_num) ? Qt : Qnil)
1161
1162       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1163       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1164       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1165       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1166
1167 #define LISP_BOOLEAN(prop) \
1168   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1169
1170       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1171       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1172       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1173       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1174       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1175       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1176       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1177
1178       else if (EQ (prop, Qinput_charset_conversion))
1179         return
1180           unparse_charset_conversion_specs
1181             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1182       else if (EQ (prop, Qoutput_charset_conversion))
1183         return
1184           unparse_charset_conversion_specs
1185             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1186       else
1187         abort ();
1188     }
1189   else if (type == CODESYS_CCL)
1190     {
1191       if (EQ (prop, Qdecode))
1192         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1193       else if (EQ (prop, Qencode))
1194         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1195       else
1196         abort ();
1197     }
1198 #endif /* MULE */
1199   else
1200     abort ();
1201
1202   return Qnil; /* not reached */
1203 }
1204
1205 \f
1206 /************************************************************************/
1207 /*                       Coding category functions                      */
1208 /************************************************************************/
1209
1210 static int
1211 decode_coding_category (Lisp_Object symbol)
1212 {
1213   int i;
1214
1215   CHECK_SYMBOL (symbol);
1216   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1217     if (EQ (coding_category_symbol[i], symbol))
1218       return i;
1219
1220   signal_simple_error ("Unrecognized coding category", symbol);
1221   return 0; /* not reached */
1222 }
1223
1224 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1225 Return a list of all recognized coding categories.
1226 */
1227        ())
1228 {
1229   int i;
1230   Lisp_Object list = Qnil;
1231
1232   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1233     list = Fcons (coding_category_symbol[i], list);
1234   return list;
1235 }
1236
1237 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1238 Change the priority order of the coding categories.
1239 LIST should be list of coding categories, in descending order of
1240 priority.  Unspecified coding categories will be lower in priority
1241 than all specified ones, in the same relative order they were in
1242 previously.
1243 */
1244        (list))
1245 {
1246   int category_to_priority[CODING_CATEGORY_LAST + 1];
1247   int i, j;
1248   Lisp_Object rest;
1249
1250   /* First generate a list that maps coding categories to priorities. */
1251
1252   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1253     category_to_priority[i] = -1;
1254
1255   /* Highest priority comes from the specified list. */
1256   i = 0;
1257   EXTERNAL_LIST_LOOP (rest, list)
1258     {
1259       int cat = decode_coding_category (XCAR (rest));
1260
1261       if (category_to_priority[cat] >= 0)
1262         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1263       category_to_priority[cat] = i++;
1264     }
1265
1266   /* Now go through the existing categories by priority to retrieve
1267      the categories not yet specified and preserve their priority
1268      order. */
1269   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1270     {
1271       int cat = coding_category_by_priority[j];
1272       if (category_to_priority[cat] < 0)
1273         category_to_priority[cat] = i++;
1274     }
1275
1276   /* Now we need to construct the inverse of the mapping we just
1277      constructed. */
1278
1279   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1280     coding_category_by_priority[category_to_priority[i]] = i;
1281
1282   /* Phew!  That was confusing. */
1283   return Qnil;
1284 }
1285
1286 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1287 Return a list of coding categories in descending order of priority.
1288 */
1289        ())
1290 {
1291   int i;
1292   Lisp_Object list = Qnil;
1293
1294   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1295     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1296                   list);
1297   return list;
1298 }
1299
1300 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1301 Change the coding system associated with a coding category.
1302 */
1303        (coding_category, coding_system))
1304 {
1305   int cat = decode_coding_category (coding_category);
1306
1307   coding_system = Fget_coding_system (coding_system);
1308   coding_category_system[cat] = coding_system;
1309   return Qnil;
1310 }
1311
1312 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1313 Return the coding system associated with a coding category.
1314 */
1315        (coding_category))
1316 {
1317   int cat = decode_coding_category (coding_category);
1318   Lisp_Object sys = coding_category_system[cat];
1319
1320   if (!NILP (sys))
1321     return XCODING_SYSTEM_NAME (sys);
1322   return Qnil;
1323 }
1324
1325 \f
1326 /************************************************************************/
1327 /*                     Detecting the encoding of data                   */
1328 /************************************************************************/
1329
1330 struct detection_state
1331 {
1332   enum eol_type eol_type;
1333   int seen_non_ascii;
1334   int mask;
1335 #ifdef MULE
1336   struct
1337     {
1338       int mask;
1339       int in_second_byte;
1340     }
1341   big5;
1342
1343   struct
1344     {
1345       int mask;
1346       int in_second_byte;
1347     }
1348   shift_jis;
1349
1350   struct
1351     {
1352       int mask;
1353       int in_byte;
1354   }
1355   ucs4;
1356
1357   struct
1358     {
1359       int mask;
1360       int in_byte;
1361     }
1362   utf8;
1363
1364   struct
1365     {
1366       int mask;
1367       int initted;
1368       struct iso2022_decoder iso;
1369       unsigned int flags;
1370       int high_byte_count;
1371       unsigned int saw_single_shift:1;
1372     }
1373   iso2022;
1374 #endif
1375   struct
1376     {
1377       int seen_anything;
1378       int just_saw_cr;
1379     }
1380   eol;
1381 };
1382
1383 static int
1384 acceptable_control_char_p (int c)
1385 {
1386   switch (c)
1387     {
1388       /* Allow and ignore control characters that you might
1389          reasonably see in a text file */
1390     case '\r':
1391     case '\n':
1392     case '\t':
1393     case  7: /* bell */
1394     case  8: /* backspace */
1395     case 11: /* vertical tab */
1396     case 12: /* form feed */
1397     case 26: /* MS-DOS C-z junk */
1398     case 31: /* '^_' -- for info */
1399       return 1;
1400     default:
1401       return 0;
1402     }
1403 }
1404
1405 static int
1406 mask_has_at_most_one_bit_p (int mask)
1407 {
1408   /* Perhaps the only thing useful you learn from intensive Microsoft
1409      technical interviews */
1410   return (mask & (mask - 1)) == 0;
1411 }
1412
1413 static enum eol_type
1414 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1415                  unsigned int n)
1416 {
1417   int c;
1418
1419   while (n--)
1420     {
1421       c = *src++;
1422       if (c == '\r')
1423         st->eol.just_saw_cr = 1;
1424       else
1425         {
1426           if (c == '\n')
1427             {
1428               if (st->eol.just_saw_cr)
1429                 return EOL_CRLF;
1430               else if (st->eol.seen_anything)
1431                 return EOL_LF;
1432             }
1433           else if (st->eol.just_saw_cr)
1434             return EOL_CR;
1435           st->eol.just_saw_cr = 0;
1436         }
1437       st->eol.seen_anything = 1;
1438     }
1439
1440   return EOL_AUTODETECT;
1441 }
1442
1443 /* Attempt to determine the encoding and EOL type of the given text.
1444    Before calling this function for the first type, you must initialize
1445    st->eol_type as appropriate and initialize st->mask to ~0.
1446
1447    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1448    not yet known.
1449
1450    st->mask holds the determined coding category mask, or ~0 if only
1451    ASCII has been seen so far.
1452
1453    Returns:
1454
1455    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1456         is present in st->mask
1457    1 == definitive answers are here for both st->eol_type and st->mask
1458 */
1459
1460 static int
1461 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1462                     unsigned int n, int just_do_eol)
1463 {
1464   int c;
1465
1466   if (st->eol_type == EOL_AUTODETECT)
1467     st->eol_type = detect_eol_type (st, src, n);
1468
1469   if (just_do_eol)
1470     return st->eol_type != EOL_AUTODETECT;
1471
1472   if (!st->seen_non_ascii)
1473     {
1474       for (; n; n--, src++)
1475         {
1476           c = *src;
1477           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1478             {
1479               st->seen_non_ascii = 1;
1480 #ifdef MULE
1481               st->shift_jis.mask = ~0;
1482               st->big5.mask = ~0;
1483               st->ucs4.mask = ~0;
1484               st->utf8.mask = ~0;
1485               st->iso2022.mask = ~0;
1486 #endif
1487               break;
1488             }
1489         }
1490     }
1491
1492   if (!n)
1493     return 0;
1494 #ifdef MULE
1495   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1496     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1497   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1498     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1499   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1500     st->big5.mask = detect_coding_big5 (st, src, n);
1501   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1502     st->utf8.mask = detect_coding_utf8 (st, src, n);
1503   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1504     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1505
1506   st->mask
1507     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1508     | st->utf8.mask | st->ucs4.mask;
1509 #endif
1510   {
1511     int retval = mask_has_at_most_one_bit_p (st->mask);
1512     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1513     return retval && st->eol_type != EOL_AUTODETECT;
1514   }
1515 }
1516
1517 static Lisp_Object
1518 coding_system_from_mask (int mask)
1519 {
1520   if (mask == ~0)
1521     {
1522       /* If the file was entirely or basically ASCII, use the
1523          default value of `buffer-file-coding-system'. */
1524       Lisp_Object retval =
1525         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1526       if (!NILP (retval))
1527         {
1528           retval = Ffind_coding_system (retval);
1529           if (NILP (retval))
1530             {
1531               warn_when_safe
1532                 (Qbad_variable, Qwarning,
1533                  "Invalid `default-buffer-file-coding-system', set to nil");
1534               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1535             }
1536         }
1537       if (NILP (retval))
1538         retval = Fget_coding_system (Qraw_text);
1539       return retval;
1540     }
1541   else
1542     {
1543       int i;
1544       int cat = -1;
1545 #ifdef MULE
1546       mask = postprocess_iso2022_mask (mask);
1547 #endif
1548       /* Look through the coding categories by priority and find
1549          the first one that is allowed. */
1550       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1551         {
1552           cat = coding_category_by_priority[i];
1553           if ((mask & (1 << cat)) &&
1554               !NILP (coding_category_system[cat]))
1555             break;
1556         }
1557       if (cat >= 0)
1558         return coding_category_system[cat];
1559       else
1560         return Fget_coding_system (Qraw_text);
1561     }
1562 }
1563
1564 /* Given a seekable read stream and potential coding system and EOL type
1565    as specified, do any autodetection that is called for.  If the
1566    coding system and/or EOL type are not autodetect, they will be left
1567    alone; but this function will never return an autodetect coding system
1568    or EOL type.
1569
1570    This function does not automatically fetch subsidiary coding systems;
1571    that should be unnecessary with the explicit eol-type argument. */
1572
1573 void
1574 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1575                               enum eol_type *eol_type_in_out)
1576 {
1577   struct detection_state decst;
1578
1579   if (*eol_type_in_out == EOL_AUTODETECT)
1580     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1581
1582   xzero (decst);
1583   decst.eol_type = *eol_type_in_out;
1584   decst.mask = ~0;
1585
1586   /* If autodetection is called for, do it now. */
1587   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1588       *eol_type_in_out == EOL_AUTODETECT)
1589     {
1590
1591       while (1)
1592         {
1593           unsigned char random_buffer[4096];
1594           int nread;
1595
1596           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1597           if (!nread)
1598             break;
1599           if (detect_coding_type (&decst, random_buffer, nread,
1600                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1601                                   CODESYS_AUTODETECT))
1602             break;
1603         }
1604
1605       *eol_type_in_out = decst.eol_type;
1606       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1607         *codesys_in_out = coding_system_from_mask (decst.mask);
1608     }
1609
1610   /* If we absolutely can't determine the EOL type, just assume LF. */
1611   if (*eol_type_in_out == EOL_AUTODETECT)
1612     *eol_type_in_out = EOL_LF;
1613
1614   Lstream_rewind (stream);
1615 }
1616
1617 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1618 Detect coding system of the text in the region between START and END.
1619 Returned a list of possible coding systems ordered by priority.
1620 If only ASCII characters are found, it returns 'undecided or one of
1621 its subsidiary coding systems according to a detected end-of-line
1622 type.  Optional arg BUFFER defaults to the current buffer.
1623 */
1624        (start, end, buffer))
1625 {
1626   Lisp_Object val = Qnil;
1627   struct buffer *buf = decode_buffer (buffer, 0);
1628   Bufpos b, e;
1629   Lisp_Object instream, lb_instream;
1630   Lstream *istr, *lb_istr;
1631   struct detection_state decst;
1632   struct gcpro gcpro1, gcpro2;
1633
1634   get_buffer_range_char (buf, start, end, &b, &e, 0);
1635   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1636   lb_istr = XLSTREAM (lb_instream);
1637   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1638   istr = XLSTREAM (instream);
1639   GCPRO2 (instream, lb_instream);
1640   xzero (decst);
1641   decst.eol_type = EOL_AUTODETECT;
1642   decst.mask = ~0;
1643   while (1)
1644     {
1645       unsigned char random_buffer[4096];
1646       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1647
1648       if (!nread)
1649         break;
1650       if (detect_coding_type (&decst, random_buffer, nread, 0))
1651         break;
1652     }
1653
1654   if (decst.mask == ~0)
1655     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1656                                     decst.eol_type);
1657   else
1658     {
1659       int i;
1660
1661       val = Qnil;
1662 #ifdef MULE
1663       decst.mask = postprocess_iso2022_mask (decst.mask);
1664 #endif
1665       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1666         {
1667           int sys = coding_category_by_priority[i];
1668           if (decst.mask & (1 << sys))
1669             {
1670               Lisp_Object codesys = coding_category_system[sys];
1671               if (!NILP (codesys))
1672                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1673               val = Fcons (codesys, val);
1674             }
1675         }
1676     }
1677   Lstream_close (istr);
1678   UNGCPRO;
1679   Lstream_delete (istr);
1680   Lstream_delete (lb_istr);
1681   return val;
1682 }
1683
1684 \f
1685 /************************************************************************/
1686 /*           Converting to internal Mule format ("decoding")            */
1687 /************************************************************************/
1688
1689 /* A decoding stream is a stream used for decoding text (i.e.
1690    converting from some external format to internal format).
1691    The decoding-stream object keeps track of the actual coding
1692    stream, the stream that is at the other end, and data that
1693    needs to be persistent across the lifetime of the stream. */
1694
1695 /* Handle the EOL stuff related to just-read-in character C.
1696    EOL_TYPE is the EOL type of the coding stream.
1697    FLAGS is the current value of FLAGS in the coding stream, and may
1698    be modified by this macro.  (The macro only looks at the
1699    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1700    bytes are to be written.  You need to also define a local goto
1701    label "label_continue_loop" that is at the end of the main
1702    character-reading loop.
1703
1704    If C is a CR character, then this macro handles it entirely and
1705    jumps to label_continue_loop.  Otherwise, this macro does not add
1706    anything to DST, and continues normally.  You should continue
1707    processing C normally after this macro. */
1708
1709 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1710 do {                                                            \
1711   if (c == '\r')                                                \
1712     {                                                           \
1713       if (eol_type == EOL_CR)                                   \
1714         Dynarr_add (dst, '\n');                                 \
1715       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1716         Dynarr_add (dst, c);                                    \
1717       else                                                      \
1718         flags |= CODING_STATE_CR;                               \
1719       goto label_continue_loop;                                 \
1720     }                                                           \
1721   else if (flags & CODING_STATE_CR)                             \
1722     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1723       if (c != '\n')                                            \
1724         Dynarr_add (dst, '\r');                                 \
1725       flags &= ~CODING_STATE_CR;                                \
1726     }                                                           \
1727 } while (0)
1728
1729 /* C should be a binary character in the range 0 - 255; convert
1730    to internal format and add to Dynarr DST. */
1731
1732 #ifdef UTF2000
1733 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1734 do {                                            \
1735   if (BYTE_ASCII_P (c))                         \
1736     Dynarr_add (dst, c);                        \
1737   else                                          \
1738     {                                           \
1739       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1740       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1741     }                                           \
1742 } while (0)
1743
1744 INLINE void
1745 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1746 {
1747   if ( c <= 0x7f )
1748     {
1749       Dynarr_add (dst, c);
1750     }
1751   else if ( c <= 0x7ff )
1752     {
1753       Dynarr_add (dst, (c >> 6) | 0xc0);
1754       Dynarr_add (dst, (c & 0x3f) | 0x80);
1755     }
1756   else if ( c <= 0xffff )
1757     {
1758       Dynarr_add (dst,  (c >> 12) | 0xe0);
1759       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1760       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1761     }
1762   else if ( c <= 0x1fffff )
1763     {
1764       Dynarr_add (dst,  (c >> 18) | 0xf0);
1765       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1766       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1767       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1768     }
1769   else if ( c <= 0x3ffffff )
1770     {
1771       Dynarr_add (dst,  (c >> 24) | 0xf8);
1772       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1773       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1774       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1775       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1776     }
1777   else
1778     {
1779       Dynarr_add (dst,  (c >> 30) | 0xfc);
1780       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1781       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1782       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1783       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1784       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1785     }
1786 }
1787 #else
1788 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1789 do {                                            \
1790   if (BYTE_ASCII_P (c))                         \
1791     Dynarr_add (dst, c);                        \
1792   else if (BYTE_C1_P (c))                       \
1793     {                                           \
1794       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1795       Dynarr_add (dst, c + 0x20);               \
1796     }                                           \
1797   else                                          \
1798     {                                           \
1799       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1800       Dynarr_add (dst, c);                      \
1801     }                                           \
1802 } while (0)
1803 #endif
1804
1805 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1806 do {                                    \
1807   if (ch)                               \
1808     {                                   \
1809       DECODE_ADD_BINARY_CHAR (ch, dst); \
1810       ch = 0;                           \
1811     }                                   \
1812 } while (0)
1813
1814 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1815 do {                                    \
1816   if (flags & CODING_STATE_END)         \
1817     {                                   \
1818       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1819       if (flags & CODING_STATE_CR)      \
1820         Dynarr_add (dst, '\r');         \
1821     }                                   \
1822 } while (0)
1823
1824 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1825
1826 struct decoding_stream
1827 {
1828   /* Coding system that governs the conversion. */
1829   Lisp_Coding_System *codesys;
1830
1831   /* Stream that we read the encoded data from or
1832      write the decoded data to. */
1833   Lstream *other_end;
1834
1835   /* If we are reading, then we can return only a fixed amount of
1836      data, so if the conversion resulted in too much data, we store it
1837      here for retrieval the next time around. */
1838   unsigned_char_dynarr *runoff;
1839
1840   /* FLAGS holds flags indicating the current state of the decoding.
1841      Some of these flags are dependent on the coding system. */
1842   unsigned int flags;
1843
1844   /* CH holds a partially built-up character.  Since we only deal
1845      with one- and two-byte characters at the moment, we only use
1846      this to store the first byte of a two-byte character. */
1847   unsigned int ch;
1848
1849   /* EOL_TYPE specifies the type of end-of-line conversion that
1850      currently applies.  We need to keep this separate from the
1851      EOL type stored in CODESYS because the latter might indicate
1852      automatic EOL-type detection while the former will always
1853      indicate a particular EOL type. */
1854   enum eol_type eol_type;
1855 #ifdef MULE
1856   /* Additional ISO2022 information.  We define the structure above
1857      because it's also needed by the detection routines. */
1858   struct iso2022_decoder iso2022;
1859
1860   /* Additional information (the state of the running CCL program)
1861      used by the CCL decoder. */
1862   struct ccl_program ccl;
1863 #endif
1864   struct detection_state decst;
1865 };
1866
1867 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1868 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1869 static int decoding_rewinder   (Lstream *stream);
1870 static int decoding_seekable_p (Lstream *stream);
1871 static int decoding_flusher    (Lstream *stream);
1872 static int decoding_closer     (Lstream *stream);
1873
1874 static Lisp_Object decoding_marker (Lisp_Object stream,
1875                                     void (*markobj) (Lisp_Object));
1876
1877 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1878                                sizeof (struct decoding_stream));
1879
1880 static Lisp_Object
1881 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1882 {
1883   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1884   Lisp_Object str_obj;
1885
1886   /* We do not need to mark the coding systems or charsets stored
1887      within the stream because they are stored in a global list
1888      and automatically marked. */
1889
1890   XSETLSTREAM (str_obj, str);
1891   markobj (str_obj);
1892   if (str->imp->marker)
1893     return (str->imp->marker) (str_obj, markobj);
1894   else
1895     return Qnil;
1896 }
1897
1898 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1899    so we read data from the other end, decode it, and store it into DATA. */
1900
1901 static int
1902 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1903 {
1904   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1905   unsigned char *orig_data = data;
1906   int read_size;
1907   int error_occurred = 0;
1908
1909   /* We need to interface to mule_decode(), which expects to take some
1910      amount of data and store the result into a Dynarr.  We have
1911      mule_decode() store into str->runoff, and take data from there
1912      as necessary. */
1913
1914   /* We loop until we have enough data, reading chunks from the other
1915      end and decoding it. */
1916   while (1)
1917     {
1918       /* Take data from the runoff if we can.  Make sure to take at
1919          most SIZE bytes, and delete the data from the runoff. */
1920       if (Dynarr_length (str->runoff) > 0)
1921         {
1922           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1923           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1924           Dynarr_delete_many (str->runoff, 0, chunk);
1925           data += chunk;
1926           size -= chunk;
1927         }
1928
1929       if (size == 0)
1930         break; /* No more room for data */
1931
1932       if (str->flags & CODING_STATE_END)
1933         /* This means that on the previous iteration, we hit the EOF on
1934            the other end.  We loop once more so that mule_decode() can
1935            output any final stuff it may be holding, or any "go back
1936            to a sane state" escape sequences. (This latter makes sense
1937            during encoding.) */
1938         break;
1939
1940       /* Exhausted the runoff, so get some more.  DATA has at least
1941          SIZE bytes left of storage in it, so it's OK to read directly
1942          into it.  (We'll be overwriting above, after we've decoded it
1943          into the runoff.) */
1944       read_size = Lstream_read (str->other_end, data, size);
1945       if (read_size < 0)
1946         {
1947           error_occurred = 1;
1948           break;
1949         }
1950       if (read_size == 0)
1951         /* There might be some more end data produced in the translation.
1952            See the comment above. */
1953         str->flags |= CODING_STATE_END;
1954       mule_decode (stream, data, str->runoff, read_size);
1955     }
1956
1957   if (data - orig_data == 0)
1958     return error_occurred ? -1 : 0;
1959   else
1960     return data - orig_data;
1961 }
1962
1963 static int
1964 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1965 {
1966   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1967   int retval;
1968
1969   /* Decode all our data into the runoff, and then attempt to write
1970      it all out to the other end.  Remove whatever chunk we succeeded
1971      in writing. */
1972   mule_decode (stream, data, str->runoff, size);
1973   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1974                           Dynarr_length (str->runoff));
1975   if (retval > 0)
1976     Dynarr_delete_many (str->runoff, 0, retval);
1977   /* Do NOT return retval.  The return value indicates how much
1978      of the incoming data was written, not how many bytes were
1979      written. */
1980   return size;
1981 }
1982
1983 static void
1984 reset_decoding_stream (struct decoding_stream *str)
1985 {
1986 #ifdef MULE
1987   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1988     {
1989       Lisp_Object coding_system;
1990       XSETCODING_SYSTEM (coding_system, str->codesys);
1991       reset_iso2022 (coding_system, &str->iso2022);
1992     }
1993   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1994     {
1995       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1996     }
1997 #endif /* MULE */
1998   str->flags = str->ch = 0;
1999 }
2000
2001 static int
2002 decoding_rewinder (Lstream *stream)
2003 {
2004   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2005   reset_decoding_stream (str);
2006   Dynarr_reset (str->runoff);
2007   return Lstream_rewind (str->other_end);
2008 }
2009
2010 static int
2011 decoding_seekable_p (Lstream *stream)
2012 {
2013   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2014   return Lstream_seekable_p (str->other_end);
2015 }
2016
2017 static int
2018 decoding_flusher (Lstream *stream)
2019 {
2020   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2021   return Lstream_flush (str->other_end);
2022 }
2023
2024 static int
2025 decoding_closer (Lstream *stream)
2026 {
2027   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2028   if (stream->flags & LSTREAM_FL_WRITE)
2029     {
2030       str->flags |= CODING_STATE_END;
2031       decoding_writer (stream, 0, 0);
2032     }
2033   Dynarr_free (str->runoff);
2034 #ifdef MULE
2035 #ifdef ENABLE_COMPOSITE_CHARS
2036   if (str->iso2022.composite_chars)
2037     Dynarr_free (str->iso2022.composite_chars);
2038 #endif
2039 #endif
2040   return Lstream_close (str->other_end);
2041 }
2042
2043 Lisp_Object
2044 decoding_stream_coding_system (Lstream *stream)
2045 {
2046   Lisp_Object coding_system;
2047   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2048
2049   XSETCODING_SYSTEM (coding_system, str->codesys);
2050   return subsidiary_coding_system (coding_system, str->eol_type);
2051 }
2052
2053 void
2054 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2055 {
2056   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2057   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2058   str->codesys = cs;
2059   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2060     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2061   reset_decoding_stream (str);
2062 }
2063
2064 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2065    stream for writing, no automatic code detection will be performed.
2066    The reason for this is that automatic code detection requires a
2067    seekable input.  Things will also fail if you open a decoding
2068    stream for reading using a non-fully-specified coding system and
2069    a non-seekable input stream. */
2070
2071 static Lisp_Object
2072 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2073                         CONST char *mode)
2074 {
2075   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2076   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2077   Lisp_Object obj;
2078
2079   xzero (*str);
2080   str->other_end = stream;
2081   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2082   str->eol_type = EOL_AUTODETECT;
2083   if (!strcmp (mode, "r")
2084       && Lstream_seekable_p (stream))
2085     /* We can determine the coding system now. */
2086     determine_real_coding_system (stream, &codesys, &str->eol_type);
2087   set_decoding_stream_coding_system (lstr, codesys);
2088   str->decst.eol_type = str->eol_type;
2089   str->decst.mask = ~0;
2090   XSETLSTREAM (obj, lstr);
2091   return obj;
2092 }
2093
2094 Lisp_Object
2095 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2096 {
2097   return make_decoding_stream_1 (stream, codesys, "r");
2098 }
2099
2100 Lisp_Object
2101 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2102 {
2103   return make_decoding_stream_1 (stream, codesys, "w");
2104 }
2105
2106 /* Note: the decode_coding_* functions all take the same
2107    arguments as mule_decode(), which is to say some SRC data of
2108    size N, which is to be stored into dynamic array DST.
2109    DECODING is the stream within which the decoding is
2110    taking place, but no data is actually read from or
2111    written to that stream; that is handled in decoding_reader()
2112    or decoding_writer().  This allows the same functions to
2113    be used for both reading and writing. */
2114
2115 static void
2116 mule_decode (Lstream *decoding, CONST unsigned char *src,
2117              unsigned_char_dynarr *dst, unsigned int n)
2118 {
2119   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2120
2121   /* If necessary, do encoding-detection now.  We do this when
2122      we're a writing stream or a non-seekable reading stream,
2123      meaning that we can't just process the whole input,
2124      rewind, and start over. */
2125
2126   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2127       str->eol_type == EOL_AUTODETECT)
2128     {
2129       Lisp_Object codesys;
2130
2131       XSETCODING_SYSTEM (codesys, str->codesys);
2132       detect_coding_type (&str->decst, src, n,
2133                           CODING_SYSTEM_TYPE (str->codesys) !=
2134                           CODESYS_AUTODETECT);
2135       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2136           str->decst.mask != ~0)
2137         /* #### This is cheesy.  What we really ought to do is
2138            buffer up a certain amount of data so as to get a
2139            less random result. */
2140         codesys = coding_system_from_mask (str->decst.mask);
2141       str->eol_type = str->decst.eol_type;
2142       if (XCODING_SYSTEM (codesys) != str->codesys)
2143         {
2144           /* Preserve the CODING_STATE_END flag in case it was set.
2145              If we erase it, bad things might happen. */
2146           int was_end = str->flags & CODING_STATE_END;
2147           set_decoding_stream_coding_system (decoding, codesys);
2148           if (was_end)
2149             str->flags |= CODING_STATE_END;
2150         }
2151     }
2152
2153   switch (CODING_SYSTEM_TYPE (str->codesys))
2154     {
2155 #ifdef DEBUG_XEMACS
2156     case CODESYS_INTERNAL:
2157       Dynarr_add_many (dst, src, n);
2158       break;
2159 #endif
2160     case CODESYS_AUTODETECT:
2161       /* If we got this far and still haven't decided on the coding
2162          system, then do no conversion. */
2163     case CODESYS_NO_CONVERSION:
2164       decode_coding_no_conversion (decoding, src, dst, n);
2165       break;
2166 #ifdef MULE
2167     case CODESYS_SHIFT_JIS:
2168       decode_coding_sjis (decoding, src, dst, n);
2169       break;
2170     case CODESYS_BIG5:
2171       decode_coding_big5 (decoding, src, dst, n);
2172       break;
2173     case CODESYS_UCS4:
2174       decode_coding_ucs4 (decoding, src, dst, n);
2175       break;
2176     case CODESYS_UTF8:
2177       decode_coding_utf8 (decoding, src, dst, n);
2178       break;
2179     case CODESYS_CCL:
2180       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2181       break;
2182     case CODESYS_ISO2022:
2183       decode_coding_iso2022 (decoding, src, dst, n);
2184       break;
2185 #endif /* MULE */
2186     default:
2187       abort ();
2188     }
2189 }
2190
2191 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2192 Decode the text between START and END which is encoded in CODING-SYSTEM.
2193 This is useful if you've read in encoded text from a file without decoding
2194 it (e.g. you read in a JIS-formatted file but used the `binary' or
2195 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2196 Return length of decoded text.
2197 BUFFER defaults to the current buffer if unspecified.
2198 */
2199        (start, end, coding_system, buffer))
2200 {
2201   Bufpos b, e;
2202   struct buffer *buf = decode_buffer (buffer, 0);
2203   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2204   Lstream *istr, *ostr;
2205   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2206
2207   get_buffer_range_char (buf, start, end, &b, &e, 0);
2208
2209   barf_if_buffer_read_only (buf, b, e);
2210
2211   coding_system = Fget_coding_system (coding_system);
2212   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2213   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2214   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2215                                               coding_system);
2216   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2217                                            Fget_coding_system (Qbinary));
2218   istr = XLSTREAM (instream);
2219   ostr = XLSTREAM (outstream);
2220   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2221
2222   /* The chain of streams looks like this:
2223
2224      [BUFFER] <----- send through
2225                      ------> [ENCODE AS BINARY]
2226                              ------> [DECODE AS SPECIFIED]
2227                                      ------> [BUFFER]
2228    */
2229
2230   while (1)
2231     {
2232       char tempbuf[1024]; /* some random amount */
2233       Bufpos newpos, even_newer_pos;
2234       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2235       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2236
2237       if (!size_in_bytes)
2238         break;
2239       newpos = lisp_buffer_stream_startpos (istr);
2240       Lstream_write (ostr, tempbuf, size_in_bytes);
2241       even_newer_pos = lisp_buffer_stream_startpos (istr);
2242       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2243                            even_newer_pos, 0);
2244     }
2245   Lstream_close (istr);
2246   Lstream_close (ostr);
2247   UNGCPRO;
2248   Lstream_delete (istr);
2249   Lstream_delete (ostr);
2250   Lstream_delete (XLSTREAM (de_outstream));
2251   Lstream_delete (XLSTREAM (lb_outstream));
2252   return Qnil;
2253 }
2254
2255 \f
2256 /************************************************************************/
2257 /*           Converting to an external encoding ("encoding")            */
2258 /************************************************************************/
2259
2260 /* An encoding stream is an output stream.  When you create the
2261    stream, you specify the coding system that governs the encoding
2262    and another stream that the resulting encoded data is to be
2263    sent to, and then start sending data to it. */
2264
2265 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2266
2267 struct encoding_stream
2268 {
2269   /* Coding system that governs the conversion. */
2270   Lisp_Coding_System *codesys;
2271
2272   /* Stream that we read the encoded data from or
2273      write the decoded data to. */
2274   Lstream *other_end;
2275
2276   /* If we are reading, then we can return only a fixed amount of
2277      data, so if the conversion resulted in too much data, we store it
2278      here for retrieval the next time around. */
2279   unsigned_char_dynarr *runoff;
2280
2281   /* FLAGS holds flags indicating the current state of the encoding.
2282      Some of these flags are dependent on the coding system. */
2283   unsigned int flags;
2284
2285   /* CH holds a partially built-up character.  Since we only deal
2286      with one- and two-byte characters at the moment, we only use
2287      this to store the first byte of a two-byte character. */
2288   unsigned int ch;
2289 #ifdef MULE
2290   /* Additional information used by the ISO2022 encoder. */
2291   struct
2292     {
2293       /* CHARSET holds the character sets currently assigned to the G0
2294          through G3 registers.  It is initialized from the array
2295          INITIAL_CHARSET in CODESYS. */
2296       Lisp_Object charset[4];
2297
2298       /* Which registers are currently invoked into the left (GL) and
2299          right (GR) halves of the 8-bit encoding space? */
2300       int register_left, register_right;
2301
2302       /* Whether we need to explicitly designate the charset in the
2303          G? register before using it.  It is initialized from the
2304          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2305       unsigned char force_charset_on_output[4];
2306
2307       /* Other state variables that need to be preserved across
2308          invocations. */
2309       Lisp_Object current_charset;
2310       int current_half;
2311       int current_char_boundary;
2312     } iso2022;
2313
2314   /* Additional information (the state of the running CCL program)
2315      used by the CCL encoder. */
2316   struct ccl_program ccl;
2317 #endif /* MULE */
2318 };
2319
2320 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2321 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2322                             size_t size);
2323 static int encoding_rewinder   (Lstream *stream);
2324 static int encoding_seekable_p (Lstream *stream);
2325 static int encoding_flusher    (Lstream *stream);
2326 static int encoding_closer     (Lstream *stream);
2327
2328 static Lisp_Object encoding_marker (Lisp_Object stream,
2329                                     void (*markobj) (Lisp_Object));
2330
2331 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2332                                sizeof (struct encoding_stream));
2333
2334 static Lisp_Object
2335 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2336 {
2337   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2338   Lisp_Object str_obj;
2339
2340   /* We do not need to mark the coding systems or charsets stored
2341      within the stream because they are stored in a global list
2342      and automatically marked. */
2343
2344   XSETLSTREAM (str_obj, str);
2345   markobj (str_obj);
2346   if (str->imp->marker)
2347     return (str->imp->marker) (str_obj, markobj);
2348   else
2349     return Qnil;
2350 }
2351
2352 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2353    so we read data from the other end, encode it, and store it into DATA. */
2354
2355 static int
2356 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2357 {
2358   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2359   unsigned char *orig_data = data;
2360   int read_size;
2361   int error_occurred = 0;
2362
2363   /* We need to interface to mule_encode(), which expects to take some
2364      amount of data and store the result into a Dynarr.  We have
2365      mule_encode() store into str->runoff, and take data from there
2366      as necessary. */
2367
2368   /* We loop until we have enough data, reading chunks from the other
2369      end and encoding it. */
2370   while (1)
2371     {
2372       /* Take data from the runoff if we can.  Make sure to take at
2373          most SIZE bytes, and delete the data from the runoff. */
2374       if (Dynarr_length (str->runoff) > 0)
2375         {
2376           int chunk = min ((int) size, Dynarr_length (str->runoff));
2377           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2378           Dynarr_delete_many (str->runoff, 0, chunk);
2379           data += chunk;
2380           size -= chunk;
2381         }
2382
2383       if (size == 0)
2384         break; /* No more room for data */
2385
2386       if (str->flags & CODING_STATE_END)
2387         /* This means that on the previous iteration, we hit the EOF on
2388            the other end.  We loop once more so that mule_encode() can
2389            output any final stuff it may be holding, or any "go back
2390            to a sane state" escape sequences. (This latter makes sense
2391            during encoding.) */
2392         break;
2393
2394       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2395          left of storage in it, so it's OK to read directly into it.
2396          (We'll be overwriting above, after we've encoded it into the
2397          runoff.) */
2398       read_size = Lstream_read (str->other_end, data, size);
2399       if (read_size < 0)
2400         {
2401           error_occurred = 1;
2402           break;
2403         }
2404       if (read_size == 0)
2405         /* There might be some more end data produced in the translation.
2406            See the comment above. */
2407         str->flags |= CODING_STATE_END;
2408       mule_encode (stream, data, str->runoff, read_size);
2409     }
2410
2411   if (data == orig_data)
2412     return error_occurred ? -1 : 0;
2413   else
2414     return data - orig_data;
2415 }
2416
2417 static int
2418 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2419 {
2420   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2421   int retval;
2422
2423   /* Encode all our data into the runoff, and then attempt to write
2424      it all out to the other end.  Remove whatever chunk we succeeded
2425      in writing. */
2426   mule_encode (stream, data, str->runoff, size);
2427   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2428                           Dynarr_length (str->runoff));
2429   if (retval > 0)
2430     Dynarr_delete_many (str->runoff, 0, retval);
2431   /* Do NOT return retval.  The return value indicates how much
2432      of the incoming data was written, not how many bytes were
2433      written. */
2434   return size;
2435 }
2436
2437 static void
2438 reset_encoding_stream (struct encoding_stream *str)
2439 {
2440 #ifdef MULE
2441   switch (CODING_SYSTEM_TYPE (str->codesys))
2442     {
2443     case CODESYS_ISO2022:
2444       {
2445         int i;
2446
2447         for (i = 0; i < 4; i++)
2448           {
2449             str->iso2022.charset[i] =
2450               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2451             str->iso2022.force_charset_on_output[i] =
2452               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2453           }
2454         str->iso2022.register_left = 0;
2455         str->iso2022.register_right = 1;
2456         str->iso2022.current_charset = Qnil;
2457         str->iso2022.current_half = 0;
2458 #ifdef UTF2000
2459         str->iso2022.current_char_boundary = 0;
2460 #else
2461         str->iso2022.current_char_boundary = 1;
2462 #endif
2463         break;
2464       }
2465     case CODESYS_CCL:
2466       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2467       break;
2468     default:
2469       break;
2470     }
2471 #endif /* MULE */
2472
2473   str->flags = str->ch = 0;
2474 }
2475
2476 static int
2477 encoding_rewinder (Lstream *stream)
2478 {
2479   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2480   reset_encoding_stream (str);
2481   Dynarr_reset (str->runoff);
2482   return Lstream_rewind (str->other_end);
2483 }
2484
2485 static int
2486 encoding_seekable_p (Lstream *stream)
2487 {
2488   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2489   return Lstream_seekable_p (str->other_end);
2490 }
2491
2492 static int
2493 encoding_flusher (Lstream *stream)
2494 {
2495   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2496   return Lstream_flush (str->other_end);
2497 }
2498
2499 static int
2500 encoding_closer (Lstream *stream)
2501 {
2502   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2503   if (stream->flags & LSTREAM_FL_WRITE)
2504     {
2505       str->flags |= CODING_STATE_END;
2506       encoding_writer (stream, 0, 0);
2507     }
2508   Dynarr_free (str->runoff);
2509   return Lstream_close (str->other_end);
2510 }
2511
2512 Lisp_Object
2513 encoding_stream_coding_system (Lstream *stream)
2514 {
2515   Lisp_Object coding_system;
2516   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2517
2518   XSETCODING_SYSTEM (coding_system, str->codesys);
2519   return coding_system;
2520 }
2521
2522 void
2523 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2524 {
2525   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2526   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2527   str->codesys = cs;
2528   reset_encoding_stream (str);
2529 }
2530
2531 static Lisp_Object
2532 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2533                         CONST char *mode)
2534 {
2535   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2536   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2537   Lisp_Object obj;
2538
2539   xzero (*str);
2540   str->runoff = Dynarr_new (unsigned_char);
2541   str->other_end = stream;
2542   set_encoding_stream_coding_system (lstr, codesys);
2543   XSETLSTREAM (obj, lstr);
2544   return obj;
2545 }
2546
2547 Lisp_Object
2548 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2549 {
2550   return make_encoding_stream_1 (stream, codesys, "r");
2551 }
2552
2553 Lisp_Object
2554 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2555 {
2556   return make_encoding_stream_1 (stream, codesys, "w");
2557 }
2558
2559 /* Convert N bytes of internally-formatted data stored in SRC to an
2560    external format, according to the encoding stream ENCODING.
2561    Store the encoded data into DST. */
2562
2563 static void
2564 mule_encode (Lstream *encoding, CONST unsigned char *src,
2565              unsigned_char_dynarr *dst, unsigned int n)
2566 {
2567   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2568
2569   switch (CODING_SYSTEM_TYPE (str->codesys))
2570     {
2571 #ifdef DEBUG_XEMACS
2572     case CODESYS_INTERNAL:
2573       Dynarr_add_many (dst, src, n);
2574       break;
2575 #endif
2576     case CODESYS_AUTODETECT:
2577       /* If we got this far and still haven't decided on the coding
2578          system, then do no conversion. */
2579     case CODESYS_NO_CONVERSION:
2580       encode_coding_no_conversion (encoding, src, dst, n);
2581       break;
2582 #ifdef MULE
2583     case CODESYS_SHIFT_JIS:
2584       encode_coding_sjis (encoding, src, dst, n);
2585       break;
2586     case CODESYS_BIG5:
2587       encode_coding_big5 (encoding, src, dst, n);
2588       break;
2589     case CODESYS_UCS4:
2590       encode_coding_ucs4 (encoding, src, dst, n);
2591       break;
2592     case CODESYS_UTF8:
2593       encode_coding_utf8 (encoding, src, dst, n);
2594       break;
2595     case CODESYS_CCL:
2596       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2597       break;
2598     case CODESYS_ISO2022:
2599       encode_coding_iso2022 (encoding, src, dst, n);
2600       break;
2601 #endif /* MULE */
2602     default:
2603       abort ();
2604     }
2605 }
2606
2607 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2608 Encode the text between START and END using CODING-SYSTEM.
2609 This will, for example, convert Japanese characters into stuff such as
2610 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2611 text.  BUFFER defaults to the current buffer if unspecified.
2612 */
2613        (start, end, coding_system, buffer))
2614 {
2615   Bufpos b, e;
2616   struct buffer *buf = decode_buffer (buffer, 0);
2617   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2618   Lstream *istr, *ostr;
2619   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2620
2621   get_buffer_range_char (buf, start, end, &b, &e, 0);
2622
2623   barf_if_buffer_read_only (buf, b, e);
2624
2625   coding_system = Fget_coding_system (coding_system);
2626   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2627   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2628   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2629                                               Fget_coding_system (Qbinary));
2630   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2631                                            coding_system);
2632   istr = XLSTREAM (instream);
2633   ostr = XLSTREAM (outstream);
2634   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2635   /* The chain of streams looks like this:
2636
2637      [BUFFER] <----- send through
2638                      ------> [ENCODE AS SPECIFIED]
2639                              ------> [DECODE AS BINARY]
2640                                      ------> [BUFFER]
2641    */
2642   while (1)
2643     {
2644       char tempbuf[1024]; /* some random amount */
2645       Bufpos newpos, even_newer_pos;
2646       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2647       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2648
2649       if (!size_in_bytes)
2650         break;
2651       newpos = lisp_buffer_stream_startpos (istr);
2652       Lstream_write (ostr, tempbuf, size_in_bytes);
2653       even_newer_pos = lisp_buffer_stream_startpos (istr);
2654       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2655                            even_newer_pos, 0);
2656     }
2657
2658   {
2659     Charcount retlen =
2660       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2661     Lstream_close (istr);
2662     Lstream_close (ostr);
2663     UNGCPRO;
2664     Lstream_delete (istr);
2665     Lstream_delete (ostr);
2666     Lstream_delete (XLSTREAM (de_outstream));
2667     Lstream_delete (XLSTREAM (lb_outstream));
2668     return make_int (retlen);
2669   }
2670 }
2671
2672 #ifdef MULE
2673 \f
2674 /************************************************************************/
2675 /*                          Shift-JIS methods                           */
2676 /************************************************************************/
2677
2678 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2679    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2680    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2681    encoded by "position-code + 0x80".  A character of JISX0208
2682    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2683    position-codes are divided and shifted so that it fit in the range
2684    below.
2685
2686    --- CODE RANGE of Shift-JIS ---
2687    (character set)      (range)
2688    ASCII                0x00 .. 0x7F
2689    JISX0201-Kana        0xA0 .. 0xDF
2690    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2691             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2692    -------------------------------
2693
2694 */
2695
2696 /* Is this the first byte of a Shift-JIS two-byte char? */
2697
2698 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2699   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2700
2701 /* Is this the second byte of a Shift-JIS two-byte char? */
2702
2703 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2704   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2705
2706 #define BYTE_SJIS_KATAKANA_P(c) \
2707   ((c) >= 0xA1 && (c) <= 0xDF)
2708
2709 static int
2710 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2711                     unsigned int n)
2712 {
2713   int c;
2714
2715   while (n--)
2716     {
2717       c = *src++;
2718       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2719         return 0;
2720       if (st->shift_jis.in_second_byte)
2721         {
2722           st->shift_jis.in_second_byte = 0;
2723           if (c < 0x40)
2724             return 0;
2725         }
2726       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2727         st->shift_jis.in_second_byte = 1;
2728     }
2729   return CODING_CATEGORY_SHIFT_JIS_MASK;
2730 }
2731
2732 /* Convert Shift-JIS data to internal format. */
2733
2734 static void
2735 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2736                     unsigned_char_dynarr *dst, unsigned int n)
2737 {
2738   unsigned char c;
2739   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2740   unsigned int flags  = str->flags;
2741   unsigned int ch     = str->ch;
2742   eol_type_t eol_type = str->eol_type;
2743
2744   while (n--)
2745     {
2746       c = *src++;
2747
2748       if (ch)
2749         {
2750           /* Previous character was first byte of Shift-JIS Kanji char. */
2751           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2752             {
2753               unsigned char e1, e2;
2754
2755               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2756               DECODE_SJIS (ch, c, e1, e2);
2757               Dynarr_add (dst, e1);
2758               Dynarr_add (dst, e2);
2759             }
2760           else
2761             {
2762               DECODE_ADD_BINARY_CHAR (ch, dst);
2763               DECODE_ADD_BINARY_CHAR (c, dst);
2764             }
2765           ch = 0;
2766         }
2767       else
2768         {
2769           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2770           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2771             ch = c;
2772           else if (BYTE_SJIS_KATAKANA_P (c))
2773             {
2774               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2775               Dynarr_add (dst, c);
2776             }
2777           else
2778             DECODE_ADD_BINARY_CHAR (c, dst);
2779         }
2780     label_continue_loop:;
2781     }
2782
2783   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2784
2785   str->flags = flags;
2786   str->ch    = ch;
2787 }
2788
2789 /* Convert internally-formatted data to Shift-JIS. */
2790
2791 static void
2792 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2793                     unsigned_char_dynarr *dst, unsigned int n)
2794 {
2795   unsigned char c;
2796   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2797   unsigned int flags  = str->flags;
2798   unsigned int ch     = str->ch;
2799   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2800
2801   while (n--)
2802     {
2803       c = *src++;
2804       if (c == '\n')
2805         {
2806           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2807             Dynarr_add (dst, '\r');
2808           if (eol_type != EOL_CR)
2809             Dynarr_add (dst, '\n');
2810           ch = 0;
2811         }
2812       else if (BYTE_ASCII_P (c))
2813         {
2814           Dynarr_add (dst, c);
2815           ch = 0;
2816         }
2817       else if (BUFBYTE_LEADING_BYTE_P (c))
2818         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2819               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2820               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2821       else if (ch)
2822         {
2823           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2824             {
2825               Dynarr_add (dst, c);
2826               ch = 0;
2827             }
2828           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2829                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2830             ch = c;
2831           else
2832             {
2833               unsigned char j1, j2;
2834               ENCODE_SJIS (ch, c, j1, j2);
2835               Dynarr_add (dst, j1);
2836               Dynarr_add (dst, j2);
2837               ch = 0;
2838             }
2839         }
2840     }
2841
2842   str->flags = flags;
2843   str->ch    = ch;
2844 }
2845
2846 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2847 Decode a JISX0208 character of Shift-JIS coding-system.
2848 CODE is the character code in Shift-JIS as a cons of type bytes.
2849 Return the corresponding character.
2850 */
2851        (code))
2852 {
2853   unsigned char c1, c2, s1, s2;
2854
2855   CHECK_CONS (code);
2856   CHECK_INT (XCAR (code));
2857   CHECK_INT (XCDR (code));
2858   s1 = XINT (XCAR (code));
2859   s2 = XINT (XCDR (code));
2860   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2861       BYTE_SJIS_TWO_BYTE_2_P (s2))
2862     {
2863       DECODE_SJIS (s1, s2, c1, c2);
2864       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2865                                    c1 & 0x7F, c2 & 0x7F));
2866     }
2867   else
2868     return Qnil;
2869 }
2870
2871 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2872 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2873 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2874 */
2875        (ch))
2876 {
2877   Lisp_Object charset;
2878   int c1, c2, s1, s2;
2879
2880   CHECK_CHAR_COERCE_INT (ch);
2881   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2882   if (EQ (charset, Vcharset_japanese_jisx0208))
2883     {
2884       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2885       return Fcons (make_int (s1), make_int (s2));
2886     }
2887   else
2888     return Qnil;
2889 }
2890
2891 \f
2892 /************************************************************************/
2893 /*                            Big5 methods                              */
2894 /************************************************************************/
2895
2896 /* BIG5 is a coding system encoding two character sets: ASCII and
2897    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2898    character set and is encoded in two-byte.
2899
2900    --- CODE RANGE of BIG5 ---
2901    (character set)      (range)
2902    ASCII                0x00 .. 0x7F
2903    Big5 (1st byte)      0xA1 .. 0xFE
2904         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2905    --------------------------
2906
2907    Since the number of characters in Big5 is larger than maximum
2908    characters in Emacs' charset (96x96), it can't be handled as one
2909    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2910    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
2911    contains frequently used characters and the latter contains less
2912    frequently used characters.  */
2913
2914 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2915   ((c) >= 0xA1 && (c) <= 0xFE)
2916
2917 /* Is this the second byte of a Shift-JIS two-byte char? */
2918
2919 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2920   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2921
2922 /* Number of Big5 characters which have the same code in 1st byte.  */
2923
2924 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2925
2926 /* Code conversion macros.  These are macros because they are used in
2927    inner loops during code conversion.
2928
2929    Note that temporary variables in macros introduce the classic
2930    dynamic-scoping problems with variable names.  We use capital-
2931    lettered variables in the assumption that XEmacs does not use
2932    capital letters in variables except in a very formalized way
2933    (e.g. Qstring). */
2934
2935 /* Convert Big5 code (b1, b2) into its internal string representation
2936    (lb, c1, c2). */
2937
2938 /* There is a much simpler way to split the Big5 charset into two.
2939    For the moment I'm going to leave the algorithm as-is because it
2940    claims to separate out the most-used characters into a single
2941    charset, which perhaps will lead to optimizations in various
2942    places.
2943
2944    The way the algorithm works is something like this:
2945
2946    Big5 can be viewed as a 94x157 charset, where the row is
2947    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2948    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2949    the split between low and high column numbers is apparently
2950    meaningless; ascending rows produce less and less frequent chars.
2951    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2952    the first charset, and the upper half (0xC9 .. 0xFE) to the
2953    second.  To do the conversion, we convert the character into
2954    a single number where 0 .. 156 is the first row, 157 .. 313
2955    is the second, etc.  That way, the characters are ordered by
2956    decreasing frequency.  Then we just chop the space in two
2957    and coerce the result into a 94x94 space.
2958    */
2959
2960 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2961 {                                                                       \
2962   int B1 = b1, B2 = b2;                                                 \
2963   unsigned int I                                                        \
2964     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2965                                                                         \
2966   if (B1 < 0xC9)                                                        \
2967     {                                                                   \
2968       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2969     }                                                                   \
2970   else                                                                  \
2971     {                                                                   \
2972       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2973       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2974     }                                                                   \
2975   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2976   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2977 } while (0)
2978
2979 /* Convert the internal string representation of a Big5 character
2980    (lb, c1, c2) into Big5 code (b1, b2). */
2981
2982 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2983 {                                                                       \
2984   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2985                                                                         \
2986   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2987     {                                                                   \
2988       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2989     }                                                                   \
2990   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2991   b2 = I % BIG5_SAME_ROW;                                               \
2992   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2993 } while (0)
2994
2995 static int
2996 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2997                     unsigned int n)
2998 {
2999   int c;
3000
3001   while (n--)
3002     {
3003       c = *src++;
3004       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3005           (c >= 0x80 && c <= 0xA0))
3006         return 0;
3007       if (st->big5.in_second_byte)
3008         {
3009           st->big5.in_second_byte = 0;
3010           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3011             return 0;
3012         }
3013       else if (c >= 0xA1)
3014         st->big5.in_second_byte = 1;
3015     }
3016   return CODING_CATEGORY_BIG5_MASK;
3017 }
3018
3019 /* Convert Big5 data to internal format. */
3020
3021 static void
3022 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3023                     unsigned_char_dynarr *dst, unsigned int n)
3024 {
3025   unsigned char c;
3026   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3027   unsigned int flags  = str->flags;
3028   unsigned int ch     = str->ch;
3029   eol_type_t eol_type = str->eol_type;
3030
3031   while (n--)
3032     {
3033       c = *src++;
3034       if (ch)
3035         {
3036           /* Previous character was first byte of Big5 char. */
3037           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3038             {
3039               unsigned char b1, b2, b3;
3040               DECODE_BIG5 (ch, c, b1, b2, b3);
3041               Dynarr_add (dst, b1);
3042               Dynarr_add (dst, b2);
3043               Dynarr_add (dst, b3);
3044             }
3045           else
3046             {
3047               DECODE_ADD_BINARY_CHAR (ch, dst);
3048               DECODE_ADD_BINARY_CHAR (c, dst);
3049             }
3050           ch = 0;
3051         }
3052       else
3053         {
3054           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3055           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3056             ch = c;
3057           else
3058             DECODE_ADD_BINARY_CHAR (c, dst);
3059         }
3060     label_continue_loop:;
3061     }
3062
3063   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3064
3065   str->flags = flags;
3066   str->ch    = ch;
3067 }
3068
3069 /* Convert internally-formatted data to Big5. */
3070
3071 static void
3072 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3073                     unsigned_char_dynarr *dst, unsigned int n)
3074 {
3075   unsigned char c;
3076   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3077   unsigned int flags  = str->flags;
3078   unsigned int ch     = str->ch;
3079   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3080
3081   while (n--)
3082     {
3083       c = *src++;
3084       if (c == '\n')
3085         {
3086           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3087             Dynarr_add (dst, '\r');
3088           if (eol_type != EOL_CR)
3089             Dynarr_add (dst, '\n');
3090         }
3091       else if (BYTE_ASCII_P (c))
3092         {
3093           /* ASCII. */
3094           Dynarr_add (dst, c);
3095         }
3096       else if (BUFBYTE_LEADING_BYTE_P (c))
3097         {
3098           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3099               c == LEADING_BYTE_CHINESE_BIG5_2)
3100             {
3101               /* A recognized leading byte. */
3102               ch = c;
3103               continue; /* not done with this character. */
3104             }
3105           /* otherwise just ignore this character. */
3106         }
3107       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3108                ch == LEADING_BYTE_CHINESE_BIG5_2)
3109         {
3110           /* Previous char was a recognized leading byte. */
3111           ch = (ch << 8) | c;
3112           continue; /* not done with this character. */
3113         }
3114       else if (ch)
3115         {
3116           /* Encountering second byte of a Big5 character. */
3117           unsigned char b1, b2;
3118
3119           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3120           Dynarr_add (dst, b1);
3121           Dynarr_add (dst, b2);
3122         }
3123
3124       ch = 0;
3125     }
3126
3127   str->flags = flags;
3128   str->ch    = ch;
3129 }
3130
3131
3132 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3133 Decode a Big5 character CODE of BIG5 coding-system.
3134 CODE is the character code in BIG5, a cons of two integers.
3135 Return the corresponding character.
3136 */
3137        (code))
3138 {
3139   unsigned char c1, c2, b1, b2;
3140
3141   CHECK_CONS (code);
3142   CHECK_INT (XCAR (code));
3143   CHECK_INT (XCDR (code));
3144   b1 = XINT (XCAR (code));
3145   b2 = XINT (XCDR (code));
3146   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3147       BYTE_BIG5_TWO_BYTE_2_P (b2))
3148     {
3149       int leading_byte;
3150       Lisp_Object charset;
3151       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3152       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3153       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3154     }
3155   else
3156     return Qnil;
3157 }
3158
3159 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3160 Encode the Big5 character CH to BIG5 coding-system.
3161 Return the corresponding character code in Big5.
3162 */
3163        (ch))
3164 {
3165   Lisp_Object charset;
3166   int c1, c2, b1, b2;
3167
3168   CHECK_CHAR_COERCE_INT (ch);
3169   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3170   if (EQ (charset, Vcharset_chinese_big5_1) ||
3171       EQ (charset, Vcharset_chinese_big5_2))
3172     {
3173       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3174                    b1, b2);
3175       return Fcons (make_int (b1), make_int (b2));
3176     }
3177   else
3178     return Qnil;
3179 }
3180
3181 \f
3182 /************************************************************************/
3183 /*                           UCS-4 methods                              */
3184 /*                                                                      */
3185 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3186 /*                                                                      */
3187 /************************************************************************/
3188
3189 Lisp_Object ucs_to_mule_table[65536];
3190 Lisp_Object mule_to_ucs_table;
3191
3192 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3193 Map UCS-4 code CODE to Mule character CHARACTER.
3194
3195 Return T on success, NIL on failure.
3196 */
3197        (code, character))
3198 {
3199   unsigned int c;
3200
3201   CHECK_CHAR (character);
3202   CHECK_INT (code);
3203   c = XINT (code);
3204
3205   if (c < sizeof (ucs_to_mule_table))
3206     {
3207       ucs_to_mule_table[c] = character;
3208       return Qt;
3209     }
3210   else
3211     return Qnil;
3212 }
3213
3214 static Lisp_Object
3215 ucs_to_char (unsigned long code)
3216 {
3217   if (code < sizeof (ucs_to_mule_table))
3218     {
3219       return ucs_to_mule_table[code];
3220     }
3221   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3222     {
3223       unsigned int c;
3224
3225       code -= 0xe00000;
3226       c = code % (94 * 94);
3227       return make_char
3228         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3229                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3230                      CHARSET_LEFT_TO_RIGHT),
3231                     c / 94 + 33, c % 94 + 33));
3232     }
3233   else
3234     return Qnil;
3235 }
3236
3237 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3238 Return Mule character corresponding to UCS code CODE (a positive integer).
3239 */
3240        (code))
3241 {
3242   CHECK_NATNUM (code);
3243   return ucs_to_char (XINT (code));
3244 }
3245
3246 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3247 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3248 */
3249        (character, code))
3250 {
3251   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3252           Fset_char_ucs is more restrictive on index arg, but should
3253           check code arg in a char_table method. */
3254   CHECK_CHAR (character);
3255   CHECK_NATNUM (code);
3256   return Fput_char_table (character, code, mule_to_ucs_table);
3257 }
3258
3259 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3260 Return the UCS code (a positive integer) corresponding to CHARACTER.
3261 */
3262        (character))
3263 {
3264   return Fget_char_table (character, mule_to_ucs_table);
3265 }
3266
3267 #ifdef UTF2000
3268 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3269 #else
3270 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3271    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3272    is not found, instead.
3273    #### do something more appropriate (use blob?)
3274         Danger, Will Robinson!  Data loss.  Should we signal user? */
3275 static void
3276 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3277 {
3278   Lisp_Object chr = ucs_to_char (ch);
3279
3280   if (! NILP (chr))
3281     {
3282       Bufbyte work[MAX_EMCHAR_LEN];
3283       int len;
3284
3285       ch = XCHAR (chr);
3286       len = (ch < 128) ?
3287         simple_set_charptr_emchar (work, ch) :
3288         non_ascii_set_charptr_emchar (work, ch);
3289       Dynarr_add_many (dst, work, len);
3290     }
3291   else
3292     {
3293       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3294       Dynarr_add (dst, 34 + 128);
3295       Dynarr_add (dst, 46 + 128);
3296     }
3297 }
3298 #endif
3299
3300 static unsigned long
3301 mule_char_to_ucs4 (Lisp_Object charset,
3302                    unsigned char h, unsigned char l)
3303 {
3304   Lisp_Object code
3305     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3306                        mule_to_ucs_table);
3307
3308   if (INTP (code))
3309     {
3310       return XINT (code);
3311     }
3312   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3313             (XCHARSET_CHARS (charset) == 94) )
3314     {
3315       unsigned char final = XCHARSET_FINAL (charset);
3316
3317       if ( ('@' <= final) && (final < 0x7f) )
3318         {
3319           return 0xe00000 + (final - '@') * 94 * 94
3320             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3321         }
3322       else
3323         {
3324           return '?';
3325         }
3326     }
3327   else
3328     {
3329       return '?';
3330     }
3331 }
3332
3333 static void
3334 encode_ucs4 (Lisp_Object charset,
3335              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3336 {
3337   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3338   Dynarr_add (dst,  code >> 24);
3339   Dynarr_add (dst, (code >> 16) & 255);
3340   Dynarr_add (dst, (code >>  8) & 255);
3341   Dynarr_add (dst,  code        & 255);
3342 }
3343
3344 static int
3345 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3346                     unsigned int n)
3347 {
3348   while (n--)
3349     {
3350       int c = *src++;
3351       switch (st->ucs4.in_byte)
3352         {
3353         case 0:
3354           if (c >= 128)
3355             return 0;
3356           else
3357             st->ucs4.in_byte++;
3358           break;
3359         case 3:
3360           st->ucs4.in_byte = 0;
3361           break;
3362         default:
3363           st->ucs4.in_byte++;
3364         }
3365     }
3366   return CODING_CATEGORY_UCS4_MASK;
3367 }
3368
3369 static void
3370 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3371                     unsigned_char_dynarr *dst, unsigned int n)
3372 {
3373   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3374   unsigned int flags = str->flags;
3375   unsigned int ch    = str->ch;
3376
3377   while (n--)
3378     {
3379       unsigned char c = *src++;
3380       switch (flags)
3381         {
3382         case 0:
3383           ch = c;
3384           flags = 3;
3385           break;
3386         case 1:
3387           decode_ucs4 ( ( ch << 8 ) | c, dst);
3388           ch = 0;
3389           flags = 0;
3390           break;
3391         default:
3392           ch = ( ch << 8 ) | c;
3393           flags--;
3394         }
3395     }
3396   if (flags & CODING_STATE_END)
3397     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3398
3399   str->flags = flags;
3400   str->ch    = ch;
3401 }
3402
3403 static void
3404 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3405                     unsigned_char_dynarr *dst, unsigned int n)
3406 {
3407   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3408   unsigned int flags = str->flags;
3409   unsigned int ch = str->ch;
3410   unsigned char char_boundary = str->iso2022.current_char_boundary;
3411   Lisp_Object charset = str->iso2022.current_charset;
3412
3413 #ifdef ENABLE_COMPOSITE_CHARS
3414   /* flags for handling composite chars.  We do a little switcharoo
3415      on the source while we're outputting the composite char. */
3416   unsigned int saved_n = 0;
3417   CONST unsigned char *saved_src = NULL;
3418   int in_composite = 0;
3419
3420  back_to_square_n:
3421 #endif
3422
3423   while (n--)
3424     {
3425       unsigned char c = *src++;
3426
3427       if (BYTE_ASCII_P (c))
3428         {               /* Processing ASCII character */
3429           ch = 0;
3430           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3431           char_boundary = 1;
3432         }
3433       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3434         { /* Processing Leading Byte */
3435           ch = 0;
3436           charset = CHARSET_BY_LEADING_BYTE (c);
3437           if (LEADING_BYTE_PREFIX_P(c))
3438             ch = c;
3439           char_boundary = 0;
3440         }
3441       else
3442         {                       /* Processing Non-ASCII character */
3443           char_boundary = 1;
3444           if (EQ (charset, Vcharset_control_1))
3445             {
3446               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3447             }
3448           else
3449             {
3450               switch (XCHARSET_REP_BYTES (charset))
3451                 {
3452                 case 2:
3453                   encode_ucs4 (charset, c, 0, dst);
3454                   break;
3455                 case 3:
3456                   if (XCHARSET_PRIVATE_P (charset))
3457                     {
3458                       encode_ucs4 (charset, c, 0, dst);
3459                       ch = 0;
3460                     }
3461                   else if (ch)
3462                     {
3463 #ifdef ENABLE_COMPOSITE_CHARS
3464                       if (EQ (charset, Vcharset_composite))
3465                         {
3466                           if (in_composite)
3467                             {
3468                               /* #### Bother! We don't know how to
3469                                  handle this yet. */
3470                               Dynarr_add (dst, 0);
3471                               Dynarr_add (dst, 0);
3472                               Dynarr_add (dst, 0);
3473                               Dynarr_add (dst, '~');
3474                             }
3475                           else
3476                             {
3477                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3478                                                        ch & 0x7F, c & 0x7F);
3479                               Lisp_Object lstr = composite_char_string (emch);
3480                               saved_n = n;
3481                               saved_src = src;
3482                               in_composite = 1;
3483                               src = XSTRING_DATA   (lstr);
3484                               n   = XSTRING_LENGTH (lstr);
3485                             }
3486                         }
3487                       else
3488 #endif /* ENABLE_COMPOSITE_CHARS */
3489                         {
3490                           encode_ucs4(charset, ch, c, dst);
3491                         }
3492                       ch = 0;
3493                     }
3494                   else
3495                     {
3496                       ch = c;
3497                       char_boundary = 0;
3498                     }
3499                   break;
3500                 case 4:
3501                   if (ch)
3502                     {
3503                       encode_ucs4 (charset, ch, c, dst);
3504                       ch = 0;
3505                     }
3506                   else
3507                     {
3508                       ch = c;
3509                       char_boundary = 0;
3510                     }
3511                   break;
3512                 default:
3513                   abort ();
3514                 }
3515             }
3516         }
3517     }
3518
3519 #ifdef ENABLE_COMPOSITE_CHARS
3520   if (in_composite)
3521     {
3522       n = saved_n;
3523       src = saved_src;
3524       in_composite = 0;
3525       goto back_to_square_n; /* Wheeeeeeeee ..... */
3526     }
3527 #endif /* ENABLE_COMPOSITE_CHARS */
3528
3529   str->flags = flags;
3530   str->ch = ch;
3531   str->iso2022.current_char_boundary = char_boundary;
3532   str->iso2022.current_charset = charset;
3533
3534   /* Verbum caro factum est! */
3535 }
3536
3537 \f
3538 /************************************************************************/
3539 /*                           UTF-8 methods                              */
3540 /************************************************************************/
3541
3542 static int
3543 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3544                     unsigned int n)
3545 {
3546   while (n--)
3547     {
3548       unsigned char c = *src++;
3549       switch (st->utf8.in_byte)
3550         {
3551         case 0:
3552           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3553             return 0;
3554           else if (c >= 0xfc)
3555             st->utf8.in_byte = 5;
3556           else if (c >= 0xf8)
3557             st->utf8.in_byte = 4;
3558           else if (c >= 0xf0)
3559             st->utf8.in_byte = 3;
3560           else if (c >= 0xe0)
3561             st->utf8.in_byte = 2;
3562           else if (c >= 0xc0)
3563             st->utf8.in_byte = 1;
3564           else if (c >= 0x80)
3565             return 0;
3566           break;
3567         default:
3568           if ((c & 0xc0) != 0x80)
3569             return 0;
3570           else
3571             st->utf8.in_byte--;
3572         }
3573     }
3574   return CODING_CATEGORY_UTF8_MASK;
3575 }
3576
3577 static void
3578 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3579                     unsigned_char_dynarr *dst, unsigned int n)
3580 {
3581   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3582   unsigned int flags  = str->flags;
3583   unsigned int ch     = str->ch;
3584   eol_type_t eol_type = str->eol_type;
3585
3586   while (n--)
3587     {
3588       unsigned char c = *src++;
3589       switch (flags)
3590         {
3591         case 0:
3592           if ( c >= 0xfc )
3593             {
3594               ch = c & 0x01;
3595               flags = 5;
3596             }
3597           else if ( c >= 0xf8 )
3598             {
3599               ch = c & 0x03;
3600               flags = 4;
3601             }
3602           else if ( c >= 0xf0 )
3603             {
3604               ch = c & 0x07;
3605               flags = 3;
3606             }
3607           else if ( c >= 0xe0 )
3608             {
3609               ch = c & 0x0f;
3610               flags = 2;
3611             }
3612           else if ( c >= 0xc0 )
3613             {
3614               ch = c & 0x1f;
3615               flags = 1;
3616             }
3617           else
3618             {
3619               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3620               decode_ucs4 (c, dst);
3621             }
3622           break;
3623         case 1:
3624           ch = ( ch << 6 ) | ( c & 0x3f );
3625           decode_ucs4 (ch, dst);
3626           ch = 0;
3627           flags = 0;
3628           break;
3629         default:
3630           ch = ( ch << 6 ) | ( c & 0x3f );
3631           flags--;
3632         }
3633     label_continue_loop:;
3634     }
3635
3636   if (flags & CODING_STATE_END)
3637     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3638
3639   str->flags = flags;
3640   str->ch    = ch;
3641 }
3642
3643 #ifndef UTF2000
3644 static void
3645 encode_utf8 (Lisp_Object charset,
3646              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3647 {
3648   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3649   if ( code <= 0x7f )
3650     {
3651       Dynarr_add (dst, code);
3652     }
3653   else if ( code <= 0x7ff )
3654     {
3655       Dynarr_add (dst, (code >> 6) | 0xc0);
3656       Dynarr_add (dst, (code & 0x3f) | 0x80);
3657     }
3658   else if ( code <= 0xffff )
3659     {
3660       Dynarr_add (dst,  (code >> 12) | 0xe0);
3661       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3662       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3663     }
3664   else if ( code <= 0x1fffff )
3665     {
3666       Dynarr_add (dst,  (code >> 18) | 0xf0);
3667       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3668       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3669       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3670     }
3671   else if ( code <= 0x3ffffff )
3672     {
3673       Dynarr_add (dst,  (code >> 24) | 0xf8);
3674       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3675       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3676       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3677       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3678     }
3679   else
3680     {
3681       Dynarr_add (dst,  (code >> 30) | 0xfc);
3682       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3683       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3684       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3685       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3686       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3687     }
3688 }
3689 #endif
3690
3691 static void
3692 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3693                     unsigned_char_dynarr *dst, unsigned int n)
3694 {
3695   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3696   unsigned int flags  = str->flags;
3697   unsigned int ch     = str->ch;
3698   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3699   unsigned char char_boundary = str->iso2022.current_char_boundary;
3700 #ifdef UTF2000
3701
3702   while (n--)
3703     {
3704       unsigned char c = *src++;
3705       switch (char_boundary)
3706         {
3707         case 0:
3708           if ( c >= 0xfc )
3709             {
3710               Dynarr_add (dst, c);
3711               char_boundary = 5;
3712             }
3713           else if ( c >= 0xf8 )
3714             {
3715               Dynarr_add (dst, c);
3716               char_boundary = 4;
3717             }
3718           else if ( c >= 0xf0 )
3719             {
3720               Dynarr_add (dst, c);
3721               char_boundary = 3;
3722             }
3723           else if ( c >= 0xe0 )
3724             {
3725               Dynarr_add (dst, c);
3726               char_boundary = 2;
3727             }
3728           else if ( c >= 0xc0 )
3729             {
3730               Dynarr_add (dst, c);
3731               char_boundary = 1;
3732             }
3733           else
3734             {
3735               if (c == '\n')
3736                 {
3737                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3738                     Dynarr_add (dst, '\r');
3739                   if (eol_type != EOL_CR)
3740                     Dynarr_add (dst, c);
3741                 }
3742               else
3743                 Dynarr_add (dst, c);
3744               char_boundary = 0;
3745             }
3746           break;
3747         case 1:
3748           Dynarr_add (dst, c);
3749           char_boundary = 0;
3750           break;
3751         default:
3752           Dynarr_add (dst, c);
3753           char_boundary--;
3754         }
3755     }
3756 #else /* not UTF2000 */
3757   Lisp_Object charset = str->iso2022.current_charset;
3758
3759 #ifdef ENABLE_COMPOSITE_CHARS
3760   /* flags for handling composite chars.  We do a little switcharoo
3761      on the source while we're outputting the composite char. */
3762   unsigned int saved_n = 0;
3763   CONST unsigned char *saved_src = NULL;
3764   int in_composite = 0;
3765
3766  back_to_square_n:
3767 #endif /* ENABLE_COMPOSITE_CHARS */
3768
3769   while (n--)
3770     {
3771       unsigned char c = *src++;
3772
3773       if (BYTE_ASCII_P (c))
3774         {               /* Processing ASCII character */
3775           ch = 0;
3776           if (c == '\n')
3777             {
3778               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3779                 Dynarr_add (dst, '\r');
3780               if (eol_type != EOL_CR)
3781                 Dynarr_add (dst, c);
3782             }
3783           else
3784             encode_utf8 (Vcharset_ascii, c, 0, dst);
3785           char_boundary = 1;
3786         }
3787       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3788         { /* Processing Leading Byte */
3789           ch = 0;
3790           charset = CHARSET_BY_LEADING_BYTE (c);
3791           if (LEADING_BYTE_PREFIX_P(c))
3792             ch = c;
3793           char_boundary = 0;
3794         }
3795       else
3796         {                       /* Processing Non-ASCII character */
3797           char_boundary = 1;
3798           if (EQ (charset, Vcharset_control_1))
3799             {
3800               encode_utf8 (Vcharset_control_1, c, 0, dst);
3801             }
3802           else
3803             {
3804               switch (XCHARSET_REP_BYTES (charset))
3805                 {
3806                 case 2:
3807                   encode_utf8 (charset, c, 0, dst);
3808                   break;
3809                 case 3:
3810                   if (XCHARSET_PRIVATE_P (charset))
3811                     {
3812                       encode_utf8 (charset, c, 0, dst);
3813                       ch = 0;
3814                     }
3815                   else if (ch)
3816                     {
3817 #ifdef ENABLE_COMPOSITE_CHARS
3818                       if (EQ (charset, Vcharset_composite))
3819                         {
3820                           if (in_composite)
3821                             {
3822                               /* #### Bother! We don't know how to
3823                                  handle this yet. */
3824                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3825                             }
3826                           else
3827                             {
3828                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3829                                                        ch & 0x7F, c & 0x7F);
3830                               Lisp_Object lstr = composite_char_string (emch);
3831                               saved_n = n;
3832                               saved_src = src;
3833                               in_composite = 1;
3834                               src = XSTRING_DATA   (lstr);
3835                               n   = XSTRING_LENGTH (lstr);
3836                             }
3837                         }
3838                       else
3839 #endif /* ENABLE_COMPOSITE_CHARS */
3840                         {
3841                           encode_utf8 (charset, ch, c, dst);
3842                         }
3843                       ch = 0;
3844                     }
3845                   else
3846                     {
3847                       ch = c;
3848                       char_boundary = 0;
3849                     }
3850                   break;
3851                 case 4:
3852                   if (ch)
3853                     {
3854                       encode_utf8 (charset, ch, c, dst);
3855                       ch = 0;
3856                     }
3857                   else
3858                     {
3859                       ch = c;
3860                       char_boundary = 0;
3861                     }
3862                   break;
3863                 default:
3864                   abort ();
3865                 }
3866             }
3867         }
3868     }
3869
3870 #ifdef ENABLE_COMPOSITE_CHARS
3871   if (in_composite)
3872     {
3873       n = saved_n;
3874       src = saved_src;
3875       in_composite = 0;
3876       goto back_to_square_n; /* Wheeeeeeeee ..... */
3877     }
3878 #endif
3879
3880 #endif /* not UTF2000 */
3881   str->flags = flags;
3882   str->ch    = ch;
3883   str->iso2022.current_char_boundary = char_boundary;
3884 #ifndef UTF2000
3885   str->iso2022.current_charset = charset;
3886 #endif
3887
3888   /* Verbum caro factum est! */
3889 }
3890
3891 \f
3892 /************************************************************************/
3893 /*                           ISO2022 methods                            */
3894 /************************************************************************/
3895
3896 /* The following note describes the coding system ISO2022 briefly.
3897    Since the intention of this note is to help understand the
3898    functions in this file, some parts are NOT ACCURATE or OVERLY
3899    SIMPLIFIED.  For thorough understanding, please refer to the
3900    original document of ISO2022.
3901
3902    ISO2022 provides many mechanisms to encode several character sets
3903    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3904    is encoded using bytes less than 128.  This may make the encoded
3905    text a little bit longer, but the text passes more easily through
3906    several gateways, some of which strip off MSB (Most Signigant Bit).
3907
3908    There are two kinds of character sets: control character set and
3909    graphic character set.  The former contains control characters such
3910    as `newline' and `escape' to provide control functions (control
3911    functions are also provided by escape sequences).  The latter
3912    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3913    two control character sets and many graphic character sets.
3914
3915    Graphic character sets are classified into one of the following
3916    four classes, according to the number of bytes (DIMENSION) and
3917    number of characters in one dimension (CHARS) of the set:
3918    - DIMENSION1_CHARS94
3919    - DIMENSION1_CHARS96
3920    - DIMENSION2_CHARS94
3921    - DIMENSION2_CHARS96
3922
3923    In addition, each character set is assigned an identification tag,
3924    unique for each set, called "final character" (denoted as <F>
3925    hereafter).  The <F> of each character set is decided by ECMA(*)
3926    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3927    (0x30..0x3F are for private use only).
3928
3929    Note (*): ECMA = European Computer Manufacturers Association
3930
3931    Here are examples of graphic character set [NAME(<F>)]:
3932         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3933         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3934         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3935         o DIMENSION2_CHARS96 -- none for the moment
3936
3937    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3938         C0 [0x00..0x1F] -- control character plane 0
3939         GL [0x20..0x7F] -- graphic character plane 0
3940         C1 [0x80..0x9F] -- control character plane 1
3941         GR [0xA0..0xFF] -- graphic character plane 1
3942
3943    A control character set is directly designated and invoked to C0 or
3944    C1 by an escape sequence.  The most common case is that:
3945    - ISO646's  control character set is designated/invoked to C0, and
3946    - ISO6429's control character set is designated/invoked to C1,
3947    and usually these designations/invocations are omitted in encoded
3948    text.  In a 7-bit environment, only C0 can be used, and a control
3949    character for C1 is encoded by an appropriate escape sequence to
3950    fit into the environment.  All control characters for C1 are
3951    defined to have corresponding escape sequences.
3952
3953    A graphic character set is at first designated to one of four
3954    graphic registers (G0 through G3), then these graphic registers are
3955    invoked to GL or GR.  These designations and invocations can be
3956    done independently.  The most common case is that G0 is invoked to
3957    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3958    these invocations and designations are omitted in encoded text.
3959    In a 7-bit environment, only GL can be used.
3960
3961    When a graphic character set of CHARS94 is invoked to GL, codes
3962    0x20 and 0x7F of the GL area work as control characters SPACE and
3963    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3964    be used.
3965
3966    There are two ways of invocation: locking-shift and single-shift.
3967    With locking-shift, the invocation lasts until the next different
3968    invocation, whereas with single-shift, the invocation affects the
3969    following character only and doesn't affect the locking-shift
3970    state.  Invocations are done by the following control characters or
3971    escape sequences:
3972
3973    ----------------------------------------------------------------------
3974    abbrev  function                  cntrl escape seq   description
3975    ----------------------------------------------------------------------
3976    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3977    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3978    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3979    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3980    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3981    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3982    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3983    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3984    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3985    ----------------------------------------------------------------------
3986    (*) These are not used by any known coding system.
3987
3988    Control characters for these functions are defined by macros
3989    ISO_CODE_XXX in `coding.h'.
3990
3991    Designations are done by the following escape sequences:
3992    ----------------------------------------------------------------------
3993    escape sequence      description
3994    ----------------------------------------------------------------------
3995    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3996    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3997    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3998    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3999    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4000    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4001    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4002    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4003    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4004    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4005    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4006    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4007    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4008    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4009    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4010    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4011    ----------------------------------------------------------------------
4012
4013    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4014    of dimension 1, chars 94, and final character <F>, etc...
4015
4016    Note (*): Although these designations are not allowed in ISO2022,
4017    Emacs accepts them on decoding, and produces them on encoding
4018    CHARS96 character sets in a coding system which is characterized as
4019    7-bit environment, non-locking-shift, and non-single-shift.
4020
4021    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4022    '(' can be omitted.  We refer to this as "short-form" hereafter.
4023
4024    Now you may notice that there are a lot of ways for encoding the
4025    same multilingual text in ISO2022.  Actually, there exist many
4026    coding systems such as Compound Text (used in X11's inter client
4027    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4028    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4029    localized platforms), and all of these are variants of ISO2022.
4030
4031    In addition to the above, Emacs handles two more kinds of escape
4032    sequences: ISO6429's direction specification and Emacs' private
4033    sequence for specifying character composition.
4034
4035    ISO6429's direction specification takes the following form:
4036         o CSI ']'      -- end of the current direction
4037         o CSI '0' ']'  -- end of the current direction
4038         o CSI '1' ']'  -- start of left-to-right text
4039         o CSI '2' ']'  -- start of right-to-left text
4040    The control character CSI (0x9B: control sequence introducer) is
4041    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4042
4043    Character composition specification takes the following form:
4044         o ESC '0' -- start character composition
4045         o ESC '1' -- end character composition
4046    Since these are not standard escape sequences of any ISO standard,
4047    their use with these meanings is restricted to Emacs only.  */
4048
4049 static void
4050 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4051 {
4052   int i;
4053
4054   for (i = 0; i < 4; i++)
4055     {
4056       if (!NILP (coding_system))
4057         iso->charset[i] =
4058           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4059       else
4060         iso->charset[i] = Qt;
4061       iso->invalid_designated[i] = 0;
4062     }
4063   iso->esc = ISO_ESC_NOTHING;
4064   iso->esc_bytes_index = 0;
4065   iso->register_left = 0;
4066   iso->register_right = 1;
4067   iso->switched_dir_and_no_valid_charset_yet = 0;
4068   iso->invalid_switch_dir = 0;
4069   iso->output_direction_sequence = 0;
4070   iso->output_literally = 0;
4071 #ifdef ENABLE_COMPOSITE_CHARS
4072   if (iso->composite_chars)
4073     Dynarr_reset (iso->composite_chars);
4074 #endif
4075 }
4076
4077 static int
4078 fit_to_be_escape_quoted (unsigned char c)
4079 {
4080   switch (c)
4081     {
4082     case ISO_CODE_ESC:
4083     case ISO_CODE_CSI:
4084     case ISO_CODE_SS2:
4085     case ISO_CODE_SS3:
4086     case ISO_CODE_SO:
4087     case ISO_CODE_SI:
4088       return 1;
4089
4090     default:
4091       return 0;
4092     }
4093 }
4094
4095 /* Parse one byte of an ISO2022 escape sequence.
4096    If the result is an invalid escape sequence, return 0 and
4097    do not change anything in STR.  Otherwise, if the result is
4098    an incomplete escape sequence, update ISO2022.ESC and
4099    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4100    all the state variables (but not ISO2022.ESC_BYTES) and
4101    return 1.
4102
4103    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4104    or invocation of an invalid character set and treat that as
4105    an unrecognized escape sequence. */
4106
4107 static int
4108 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4109                    unsigned char c, unsigned int *flags,
4110                    int check_invalid_charsets)
4111 {
4112   /* (1) If we're at the end of a designation sequence, CS is the
4113      charset being designated and REG is the register to designate
4114      it to.
4115
4116      (2) If we're at the end of a locking-shift sequence, REG is
4117      the register to invoke and HALF (0 == left, 1 == right) is
4118      the half to invoke it into.
4119
4120      (3) If we're at the end of a single-shift sequence, REG is
4121      the register to invoke. */
4122   Lisp_Object cs = Qnil;
4123   int reg, half;
4124
4125   /* NOTE: This code does goto's all over the fucking place.
4126      The reason for this is that we're basically implementing
4127      a state machine here, and hierarchical languages like C
4128      don't really provide a clean way of doing this. */
4129
4130   if (! (*flags & CODING_STATE_ESCAPE))
4131     /* At beginning of escape sequence; we need to reset our
4132        escape-state variables. */
4133     iso->esc = ISO_ESC_NOTHING;
4134
4135   iso->output_literally = 0;
4136   iso->output_direction_sequence = 0;
4137
4138   switch (iso->esc)
4139     {
4140     case ISO_ESC_NOTHING:
4141       iso->esc_bytes_index = 0;
4142       switch (c)
4143         {
4144         case ISO_CODE_ESC:      /* Start escape sequence */
4145           *flags |= CODING_STATE_ESCAPE;
4146           iso->esc = ISO_ESC;
4147           goto not_done;
4148
4149         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4150           *flags |= CODING_STATE_ESCAPE;
4151           iso->esc = ISO_ESC_5_11;
4152           goto not_done;
4153
4154         case ISO_CODE_SO:       /* locking shift 1 */
4155           reg = 1; half = 0;
4156           goto locking_shift;
4157         case ISO_CODE_SI:       /* locking shift 0 */
4158           reg = 0; half = 0;
4159           goto locking_shift;
4160
4161         case ISO_CODE_SS2:      /* single shift */
4162           reg = 2;
4163           goto single_shift;
4164         case ISO_CODE_SS3:      /* single shift */
4165           reg = 3;
4166           goto single_shift;
4167
4168         default:                        /* Other control characters */
4169           return 0;
4170         }
4171
4172     case ISO_ESC:
4173       switch (c)
4174         {
4175           /**** single shift ****/
4176
4177         case 'N':       /* single shift 2 */
4178           reg = 2;
4179           goto single_shift;
4180         case 'O':       /* single shift 3 */
4181           reg = 3;
4182           goto single_shift;
4183
4184           /**** locking shift ****/
4185
4186         case '~':       /* locking shift 1 right */
4187           reg = 1; half = 1;
4188           goto locking_shift;
4189         case 'n':       /* locking shift 2 */
4190           reg = 2; half = 0;
4191           goto locking_shift;
4192         case '}':       /* locking shift 2 right */
4193           reg = 2; half = 1;
4194           goto locking_shift;
4195         case 'o':       /* locking shift 3 */
4196           reg = 3; half = 0;
4197           goto locking_shift;
4198         case '|':       /* locking shift 3 right */
4199           reg = 3; half = 1;
4200           goto locking_shift;
4201
4202 #ifdef ENABLE_COMPOSITE_CHARS
4203           /**** composite ****/
4204
4205         case '0':
4206           iso->esc = ISO_ESC_START_COMPOSITE;
4207           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4208             CODING_STATE_COMPOSITE;
4209           return 1;
4210
4211         case '1':
4212           iso->esc = ISO_ESC_END_COMPOSITE;
4213           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4214             ~CODING_STATE_COMPOSITE;
4215           return 1;
4216 #endif /* ENABLE_COMPOSITE_CHARS */
4217
4218           /**** directionality ****/
4219
4220         case '[':
4221           iso->esc = ISO_ESC_5_11;
4222           goto not_done;
4223
4224           /**** designation ****/
4225
4226         case '$':       /* multibyte charset prefix */
4227           iso->esc = ISO_ESC_2_4;
4228           goto not_done;
4229
4230         default:
4231           if (0x28 <= c && c <= 0x2F)
4232             {
4233               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4234               goto not_done;
4235             }
4236
4237           /* This function is called with CODESYS equal to nil when
4238              doing coding-system detection. */
4239           if (!NILP (codesys)
4240               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4241               && fit_to_be_escape_quoted (c))
4242             {
4243               iso->esc = ISO_ESC_LITERAL;
4244               *flags &= CODING_STATE_ISO2022_LOCK;
4245               return 1;
4246             }
4247
4248           /* bzzzt! */
4249           return 0;
4250         }
4251
4252
4253
4254       /**** directionality ****/
4255
4256     case ISO_ESC_5_11:          /* ISO6429 direction control */
4257       if (c == ']')
4258         {
4259           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4260           goto directionality;
4261         }
4262       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4263       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4264       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4265       else               return 0;
4266       goto not_done;
4267
4268     case ISO_ESC_5_11_0:
4269       if (c == ']')
4270         {
4271           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4272           goto directionality;
4273         }
4274       return 0;
4275
4276     case ISO_ESC_5_11_1:
4277       if (c == ']')
4278         {
4279           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4280           goto directionality;
4281         }
4282       return 0;
4283
4284     case ISO_ESC_5_11_2:
4285       if (c == ']')
4286         {
4287           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4288           goto directionality;
4289         }
4290       return 0;
4291
4292     directionality:
4293       iso->esc = ISO_ESC_DIRECTIONALITY;
4294       /* Various junk here to attempt to preserve the direction sequences
4295          literally in the text if they would otherwise be swallowed due
4296          to invalid designations that don't show up as actual charset
4297          changes in the text. */
4298       if (iso->invalid_switch_dir)
4299         {
4300           /* We already inserted a direction switch literally into the
4301              text.  We assume (#### this may not be right) that the
4302              next direction switch is the one going the other way,
4303              and we need to output that literally as well. */
4304           iso->output_literally = 1;
4305           iso->invalid_switch_dir = 0;
4306         }
4307       else
4308         {
4309           int jj;
4310
4311           /* If we are in the thrall of an invalid designation,
4312            then stick the directionality sequence literally into the
4313            output stream so it ends up in the original text again. */
4314           for (jj = 0; jj < 4; jj++)
4315             if (iso->invalid_designated[jj])
4316               break;
4317           if (jj < 4)
4318             {
4319               iso->output_literally = 1;
4320               iso->invalid_switch_dir = 1;
4321             }
4322           else
4323             /* Indicate that we haven't yet seen a valid designation,
4324                so that if a switch-dir is directly followed by an
4325                invalid designation, both get inserted literally. */
4326             iso->switched_dir_and_no_valid_charset_yet = 1;
4327         }
4328       return 1;
4329
4330
4331       /**** designation ****/
4332
4333     case ISO_ESC_2_4:
4334       if (0x28 <= c && c <= 0x2F)
4335         {
4336           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4337           goto not_done;
4338         }
4339       if (0x40 <= c && c <= 0x42)
4340         {
4341           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4342                                       *flags & CODING_STATE_R2L ?
4343                                       CHARSET_RIGHT_TO_LEFT :
4344                                       CHARSET_LEFT_TO_RIGHT);
4345           reg = 0;
4346           goto designated;
4347         }
4348       return 0;
4349
4350     default:
4351       {
4352         int type =-1;
4353
4354         if (c < '0' || c > '~')
4355           return 0; /* bad final byte */
4356
4357         if (iso->esc >= ISO_ESC_2_8 &&
4358             iso->esc <= ISO_ESC_2_15)
4359           {
4360             type = ((iso->esc >= ISO_ESC_2_12) ?
4361                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4362             reg = (iso->esc - ISO_ESC_2_8) & 3;
4363           }
4364         else if (iso->esc >= ISO_ESC_2_4_8 &&
4365                  iso->esc <= ISO_ESC_2_4_15)
4366           {
4367             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4368                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4369             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4370           }
4371         else
4372           {
4373             /* Can this ever be reached? -slb */
4374             abort();
4375           }
4376
4377         cs = CHARSET_BY_ATTRIBUTES (type, c,
4378                                     *flags & CODING_STATE_R2L ?
4379                                     CHARSET_RIGHT_TO_LEFT :
4380                                     CHARSET_LEFT_TO_RIGHT);
4381         goto designated;
4382       }
4383     }
4384
4385  not_done:
4386   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4387   return -1;
4388
4389  single_shift:
4390   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4391     /* can't invoke something that ain't there. */
4392     return 0;
4393   iso->esc = ISO_ESC_SINGLE_SHIFT;
4394   *flags &= CODING_STATE_ISO2022_LOCK;
4395   if (reg == 2)
4396     *flags |= CODING_STATE_SS2;
4397   else
4398     *flags |= CODING_STATE_SS3;
4399   return 1;
4400
4401  locking_shift:
4402   if (check_invalid_charsets &&
4403       !CHARSETP (iso->charset[reg]))
4404     /* can't invoke something that ain't there. */
4405     return 0;
4406   if (half)
4407     iso->register_right = reg;
4408   else
4409     iso->register_left = reg;
4410   *flags &= CODING_STATE_ISO2022_LOCK;
4411   iso->esc = ISO_ESC_LOCKING_SHIFT;
4412   return 1;
4413
4414  designated:
4415   if (NILP (cs) && check_invalid_charsets)
4416     {
4417       iso->invalid_designated[reg] = 1;
4418       iso->charset[reg] = Vcharset_ascii;
4419       iso->esc = ISO_ESC_DESIGNATE;
4420       *flags &= CODING_STATE_ISO2022_LOCK;
4421       iso->output_literally = 1;
4422       if (iso->switched_dir_and_no_valid_charset_yet)
4423         {
4424           /* We encountered a switch-direction followed by an
4425              invalid designation.  Ensure that the switch-direction
4426              gets outputted; otherwise it will probably get eaten
4427              when the text is written out again. */
4428           iso->switched_dir_and_no_valid_charset_yet = 0;
4429           iso->output_direction_sequence = 1;
4430           /* And make sure that the switch-dir going the other
4431              way gets outputted, as well. */
4432           iso->invalid_switch_dir = 1;
4433         }
4434       return 1;
4435     }
4436   /* This function is called with CODESYS equal to nil when
4437      doing coding-system detection. */
4438   if (!NILP (codesys))
4439     {
4440       charset_conversion_spec_dynarr *dyn =
4441         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4442
4443       if (dyn)
4444         {
4445           int i;
4446
4447           for (i = 0; i < Dynarr_length (dyn); i++)
4448             {
4449               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4450               if (EQ (cs, spec->from_charset))
4451                 cs = spec->to_charset;
4452             }
4453         }
4454     }
4455
4456   iso->charset[reg] = cs;
4457   iso->esc = ISO_ESC_DESIGNATE;
4458   *flags &= CODING_STATE_ISO2022_LOCK;
4459   if (iso->invalid_designated[reg])
4460     {
4461       iso->invalid_designated[reg] = 0;
4462       iso->output_literally = 1;
4463     }
4464   if (iso->switched_dir_and_no_valid_charset_yet)
4465     iso->switched_dir_and_no_valid_charset_yet = 0;
4466   return 1;
4467 }
4468
4469 static int
4470 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4471                        unsigned int n)
4472 {
4473   int mask;
4474
4475   /* #### There are serious deficiencies in the recognition mechanism
4476      here.  This needs to be much smarter if it's going to cut it.
4477      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4478      it should be detected as Latin-1.
4479      All the ISO2022 stuff in this file should be synced up with the
4480      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4481      Perhaps we should wait till R2L works in FSF Emacs? */
4482
4483   if (!st->iso2022.initted)
4484     {
4485       reset_iso2022 (Qnil, &st->iso2022.iso);
4486       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4487                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4488                           CODING_CATEGORY_ISO_8_1_MASK |
4489                           CODING_CATEGORY_ISO_8_2_MASK |
4490                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4491       st->iso2022.flags = 0;
4492       st->iso2022.high_byte_count = 0;
4493       st->iso2022.saw_single_shift = 0;
4494       st->iso2022.initted = 1;
4495     }
4496
4497   mask = st->iso2022.mask;
4498
4499   while (n--)
4500     {
4501       int c = *src++;
4502       if (c >= 0xA0)
4503         {
4504           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4505           st->iso2022.high_byte_count++;
4506         }
4507       else
4508         {
4509           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4510             {
4511               if (st->iso2022.high_byte_count & 1)
4512                 /* odd number of high bytes; assume not iso-8-2 */
4513                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4514             }
4515           st->iso2022.high_byte_count = 0;
4516           st->iso2022.saw_single_shift = 0;
4517           if (c > 0x80)
4518             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4519         }
4520       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4521           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4522         { /* control chars */
4523           switch (c)
4524             {
4525               /* Allow and ignore control characters that you might
4526                  reasonably see in a text file */
4527             case '\r':
4528             case '\n':
4529             case '\t':
4530             case  7: /* bell */
4531             case  8: /* backspace */
4532             case 11: /* vertical tab */
4533             case 12: /* form feed */
4534             case 26: /* MS-DOS C-z junk */
4535             case 31: /* '^_' -- for info */
4536               goto label_continue_loop;
4537
4538             default:
4539               break;
4540             }
4541         }
4542
4543       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4544           || BYTE_C1_P (c))
4545         {
4546           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4547                                  &st->iso2022.flags, 0))
4548             {
4549               switch (st->iso2022.iso.esc)
4550                 {
4551                 case ISO_ESC_DESIGNATE:
4552                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4553                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4554                   break;
4555                 case ISO_ESC_LOCKING_SHIFT:
4556                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4557                   goto ran_out_of_chars;
4558                 case ISO_ESC_SINGLE_SHIFT:
4559                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4560                   st->iso2022.saw_single_shift = 1;
4561                   break;
4562                 default:
4563                   break;
4564                 }
4565             }
4566           else
4567             {
4568               mask = 0;
4569               goto ran_out_of_chars;
4570             }
4571         }
4572     label_continue_loop:;
4573     }
4574
4575  ran_out_of_chars:
4576
4577   return mask;
4578 }
4579
4580 static int
4581 postprocess_iso2022_mask (int mask)
4582 {
4583   /* #### kind of cheesy */
4584   /* If seven-bit ISO is allowed, then assume that the encoding is
4585      entirely seven-bit and turn off the eight-bit ones. */
4586   if (mask & CODING_CATEGORY_ISO_7_MASK)
4587     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4588                CODING_CATEGORY_ISO_8_1_MASK |
4589                CODING_CATEGORY_ISO_8_2_MASK);
4590   return mask;
4591 }
4592
4593 /* If FLAGS is a null pointer or specifies right-to-left motion,
4594    output a switch-dir-to-left-to-right sequence to DST.
4595    Also update FLAGS if it is not a null pointer.
4596    If INTERNAL_P is set, we are outputting in internal format and
4597    need to handle the CSI differently. */
4598
4599 static void
4600 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4601                                  unsigned_char_dynarr *dst,
4602                                  unsigned int *flags,
4603                                  int internal_p)
4604 {
4605   if (!flags || (*flags & CODING_STATE_R2L))
4606     {
4607       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4608         {
4609           Dynarr_add (dst, ISO_CODE_ESC);
4610           Dynarr_add (dst, '[');
4611         }
4612       else if (internal_p)
4613         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4614       else
4615         Dynarr_add (dst, ISO_CODE_CSI);
4616       Dynarr_add (dst, '0');
4617       Dynarr_add (dst, ']');
4618       if (flags)
4619         *flags &= ~CODING_STATE_R2L;
4620     }
4621 }
4622
4623 /* If FLAGS is a null pointer or specifies a direction different from
4624    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4625    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4626    sequence to DST.  Also update FLAGS if it is not a null pointer.
4627    If INTERNAL_P is set, we are outputting in internal format and
4628    need to handle the CSI differently. */
4629
4630 static void
4631 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4632                           unsigned_char_dynarr *dst, unsigned int *flags,
4633                           int internal_p)
4634 {
4635   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4636       direction == CHARSET_LEFT_TO_RIGHT)
4637     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4638   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4639            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4640            direction == CHARSET_RIGHT_TO_LEFT)
4641     {
4642       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4643         {
4644           Dynarr_add (dst, ISO_CODE_ESC);
4645           Dynarr_add (dst, '[');
4646         }
4647       else if (internal_p)
4648         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4649       else
4650         Dynarr_add (dst, ISO_CODE_CSI);
4651       Dynarr_add (dst, '2');
4652       Dynarr_add (dst, ']');
4653       if (flags)
4654         *flags |= CODING_STATE_R2L;
4655     }
4656 }
4657
4658 /* Convert ISO2022-format data to internal format. */
4659
4660 static void
4661 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4662                        unsigned_char_dynarr *dst, unsigned int n)
4663 {
4664   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4665   unsigned int flags  = str->flags;
4666   unsigned int ch     = str->ch;
4667   eol_type_t eol_type = str->eol_type;
4668 #ifdef ENABLE_COMPOSITE_CHARS
4669   unsigned_char_dynarr *real_dst = dst;
4670 #endif
4671   Lisp_Object coding_system;
4672
4673   XSETCODING_SYSTEM (coding_system, str->codesys);
4674
4675 #ifdef ENABLE_COMPOSITE_CHARS
4676   if (flags & CODING_STATE_COMPOSITE)
4677     dst = str->iso2022.composite_chars;
4678 #endif /* ENABLE_COMPOSITE_CHARS */
4679
4680   while (n--)
4681     {
4682       unsigned char c = *src++;
4683       if (flags & CODING_STATE_ESCAPE)
4684         {       /* Within ESC sequence */
4685           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4686                                           c, &flags, 1);
4687
4688           if (retval)
4689             {
4690               switch (str->iso2022.esc)
4691                 {
4692 #ifdef ENABLE_COMPOSITE_CHARS
4693                 case ISO_ESC_START_COMPOSITE:
4694                   if (str->iso2022.composite_chars)
4695                     Dynarr_reset (str->iso2022.composite_chars);
4696                   else
4697                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4698                   dst = str->iso2022.composite_chars;
4699                   break;
4700                 case ISO_ESC_END_COMPOSITE:
4701                   {
4702                     Bufbyte comstr[MAX_EMCHAR_LEN];
4703                     Bytecount len;
4704                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4705                                                          Dynarr_length (dst));
4706                     dst = real_dst;
4707                     len = set_charptr_emchar (comstr, emch);
4708                     Dynarr_add_many (dst, comstr, len);
4709                     break;
4710                   }
4711 #endif /* ENABLE_COMPOSITE_CHARS */
4712
4713                 case ISO_ESC_LITERAL:
4714                   DECODE_ADD_BINARY_CHAR (c, dst);
4715                   break;
4716
4717                 default:
4718                   /* Everything else handled already */
4719                   break;
4720                 }
4721             }
4722
4723           /* Attempted error recovery. */
4724           if (str->iso2022.output_direction_sequence)
4725             ensure_correct_direction (flags & CODING_STATE_R2L ?
4726                                       CHARSET_RIGHT_TO_LEFT :
4727                                       CHARSET_LEFT_TO_RIGHT,
4728                                       str->codesys, dst, 0, 1);
4729           /* More error recovery. */
4730           if (!retval || str->iso2022.output_literally)
4731             {
4732               /* Output the (possibly invalid) sequence */
4733               int i;
4734               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4735                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4736               flags &= CODING_STATE_ISO2022_LOCK;
4737               if (!retval)
4738                 n++, src--;/* Repeat the loop with the same character. */
4739               else
4740                 {
4741                   /* No sense in reprocessing the final byte of the
4742                      escape sequence; it could mess things up anyway.
4743                      Just add it now. */
4744                   DECODE_ADD_BINARY_CHAR (c, dst);
4745                 }
4746             }
4747           ch = 0;
4748         }
4749       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4750         { /* Control characters */
4751
4752           /***** Error-handling *****/
4753
4754           /* If we were in the middle of a character, dump out the
4755              partial character. */
4756           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4757
4758           /* If we just saw a single-shift character, dump it out.
4759              This may dump out the wrong sort of single-shift character,
4760              but least it will give an indication that something went
4761              wrong. */
4762           if (flags & CODING_STATE_SS2)
4763             {
4764               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4765               flags &= ~CODING_STATE_SS2;
4766             }
4767           if (flags & CODING_STATE_SS3)
4768             {
4769               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4770               flags &= ~CODING_STATE_SS3;
4771             }
4772
4773           /***** Now handle the control characters. *****/
4774
4775           /* Handle CR/LF */
4776           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4777
4778           flags &= CODING_STATE_ISO2022_LOCK;
4779
4780           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4781             DECODE_ADD_BINARY_CHAR (c, dst);
4782         }
4783       else
4784         {                       /* Graphic characters */
4785           Lisp_Object charset;
4786 #ifndef UTF2000
4787           int lb;
4788 #endif
4789           int reg;
4790
4791           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4792
4793           /* Now determine the charset. */
4794           reg = ((flags & CODING_STATE_SS2) ? 2
4795                  : (flags & CODING_STATE_SS3) ? 3
4796                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4797                  : str->iso2022.register_left);
4798           charset = str->iso2022.charset[reg];
4799
4800           /* Error checking: */
4801           if (! CHARSETP (charset)
4802               || str->iso2022.invalid_designated[reg]
4803               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4804                   && XCHARSET_CHARS (charset) == 94))
4805             /* Mrmph.  We are trying to invoke a register that has no
4806                or an invalid charset in it, or trying to add a character
4807                outside the range of the charset.  Insert that char literally
4808                to preserve it for the output. */
4809             {
4810               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4811               DECODE_ADD_BINARY_CHAR (c, dst);
4812             }
4813
4814           else
4815             {
4816               /* Things are probably hunky-dorey. */
4817
4818               /* Fetch reverse charset, maybe. */
4819               if (((flags & CODING_STATE_R2L) &&
4820                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4821                   ||
4822                   (!(flags & CODING_STATE_R2L) &&
4823                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4824                 {
4825                   Lisp_Object new_charset =
4826                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4827                   if (!NILP (new_charset))
4828                     charset = new_charset;
4829                 }
4830
4831 #ifndef UTF2000
4832               lb = XCHARSET_LEADING_BYTE (charset);
4833 #endif
4834               switch (XCHARSET_REP_BYTES (charset))
4835                 {
4836                 case 1: /* ASCII */
4837                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4838                   Dynarr_add (dst, c & 0x7F);
4839                   break;
4840
4841                 case 2: /* one-byte official */
4842                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4843 #ifdef UTF2000
4844                   DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0), dst);
4845 #else
4846                   Dynarr_add (dst, lb);
4847                   Dynarr_add (dst, c | 0x80);
4848 #endif
4849                   break;
4850
4851                 case 3: /* one-byte private or two-byte official */
4852                   if (XCHARSET_PRIVATE_P (charset))
4853                     {
4854                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4855 #ifdef UTF2000
4856                       DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0),
4857                                           dst);
4858 #else
4859                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4860                       Dynarr_add (dst, lb);
4861                       Dynarr_add (dst, c | 0x80);
4862 #endif
4863                     }
4864                   else
4865                     {
4866                       if (ch)
4867                         {
4868 #ifdef UTF2000
4869                           DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4870                                                         ch & 0x7F,
4871                                                         c & 0x7F), dst);
4872 #else
4873                           Dynarr_add (dst, lb);
4874                           Dynarr_add (dst, ch | 0x80);
4875                           Dynarr_add (dst, c | 0x80);
4876 #endif
4877                           ch = 0;
4878                         }
4879                       else
4880                         ch = c;
4881                     }
4882                   break;
4883
4884                 default:        /* two-byte private */
4885                   if (ch)
4886                     {
4887 #ifdef UTF2000
4888                       DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4889                                                     ch & 0x7F,
4890                                                     c & 0x7F), dst);
4891 #else
4892                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4893                       Dynarr_add (dst, lb);
4894                       Dynarr_add (dst, ch | 0x80);
4895                       Dynarr_add (dst, c | 0x80);
4896 #endif
4897                       ch = 0;
4898                     }
4899                   else
4900                     ch = c;
4901                 }
4902             }
4903
4904           if (!ch)
4905             flags &= CODING_STATE_ISO2022_LOCK;
4906         }
4907
4908     label_continue_loop:;
4909     }
4910
4911   if (flags & CODING_STATE_END)
4912     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4913
4914   str->flags = flags;
4915   str->ch    = ch;
4916 }
4917
4918
4919 /***** ISO2022 encoder *****/
4920
4921 /* Designate CHARSET into register REG. */
4922
4923 static void
4924 iso2022_designate (Lisp_Object charset, unsigned char reg,
4925                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4926 {
4927   static CONST char inter94[] = "()*+";
4928   static CONST char inter96[] = ",-./";
4929   unsigned int type;
4930   unsigned char final;
4931   Lisp_Object old_charset = str->iso2022.charset[reg];
4932
4933   str->iso2022.charset[reg] = charset;
4934   if (!CHARSETP (charset))
4935     /* charset might be an initial nil or t. */
4936     return;
4937   type = XCHARSET_TYPE (charset);
4938   final = XCHARSET_FINAL (charset);
4939   if (!str->iso2022.force_charset_on_output[reg] &&
4940       CHARSETP (old_charset) &&
4941       XCHARSET_TYPE (old_charset) == type &&
4942       XCHARSET_FINAL (old_charset) == final)
4943     return;
4944
4945   str->iso2022.force_charset_on_output[reg] = 0;
4946
4947   {
4948     charset_conversion_spec_dynarr *dyn =
4949       str->codesys->iso2022.output_conv;
4950
4951     if (dyn)
4952       {
4953         int i;
4954
4955         for (i = 0; i < Dynarr_length (dyn); i++)
4956           {
4957             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4958             if (EQ (charset, spec->from_charset))
4959                 charset = spec->to_charset;
4960           }
4961       }
4962   }
4963
4964   Dynarr_add (dst, ISO_CODE_ESC);
4965   switch (type)
4966     {
4967     case CHARSET_TYPE_94:
4968       Dynarr_add (dst, inter94[reg]);
4969       break;
4970     case CHARSET_TYPE_96:
4971       Dynarr_add (dst, inter96[reg]);
4972       break;
4973     case CHARSET_TYPE_94X94:
4974       Dynarr_add (dst, '$');
4975       if (reg != 0
4976           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4977           || final < '@'
4978           || final > 'B')
4979         Dynarr_add (dst, inter94[reg]);
4980       break;
4981     case CHARSET_TYPE_96X96:
4982       Dynarr_add (dst, '$');
4983       Dynarr_add (dst, inter96[reg]);
4984       break;
4985     }
4986   Dynarr_add (dst, final);
4987 }
4988
4989 static void
4990 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4991 {
4992   if (str->iso2022.register_left != 0)
4993     {
4994       Dynarr_add (dst, ISO_CODE_SI);
4995       str->iso2022.register_left = 0;
4996     }
4997 }
4998
4999 static void
5000 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5001 {
5002   if (str->iso2022.register_left != 1)
5003     {
5004       Dynarr_add (dst, ISO_CODE_SO);
5005       str->iso2022.register_left = 1;
5006     }
5007 }
5008
5009 /* Convert internally-formatted data to ISO2022 format. */
5010
5011 static void
5012 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5013                        unsigned_char_dynarr *dst, unsigned int n)
5014 {
5015   unsigned char charmask, c;
5016   unsigned char char_boundary;
5017   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5018   unsigned int flags          = str->flags;
5019   Emchar ch                   = str->ch;
5020   Lisp_Coding_System *codesys = str->codesys;
5021   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5022   int i;
5023   Lisp_Object charset;
5024   int half;
5025 #ifdef UTF2000
5026   unsigned int byte1, byte2;
5027 #endif
5028
5029 #ifdef ENABLE_COMPOSITE_CHARS
5030   /* flags for handling composite chars.  We do a little switcharoo
5031      on the source while we're outputting the composite char. */
5032   unsigned int saved_n = 0;
5033   CONST unsigned char *saved_src = NULL;
5034   int in_composite = 0;
5035 #endif /* ENABLE_COMPOSITE_CHARS */
5036
5037   char_boundary = str->iso2022.current_char_boundary;
5038   charset = str->iso2022.current_charset;
5039   half = str->iso2022.current_half;
5040
5041 #ifdef ENABLE_COMPOSITE_CHARS
5042  back_to_square_n:
5043 #endif
5044 #ifdef UTF2000
5045   while (n--)
5046     {
5047       c = *src++;
5048
5049       switch (char_boundary)
5050         {
5051         case 0:
5052           if ( c >= 0xfc )
5053             {
5054               ch = c & 0x01;
5055               char_boundary = 5;
5056             }
5057           else if ( c >= 0xf8 )
5058             {
5059               ch = c & 0x03;
5060               char_boundary = 4;
5061             }
5062           else if ( c >= 0xf0 )
5063             {
5064               ch = c & 0x07;
5065               char_boundary = 3;
5066             }
5067           else if ( c >= 0xe0 )
5068             {
5069               ch = c & 0x0f;
5070               char_boundary = 2;
5071             }
5072           else if ( c >= 0xc0 )
5073             {
5074               ch = c & 0x1f;
5075               char_boundary = 1;
5076             }
5077           else
5078             {
5079               ch = 0;
5080
5081               restore_left_to_right_direction (codesys, dst, &flags, 0);
5082
5083               /* Make sure G0 contains ASCII */
5084               if ((c > ' ' && c < ISO_CODE_DEL) ||
5085                   !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5086                 {
5087                   ensure_normal_shift (str, dst);
5088                   iso2022_designate (Vcharset_ascii, 0, str, dst);
5089                 }
5090
5091               /* If necessary, restore everything to the default state
5092                  at end-of-line */
5093               if (c == '\n' &&
5094                   !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5095                 {
5096                   restore_left_to_right_direction (codesys, dst, &flags, 0);
5097
5098                   ensure_normal_shift (str, dst);
5099
5100                   for (i = 0; i < 4; i++)
5101                     {
5102                       Lisp_Object initial_charset =
5103                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5104                       iso2022_designate (initial_charset, i, str, dst);
5105                     }
5106                 }
5107               if (c == '\n')
5108                 {
5109                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5110                     Dynarr_add (dst, '\r');
5111                   if (eol_type != EOL_CR)
5112                     Dynarr_add (dst, c);
5113                 }
5114               else
5115                 {
5116                   if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5117                       && fit_to_be_escape_quoted (c))
5118                     Dynarr_add (dst, ISO_CODE_ESC);
5119                   Dynarr_add (dst, c);
5120                 }
5121               char_boundary = 0;
5122             }
5123           break;
5124         case 1:
5125           ch = ( ch << 6 ) | ( c & 0x3f );
5126
5127           char_boundary = 0;
5128           if ( (0x80 <= ch) && (ch <= 0x9f) )
5129             {
5130               charmask = (half == 0 ? 0x00 : 0x80);
5131
5132               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5133                   && fit_to_be_escape_quoted (ch))
5134                 Dynarr_add (dst, ISO_CODE_ESC);
5135               /* you asked for it ... */
5136               Dynarr_add (dst, ch);
5137             }
5138           else
5139             {
5140               int reg;
5141
5142               BREAKUP_CHAR (ch, charset, byte1, byte2);
5143               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5144                                         codesys, dst, &flags, 0);
5145
5146               /* Now determine which register to use. */
5147               reg = -1;
5148               for (i = 0; i < 4; i++)
5149                 {
5150                   if (EQ (charset, str->iso2022.charset[i]) ||
5151                       EQ (charset,
5152                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5153                     {
5154                       reg = i;
5155                       break;
5156                     }
5157                 }
5158
5159               if (reg == -1)
5160                 {
5161                   if (XCHARSET_GRAPHIC (charset) != 0)
5162                     {
5163                       if (!NILP (str->iso2022.charset[1]) &&
5164                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5165                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5166                         reg = 1;
5167                       else if (!NILP (str->iso2022.charset[2]))
5168                         reg = 2;
5169                       else if (!NILP (str->iso2022.charset[3]))
5170                         reg = 3;
5171                       else
5172                         reg = 0;
5173                     }
5174                   else
5175                     reg = 0;
5176                 }
5177
5178               iso2022_designate (charset, reg, str, dst);
5179
5180               /* Now invoke that register. */
5181               switch (reg)
5182                 {
5183                 case 0:
5184                   ensure_normal_shift (str, dst);
5185                   half = 0;
5186                   break;
5187
5188                 case 1:
5189                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5190                     {
5191                       ensure_shift_out (str, dst);
5192                       half = 0;
5193                     }
5194                   else
5195                     half = 1;
5196                   break;
5197
5198                 case 2:
5199                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5200                     {
5201                       Dynarr_add (dst, ISO_CODE_ESC);
5202                       Dynarr_add (dst, 'N');
5203                       half = 0;
5204                     }
5205                   else
5206                     {
5207                       Dynarr_add (dst, ISO_CODE_SS2);
5208                       half = 1;
5209                     }
5210                   break;
5211
5212                 case 3:
5213                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5214                     {
5215                       Dynarr_add (dst, ISO_CODE_ESC);
5216                       Dynarr_add (dst, 'O');
5217                       half = 0;
5218                     }
5219                   else
5220                     {
5221                       Dynarr_add (dst, ISO_CODE_SS3);
5222                       half = 1;
5223                     }
5224                   break;
5225
5226                 default:
5227                   abort ();
5228                 }
5229
5230               charmask = (half == 0 ? 0x00 : 0x80);
5231
5232               switch (XCHARSET_DIMENSION (charset))
5233                 {
5234                 case 1:
5235                   Dynarr_add (dst, byte1 | charmask);
5236                   break;
5237                 case 2:
5238                   Dynarr_add (dst, byte1 | charmask);
5239                   Dynarr_add (dst, byte2 | charmask);
5240                   break;
5241                 default:
5242                   abort ();
5243                 }
5244             }
5245           ch =0;
5246           break;
5247         default:
5248           ch = ( ch << 6 ) | ( c & 0x3f );
5249           char_boundary--;
5250         }
5251     }
5252 #else /* not UTF2000 */
5253
5254   while (n--)
5255     {
5256       c = *src++;
5257
5258       if (BYTE_ASCII_P (c))
5259         {               /* Processing ASCII character */
5260           ch = 0;
5261
5262           restore_left_to_right_direction (codesys, dst, &flags, 0);
5263
5264           /* Make sure G0 contains ASCII */
5265           if ((c > ' ' && c < ISO_CODE_DEL) ||
5266               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5267             {
5268               ensure_normal_shift (str, dst);
5269               iso2022_designate (Vcharset_ascii, 0, str, dst);
5270             }
5271
5272           /* If necessary, restore everything to the default state
5273              at end-of-line */
5274           if (c == '\n' &&
5275               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5276             {
5277               restore_left_to_right_direction (codesys, dst, &flags, 0);
5278
5279               ensure_normal_shift (str, dst);
5280
5281               for (i = 0; i < 4; i++)
5282                 {
5283                   Lisp_Object initial_charset =
5284                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5285                   iso2022_designate (initial_charset, i, str, dst);
5286                 }
5287             }
5288           if (c == '\n')
5289             {
5290               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5291                 Dynarr_add (dst, '\r');
5292               if (eol_type != EOL_CR)
5293                 Dynarr_add (dst, c);
5294             }
5295           else
5296             {
5297               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5298                   && fit_to_be_escape_quoted (c))
5299                 Dynarr_add (dst, ISO_CODE_ESC);
5300               Dynarr_add (dst, c);
5301             }
5302           char_boundary = 1;
5303         }
5304
5305       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5306         { /* Processing Leading Byte */
5307           ch = 0;
5308           charset = CHARSET_BY_LEADING_BYTE (c);
5309           if (LEADING_BYTE_PREFIX_P(c))
5310             ch = c;
5311           else if (!EQ (charset, Vcharset_control_1)
5312 #ifdef ENABLE_COMPOSITE_CHARS
5313                    && !EQ (charset, Vcharset_composite)
5314 #endif
5315                    )
5316             {
5317               int reg;
5318
5319               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5320                                         codesys, dst, &flags, 0);
5321
5322               /* Now determine which register to use. */
5323               reg = -1;
5324               for (i = 0; i < 4; i++)
5325                 {
5326                   if (EQ (charset, str->iso2022.charset[i]) ||
5327                       EQ (charset,
5328                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5329                     {
5330                       reg = i;
5331                       break;
5332                     }
5333                 }
5334
5335               if (reg == -1)
5336                 {
5337                   if (XCHARSET_GRAPHIC (charset) != 0)
5338                     {
5339                       if (!NILP (str->iso2022.charset[1]) &&
5340                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5341                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5342                         reg = 1;
5343                       else if (!NILP (str->iso2022.charset[2]))
5344                         reg = 2;
5345                       else if (!NILP (str->iso2022.charset[3]))
5346                         reg = 3;
5347                       else
5348                         reg = 0;
5349                     }
5350                   else
5351                     reg = 0;
5352                 }
5353
5354               iso2022_designate (charset, reg, str, dst);
5355
5356               /* Now invoke that register. */
5357               switch (reg)
5358                 {
5359                 case 0:
5360                   ensure_normal_shift (str, dst);
5361                   half = 0;
5362                   break;
5363
5364                 case 1:
5365                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5366                     {
5367                       ensure_shift_out (str, dst);
5368                       half = 0;
5369                     }
5370                   else
5371                     half = 1;
5372                   break;
5373
5374                 case 2:
5375                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5376                     {
5377                       Dynarr_add (dst, ISO_CODE_ESC);
5378                       Dynarr_add (dst, 'N');
5379                       half = 0;
5380                     }
5381                   else
5382                     {
5383                       Dynarr_add (dst, ISO_CODE_SS2);
5384                       half = 1;
5385                     }
5386                   break;
5387
5388                 case 3:
5389                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5390                     {
5391                       Dynarr_add (dst, ISO_CODE_ESC);
5392                       Dynarr_add (dst, 'O');
5393                       half = 0;
5394                     }
5395                   else
5396                     {
5397                       Dynarr_add (dst, ISO_CODE_SS3);
5398                       half = 1;
5399                     }
5400                   break;
5401
5402                 default:
5403                   abort ();
5404                 }
5405             }
5406           char_boundary = 0;
5407         }
5408       else
5409         {                       /* Processing Non-ASCII character */
5410           charmask = (half == 0 ? 0x7F : 0xFF);
5411           char_boundary = 1;
5412           if (EQ (charset, Vcharset_control_1))
5413             {
5414               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5415                   && fit_to_be_escape_quoted (c))
5416                 Dynarr_add (dst, ISO_CODE_ESC);
5417               /* you asked for it ... */
5418               Dynarr_add (dst, c - 0x20);
5419             }
5420           else
5421             {
5422               switch (XCHARSET_REP_BYTES (charset))
5423                 {
5424                 case 2:
5425                   Dynarr_add (dst, c & charmask);
5426                   break;
5427                 case 3:
5428                   if (XCHARSET_PRIVATE_P (charset))
5429                     {
5430                       Dynarr_add (dst, c & charmask);
5431                       ch = 0;
5432                     }
5433                   else if (ch)
5434                     {
5435 #ifdef ENABLE_COMPOSITE_CHARS
5436                       if (EQ (charset, Vcharset_composite))
5437                         {
5438                           if (in_composite)
5439                             {
5440                               /* #### Bother! We don't know how to
5441                                  handle this yet. */
5442                               Dynarr_add (dst, '~');
5443                             }
5444                           else
5445                             {
5446                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5447                                                        ch & 0x7F, c & 0x7F);
5448                               Lisp_Object lstr = composite_char_string (emch);
5449                               saved_n = n;
5450                               saved_src = src;
5451                               in_composite = 1;
5452                               src = XSTRING_DATA   (lstr);
5453                               n   = XSTRING_LENGTH (lstr);
5454                               Dynarr_add (dst, ISO_CODE_ESC);
5455                               Dynarr_add (dst, '0'); /* start composing */
5456                             }
5457                         }
5458                       else
5459 #endif /* ENABLE_COMPOSITE_CHARS */
5460                         {
5461                           Dynarr_add (dst, ch & charmask);
5462                           Dynarr_add (dst, c & charmask);
5463                         }
5464                       ch = 0;
5465                     }
5466                   else
5467                     {
5468                       ch = c;
5469                       char_boundary = 0;
5470                     }
5471                   break;
5472                 case 4:
5473                   if (ch)
5474                     {
5475                       Dynarr_add (dst, ch & charmask);
5476                       Dynarr_add (dst, c & charmask);
5477                       ch = 0;
5478                     }
5479                   else
5480                     {
5481                       ch = c;
5482                       char_boundary = 0;
5483                     }
5484                   break;
5485                 default:
5486                   abort ();
5487                 }
5488             }
5489         }
5490     }
5491 #endif /* not UTF2000 */
5492
5493 #ifdef ENABLE_COMPOSITE_CHARS
5494   if (in_composite)
5495     {
5496       n = saved_n;
5497       src = saved_src;
5498       in_composite = 0;
5499       Dynarr_add (dst, ISO_CODE_ESC);
5500       Dynarr_add (dst, '1'); /* end composing */
5501       goto back_to_square_n; /* Wheeeeeeeee ..... */
5502     }
5503 #endif /* ENABLE_COMPOSITE_CHARS */
5504
5505 #ifdef UTF2000
5506   if ( (char_boundary == 0) && flags & CODING_STATE_END)
5507 #else
5508   if (char_boundary && flags & CODING_STATE_END)
5509 #endif
5510     {
5511       restore_left_to_right_direction (codesys, dst, &flags, 0);
5512       ensure_normal_shift (str, dst);
5513       for (i = 0; i < 4; i++)
5514         {
5515           Lisp_Object initial_charset =
5516             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5517           iso2022_designate (initial_charset, i, str, dst);
5518         }
5519     }
5520
5521   str->flags = flags;
5522   str->ch    = ch;
5523   str->iso2022.current_char_boundary = char_boundary;
5524   str->iso2022.current_charset = charset;
5525   str->iso2022.current_half = half;
5526
5527   /* Verbum caro factum est! */
5528 }
5529 #endif /* MULE */
5530 \f
5531 /************************************************************************/
5532 /*                     No-conversion methods                            */
5533 /************************************************************************/
5534
5535 /* This is used when reading in "binary" files -- i.e. files that may
5536    contain all 256 possible byte values and that are not to be
5537    interpreted as being in any particular decoding. */
5538 static void
5539 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5540                              unsigned_char_dynarr *dst, unsigned int n)
5541 {
5542   unsigned char c;
5543   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5544   unsigned int flags  = str->flags;
5545   unsigned int ch     = str->ch;
5546   eol_type_t eol_type = str->eol_type;
5547
5548   while (n--)
5549     {
5550       c = *src++;
5551
5552       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5553       DECODE_ADD_BINARY_CHAR (c, dst);
5554     label_continue_loop:;
5555     }
5556
5557   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5558
5559   str->flags = flags;
5560   str->ch    = ch;
5561 }
5562
5563 static void
5564 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5565                              unsigned_char_dynarr *dst, unsigned int n)
5566 {
5567   unsigned char c;
5568   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5569   unsigned int flags  = str->flags;
5570   unsigned int ch     = str->ch;
5571   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5572 #ifdef UTF2000
5573   unsigned char char_boundary = str->iso2022.current_char_boundary;
5574 #endif
5575
5576   while (n--)
5577     {
5578       c = *src++;
5579 #ifdef UTF2000
5580       switch (char_boundary)
5581         {
5582         case 0:
5583           if ( c >= 0xfc )
5584             {
5585               ch = c & 0x01;
5586               char_boundary = 5;
5587             }
5588           else if ( c >= 0xf8 )
5589             {
5590               ch = c & 0x03;
5591               char_boundary = 4;
5592             }
5593           else if ( c >= 0xf0 )
5594             {
5595               ch = c & 0x07;
5596               char_boundary = 3;
5597             }
5598           else if ( c >= 0xe0 )
5599             {
5600               ch = c & 0x0f;
5601               char_boundary = 2;
5602             }
5603           else if ( c >= 0xc0 )
5604             {
5605               ch = c & 0x1f;
5606               char_boundary = 1;
5607             }
5608           else
5609             {
5610               ch = 0;
5611
5612               if (c == '\n')
5613                 {
5614                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5615                     Dynarr_add (dst, '\r');
5616                   if (eol_type != EOL_CR)
5617                     Dynarr_add (dst, c);
5618                 }
5619               else
5620                 Dynarr_add (dst, c);
5621               char_boundary = 0;
5622             }
5623           break;
5624         case 1:
5625           ch = ( ch << 6 ) | ( c & 0x3f );
5626           switch ( str->codesys->fixed.size )
5627             {
5628             case 1:
5629               Dynarr_add (dst, ch & 0xff);
5630               break;
5631             case 2:
5632               Dynarr_add (dst, (ch >> 8) & 0xff);
5633               Dynarr_add (dst,  ch       & 0xff);
5634               break;
5635             case 3:
5636               Dynarr_add (dst, (ch >> 16) & 0xff);
5637               Dynarr_add (dst, (ch >>  8) & 0xff);
5638               Dynarr_add (dst,  ch        & 0xff);
5639               break;
5640             case 4:
5641               Dynarr_add (dst, (ch >> 24) & 0xff);
5642               Dynarr_add (dst, (ch >> 16) & 0xff);
5643               Dynarr_add (dst, (ch >>  8) & 0xff);
5644               Dynarr_add (dst,  ch        & 0xff);
5645               break;
5646             default:
5647               fprintf(stderr, "It seems %d bytes stream.\n",
5648                       str->codesys->fixed.size);
5649               abort ();
5650             }
5651           char_boundary = 0;
5652           break;
5653         default:
5654           ch = ( ch << 6 ) | ( c & 0x3f );
5655           char_boundary--;
5656         }
5657 #else /* not UTF2000 */
5658       if (c == '\n')
5659         {
5660           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5661             Dynarr_add (dst, '\r');
5662           if (eol_type != EOL_CR)
5663             Dynarr_add (dst, '\n');
5664           ch = 0;
5665         }
5666       else if (BYTE_ASCII_P (c))
5667         {
5668           assert (ch == 0);
5669           Dynarr_add (dst, c);
5670         }
5671       else if (BUFBYTE_LEADING_BYTE_P (c))
5672         {
5673           assert (ch == 0);
5674           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5675               c == LEADING_BYTE_CONTROL_1)
5676             ch = c;
5677           else
5678             Dynarr_add (dst, '~'); /* untranslatable character */
5679         }
5680       else
5681         {
5682           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5683             Dynarr_add (dst, c);
5684           else if (ch == LEADING_BYTE_CONTROL_1)
5685             {
5686               assert (c < 0xC0);
5687               Dynarr_add (dst, c - 0x20);
5688             }
5689           /* else it should be the second or third byte of an
5690              untranslatable character, so ignore it */
5691           ch = 0;
5692         }
5693 #endif /* not UTF2000 */
5694     }
5695
5696   str->flags = flags;
5697   str->ch    = ch;
5698 #ifdef UTF2000
5699   str->iso2022.current_char_boundary = char_boundary;
5700 #endif
5701 }
5702
5703 \f
5704 /************************************************************************/
5705 /*                   Simple internal/external functions                 */
5706 /************************************************************************/
5707
5708 static Extbyte_dynarr *conversion_out_dynarr;
5709 static Bufbyte_dynarr *conversion_in_dynarr;
5710
5711 /* Determine coding system from coding format */
5712
5713 /* #### not correct for all values of `fmt'! */
5714 static Lisp_Object
5715 external_data_format_to_coding_system (enum external_data_format fmt)
5716 {
5717   switch (fmt)
5718     {
5719     case FORMAT_FILENAME:
5720     case FORMAT_TERMINAL:
5721       if (EQ (Vfile_name_coding_system, Qnil) ||
5722           EQ (Vfile_name_coding_system, Qbinary))
5723         return Qnil;
5724       else
5725         return Fget_coding_system (Vfile_name_coding_system);
5726 #ifdef MULE
5727     case FORMAT_CTEXT:
5728       return Fget_coding_system (Qctext);
5729 #endif
5730     default:
5731       return Qnil;
5732     }
5733 }
5734
5735 Extbyte *
5736 convert_to_external_format (CONST Bufbyte *ptr,
5737                             Bytecount len,
5738                             Extcount *len_out,
5739                             enum external_data_format fmt)
5740 {
5741   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5742
5743   if (!conversion_out_dynarr)
5744     conversion_out_dynarr = Dynarr_new (Extbyte);
5745   else
5746     Dynarr_reset (conversion_out_dynarr);
5747
5748   if (NILP (coding_system))
5749     {
5750       CONST Bufbyte *end = ptr + len;
5751
5752       for (; ptr < end;)
5753         {
5754 #ifdef UTF2000
5755           Bufbyte c =
5756             (*ptr < 0xc0) ? *ptr :
5757             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5758 #else
5759           Bufbyte c =
5760             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5761             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5762             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5763             '~';
5764 #endif
5765           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5766           INC_CHARPTR (ptr);
5767         }
5768
5769 #ifdef ERROR_CHECK_BUFPOS
5770       assert (ptr == end);
5771 #endif
5772     }
5773   else
5774     {
5775       Lisp_Object instream, outstream, da_outstream;
5776       Lstream *istr, *ostr;
5777       struct gcpro gcpro1, gcpro2, gcpro3;
5778       char tempbuf[1024]; /* some random amount */
5779
5780       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5781       da_outstream = make_dynarr_output_stream
5782         ((unsigned_char_dynarr *) conversion_out_dynarr);
5783       outstream =
5784         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5785       istr = XLSTREAM (instream);
5786       ostr = XLSTREAM (outstream);
5787       GCPRO3 (instream, outstream, da_outstream);
5788       while (1)
5789         {
5790           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5791           if (!size_in_bytes)
5792             break;
5793           Lstream_write (ostr, tempbuf, size_in_bytes);
5794         }
5795       Lstream_close (istr);
5796       Lstream_close (ostr);
5797       UNGCPRO;
5798       Lstream_delete (istr);
5799       Lstream_delete (ostr);
5800       Lstream_delete (XLSTREAM (da_outstream));
5801     }
5802
5803   *len_out = Dynarr_length (conversion_out_dynarr);
5804   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5805   return Dynarr_atp (conversion_out_dynarr, 0);
5806 }
5807
5808 Bufbyte *
5809 convert_from_external_format (CONST Extbyte *ptr,
5810                               Extcount len,
5811                               Bytecount *len_out,
5812                               enum external_data_format fmt)
5813 {
5814   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5815
5816   if (!conversion_in_dynarr)
5817     conversion_in_dynarr = Dynarr_new (Bufbyte);
5818   else
5819     Dynarr_reset (conversion_in_dynarr);
5820
5821   if (NILP (coding_system))
5822     {
5823       CONST Extbyte *end = ptr + len;
5824       for (; ptr < end; ptr++)
5825         {
5826           Extbyte c = *ptr;
5827           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5828         }
5829     }
5830   else
5831     {
5832       Lisp_Object instream, outstream, da_outstream;
5833       Lstream *istr, *ostr;
5834       struct gcpro gcpro1, gcpro2, gcpro3;
5835       char tempbuf[1024]; /* some random amount */
5836
5837       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5838       da_outstream = make_dynarr_output_stream
5839         ((unsigned_char_dynarr *) conversion_in_dynarr);
5840       outstream =
5841         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5842       istr = XLSTREAM (instream);
5843       ostr = XLSTREAM (outstream);
5844       GCPRO3 (instream, outstream, da_outstream);
5845       while (1)
5846         {
5847           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5848           if (!size_in_bytes)
5849             break;
5850           Lstream_write (ostr, tempbuf, size_in_bytes);
5851         }
5852       Lstream_close (istr);
5853       Lstream_close (ostr);
5854       UNGCPRO;
5855       Lstream_delete (istr);
5856       Lstream_delete (ostr);
5857       Lstream_delete (XLSTREAM (da_outstream));
5858     }
5859
5860   *len_out = Dynarr_length (conversion_in_dynarr);
5861   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5862   return Dynarr_atp (conversion_in_dynarr, 0);
5863 }
5864
5865 \f
5866 /************************************************************************/
5867 /*                             Initialization                           */
5868 /************************************************************************/
5869
5870 void
5871 syms_of_file_coding (void)
5872 {
5873   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5874   deferror (&Qcoding_system_error, "coding-system-error",
5875             "Coding-system error", Qio_error);
5876
5877   DEFSUBR (Fcoding_system_p);
5878   DEFSUBR (Ffind_coding_system);
5879   DEFSUBR (Fget_coding_system);
5880   DEFSUBR (Fcoding_system_list);
5881   DEFSUBR (Fcoding_system_name);
5882   DEFSUBR (Fmake_coding_system);
5883   DEFSUBR (Fcopy_coding_system);
5884   DEFSUBR (Fdefine_coding_system_alias);
5885   DEFSUBR (Fsubsidiary_coding_system);
5886
5887   DEFSUBR (Fcoding_system_type);
5888   DEFSUBR (Fcoding_system_doc_string);
5889 #ifdef MULE
5890   DEFSUBR (Fcoding_system_charset);
5891 #endif
5892   DEFSUBR (Fcoding_system_property);
5893
5894   DEFSUBR (Fcoding_category_list);
5895   DEFSUBR (Fset_coding_priority_list);
5896   DEFSUBR (Fcoding_priority_list);
5897   DEFSUBR (Fset_coding_category_system);
5898   DEFSUBR (Fcoding_category_system);
5899
5900   DEFSUBR (Fdetect_coding_region);
5901   DEFSUBR (Fdecode_coding_region);
5902   DEFSUBR (Fencode_coding_region);
5903 #ifdef MULE
5904   DEFSUBR (Fdecode_shift_jis_char);
5905   DEFSUBR (Fencode_shift_jis_char);
5906   DEFSUBR (Fdecode_big5_char);
5907   DEFSUBR (Fencode_big5_char);
5908   DEFSUBR (Fset_ucs_char);
5909   DEFSUBR (Fucs_char);
5910   DEFSUBR (Fset_char_ucs);
5911   DEFSUBR (Fchar_ucs);
5912 #endif /* MULE */
5913   defsymbol (&Qcoding_system_p, "coding-system-p");
5914   defsymbol (&Qno_conversion, "no-conversion");
5915   defsymbol (&Qraw_text, "raw-text");
5916 #ifdef MULE
5917   defsymbol (&Qbig5, "big5");
5918   defsymbol (&Qshift_jis, "shift-jis");
5919   defsymbol (&Qucs4, "ucs-4");
5920   defsymbol (&Qutf8, "utf-8");
5921   defsymbol (&Qccl, "ccl");
5922   defsymbol (&Qiso2022, "iso2022");
5923 #endif /* MULE */
5924   defsymbol (&Qmnemonic, "mnemonic");
5925   defsymbol (&Qeol_type, "eol-type");
5926   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5927   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5928
5929   defsymbol (&Qcr, "cr");
5930   defsymbol (&Qlf, "lf");
5931   defsymbol (&Qcrlf, "crlf");
5932   defsymbol (&Qeol_cr, "eol-cr");
5933   defsymbol (&Qeol_lf, "eol-lf");
5934   defsymbol (&Qeol_crlf, "eol-crlf");
5935 #ifdef MULE
5936   defsymbol (&Qcharset_g0, "charset-g0");
5937   defsymbol (&Qcharset_g1, "charset-g1");
5938   defsymbol (&Qcharset_g2, "charset-g2");
5939   defsymbol (&Qcharset_g3, "charset-g3");
5940   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5941   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5942   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5943   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5944   defsymbol (&Qno_iso6429, "no-iso6429");
5945   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5946   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5947
5948   defsymbol (&Qshort, "short");
5949   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5950   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5951   defsymbol (&Qseven, "seven");
5952   defsymbol (&Qlock_shift, "lock-shift");
5953   defsymbol (&Qescape_quoted, "escape-quoted");
5954 #endif /* MULE */
5955   defsymbol (&Qencode, "encode");
5956   defsymbol (&Qdecode, "decode");
5957
5958 #ifdef MULE
5959   defsymbol (&Qctext, "ctext");
5960   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5961              "shift-jis");
5962   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5963              "big5");
5964   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5965              "ucs-4");
5966   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5967              "utf-8");
5968   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5969              "iso-7");
5970   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5971              "iso-8-designate");
5972   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5973              "iso-8-1");
5974   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5975              "iso-8-2");
5976   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5977              "iso-lock-shift");
5978 #endif /* MULE */
5979   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5980              "no-conversion");
5981 }
5982
5983 void
5984 lstream_type_create_file_coding (void)
5985 {
5986   LSTREAM_HAS_METHOD (decoding, reader);
5987   LSTREAM_HAS_METHOD (decoding, writer);
5988   LSTREAM_HAS_METHOD (decoding, rewinder);
5989   LSTREAM_HAS_METHOD (decoding, seekable_p);
5990   LSTREAM_HAS_METHOD (decoding, flusher);
5991   LSTREAM_HAS_METHOD (decoding, closer);
5992   LSTREAM_HAS_METHOD (decoding, marker);
5993
5994   LSTREAM_HAS_METHOD (encoding, reader);
5995   LSTREAM_HAS_METHOD (encoding, writer);
5996   LSTREAM_HAS_METHOD (encoding, rewinder);
5997   LSTREAM_HAS_METHOD (encoding, seekable_p);
5998   LSTREAM_HAS_METHOD (encoding, flusher);
5999   LSTREAM_HAS_METHOD (encoding, closer);
6000   LSTREAM_HAS_METHOD (encoding, marker);
6001 }
6002
6003 void
6004 vars_of_file_coding (void)
6005 {
6006   int i;
6007
6008   /* Initialize to something reasonable ... */
6009   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6010     {
6011       coding_category_system[i] = Qnil;
6012       coding_category_by_priority[i] = i;
6013     }
6014
6015   Fprovide (intern ("file-coding"));
6016
6017   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6018 Coding system used for TTY keyboard input.
6019 Not used under a windowing system.
6020 */ );
6021   Vkeyboard_coding_system = Qnil;
6022
6023   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6024 Coding system used for TTY display output.
6025 Not used under a windowing system.
6026 */ );
6027   Vterminal_coding_system = Qnil;
6028
6029   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6030 Overriding coding system used when writing a file or process.
6031 You should *bind* this, not set it.  If this is non-nil, it specifies
6032 the coding system that will be used when a file or process is read
6033 in, and overrides `buffer-file-coding-system-for-read',
6034 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6035 this one for permanent changes to the environment.
6036 */ );
6037   Vcoding_system_for_read = Qnil;
6038
6039   DEFVAR_LISP ("coding-system-for-write",
6040                &Vcoding_system_for_write /*
6041 Overriding coding system used when writing a file or process.
6042 You should *bind* this, not set it.  If this is non-nil, it specifies
6043 the coding system that will be used when a file or process is wrote
6044 in, and overrides `buffer-file-coding-system',
6045 `write-region-pre-hook', etc.  Use those variables instead of this one
6046 for permanent changes to the environment.
6047 */ );
6048   Vcoding_system_for_write = Qnil;
6049
6050   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6051 Coding system used to convert pathnames when accessing files.
6052 */ );
6053   Vfile_name_coding_system = Qnil;
6054
6055   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6056 Non-nil means the buffer contents are regarded as multi-byte form
6057 of characters, not a binary code.  This affects the display, file I/O,
6058 and behaviors of various editing commands.
6059
6060 Setting this to nil does not do anything.
6061 */ );
6062   enable_multibyte_characters = 1;
6063 }
6064
6065 void
6066 complex_vars_of_file_coding (void)
6067 {
6068   staticpro (&Vcoding_system_hash_table);
6069   Vcoding_system_hash_table =
6070     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6071
6072   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6073
6074 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6075 {                                               \
6076   struct codesys_prop csp;                      \
6077   csp.sym = (Sym);                              \
6078   csp.prop_type = (Prop_Type);                  \
6079   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6080 } while (0)
6081
6082   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6083   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6084   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6085   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6086   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6087   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6088   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6089 #ifdef MULE
6090   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6091   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6092   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6093   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6094   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6095   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6096   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6097   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6098   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6099   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6100   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6101   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6102   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6103   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6104   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6105   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6106   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6107
6108   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6109   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6110 #endif /* MULE */
6111   /* Need to create this here or we're really screwed. */
6112   Fmake_coding_system
6113     (Qraw_text, Qno_conversion,
6114      build_string ("Raw text, which means it converts only line-break-codes."),
6115      list2 (Qmnemonic, build_string ("Raw")));
6116
6117   Fmake_coding_system
6118     (Qbinary, Qno_conversion,
6119      build_string ("Binary, which means it does not convert anything."),
6120      list4 (Qeol_type, Qlf,
6121             Qmnemonic, build_string ("Binary")));
6122
6123 #ifdef UTF2000
6124   Fmake_coding_system
6125     (Qutf8, Qutf8,
6126      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6127      list2 (Qmnemonic, build_string ("UTF8")));
6128 #endif
6129
6130   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6131
6132   /* Need this for bootstrapping */
6133   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6134     Fget_coding_system (Qraw_text);
6135
6136 #ifdef UTF2000
6137   coding_category_system[CODING_CATEGORY_UTF8]
6138    = Fget_coding_system (Qutf8);
6139 #endif
6140
6141 #ifdef MULE
6142   {
6143     unsigned int i;
6144
6145     for (i = 0; i < 65536; i++)
6146       ucs_to_mule_table[i] = Qnil;
6147   }
6148   staticpro (&mule_to_ucs_table);
6149   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6150 #endif /* MULE */
6151 }