src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28 #include "buffer.h"
  29 #include "elhash.h"
  30 #include "insdel.h"
  31 #include "lstream.h"
  32 #ifdef MULE
  33 #include "mule-ccl.h"
  34 #include "chartab.h"
  35 #endif
  36 #include "file-coding.h"
  37
  38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  39
  40 Lisp_Object Vkeyboard_coding_system;
  41 Lisp_Object Vterminal_coding_system;
  42 Lisp_Object Vcoding_system_for_read;
  43 Lisp_Object Vcoding_system_for_write;
  44 Lisp_Object Vfile_name_coding_system;
  45
  46 /* Table of symbols identifying each coding category. */
  47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  48
  49 /* Coding system currently associated with each coding category. */
  50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  51
  52 /* Table of all coding categories in decreasing order of priority.
  53    This describes a permutation of the possible coding categories. */
  54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  55
  56 Lisp_Object Qcoding_system_p;
  57
  58 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  59 /* Qinternal in general.c */
  60
  61 Lisp_Object Qmnemonic, Qeol_type;
  62 Lisp_Object Qcr, Qcrlf, Qlf;
  63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  64 Lisp_Object Qpost_read_conversion;
  65 Lisp_Object Qpre_write_conversion;
  66
  67 #ifdef MULE
  68 Lisp_Object Qucs4, Qutf8;
  69 Lisp_Object Qbig5, Qshift_jis;
  70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  73 Lisp_Object Qno_iso6429;
  74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  75 Lisp_Object Qctext, Qescape_quoted;
  76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  77 #endif
  78 Lisp_Object Qencode, Qdecode;
  79
  80 Lisp_Object Vcoding_system_hash_table;
  81
  82 int enable_multibyte_characters;
  83
  84 #ifdef MULE
  85 /* Additional information used by the ISO2022 decoder and detector. */
  86 struct iso2022_decoder
  87 {
  88   /* CHARSET holds the character sets currently assigned to the G0
  89      through G3 variables.  It is initialized from the array
  90      INITIAL_CHARSET in CODESYS. */
  91   Lisp_Object charset[4];
  92
  93   /* Which registers are currently invoked into the left (GL) and
  94      right (GR) halves of the 8-bit encoding space? */
  95   int register_left, register_right;
  96
  97   /* ISO_ESC holds a value indicating part of an escape sequence
  98      that has already been seen. */
  99   enum iso_esc_flag esc;
 100
 101   /* This records the bytes we've seen so far in an escape sequence,
 102      in case the sequence is invalid (we spit out the bytes unchanged). */
 103   unsigned char esc_bytes[8];
 104
 105   /* Index for next byte to store in ISO escape sequence. */
 106   int esc_bytes_index;
 107
 108 #ifdef ENABLE_COMPOSITE_CHARS
 109   /* Stuff seen so far when composing a string. */
 110   unsigned_char_dynarr *composite_chars;
 111 #endif
 112
 113   /* If we saw an invalid designation sequence for a particular
 114      register, we flag it here and switch to ASCII.  The next time we
 115      see a valid designation for this register, we turn off the flag
 116      and do the designation normally, but pretend the sequence was
 117      invalid.  The effect of all this is that (most of the time) the
 118      escape sequences for both the switch to the unknown charset, and
 119      the switch back to the known charset, get inserted literally into
 120      the buffer and saved out as such.  The hope is that we can
 121      preserve the escape sequences so that the resulting written out
 122      file makes sense.  If we don't do any of this, the designation
 123      to the invalid charset will be preserved but that switch back
 124      to the known charset will probably get eaten because it was
 125      the same charset that was already present in the register. */
 126   unsigned char invalid_designated[4];
 127
 128   /* We try to do similar things as above for direction-switching
 129      sequences.  If we encountered a direction switch while an
 130      invalid designation was present, or an invalid designation
 131      just after a direction switch (i.e. no valid designation
 132      encountered yet), we insert the direction-switch escape
 133      sequence literally into the output stream, and later on
 134      insert the corresponding direction-restoring escape sequence
 135      literally also. */
 136   unsigned int switched_dir_and_no_valid_charset_yet :1;
 137   unsigned int invalid_switch_dir :1;
 138
 139   /* Tells the decoder to output the escape sequence literally
 140      even though it was valid.  Used in the games we play to
 141      avoid lossage when we encounter invalid designations. */
 142   unsigned int output_literally :1;
 143   /* We encountered a direction switch followed by an invalid
 144      designation.  We didn't output the direction switch
 145      literally because we didn't know about the invalid designation;
 146      but we have to do so now. */
 147   unsigned int output_direction_sequence :1;
 148 };
 149 #endif /* MULE */
 150 EXFUN (Fcopy_coding_system, 2);
 151 #ifdef MULE
 152 struct detection_state;
 153 static int detect_coding_sjis (struct detection_state *st,
 154                                CONST unsigned char *src,
 155                                unsigned int n);
 156 static void decode_coding_sjis (Lstream *decoding,
 157                                 CONST unsigned char *src,
 158                                 unsigned_char_dynarr *dst,
 159                                 unsigned int n);
 160 static void encode_coding_sjis (Lstream *encoding,
 161                                 CONST unsigned char *src,
 162                                 unsigned_char_dynarr *dst,
 163                                 unsigned int n);
 164 static int detect_coding_big5 (struct detection_state *st,
 165                                CONST unsigned char *src,
 166                                unsigned int n);
 167 static void decode_coding_big5 (Lstream *decoding,
 168                                 CONST unsigned char *src,
 169                                 unsigned_char_dynarr *dst, unsigned int n);
 170 static void encode_coding_big5 (Lstream *encoding,
 171                                 CONST unsigned char *src,
 172                                 unsigned_char_dynarr *dst, unsigned int n);
 173 static int detect_coding_ucs4 (struct detection_state *st,
 174                                CONST unsigned char *src,
 175                                unsigned int n);
 176 static void decode_coding_ucs4 (Lstream *decoding,
 177                                 CONST unsigned char *src,
 178                                 unsigned_char_dynarr *dst, unsigned int n);
 179 static void encode_coding_ucs4 (Lstream *encoding,
 180                                 CONST unsigned char *src,
 181                                 unsigned_char_dynarr *dst, unsigned int n);
 182 static int detect_coding_utf8 (struct detection_state *st,
 183                                CONST unsigned char *src,
 184                                unsigned int n);
 185 static void decode_coding_utf8 (Lstream *decoding,
 186                                 CONST unsigned char *src,
 187                                 unsigned_char_dynarr *dst, unsigned int n);
 188 static void encode_coding_utf8 (Lstream *encoding,
 189                                 CONST unsigned char *src,
 190                                 unsigned_char_dynarr *dst, unsigned int n);
 191 static int postprocess_iso2022_mask (int mask);
 192 static void reset_iso2022 (Lisp_Object coding_system,
 193                            struct iso2022_decoder *iso);
 194 static int detect_coding_iso2022 (struct detection_state *st,
 195                                   CONST unsigned char *src,
 196                                   unsigned int n);
 197 static void decode_coding_iso2022 (Lstream *decoding,
 198                                    CONST unsigned char *src,
 199                                    unsigned_char_dynarr *dst, unsigned int n);
 200 static void encode_coding_iso2022 (Lstream *encoding,
 201                                    CONST unsigned char *src,
 202                                    unsigned_char_dynarr *dst, unsigned int n);
 203 #endif /* MULE */
 204 static void decode_coding_no_conversion (Lstream *decoding,
 205                                          CONST unsigned char *src,
 206                                          unsigned_char_dynarr *dst,
 207                                          unsigned int n);
 208 static void encode_coding_no_conversion (Lstream *encoding,
 209                                          CONST unsigned char *src,
 210                                          unsigned_char_dynarr *dst,
 211                                          unsigned int n);
 212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 213                          unsigned_char_dynarr *dst, unsigned int n);
 214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 215                          unsigned_char_dynarr *dst, unsigned int n);
 216
 217 typedef struct codesys_prop codesys_prop;
 218 struct codesys_prop
 219 {
 220   Lisp_Object sym;
 221   int prop_type;
 222 };
 223
 224 typedef struct
 225 {
 226   Dynarr_declare (codesys_prop);
 227 } codesys_prop_dynarr;
 228
 229 codesys_prop_dynarr *the_codesys_prop_dynarr;
 230
 231 enum codesys_prop_enum
 232 {
 233   CODESYS_PROP_ALL_OK,
 234   CODESYS_PROP_ISO2022,
 235   CODESYS_PROP_CCL
 236 };
 237
 238 \f
 239 /************************************************************************/
 240 /*                       Coding system functions                        */
 241 /************************************************************************/
 242
 243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 245 static void finalize_coding_system (void *header, int for_disksave);
 246
 247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 248                                mark_coding_system, print_coding_system,
 249                                finalize_coding_system,
 250                                0, 0, struct Lisp_Coding_System);
 251
 252 static Lisp_Object
 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 254 {
 255   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 256
 257   markobj (CODING_SYSTEM_NAME (codesys));
 258   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 259   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 260   markobj (CODING_SYSTEM_EOL_LF (codesys));
 261   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 262   markobj (CODING_SYSTEM_EOL_CR (codesys));
 263
 264   switch (CODING_SYSTEM_TYPE (codesys))
 265     {
 266 #ifdef MULE
 267       int i;
 268     case CODESYS_ISO2022:
 269       for (i = 0; i < 4; i++)
 270         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 271       if (codesys->iso2022.input_conv)
 272         {
 273           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 274             {
 275               struct charset_conversion_spec *ccs =
 276                 Dynarr_atp (codesys->iso2022.input_conv, i);
 277               markobj (ccs->from_charset);
 278               markobj (ccs->to_charset);
 279             }
 280         }
 281       if (codesys->iso2022.output_conv)
 282         {
 283           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 284             {
 285               struct charset_conversion_spec *ccs =
 286                 Dynarr_atp (codesys->iso2022.output_conv, i);
 287               markobj (ccs->from_charset);
 288               markobj (ccs->to_charset);
 289             }
 290         }
 291       break;
 292
 293     case CODESYS_CCL:
 294       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 295       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 296       break;
 297 #endif /* MULE */
 298     default:
 299       break;
 300     }
 301
 302   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 303   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 304 }
 305
 306 static void
 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 308                      int escapeflag)
 309 {
 310   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 311   if (print_readably)
 312     error ("printing unreadable object #<coding_system 0x%x>",
 313            c->header.uid);
 314
 315   write_c_string ("#<coding_system ", printcharfun);
 316   print_internal (c->name, printcharfun, 1);
 317   write_c_string (">", printcharfun);
 318 }
 319
 320 static void
 321 finalize_coding_system (void *header, int for_disksave)
 322 {
 323   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 324   /* Since coding systems never go away, this function is not
 325      necessary.  But it would be necessary if we changed things
 326      so that coding systems could go away. */
 327   if (!for_disksave) /* see comment in lstream.c */
 328     {
 329       switch (CODING_SYSTEM_TYPE (c))
 330         {
 331 #ifdef MULE
 332         case CODESYS_ISO2022:
 333           if (c->iso2022.input_conv)
 334             {
 335               Dynarr_free (c->iso2022.input_conv);
 336               c->iso2022.input_conv = 0;
 337             }
 338           if (c->iso2022.output_conv)
 339             {
 340               Dynarr_free (c->iso2022.output_conv);
 341               c->iso2022.output_conv = 0;
 342             }
 343           break;
 344 #endif /* MULE */
 345         default:
 346           break;
 347         }
 348     }
 349 }
 350
 351 static enum eol_type
 352 symbol_to_eol_type (Lisp_Object symbol)
 353 {
 354   CHECK_SYMBOL (symbol);
 355   if (NILP (symbol))      return EOL_AUTODETECT;
 356   if (EQ (symbol, Qlf))   return EOL_LF;
 357   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 358   if (EQ (symbol, Qcr))   return EOL_CR;
 359
 360   signal_simple_error ("Unrecognized eol type", symbol);
 361   return EOL_AUTODETECT; /* not reached */
 362 }
 363
 364 static Lisp_Object
 365 eol_type_to_symbol (enum eol_type type)
 366 {
 367   switch (type)
 368     {
 369     default: abort ();
 370     case EOL_LF:         return Qlf;
 371     case EOL_CRLF:       return Qcrlf;
 372     case EOL_CR:         return Qcr;
 373     case EOL_AUTODETECT: return Qnil;
 374     }
 375 }
 376
 377 static void
 378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 379 {
 380   Lisp_Object codesys_obj;
 381   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 382   char *codesys_name = (char *) alloca (len + 7);
 383   int mlen = -1;
 384   char *codesys_mnemonic=0;
 385
 386   Lisp_Object codesys_name_sym, sub_codesys_obj;
 387
 388   /* kludge */
 389
 390   XSETCODING_SYSTEM (codesys_obj, codesys);
 391
 392   memcpy (codesys_name,
 393           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 394
 395   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 396     {
 397       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 398       codesys_mnemonic = (char *) alloca (mlen + 7);
 399       memcpy (codesys_mnemonic,
 400               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 401     }
 402
 403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 404   strcpy (codesys_name + len, "-" op_sys);                                      \
 405   if (mlen != -1)                                                               \
 406     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 407   codesys_name_sym = intern (codesys_name);                                     \
 408   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 409   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 410   if (mlen != -1)                                                               \
 411     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 412       build_string (codesys_mnemonic);                                          \
 413   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 414 } while (0)
 415
 416   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 417   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 418   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 419 }
 420
 421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 422 Return t if OBJECT is a coding system.
 423 A coding system is an object that defines how text containing multiple
 424 character sets is encoded into a stream of (typically 8-bit) bytes.
 425 The coding system is used to decode the stream into a series of
 426 characters (which may be from multiple charsets) when the text is read
 427 from a file or process, and is used to encode the text back into the
 428 same format when it is written out to a file or process.
 429
 430 For example, many ISO2022-compliant coding systems (such as Compound
 431 Text, which is used for inter-client data under the X Window System)
 432 use escape sequences to switch between different charsets -- Japanese
 433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 435 `make-coding-system' for more information.
 436
 437 Coding systems are normally identified using a symbol, and the
 438 symbol is accepted in place of the actual coding system object whenever
 439 a coding system is called for. (This is similar to how faces work.)
 440 */
 441        (object))
 442 {
 443   return CODING_SYSTEMP (object) ? Qt : Qnil;
 444 }
 445
 446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 447 Retrieve the coding system of the given name.
 448
 449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 450 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 451 If there is no such coding system, nil is returned.  Otherwise the
 452 associated coding system object is returned.
 453 */
 454        (coding_system_or_name))
 455 {
 456   if (CODING_SYSTEMP (coding_system_or_name))
 457     return coding_system_or_name;
 458
 459   if (NILP (coding_system_or_name))
 460     coding_system_or_name = Qbinary;
 461   else
 462     CHECK_SYMBOL (coding_system_or_name);
 463
 464   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 465 }
 466
 467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 468 Retrieve the coding system of the given name.
 469 Same as `find-coding-system' except that if there is no such
 470 coding system, an error is signaled instead of returning nil.
 471 */
 472        (name))
 473 {
 474   Lisp_Object coding_system = Ffind_coding_system (name);
 475
 476   if (NILP (coding_system))
 477     signal_simple_error ("No such coding system", name);
 478   return coding_system;
 479 }
 480
 481 /* We store the coding systems in hash tables with the names as the key and the
 482    actual coding system object as the value.  Occasionally we need to use them
 483    in a list format.  These routines provide us with that. */
 484 struct coding_system_list_closure
 485 {
 486   Lisp_Object *coding_system_list;
 487 };
 488
 489 static int
 490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 491                                   void *coding_system_list_closure)
 492 {
 493   /* This function can GC */
 494   struct coding_system_list_closure *cscl =
 495     (struct coding_system_list_closure *) coding_system_list_closure;
 496   Lisp_Object *coding_system_list = cscl->coding_system_list;
 497
 498   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 499                                *coding_system_list);
 500   return 0;
 501 }
 502
 503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 504 Return a list of the names of all defined coding systems.
 505 */
 506        ())
 507 {
 508   Lisp_Object coding_system_list = Qnil;
 509   struct gcpro gcpro1;
 510   struct coding_system_list_closure coding_system_list_closure;
 511
 512   GCPRO1 (coding_system_list);
 513   coding_system_list_closure.coding_system_list = &coding_system_list;
 514   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 515                  &coding_system_list_closure);
 516   UNGCPRO;
 517
 518   return coding_system_list;
 519 }
 520
 521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 522 Return the name of the given coding system.
 523 */
 524        (coding_system))
 525 {
 526   coding_system = Fget_coding_system (coding_system);
 527   return XCODING_SYSTEM_NAME (coding_system);
 528 }
 529
 530 static Lisp_Coding_System *
 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 532 {
 533   Lisp_Coding_System *codesys =
 534     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 535
 536   zero_lcrecord (codesys);
 537   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 538   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 539   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 540   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 541   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 542   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 543   CODING_SYSTEM_TYPE     (codesys) = type;
 544   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 545 #ifdef MULE
 546   if (type == CODESYS_ISO2022)
 547     {
 548       int i;
 549       for (i = 0; i < 4; i++)
 550         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 551     }
 552   else if (type == CODESYS_CCL)
 553     {
 554       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 555       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 556     }
 557 #endif /* MULE */
 558   CODING_SYSTEM_NAME (codesys) = name;
 559
 560   return codesys;
 561 }
 562
 563 #ifdef MULE
 564 /* Given a list of charset conversion specs as specified in a Lisp
 565    program, parse it into STORE_HERE. */
 566
 567 static void
 568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 569                                 Lisp_Object spec_list)
 570 {
 571   Lisp_Object rest;
 572
 573   EXTERNAL_LIST_LOOP (rest, spec_list)
 574     {
 575       Lisp_Object car = XCAR (rest);
 576       Lisp_Object from, to;
 577       struct charset_conversion_spec spec;
 578
 579       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 580         signal_simple_error ("Invalid charset conversion spec", car);
 581       from = Fget_charset (XCAR (car));
 582       to = Fget_charset (XCAR (XCDR (car)));
 583       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 584         signal_simple_error_2
 585           ("Attempted conversion between different charset types",
 586            from, to);
 587       spec.from_charset = from;
 588       spec.to_charset = to;
 589
 590       Dynarr_add (store_here, spec);
 591     }
 592 }
 593
 594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 595    specs, return the equivalent as the Lisp programmer would see it.
 596
 597    If LOAD_HERE is 0, return Qnil. */
 598
 599 static Lisp_Object
 600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 601 {
 602   int i;
 603   Lisp_Object result;
 604
 605   if (!load_here)
 606     return Qnil;
 607   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 608     {
 609       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 610       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 611     }
 612
 613   return Fnreverse (result);
 614 }
 615
 616 #endif /* MULE */
 617
 618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 619 Register symbol NAME as a coding system.
 620
 621 TYPE describes the conversion method used and should be one of
 622
 623 nil or 'undecided
 624      Automatic conversion.  XEmacs attempts to detect the coding system
 625      used in the file.
 626 'no-conversion
 627      No conversion.  Use this for binary files and such.  On output,
 628      graphic characters that are not in ASCII or Latin-1 will be
 629      replaced by a ?. (For a no-conversion-encoded buffer, these
 630      characters will only be present if you explicitly insert them.)
 631 'shift-jis
 632      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 633 'ucs-4
 634      ISO 10646 UCS-4 encoding.
 635 'utf-8
 636      ISO 10646 UTF-8 encoding.
 637 'iso2022
 638      Any ISO2022-compliant encoding.  Among other things, this includes
 639      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 640      standard Unix encoding for Japanese and other languages), and
 641      Compound Text (the encoding used in X11).  You can specify more
 642      specific information about the conversion with the FLAGS argument.
 643 'big5
 644      Big5 (the encoding commonly used for Taiwanese).
 645 'ccl
 646      The conversion is performed using a user-written pseudo-code
 647      program.  CCL (Code Conversion Language) is the name of this
 648      pseudo-code.
 649 'internal
 650      Write out or read in the raw contents of the memory representing
 651      the buffer's text.  This is primarily useful for debugging
 652      purposes, and is only enabled when XEmacs has been compiled with
 653      DEBUG_XEMACS defined (via the --debug configure option).
 654      WARNING: Reading in a file using 'internal conversion can result
 655      in an internal inconsistency in the memory representing a
 656      buffer's text, which will produce unpredictable results and may
 657      cause XEmacs to crash.  Under normal circumstances you should
 658      never use 'internal conversion.
 659
 660 DOC-STRING is a string describing the coding system.
 661
 662 PROPS is a property list, describing the specific nature of the
 663 character set.  Recognized properties are:
 664
 665 'mnemonic
 666      String to be displayed in the modeline when this coding system is
 667      active.
 668
 669 'eol-type
 670      End-of-line conversion to be used.  It should be one of
 671
 672         nil
 673                 Automatically detect the end-of-line type (LF, CRLF,
 674                 or CR).  Also generate subsidiary coding systems named
 675                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 676                 identical to this coding system but have an EOL-TYPE
 677                 value of 'lf, 'crlf, and 'cr, respectively.
 678         'lf
 679                 The end of a line is marked externally using ASCII LF.
 680                 Since this is also the way that XEmacs represents an
 681                 end-of-line internally, specifying this option results
 682                 in no end-of-line conversion.  This is the standard
 683                 format for Unix text files.
 684         'crlf
 685                 The end of a line is marked externally using ASCII
 686                 CRLF.  This is the standard format for MS-DOS text
 687                 files.
 688         'cr
 689                 The end of a line is marked externally using ASCII CR.
 690                 This is the standard format for Macintosh text files.
 691         t
 692                 Automatically detect the end-of-line type but do not
 693                 generate subsidiary coding systems.  (This value is
 694                 converted to nil when stored internally, and
 695                 `coding-system-property' will return nil.)
 696
 697 'post-read-conversion
 698      Function called after a file has been read in, to perform the
 699      decoding.  Called with two arguments, BEG and END, denoting
 700      a region of the current buffer to be decoded.
 701
 702 'pre-write-conversion
 703      Function called before a file is written out, to perform the
 704      encoding.  Called with two arguments, BEG and END, denoting
 705      a region of the current buffer to be encoded.
 706
 707
 708 The following additional properties are recognized if TYPE is 'iso2022:
 709
 710 'charset-g0
 711 'charset-g1
 712 'charset-g2
 713 'charset-g3
 714      The character set initially designated to the G0 - G3 registers.
 715      The value should be one of
 716
 717           -- A charset object (designate that character set)
 718           -- nil (do not ever use this register)
 719           -- t (no character set is initially designated to
 720                 the register, but may be later on; this automatically
 721                 sets the corresponding `force-g*-on-output' property)
 722
 723 'force-g0-on-output
 724 'force-g1-on-output
 725 'force-g2-on-output
 726 'force-g2-on-output
 727      If non-nil, send an explicit designation sequence on output before
 728      using the specified register.
 729
 730 'short
 731      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 732      "ESC $ B" on output in place of the full designation sequences
 733      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 734
 735 'no-ascii-eol
 736      If non-nil, don't designate ASCII to G0 at each end of line on output.
 737      Setting this to non-nil also suppresses other state-resetting that
 738      normally happens at the end of a line.
 739
 740 'no-ascii-cntl
 741      If non-nil, don't designate ASCII to G0 before control chars on output.
 742
 743 'seven
 744      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 745      environment.
 746
 747 'lock-shift
 748      If non-nil, use locking-shift (SO/SI) instead of single-shift
 749      or designation by escape sequence.
 750
 751 'no-iso6429
 752      If non-nil, don't use ISO6429's direction specification.
 753
 754 'escape-quoted
 755      If non-nil, literal control characters that are the same as
 756      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 757      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 758      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 759      so that they can be properly distinguished from an escape sequence.
 760      (Note that doing this results in a non-portable encoding.) This
 761      encoding flag is used for byte-compiled files.  Note that ESC
 762      is a good choice for a quoting character because there are no
 763      escape sequences whose second byte is a character from the Control-0
 764      or Control-1 character sets; this is explicitly disallowed by the
 765      ISO2022 standard.
 766
 767 'input-charset-conversion
 768      A list of conversion specifications, specifying conversion of
 769      characters in one charset to another when decoding is performed.
 770      Each specification is a list of two elements: the source charset,
 771      and the destination charset.
 772
 773 'output-charset-conversion
 774      A list of conversion specifications, specifying conversion of
 775      characters in one charset to another when encoding is performed.
 776      The form of each specification is the same as for
 777      'input-charset-conversion.
 778
 779
 780 The following additional properties are recognized (and required)
 781 if TYPE is 'ccl:
 782
 783 'decode
 784      CCL program used for decoding (converting to internal format).
 785
 786 'encode
 787      CCL program used for encoding (converting to external format).
 788 */
 789        (name, type, doc_string, props))
 790 {
 791   Lisp_Coding_System *codesys;
 792   Lisp_Object rest, key, value;
 793   enum coding_system_type ty;
 794   int need_to_setup_eol_systems = 1;
 795
 796   /* Convert type to constant */
 797   if (NILP (type) || EQ (type, Qundecided))
 798                                       { ty = CODESYS_AUTODETECT; }
 799 #ifdef MULE
 800   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 801   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 802   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 803   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 804   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 805   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 806 #endif
 807   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 808 #ifdef DEBUG_XEMACS
 809   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 810 #endif
 811   else
 812     signal_simple_error ("Invalid coding system type", type);
 813
 814   CHECK_SYMBOL (name);
 815
 816   codesys = allocate_coding_system (ty, name);
 817
 818   if (NILP (doc_string))
 819     doc_string = build_string ("");
 820   else
 821     CHECK_STRING (doc_string);
 822   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 823
 824   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 825     {
 826       if (EQ (key, Qmnemonic))
 827         {
 828           if (!NILP (value))
 829             CHECK_STRING (value);
 830           CODING_SYSTEM_MNEMONIC (codesys) = value;
 831         }
 832
 833       else if (EQ (key, Qeol_type))
 834         {
 835           need_to_setup_eol_systems = NILP (value);
 836           if (EQ (value, Qt))
 837             value = Qnil;
 838           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 839         }
 840
 841       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 842       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 843 #ifdef MULE
 844       else if (ty == CODESYS_ISO2022)
 845         {
 846 #define FROB_INITIAL_CHARSET(charset_num) \
 847   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 848     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 849
 850           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 851           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 852           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 853           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 854
 855 #define FROB_FORCE_CHARSET(charset_num) \
 856   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 857
 858           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 859           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 860           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 861           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 862
 863 #define FROB_BOOLEAN_PROPERTY(prop) \
 864   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 865
 866           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 867           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 868           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 869           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 870           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 871           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 872           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 873
 874           else if (EQ (key, Qinput_charset_conversion))
 875             {
 876               codesys->iso2022.input_conv =
 877                 Dynarr_new (charset_conversion_spec);
 878               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 879                                               value);
 880             }
 881           else if (EQ (key, Qoutput_charset_conversion))
 882             {
 883               codesys->iso2022.output_conv =
 884                 Dynarr_new (charset_conversion_spec);
 885               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 886                                               value);
 887             }
 888           else
 889             signal_simple_error ("Unrecognized property", key);
 890         }
 891       else if (EQ (type, Qccl))
 892         {
 893           if (EQ (key, Qdecode))
 894             {
 895               CHECK_VECTOR (value);
 896               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 897             }
 898           else if (EQ (key, Qencode))
 899             {
 900               CHECK_VECTOR (value);
 901               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 902             }
 903           else
 904             signal_simple_error ("Unrecognized property", key);
 905         }
 906 #endif /* MULE */
 907       else
 908         signal_simple_error ("Unrecognized property", key);
 909     }
 910
 911   if (need_to_setup_eol_systems)
 912     setup_eol_coding_systems (codesys);
 913
 914   {
 915     Lisp_Object codesys_obj;
 916     XSETCODING_SYSTEM (codesys_obj, codesys);
 917     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 918     return codesys_obj;
 919   }
 920 }
 921
 922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 923 Copy OLD-CODING-SYSTEM to NEW-NAME.
 924 If NEW-NAME does not name an existing coding system, a new one will
 925 be created.
 926 */
 927        (old_coding_system, new_name))
 928 {
 929   Lisp_Object new_coding_system;
 930   old_coding_system = Fget_coding_system (old_coding_system);
 931   new_coding_system = Ffind_coding_system (new_name);
 932   if (NILP (new_coding_system))
 933     {
 934       XSETCODING_SYSTEM (new_coding_system,
 935                          allocate_coding_system
 936                          (XCODING_SYSTEM_TYPE (old_coding_system),
 937                           new_name));
 938       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 939     }
 940
 941   {
 942     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 943     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 944     memcpy (((char *) to  ) + sizeof (to->header),
 945             ((char *) from) + sizeof (from->header),
 946             sizeof (*from) - sizeof (from->header));
 947     to->name = new_name;
 948   }
 949   return new_coding_system;
 950 }
 951
 952 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
 953 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
 954 */
 955        (alias, coding_system))
 956 {
 957   CHECK_SYMBOL (alias);
 958   if (!NILP (Ffind_coding_system (alias)))
 959     signal_simple_error ("Symbol already names a coding system", alias);
 960   coding_system = Fget_coding_system (coding_system);
 961   Fputhash (alias, coding_system, Vcoding_system_hash_table);
 962
 963   /* Set up aliases for subsidiaries. */
 964   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
 965     {
 966       Lisp_Object str;
 967       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
 968 #define FROB(type, name)                                                        \
 969       do {                                                                      \
 970         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
 971         if (!NILP (subsidiary))                                                 \
 972           Fdefine_coding_system_alias                                           \
 973             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
 974       } while (0)
 975       FROB (LF,   "-unix");
 976       FROB (CRLF, "-dos");
 977       FROB (CR,   "-mac");
 978 #undef FROB
 979     }
 980   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
 981      but it doesn't look intentional, so I'd rather return something
 982      meaningful or nothing at all. */
 983   return Qnil;
 984 }
 985
 986 static Lisp_Object
 987 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
 988 {
 989   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
 990   Lisp_Object new_coding_system;
 991
 992   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
 993     return coding_system;
 994
 995   switch (type)
 996     {
 997     case EOL_AUTODETECT: return coding_system;
 998     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
 999     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1000     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1001     default:       abort ();
1002     }
1003
1004   return NILP (new_coding_system) ? coding_system : new_coding_system;
1005 }
1006
1007 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1008 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1009 */
1010        (coding_system, eol_type))
1011 {
1012   coding_system = Fget_coding_system (coding_system);
1013
1014   return subsidiary_coding_system (coding_system,
1015                                    symbol_to_eol_type (eol_type));
1016 }
1017
1018 \f
1019 /************************************************************************/
1020 /*                         Coding system accessors                      */
1021 /************************************************************************/
1022
1023 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1024 Return the doc string for CODING-SYSTEM.
1025 */
1026        (coding_system))
1027 {
1028   coding_system = Fget_coding_system (coding_system);
1029   return XCODING_SYSTEM_DOC_STRING (coding_system);
1030 }
1031
1032 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1033 Return the type of CODING-SYSTEM.
1034 */
1035        (coding_system))
1036 {
1037   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1038     {
1039     default: abort ();
1040     case CODESYS_AUTODETECT:    return Qundecided;
1041 #ifdef MULE
1042     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1043     case CODESYS_ISO2022:       return Qiso2022;
1044     case CODESYS_BIG5:          return Qbig5;
1045     case CODESYS_UCS4:          return Qucs4;
1046     case CODESYS_UTF8:          return Qutf8;
1047     case CODESYS_CCL:           return Qccl;
1048 #endif
1049     case CODESYS_NO_CONVERSION: return Qno_conversion;
1050 #ifdef DEBUG_XEMACS
1051     case CODESYS_INTERNAL:      return Qinternal;
1052 #endif
1053     }
1054 }
1055
1056 #ifdef MULE
1057 static
1058 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1059 {
1060   Lisp_Object cs
1061     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1062
1063   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1064 }
1065
1066 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1067 Return initial charset of CODING-SYSTEM designated to GNUM.
1068 GNUM allows 0 .. 3.
1069 */
1070        (coding_system, gnum))
1071 {
1072   coding_system = Fget_coding_system (coding_system);
1073   CHECK_INT (gnum);
1074
1075   return coding_system_charset (coding_system, XINT (gnum));
1076 }
1077 #endif /* MULE */
1078
1079 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1080 Return the PROP property of CODING-SYSTEM.
1081 */
1082        (coding_system, prop))
1083 {
1084   int i, ok = 0;
1085   enum coding_system_type type;
1086
1087   coding_system = Fget_coding_system (coding_system);
1088   CHECK_SYMBOL (prop);
1089   type = XCODING_SYSTEM_TYPE (coding_system);
1090
1091   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1092     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1093       {
1094         ok = 1;
1095         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1096           {
1097           case CODESYS_PROP_ALL_OK:
1098             break;
1099 #ifdef MULE
1100           case CODESYS_PROP_ISO2022:
1101             if (type != CODESYS_ISO2022)
1102               signal_simple_error
1103                 ("Property only valid in ISO2022 coding systems",
1104                  prop);
1105             break;
1106
1107           case CODESYS_PROP_CCL:
1108             if (type != CODESYS_CCL)
1109               signal_simple_error
1110                 ("Property only valid in CCL coding systems",
1111                  prop);
1112             break;
1113 #endif /* MULE */
1114           default:
1115             abort ();
1116           }
1117       }
1118
1119   if (!ok)
1120     signal_simple_error ("Unrecognized property", prop);
1121
1122   if (EQ (prop, Qname))
1123     return XCODING_SYSTEM_NAME (coding_system);
1124   else if (EQ (prop, Qtype))
1125     return Fcoding_system_type (coding_system);
1126   else if (EQ (prop, Qdoc_string))
1127     return XCODING_SYSTEM_DOC_STRING (coding_system);
1128   else if (EQ (prop, Qmnemonic))
1129     return XCODING_SYSTEM_MNEMONIC (coding_system);
1130   else if (EQ (prop, Qeol_type))
1131     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1132   else if (EQ (prop, Qeol_lf))
1133     return XCODING_SYSTEM_EOL_LF (coding_system);
1134   else if (EQ (prop, Qeol_crlf))
1135     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1136   else if (EQ (prop, Qeol_cr))
1137     return XCODING_SYSTEM_EOL_CR (coding_system);
1138   else if (EQ (prop, Qpost_read_conversion))
1139     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1140   else if (EQ (prop, Qpre_write_conversion))
1141     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1142 #ifdef MULE
1143   else if (type == CODESYS_ISO2022)
1144     {
1145       if (EQ (prop, Qcharset_g0))
1146         return coding_system_charset (coding_system, 0);
1147       else if (EQ (prop, Qcharset_g1))
1148         return coding_system_charset (coding_system, 1);
1149       else if (EQ (prop, Qcharset_g2))
1150         return coding_system_charset (coding_system, 2);
1151       else if (EQ (prop, Qcharset_g3))
1152         return coding_system_charset (coding_system, 3);
1153
1154 #define FORCE_CHARSET(charset_num) \
1155   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1156    (coding_system, charset_num) ? Qt : Qnil)
1157
1158       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1159       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1160       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1161       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1162
1163 #define LISP_BOOLEAN(prop) \
1164   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1165
1166       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1167       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1168       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1169       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1170       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1171       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1172       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1173
1174       else if (EQ (prop, Qinput_charset_conversion))
1175         return
1176           unparse_charset_conversion_specs
1177             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1178       else if (EQ (prop, Qoutput_charset_conversion))
1179         return
1180           unparse_charset_conversion_specs
1181             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1182       else
1183         abort ();
1184     }
1185   else if (type == CODESYS_CCL)
1186     {
1187       if (EQ (prop, Qdecode))
1188         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1189       else if (EQ (prop, Qencode))
1190         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1191       else
1192         abort ();
1193     }
1194 #endif /* MULE */
1195   else
1196     abort ();
1197
1198   return Qnil; /* not reached */
1199 }
1200
1201 \f
1202 /************************************************************************/
1203 /*                       Coding category functions                      */
1204 /************************************************************************/
1205
1206 static int
1207 decode_coding_category (Lisp_Object symbol)
1208 {
1209   int i;
1210
1211   CHECK_SYMBOL (symbol);
1212   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1213     if (EQ (coding_category_symbol[i], symbol))
1214       return i;
1215
1216   signal_simple_error ("Unrecognized coding category", symbol);
1217   return 0; /* not reached */
1218 }
1219
1220 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1221 Return a list of all recognized coding categories.
1222 */
1223        ())
1224 {
1225   int i;
1226   Lisp_Object list = Qnil;
1227
1228   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1229     list = Fcons (coding_category_symbol[i], list);
1230   return list;
1231 }
1232
1233 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1234 Change the priority order of the coding categories.
1235 LIST should be list of coding categories, in descending order of
1236 priority.  Unspecified coding categories will be lower in priority
1237 than all specified ones, in the same relative order they were in
1238 previously.
1239 */
1240        (list))
1241 {
1242   int category_to_priority[CODING_CATEGORY_LAST + 1];
1243   int i, j;
1244   Lisp_Object rest;
1245
1246   /* First generate a list that maps coding categories to priorities. */
1247
1248   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1249     category_to_priority[i] = -1;
1250
1251   /* Highest priority comes from the specified list. */
1252   i = 0;
1253   EXTERNAL_LIST_LOOP (rest, list)
1254     {
1255       int cat = decode_coding_category (XCAR (rest));
1256
1257       if (category_to_priority[cat] >= 0)
1258         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1259       category_to_priority[cat] = i++;
1260     }
1261
1262   /* Now go through the existing categories by priority to retrieve
1263      the categories not yet specified and preserve their priority
1264      order. */
1265   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1266     {
1267       int cat = coding_category_by_priority[j];
1268       if (category_to_priority[cat] < 0)
1269         category_to_priority[cat] = i++;
1270     }
1271
1272   /* Now we need to construct the inverse of the mapping we just
1273      constructed. */
1274
1275   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1276     coding_category_by_priority[category_to_priority[i]] = i;
1277
1278   /* Phew!  That was confusing. */
1279   return Qnil;
1280 }
1281
1282 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1283 Return a list of coding categories in descending order of priority.
1284 */
1285        ())
1286 {
1287   int i;
1288   Lisp_Object list = Qnil;
1289
1290   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1291     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1292                   list);
1293   return list;
1294 }
1295
1296 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1297 Change the coding system associated with a coding category.
1298 */
1299        (coding_category, coding_system))
1300 {
1301   int cat = decode_coding_category (coding_category);
1302
1303   coding_system = Fget_coding_system (coding_system);
1304   coding_category_system[cat] = coding_system;
1305   return Qnil;
1306 }
1307
1308 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1309 Return the coding system associated with a coding category.
1310 */
1311        (coding_category))
1312 {
1313   int cat = decode_coding_category (coding_category);
1314   Lisp_Object sys = coding_category_system[cat];
1315
1316   if (!NILP (sys))
1317     return XCODING_SYSTEM_NAME (sys);
1318   return Qnil;
1319 }
1320
1321 \f
1322 /************************************************************************/
1323 /*                     Detecting the encoding of data                   */
1324 /************************************************************************/
1325
1326 struct detection_state
1327 {
1328   enum eol_type eol_type;
1329   int seen_non_ascii;
1330   int mask;
1331 #ifdef MULE
1332   struct
1333     {
1334       int mask;
1335       int in_second_byte;
1336     }
1337   big5;
1338
1339   struct
1340     {
1341       int mask;
1342       int in_second_byte;
1343     }
1344   shift_jis;
1345
1346   struct
1347     {
1348       int mask;
1349       int in_byte;
1350   }
1351   ucs4;
1352
1353   struct
1354     {
1355       int mask;
1356       int in_byte;
1357     }
1358   utf8;
1359
1360   struct
1361     {
1362       int mask;
1363       int initted;
1364       struct iso2022_decoder iso;
1365       unsigned int flags;
1366       int high_byte_count;
1367       unsigned int saw_single_shift:1;
1368     }
1369   iso2022;
1370 #endif
1371   struct
1372     {
1373       int seen_anything;
1374       int just_saw_cr;
1375     }
1376   eol;
1377 };
1378
1379 static int
1380 acceptable_control_char_p (int c)
1381 {
1382   switch (c)
1383     {
1384       /* Allow and ignore control characters that you might
1385          reasonably see in a text file */
1386     case '\r':
1387     case '\n':
1388     case '\t':
1389     case  7: /* bell */
1390     case  8: /* backspace */
1391     case 11: /* vertical tab */
1392     case 12: /* form feed */
1393     case 26: /* MS-DOS C-z junk */
1394     case 31: /* '^_' -- for info */
1395       return 1;
1396     default:
1397       return 0;
1398     }
1399 }
1400
1401 static int
1402 mask_has_at_most_one_bit_p (int mask)
1403 {
1404   /* Perhaps the only thing useful you learn from intensive Microsoft
1405      technical interviews */
1406   return (mask & (mask - 1)) == 0;
1407 }
1408
1409 static enum eol_type
1410 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1411                  unsigned int n)
1412 {
1413   int c;
1414
1415   while (n--)
1416     {
1417       c = *src++;
1418       if (c == '\r')
1419         st->eol.just_saw_cr = 1;
1420       else
1421         {
1422           if (c == '\n')
1423             {
1424               if (st->eol.just_saw_cr)
1425                 return EOL_CRLF;
1426               else if (st->eol.seen_anything)
1427                 return EOL_LF;
1428             }
1429           else if (st->eol.just_saw_cr)
1430             return EOL_CR;
1431           st->eol.just_saw_cr = 0;
1432         }
1433       st->eol.seen_anything = 1;
1434     }
1435
1436   return EOL_AUTODETECT;
1437 }
1438
1439 /* Attempt to determine the encoding and EOL type of the given text.
1440    Before calling this function for the first type, you must initialize
1441    st->eol_type as appropriate and initialize st->mask to ~0.
1442
1443    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1444    not yet known.
1445
1446    st->mask holds the determined coding category mask, or ~0 if only
1447    ASCII has been seen so far.
1448
1449    Returns:
1450
1451    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1452         is present in st->mask
1453    1 == definitive answers are here for both st->eol_type and st->mask
1454 */
1455
1456 static int
1457 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1458                     unsigned int n, int just_do_eol)
1459 {
1460   int c;
1461
1462   if (st->eol_type == EOL_AUTODETECT)
1463     st->eol_type = detect_eol_type (st, src, n);
1464
1465   if (just_do_eol)
1466     return st->eol_type != EOL_AUTODETECT;
1467
1468   if (!st->seen_non_ascii)
1469     {
1470       for (; n; n--, src++)
1471         {
1472           c = *src;
1473           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1474             {
1475               st->seen_non_ascii = 1;
1476 #ifdef MULE
1477               st->shift_jis.mask = ~0;
1478               st->big5.mask = ~0;
1479               st->ucs4.mask = ~0;
1480               st->utf8.mask = ~0;
1481               st->iso2022.mask = ~0;
1482 #endif
1483               break;
1484             }
1485         }
1486     }
1487
1488   if (!n)
1489     return 0;
1490 #ifdef MULE
1491   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1492     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1493   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1494     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1495   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1496     st->big5.mask = detect_coding_big5 (st, src, n);
1497   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1498     st->utf8.mask = detect_coding_utf8 (st, src, n);
1499   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1500     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1501
1502   st->mask
1503     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1504     | st->utf8.mask | st->ucs4.mask;
1505 #endif
1506   {
1507     int retval = mask_has_at_most_one_bit_p (st->mask);
1508     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1509     return retval && st->eol_type != EOL_AUTODETECT;
1510   }
1511 }
1512
1513 static Lisp_Object
1514 coding_system_from_mask (int mask)
1515 {
1516   if (mask == ~0)
1517     {
1518       /* If the file was entirely or basically ASCII, use the
1519          default value of `buffer-file-coding-system'. */
1520       Lisp_Object retval =
1521         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1522       if (!NILP (retval))
1523         {
1524           retval = Ffind_coding_system (retval);
1525           if (NILP (retval))
1526             {
1527               warn_when_safe
1528                 (Qbad_variable, Qwarning,
1529                  "Invalid `default-buffer-file-coding-system', set to nil");
1530               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1531             }
1532         }
1533       if (NILP (retval))
1534         retval = Fget_coding_system (Qraw_text);
1535       return retval;
1536     }
1537   else
1538     {
1539       int i;
1540       int cat = -1;
1541 #ifdef MULE
1542       mask = postprocess_iso2022_mask (mask);
1543 #endif
1544       /* Look through the coding categories by priority and find
1545          the first one that is allowed. */
1546       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1547         {
1548           cat = coding_category_by_priority[i];
1549           if ((mask & (1 << cat)) &&
1550               !NILP (coding_category_system[cat]))
1551             break;
1552         }
1553       if (cat >= 0)
1554         return coding_category_system[cat];
1555       else
1556         return Fget_coding_system (Qraw_text);
1557     }
1558 }
1559
1560 /* Given a seekable read stream and potential coding system and EOL type
1561    as specified, do any autodetection that is called for.  If the
1562    coding system and/or EOL type are not autodetect, they will be left
1563    alone; but this function will never return an autodetect coding system
1564    or EOL type.
1565
1566    This function does not automatically fetch subsidiary coding systems;
1567    that should be unnecessary with the explicit eol-type argument. */
1568
1569 void
1570 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1571                               enum eol_type *eol_type_in_out)
1572 {
1573   struct detection_state decst;
1574
1575   if (*eol_type_in_out == EOL_AUTODETECT)
1576     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1577
1578   xzero (decst);
1579   decst.eol_type = *eol_type_in_out;
1580   decst.mask = ~0;
1581
1582   /* If autodetection is called for, do it now. */
1583   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1584       *eol_type_in_out == EOL_AUTODETECT)
1585     {
1586
1587       while (1)
1588         {
1589           unsigned char random_buffer[4096];
1590           int nread;
1591
1592           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1593           if (!nread)
1594             break;
1595           if (detect_coding_type (&decst, random_buffer, nread,
1596                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1597                                   CODESYS_AUTODETECT))
1598             break;
1599         }
1600
1601       *eol_type_in_out = decst.eol_type;
1602       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1603         *codesys_in_out = coding_system_from_mask (decst.mask);
1604     }
1605
1606   /* If we absolutely can't determine the EOL type, just assume LF. */
1607   if (*eol_type_in_out == EOL_AUTODETECT)
1608     *eol_type_in_out = EOL_LF;
1609
1610   Lstream_rewind (stream);
1611 }
1612
1613 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1614 Detect coding system of the text in the region between START and END.
1615 Returned a list of possible coding systems ordered by priority.
1616 If only ASCII characters are found, it returns 'undecided or one of
1617 its subsidiary coding systems according to a detected end-of-line
1618 type.  Optional arg BUFFER defaults to the current buffer.
1619 */
1620        (start, end, buffer))
1621 {
1622   Lisp_Object val = Qnil;
1623   struct buffer *buf = decode_buffer (buffer, 0);
1624   Bufpos b, e;
1625   Lisp_Object instream, lb_instream;
1626   Lstream *istr, *lb_istr;
1627   struct detection_state decst;
1628   struct gcpro gcpro1, gcpro2;
1629
1630   get_buffer_range_char (buf, start, end, &b, &e, 0);
1631   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1632   lb_istr = XLSTREAM (lb_instream);
1633   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1634   istr = XLSTREAM (instream);
1635   GCPRO2 (instream, lb_instream);
1636   xzero (decst);
1637   decst.eol_type = EOL_AUTODETECT;
1638   decst.mask = ~0;
1639   while (1)
1640     {
1641       unsigned char random_buffer[4096];
1642       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1643
1644       if (!nread)
1645         break;
1646       if (detect_coding_type (&decst, random_buffer, nread, 0))
1647         break;
1648     }
1649
1650   if (decst.mask == ~0)
1651     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1652                                     decst.eol_type);
1653   else
1654     {
1655       int i;
1656
1657       val = Qnil;
1658 #ifdef MULE
1659       decst.mask = postprocess_iso2022_mask (decst.mask);
1660 #endif
1661       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1662         {
1663           int sys = coding_category_by_priority[i];
1664           if (decst.mask & (1 << sys))
1665             {
1666               Lisp_Object codesys = coding_category_system[sys];
1667               if (!NILP (codesys))
1668                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1669               val = Fcons (codesys, val);
1670             }
1671         }
1672     }
1673   Lstream_close (istr);
1674   UNGCPRO;
1675   Lstream_delete (istr);
1676   Lstream_delete (lb_istr);
1677   return val;
1678 }
1679
1680 \f
1681 /************************************************************************/
1682 /*           Converting to internal Mule format ("decoding")            */
1683 /************************************************************************/
1684
1685 /* A decoding stream is a stream used for decoding text (i.e.
1686    converting from some external format to internal format).
1687    The decoding-stream object keeps track of the actual coding
1688    stream, the stream that is at the other end, and data that
1689    needs to be persistent across the lifetime of the stream. */
1690
1691 /* Handle the EOL stuff related to just-read-in character C.
1692    EOL_TYPE is the EOL type of the coding stream.
1693    FLAGS is the current value of FLAGS in the coding stream, and may
1694    be modified by this macro.  (The macro only looks at the
1695    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1696    bytes are to be written.  You need to also define a local goto
1697    label "label_continue_loop" that is at the end of the main
1698    character-reading loop.
1699
1700    If C is a CR character, then this macro handles it entirely and
1701    jumps to label_continue_loop.  Otherwise, this macro does not add
1702    anything to DST, and continues normally.  You should continue
1703    processing C normally after this macro. */
1704
1705 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1706 do {                                                            \
1707   if (c == '\r')                                                \
1708     {                                                           \
1709       if (eol_type == EOL_CR)                                   \
1710         Dynarr_add (dst, '\n');                                 \
1711       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1712         Dynarr_add (dst, c);                                    \
1713       else                                                      \
1714         flags |= CODING_STATE_CR;                               \
1715       goto label_continue_loop;                                 \
1716     }                                                           \
1717   else if (flags & CODING_STATE_CR)                             \
1718     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1719       if (c != '\n')                                            \
1720         Dynarr_add (dst, '\r');                                 \
1721       flags &= ~CODING_STATE_CR;                                \
1722     }                                                           \
1723 } while (0)
1724
1725 /* C should be a binary character in the range 0 - 255; convert
1726    to internal format and add to Dynarr DST. */
1727
1728 #ifdef UTF2000
1729 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1730 do {                                            \
1731   if (BYTE_ASCII_P (c))                         \
1732     Dynarr_add (dst, c);                        \
1733   else                                          \
1734     {                                           \
1735       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1736       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1737     }                                           \
1738 } while (0)
1739
1740 INLINE void
1741 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1742 {
1743   if ( c <= 0x7f )
1744     {
1745       Dynarr_add (dst, c);
1746     }
1747   else if ( c <= 0x7ff )
1748     {
1749       Dynarr_add (dst, (c >> 6) | 0xc0);
1750       Dynarr_add (dst, (c & 0x3f) | 0x80);
1751     }
1752   else if ( c <= 0xffff )
1753     {
1754       Dynarr_add (dst,  (c >> 12) | 0xe0);
1755       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1756       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1757     }
1758   else if ( c <= 0x1fffff )
1759     {
1760       Dynarr_add (dst,  (c >> 18) | 0xf0);
1761       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1762       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1763       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1764     }
1765   else if ( c <= 0x3ffffff )
1766     {
1767       Dynarr_add (dst,  (c >> 24) | 0xf8);
1768       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1769       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1770       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1771       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1772     }
1773   else
1774     {
1775       Dynarr_add (dst,  (c >> 30) | 0xfc);
1776       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1777       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1778       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1779       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1780       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1781     }
1782 }
1783 #else
1784 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1785 do {                                            \
1786   if (BYTE_ASCII_P (c))                         \
1787     Dynarr_add (dst, c);                        \
1788   else if (BYTE_C1_P (c))                       \
1789     {                                           \
1790       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1791       Dynarr_add (dst, c + 0x20);               \
1792     }                                           \
1793   else                                          \
1794     {                                           \
1795       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1796       Dynarr_add (dst, c);                      \
1797     }                                           \
1798 } while (0)
1799 #endif
1800
1801 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1802 do {                                    \
1803   if (ch)                               \
1804     {                                   \
1805       DECODE_ADD_BINARY_CHAR (ch, dst); \
1806       ch = 0;                           \
1807     }                                   \
1808 } while (0)
1809
1810 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1811 do {                                    \
1812   if (flags & CODING_STATE_END)         \
1813     {                                   \
1814       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1815       if (flags & CODING_STATE_CR)      \
1816         Dynarr_add (dst, '\r');         \
1817     }                                   \
1818 } while (0)
1819
1820 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1821
1822 struct decoding_stream
1823 {
1824   /* Coding system that governs the conversion. */
1825   Lisp_Coding_System *codesys;
1826
1827   /* Stream that we read the encoded data from or
1828      write the decoded data to. */
1829   Lstream *other_end;
1830
1831   /* If we are reading, then we can return only a fixed amount of
1832      data, so if the conversion resulted in too much data, we store it
1833      here for retrieval the next time around. */
1834   unsigned_char_dynarr *runoff;
1835
1836   /* FLAGS holds flags indicating the current state of the decoding.
1837      Some of these flags are dependent on the coding system. */
1838   unsigned int flags;
1839
1840   /* CH holds a partially built-up character.  Since we only deal
1841      with one- and two-byte characters at the moment, we only use
1842      this to store the first byte of a two-byte character. */
1843   unsigned int ch;
1844
1845   /* EOL_TYPE specifies the type of end-of-line conversion that
1846      currently applies.  We need to keep this separate from the
1847      EOL type stored in CODESYS because the latter might indicate
1848      automatic EOL-type detection while the former will always
1849      indicate a particular EOL type. */
1850   enum eol_type eol_type;
1851 #ifdef MULE
1852   /* Additional ISO2022 information.  We define the structure above
1853      because it's also needed by the detection routines. */
1854   struct iso2022_decoder iso2022;
1855
1856   /* Additional information (the state of the running CCL program)
1857      used by the CCL decoder. */
1858   struct ccl_program ccl;
1859 #endif
1860   struct detection_state decst;
1861 };
1862
1863 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1864 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1865 static int decoding_rewinder   (Lstream *stream);
1866 static int decoding_seekable_p (Lstream *stream);
1867 static int decoding_flusher    (Lstream *stream);
1868 static int decoding_closer     (Lstream *stream);
1869
1870 static Lisp_Object decoding_marker (Lisp_Object stream,
1871                                     void (*markobj) (Lisp_Object));
1872
1873 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1874                                sizeof (struct decoding_stream));
1875
1876 static Lisp_Object
1877 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1878 {
1879   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1880   Lisp_Object str_obj;
1881
1882   /* We do not need to mark the coding systems or charsets stored
1883      within the stream because they are stored in a global list
1884      and automatically marked. */
1885
1886   XSETLSTREAM (str_obj, str);
1887   markobj (str_obj);
1888   if (str->imp->marker)
1889     return (str->imp->marker) (str_obj, markobj);
1890   else
1891     return Qnil;
1892 }
1893
1894 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1895    so we read data from the other end, decode it, and store it into DATA. */
1896
1897 static int
1898 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1899 {
1900   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1901   unsigned char *orig_data = data;
1902   int read_size;
1903   int error_occurred = 0;
1904
1905   /* We need to interface to mule_decode(), which expects to take some
1906      amount of data and store the result into a Dynarr.  We have
1907      mule_decode() store into str->runoff, and take data from there
1908      as necessary. */
1909
1910   /* We loop until we have enough data, reading chunks from the other
1911      end and decoding it. */
1912   while (1)
1913     {
1914       /* Take data from the runoff if we can.  Make sure to take at
1915          most SIZE bytes, and delete the data from the runoff. */
1916       if (Dynarr_length (str->runoff) > 0)
1917         {
1918           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1919           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1920           Dynarr_delete_many (str->runoff, 0, chunk);
1921           data += chunk;
1922           size -= chunk;
1923         }
1924
1925       if (size == 0)
1926         break; /* No more room for data */
1927
1928       if (str->flags & CODING_STATE_END)
1929         /* This means that on the previous iteration, we hit the EOF on
1930            the other end.  We loop once more so that mule_decode() can
1931            output any final stuff it may be holding, or any "go back
1932            to a sane state" escape sequences. (This latter makes sense
1933            during encoding.) */
1934         break;
1935
1936       /* Exhausted the runoff, so get some more.  DATA has at least
1937          SIZE bytes left of storage in it, so it's OK to read directly
1938          into it.  (We'll be overwriting above, after we've decoded it
1939          into the runoff.) */
1940       read_size = Lstream_read (str->other_end, data, size);
1941       if (read_size < 0)
1942         {
1943           error_occurred = 1;
1944           break;
1945         }
1946       if (read_size == 0)
1947         /* There might be some more end data produced in the translation.
1948            See the comment above. */
1949         str->flags |= CODING_STATE_END;
1950       mule_decode (stream, data, str->runoff, read_size);
1951     }
1952
1953   if (data - orig_data == 0)
1954     return error_occurred ? -1 : 0;
1955   else
1956     return data - orig_data;
1957 }
1958
1959 static int
1960 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1961 {
1962   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1963   int retval;
1964
1965   /* Decode all our data into the runoff, and then attempt to write
1966      it all out to the other end.  Remove whatever chunk we succeeded
1967      in writing. */
1968   mule_decode (stream, data, str->runoff, size);
1969   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1970                           Dynarr_length (str->runoff));
1971   if (retval > 0)
1972     Dynarr_delete_many (str->runoff, 0, retval);
1973   /* Do NOT return retval.  The return value indicates how much
1974      of the incoming data was written, not how many bytes were
1975      written. */
1976   return size;
1977 }
1978
1979 static void
1980 reset_decoding_stream (struct decoding_stream *str)
1981 {
1982 #ifdef MULE
1983   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1984     {
1985       Lisp_Object coding_system;
1986       XSETCODING_SYSTEM (coding_system, str->codesys);
1987       reset_iso2022 (coding_system, &str->iso2022);
1988     }
1989   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1990     {
1991       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1992     }
1993 #endif /* MULE */
1994   str->flags = str->ch = 0;
1995 }
1996
1997 static int
1998 decoding_rewinder (Lstream *stream)
1999 {
2000   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2001   reset_decoding_stream (str);
2002   Dynarr_reset (str->runoff);
2003   return Lstream_rewind (str->other_end);
2004 }
2005
2006 static int
2007 decoding_seekable_p (Lstream *stream)
2008 {
2009   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2010   return Lstream_seekable_p (str->other_end);
2011 }
2012
2013 static int
2014 decoding_flusher (Lstream *stream)
2015 {
2016   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2017   return Lstream_flush (str->other_end);
2018 }
2019
2020 static int
2021 decoding_closer (Lstream *stream)
2022 {
2023   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2024   if (stream->flags & LSTREAM_FL_WRITE)
2025     {
2026       str->flags |= CODING_STATE_END;
2027       decoding_writer (stream, 0, 0);
2028     }
2029   Dynarr_free (str->runoff);
2030 #ifdef MULE
2031 #ifdef ENABLE_COMPOSITE_CHARS
2032   if (str->iso2022.composite_chars)
2033     Dynarr_free (str->iso2022.composite_chars);
2034 #endif
2035 #endif
2036   return Lstream_close (str->other_end);
2037 }
2038
2039 Lisp_Object
2040 decoding_stream_coding_system (Lstream *stream)
2041 {
2042   Lisp_Object coding_system;
2043   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2044
2045   XSETCODING_SYSTEM (coding_system, str->codesys);
2046   return subsidiary_coding_system (coding_system, str->eol_type);
2047 }
2048
2049 void
2050 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2051 {
2052   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2053   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2054   str->codesys = cs;
2055   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2056     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2057   reset_decoding_stream (str);
2058 }
2059
2060 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2061    stream for writing, no automatic code detection will be performed.
2062    The reason for this is that automatic code detection requires a
2063    seekable input.  Things will also fail if you open a decoding
2064    stream for reading using a non-fully-specified coding system and
2065    a non-seekable input stream. */
2066
2067 static Lisp_Object
2068 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2069                         CONST char *mode)
2070 {
2071   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2072   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2073   Lisp_Object obj;
2074
2075   xzero (*str);
2076   str->other_end = stream;
2077   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2078   str->eol_type = EOL_AUTODETECT;
2079   if (!strcmp (mode, "r")
2080       && Lstream_seekable_p (stream))
2081     /* We can determine the coding system now. */
2082     determine_real_coding_system (stream, &codesys, &str->eol_type);
2083   set_decoding_stream_coding_system (lstr, codesys);
2084   str->decst.eol_type = str->eol_type;
2085   str->decst.mask = ~0;
2086   XSETLSTREAM (obj, lstr);
2087   return obj;
2088 }
2089
2090 Lisp_Object
2091 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2092 {
2093   return make_decoding_stream_1 (stream, codesys, "r");
2094 }
2095
2096 Lisp_Object
2097 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2098 {
2099   return make_decoding_stream_1 (stream, codesys, "w");
2100 }
2101
2102 /* Note: the decode_coding_* functions all take the same
2103    arguments as mule_decode(), which is to say some SRC data of
2104    size N, which is to be stored into dynamic array DST.
2105    DECODING is the stream within which the decoding is
2106    taking place, but no data is actually read from or
2107    written to that stream; that is handled in decoding_reader()
2108    or decoding_writer().  This allows the same functions to
2109    be used for both reading and writing. */
2110
2111 static void
2112 mule_decode (Lstream *decoding, CONST unsigned char *src,
2113              unsigned_char_dynarr *dst, unsigned int n)
2114 {
2115   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2116
2117   /* If necessary, do encoding-detection now.  We do this when
2118      we're a writing stream or a non-seekable reading stream,
2119      meaning that we can't just process the whole input,
2120      rewind, and start over. */
2121
2122   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2123       str->eol_type == EOL_AUTODETECT)
2124     {
2125       Lisp_Object codesys;
2126
2127       XSETCODING_SYSTEM (codesys, str->codesys);
2128       detect_coding_type (&str->decst, src, n,
2129                           CODING_SYSTEM_TYPE (str->codesys) !=
2130                           CODESYS_AUTODETECT);
2131       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2132           str->decst.mask != ~0)
2133         /* #### This is cheesy.  What we really ought to do is
2134            buffer up a certain amount of data so as to get a
2135            less random result. */
2136         codesys = coding_system_from_mask (str->decst.mask);
2137       str->eol_type = str->decst.eol_type;
2138       if (XCODING_SYSTEM (codesys) != str->codesys)
2139         {
2140           /* Preserve the CODING_STATE_END flag in case it was set.
2141              If we erase it, bad things might happen. */
2142           int was_end = str->flags & CODING_STATE_END;
2143           set_decoding_stream_coding_system (decoding, codesys);
2144           if (was_end)
2145             str->flags |= CODING_STATE_END;
2146         }
2147     }
2148
2149   switch (CODING_SYSTEM_TYPE (str->codesys))
2150     {
2151 #ifdef DEBUG_XEMACS
2152     case CODESYS_INTERNAL:
2153       Dynarr_add_many (dst, src, n);
2154       break;
2155 #endif
2156     case CODESYS_AUTODETECT:
2157       /* If we got this far and still haven't decided on the coding
2158          system, then do no conversion. */
2159     case CODESYS_NO_CONVERSION:
2160       decode_coding_no_conversion (decoding, src, dst, n);
2161       break;
2162 #ifdef MULE
2163     case CODESYS_SHIFT_JIS:
2164       decode_coding_sjis (decoding, src, dst, n);
2165       break;
2166     case CODESYS_BIG5:
2167       decode_coding_big5 (decoding, src, dst, n);
2168       break;
2169     case CODESYS_UCS4:
2170       decode_coding_ucs4 (decoding, src, dst, n);
2171       break;
2172     case CODESYS_UTF8:
2173       decode_coding_utf8 (decoding, src, dst, n);
2174       break;
2175     case CODESYS_CCL:
2176       ccl_driver (&str->ccl, src, dst, n, 0);
2177       break;
2178     case CODESYS_ISO2022:
2179       decode_coding_iso2022 (decoding, src, dst, n);
2180       break;
2181 #endif /* MULE */
2182     default:
2183       abort ();
2184     }
2185 }
2186
2187 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2188 Decode the text between START and END which is encoded in CODING-SYSTEM.
2189 This is useful if you've read in encoded text from a file without decoding
2190 it (e.g. you read in a JIS-formatted file but used the `binary' or
2191 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2192 Return length of decoded text.
2193 BUFFER defaults to the current buffer if unspecified.
2194 */
2195        (start, end, coding_system, buffer))
2196 {
2197   Bufpos b, e;
2198   struct buffer *buf = decode_buffer (buffer, 0);
2199   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2200   Lstream *istr, *ostr;
2201   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2202
2203   get_buffer_range_char (buf, start, end, &b, &e, 0);
2204
2205   barf_if_buffer_read_only (buf, b, e);
2206
2207   coding_system = Fget_coding_system (coding_system);
2208   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2209   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2210   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2211                                               coding_system);
2212   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2213                                            Fget_coding_system (Qbinary));
2214   istr = XLSTREAM (instream);
2215   ostr = XLSTREAM (outstream);
2216   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2217
2218   /* The chain of streams looks like this:
2219
2220      [BUFFER] <----- send through
2221                      ------> [ENCODE AS BINARY]
2222                              ------> [DECODE AS SPECIFIED]
2223                                      ------> [BUFFER]
2224    */
2225
2226   while (1)
2227     {
2228       char tempbuf[1024]; /* some random amount */
2229       Bufpos newpos, even_newer_pos;
2230       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2231       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2232
2233       if (!size_in_bytes)
2234         break;
2235       newpos = lisp_buffer_stream_startpos (istr);
2236       Lstream_write (ostr, tempbuf, size_in_bytes);
2237       even_newer_pos = lisp_buffer_stream_startpos (istr);
2238       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2239                            even_newer_pos, 0);
2240     }
2241   Lstream_close (istr);
2242   Lstream_close (ostr);
2243   UNGCPRO;
2244   Lstream_delete (istr);
2245   Lstream_delete (ostr);
2246   Lstream_delete (XLSTREAM (de_outstream));
2247   Lstream_delete (XLSTREAM (lb_outstream));
2248   return Qnil;
2249 }
2250
2251 \f
2252 /************************************************************************/
2253 /*           Converting to an external encoding ("encoding")            */
2254 /************************************************************************/
2255
2256 /* An encoding stream is an output stream.  When you create the
2257    stream, you specify the coding system that governs the encoding
2258    and another stream that the resulting encoded data is to be
2259    sent to, and then start sending data to it. */
2260
2261 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2262
2263 struct encoding_stream
2264 {
2265   /* Coding system that governs the conversion. */
2266   Lisp_Coding_System *codesys;
2267
2268   /* Stream that we read the encoded data from or
2269      write the decoded data to. */
2270   Lstream *other_end;
2271
2272   /* If we are reading, then we can return only a fixed amount of
2273      data, so if the conversion resulted in too much data, we store it
2274      here for retrieval the next time around. */
2275   unsigned_char_dynarr *runoff;
2276
2277   /* FLAGS holds flags indicating the current state of the encoding.
2278      Some of these flags are dependent on the coding system. */
2279   unsigned int flags;
2280
2281   /* CH holds a partially built-up character.  Since we only deal
2282      with one- and two-byte characters at the moment, we only use
2283      this to store the first byte of a two-byte character. */
2284   unsigned int ch;
2285 #ifdef MULE
2286   /* Additional information used by the ISO2022 encoder. */
2287   struct
2288     {
2289       /* CHARSET holds the character sets currently assigned to the G0
2290          through G3 registers.  It is initialized from the array
2291          INITIAL_CHARSET in CODESYS. */
2292       Lisp_Object charset[4];
2293
2294       /* Which registers are currently invoked into the left (GL) and
2295          right (GR) halves of the 8-bit encoding space? */
2296       int register_left, register_right;
2297
2298       /* Whether we need to explicitly designate the charset in the
2299          G? register before using it.  It is initialized from the
2300          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2301       unsigned char force_charset_on_output[4];
2302
2303       /* Other state variables that need to be preserved across
2304          invocations. */
2305       Lisp_Object current_charset;
2306       int current_half;
2307       int current_char_boundary;
2308     } iso2022;
2309
2310   /* Additional information (the state of the running CCL program)
2311      used by the CCL encoder. */
2312   struct ccl_program ccl;
2313 #endif /* MULE */
2314 };
2315
2316 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2317 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2318                             size_t size);
2319 static int encoding_rewinder   (Lstream *stream);
2320 static int encoding_seekable_p (Lstream *stream);
2321 static int encoding_flusher    (Lstream *stream);
2322 static int encoding_closer     (Lstream *stream);
2323
2324 static Lisp_Object encoding_marker (Lisp_Object stream,
2325                                     void (*markobj) (Lisp_Object));
2326
2327 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2328                                sizeof (struct encoding_stream));
2329
2330 static Lisp_Object
2331 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2332 {
2333   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2334   Lisp_Object str_obj;
2335
2336   /* We do not need to mark the coding systems or charsets stored
2337      within the stream because they are stored in a global list
2338      and automatically marked. */
2339
2340   XSETLSTREAM (str_obj, str);
2341   markobj (str_obj);
2342   if (str->imp->marker)
2343     return (str->imp->marker) (str_obj, markobj);
2344   else
2345     return Qnil;
2346 }
2347
2348 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2349    so we read data from the other end, encode it, and store it into DATA. */
2350
2351 static int
2352 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2353 {
2354   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2355   unsigned char *orig_data = data;
2356   int read_size;
2357   int error_occurred = 0;
2358
2359   /* We need to interface to mule_encode(), which expects to take some
2360      amount of data and store the result into a Dynarr.  We have
2361      mule_encode() store into str->runoff, and take data from there
2362      as necessary. */
2363
2364   /* We loop until we have enough data, reading chunks from the other
2365      end and encoding it. */
2366   while (1)
2367     {
2368       /* Take data from the runoff if we can.  Make sure to take at
2369          most SIZE bytes, and delete the data from the runoff. */
2370       if (Dynarr_length (str->runoff) > 0)
2371         {
2372           int chunk = min ((int) size, Dynarr_length (str->runoff));
2373           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2374           Dynarr_delete_many (str->runoff, 0, chunk);
2375           data += chunk;
2376           size -= chunk;
2377         }
2378
2379       if (size == 0)
2380         break; /* No more room for data */
2381
2382       if (str->flags & CODING_STATE_END)
2383         /* This means that on the previous iteration, we hit the EOF on
2384            the other end.  We loop once more so that mule_encode() can
2385            output any final stuff it may be holding, or any "go back
2386            to a sane state" escape sequences. (This latter makes sense
2387            during encoding.) */
2388         break;
2389
2390       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2391          left of storage in it, so it's OK to read directly into it.
2392          (We'll be overwriting above, after we've encoded it into the
2393          runoff.) */
2394       read_size = Lstream_read (str->other_end, data, size);
2395       if (read_size < 0)
2396         {
2397           error_occurred = 1;
2398           break;
2399         }
2400       if (read_size == 0)
2401         /* There might be some more end data produced in the translation.
2402            See the comment above. */
2403         str->flags |= CODING_STATE_END;
2404       mule_encode (stream, data, str->runoff, read_size);
2405     }
2406
2407   if (data == orig_data)
2408     return error_occurred ? -1 : 0;
2409   else
2410     return data - orig_data;
2411 }
2412
2413 static int
2414 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2415 {
2416   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2417   int retval;
2418
2419   /* Encode all our data into the runoff, and then attempt to write
2420      it all out to the other end.  Remove whatever chunk we succeeded
2421      in writing. */
2422   mule_encode (stream, data, str->runoff, size);
2423   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2424                           Dynarr_length (str->runoff));
2425   if (retval > 0)
2426     Dynarr_delete_many (str->runoff, 0, retval);
2427   /* Do NOT return retval.  The return value indicates how much
2428      of the incoming data was written, not how many bytes were
2429      written. */
2430   return size;
2431 }
2432
2433 static void
2434 reset_encoding_stream (struct encoding_stream *str)
2435 {
2436 #ifdef MULE
2437   switch (CODING_SYSTEM_TYPE (str->codesys))
2438     {
2439     case CODESYS_ISO2022:
2440       {
2441         int i;
2442
2443         for (i = 0; i < 4; i++)
2444           {
2445             str->iso2022.charset[i] =
2446               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2447             str->iso2022.force_charset_on_output[i] =
2448               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2449           }
2450         str->iso2022.register_left = 0;
2451         str->iso2022.register_right = 1;
2452         str->iso2022.current_charset = Qnil;
2453         str->iso2022.current_half = 0;
2454         str->iso2022.current_char_boundary = 1;
2455         break;
2456       }
2457     case CODESYS_CCL:
2458       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2459       break;
2460     default:
2461       break;
2462     }
2463 #endif /* MULE */
2464
2465   str->flags = str->ch = 0;
2466 }
2467
2468 static int
2469 encoding_rewinder (Lstream *stream)
2470 {
2471   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2472   reset_encoding_stream (str);
2473   Dynarr_reset (str->runoff);
2474   return Lstream_rewind (str->other_end);
2475 }
2476
2477 static int
2478 encoding_seekable_p (Lstream *stream)
2479 {
2480   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2481   return Lstream_seekable_p (str->other_end);
2482 }
2483
2484 static int
2485 encoding_flusher (Lstream *stream)
2486 {
2487   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2488   return Lstream_flush (str->other_end);
2489 }
2490
2491 static int
2492 encoding_closer (Lstream *stream)
2493 {
2494   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2495   if (stream->flags & LSTREAM_FL_WRITE)
2496     {
2497       str->flags |= CODING_STATE_END;
2498       encoding_writer (stream, 0, 0);
2499     }
2500   Dynarr_free (str->runoff);
2501   return Lstream_close (str->other_end);
2502 }
2503
2504 Lisp_Object
2505 encoding_stream_coding_system (Lstream *stream)
2506 {
2507   Lisp_Object coding_system;
2508   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2509
2510   XSETCODING_SYSTEM (coding_system, str->codesys);
2511   return coding_system;
2512 }
2513
2514 void
2515 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2516 {
2517   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2518   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2519   str->codesys = cs;
2520   reset_encoding_stream (str);
2521 }
2522
2523 static Lisp_Object
2524 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2525                         CONST char *mode)
2526 {
2527   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2528   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2529   Lisp_Object obj;
2530
2531   xzero (*str);
2532   str->runoff = Dynarr_new (unsigned_char);
2533   str->other_end = stream;
2534   set_encoding_stream_coding_system (lstr, codesys);
2535   XSETLSTREAM (obj, lstr);
2536   return obj;
2537 }
2538
2539 Lisp_Object
2540 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2541 {
2542   return make_encoding_stream_1 (stream, codesys, "r");
2543 }
2544
2545 Lisp_Object
2546 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2547 {
2548   return make_encoding_stream_1 (stream, codesys, "w");
2549 }
2550
2551 /* Convert N bytes of internally-formatted data stored in SRC to an
2552    external format, according to the encoding stream ENCODING.
2553    Store the encoded data into DST. */
2554
2555 static void
2556 mule_encode (Lstream *encoding, CONST unsigned char *src,
2557              unsigned_char_dynarr *dst, unsigned int n)
2558 {
2559   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2560
2561   switch (CODING_SYSTEM_TYPE (str->codesys))
2562     {
2563 #ifdef DEBUG_XEMACS
2564     case CODESYS_INTERNAL:
2565       Dynarr_add_many (dst, src, n);
2566       break;
2567 #endif
2568     case CODESYS_AUTODETECT:
2569       /* If we got this far and still haven't decided on the coding
2570          system, then do no conversion. */
2571     case CODESYS_NO_CONVERSION:
2572       encode_coding_no_conversion (encoding, src, dst, n);
2573       break;
2574 #ifdef MULE
2575     case CODESYS_SHIFT_JIS:
2576       encode_coding_sjis (encoding, src, dst, n);
2577       break;
2578     case CODESYS_BIG5:
2579       encode_coding_big5 (encoding, src, dst, n);
2580       break;
2581     case CODESYS_UCS4:
2582       encode_coding_ucs4 (encoding, src, dst, n);
2583       break;
2584     case CODESYS_UTF8:
2585       encode_coding_utf8 (encoding, src, dst, n);
2586       break;
2587     case CODESYS_CCL:
2588       ccl_driver (&str->ccl, src, dst, n, 0);
2589       break;
2590     case CODESYS_ISO2022:
2591       encode_coding_iso2022 (encoding, src, dst, n);
2592       break;
2593 #endif /* MULE */
2594     default:
2595       abort ();
2596     }
2597 }
2598
2599 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2600 Encode the text between START and END using CODING-SYSTEM.
2601 This will, for example, convert Japanese characters into stuff such as
2602 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2603 text.  BUFFER defaults to the current buffer if unspecified.
2604 */
2605        (start, end, coding_system, buffer))
2606 {
2607   Bufpos b, e;
2608   struct buffer *buf = decode_buffer (buffer, 0);
2609   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2610   Lstream *istr, *ostr;
2611   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2612
2613   get_buffer_range_char (buf, start, end, &b, &e, 0);
2614
2615   barf_if_buffer_read_only (buf, b, e);
2616
2617   coding_system = Fget_coding_system (coding_system);
2618   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2619   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2620   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2621                                               Fget_coding_system (Qbinary));
2622   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2623                                            coding_system);
2624   istr = XLSTREAM (instream);
2625   ostr = XLSTREAM (outstream);
2626   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2627   /* The chain of streams looks like this:
2628
2629      [BUFFER] <----- send through
2630                      ------> [ENCODE AS SPECIFIED]
2631                              ------> [DECODE AS BINARY]
2632                                      ------> [BUFFER]
2633    */
2634   while (1)
2635     {
2636       char tempbuf[1024]; /* some random amount */
2637       Bufpos newpos, even_newer_pos;
2638       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2639       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2640
2641       if (!size_in_bytes)
2642         break;
2643       newpos = lisp_buffer_stream_startpos (istr);
2644       Lstream_write (ostr, tempbuf, size_in_bytes);
2645       even_newer_pos = lisp_buffer_stream_startpos (istr);
2646       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2647                            even_newer_pos, 0);
2648     }
2649
2650   {
2651     Charcount retlen =
2652       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2653     Lstream_close (istr);
2654     Lstream_close (ostr);
2655     UNGCPRO;
2656     Lstream_delete (istr);
2657     Lstream_delete (ostr);
2658     Lstream_delete (XLSTREAM (de_outstream));
2659     Lstream_delete (XLSTREAM (lb_outstream));
2660     return make_int (retlen);
2661   }
2662 }
2663
2664 #ifdef MULE
2665 \f
2666 /************************************************************************/
2667 /*                          Shift-JIS methods                           */
2668 /************************************************************************/
2669
2670 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2671    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2672    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2673    encoded by "position-code + 0x80".  A character of JISX0208
2674    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2675    position-codes are divided and shifted so that it fit in the range
2676    below.
2677
2678    --- CODE RANGE of Shift-JIS ---
2679    (character set)      (range)
2680    ASCII                0x00 .. 0x7F
2681    JISX0201-Kana        0xA0 .. 0xDF
2682    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2683             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2684    -------------------------------
2685
2686 */
2687
2688 /* Is this the first byte of a Shift-JIS two-byte char? */
2689
2690 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2691   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2692
2693 /* Is this the second byte of a Shift-JIS two-byte char? */
2694
2695 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2696   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2697
2698 #define BYTE_SJIS_KATAKANA_P(c) \
2699   ((c) >= 0xA1 && (c) <= 0xDF)
2700
2701 static int
2702 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2703                     unsigned int n)
2704 {
2705   int c;
2706
2707   while (n--)
2708     {
2709       c = *src++;
2710       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2711         return 0;
2712       if (st->shift_jis.in_second_byte)
2713         {
2714           st->shift_jis.in_second_byte = 0;
2715           if (c < 0x40)
2716             return 0;
2717         }
2718       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2719         st->shift_jis.in_second_byte = 1;
2720     }
2721   return CODING_CATEGORY_SHIFT_JIS_MASK;
2722 }
2723
2724 /* Convert Shift-JIS data to internal format. */
2725
2726 static void
2727 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2728                     unsigned_char_dynarr *dst, unsigned int n)
2729 {
2730   unsigned char c;
2731   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2732   unsigned int flags  = str->flags;
2733   unsigned int ch     = str->ch;
2734   eol_type_t eol_type = str->eol_type;
2735
2736   while (n--)
2737     {
2738       c = *src++;
2739
2740       if (ch)
2741         {
2742           /* Previous character was first byte of Shift-JIS Kanji char. */
2743           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2744             {
2745               unsigned char e1, e2;
2746
2747               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2748               DECODE_SJIS (ch, c, e1, e2);
2749               Dynarr_add (dst, e1);
2750               Dynarr_add (dst, e2);
2751             }
2752           else
2753             {
2754               DECODE_ADD_BINARY_CHAR (ch, dst);
2755               DECODE_ADD_BINARY_CHAR (c, dst);
2756             }
2757           ch = 0;
2758         }
2759       else
2760         {
2761           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2762           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2763             ch = c;
2764           else if (BYTE_SJIS_KATAKANA_P (c))
2765             {
2766               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2767               Dynarr_add (dst, c);
2768             }
2769           else
2770             DECODE_ADD_BINARY_CHAR (c, dst);
2771         }
2772     label_continue_loop:;
2773     }
2774
2775   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2776
2777   str->flags = flags;
2778   str->ch    = ch;
2779 }
2780
2781 /* Convert internally-formatted data to Shift-JIS. */
2782
2783 static void
2784 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2785                     unsigned_char_dynarr *dst, unsigned int n)
2786 {
2787   unsigned char c;
2788   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2789   unsigned int flags  = str->flags;
2790   unsigned int ch     = str->ch;
2791   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2792
2793   while (n--)
2794     {
2795       c = *src++;
2796       if (c == '\n')
2797         {
2798           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2799             Dynarr_add (dst, '\r');
2800           if (eol_type != EOL_CR)
2801             Dynarr_add (dst, '\n');
2802           ch = 0;
2803         }
2804       else if (BYTE_ASCII_P (c))
2805         {
2806           Dynarr_add (dst, c);
2807           ch = 0;
2808         }
2809       else if (BUFBYTE_LEADING_BYTE_P (c))
2810         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2811               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2812               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2813       else if (ch)
2814         {
2815           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2816             {
2817               Dynarr_add (dst, c);
2818               ch = 0;
2819             }
2820           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2821                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2822             ch = c;
2823           else
2824             {
2825               unsigned char j1, j2;
2826               ENCODE_SJIS (ch, c, j1, j2);
2827               Dynarr_add (dst, j1);
2828               Dynarr_add (dst, j2);
2829               ch = 0;
2830             }
2831         }
2832     }
2833
2834   str->flags = flags;
2835   str->ch    = ch;
2836 }
2837
2838 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2839 Decode a JISX0208 character of Shift-JIS coding-system.
2840 CODE is the character code in Shift-JIS as a cons of type bytes.
2841 Return the corresponding character.
2842 */
2843        (code))
2844 {
2845   unsigned char c1, c2, s1, s2;
2846
2847   CHECK_CONS (code);
2848   CHECK_INT (XCAR (code));
2849   CHECK_INT (XCDR (code));
2850   s1 = XINT (XCAR (code));
2851   s2 = XINT (XCDR (code));
2852   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2853       BYTE_SJIS_TWO_BYTE_2_P (s2))
2854     {
2855       DECODE_SJIS (s1, s2, c1, c2);
2856       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2857                                    c1 & 0x7F, c2 & 0x7F));
2858     }
2859   else
2860     return Qnil;
2861 }
2862
2863 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2864 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2865 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2866 */
2867        (ch))
2868 {
2869   Lisp_Object charset;
2870   int c1, c2, s1, s2;
2871
2872   CHECK_CHAR_COERCE_INT (ch);
2873   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2874   if (EQ (charset, Vcharset_japanese_jisx0208))
2875     {
2876       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2877       return Fcons (make_int (s1), make_int (s2));
2878     }
2879   else
2880     return Qnil;
2881 }
2882
2883 \f
2884 /************************************************************************/
2885 /*                            Big5 methods                              */
2886 /************************************************************************/
2887
2888 /* BIG5 is a coding system encoding two character sets: ASCII and
2889    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2890    character set and is encoded in two-byte.
2891
2892    --- CODE RANGE of BIG5 ---
2893    (character set)      (range)
2894    ASCII                0x00 .. 0x7F
2895    Big5 (1st byte)      0xA1 .. 0xFE
2896         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2897    --------------------------
2898
2899    Since the number of characters in Big5 is larger than maximum
2900    characters in Emacs' charset (96x96), it can't be handled as one
2901    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2902    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
2903    contains frequently used characters and the latter contains less
2904    frequently used characters.  */
2905
2906 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2907   ((c) >= 0xA1 && (c) <= 0xFE)
2908
2909 /* Is this the second byte of a Shift-JIS two-byte char? */
2910
2911 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2912   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2913
2914 /* Number of Big5 characters which have the same code in 1st byte.  */
2915
2916 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2917
2918 /* Code conversion macros.  These are macros because they are used in
2919    inner loops during code conversion.
2920
2921    Note that temporary variables in macros introduce the classic
2922    dynamic-scoping problems with variable names.  We use capital-
2923    lettered variables in the assumption that XEmacs does not use
2924    capital letters in variables except in a very formalized way
2925    (e.g. Qstring). */
2926
2927 /* Convert Big5 code (b1, b2) into its internal string representation
2928    (lb, c1, c2). */
2929
2930 /* There is a much simpler way to split the Big5 charset into two.
2931    For the moment I'm going to leave the algorithm as-is because it
2932    claims to separate out the most-used characters into a single
2933    charset, which perhaps will lead to optimizations in various
2934    places.
2935
2936    The way the algorithm works is something like this:
2937
2938    Big5 can be viewed as a 94x157 charset, where the row is
2939    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2940    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2941    the split between low and high column numbers is apparently
2942    meaningless; ascending rows produce less and less frequent chars.
2943    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2944    the first charset, and the upper half (0xC9 .. 0xFE) to the
2945    second.  To do the conversion, we convert the character into
2946    a single number where 0 .. 156 is the first row, 157 .. 313
2947    is the second, etc.  That way, the characters are ordered by
2948    decreasing frequency.  Then we just chop the space in two
2949    and coerce the result into a 94x94 space.
2950    */
2951
2952 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2953 {                                                                       \
2954   int B1 = b1, B2 = b2;                                                 \
2955   unsigned int I                                                        \
2956     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2957                                                                         \
2958   if (B1 < 0xC9)                                                        \
2959     {                                                                   \
2960       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2961     }                                                                   \
2962   else                                                                  \
2963     {                                                                   \
2964       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2965       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2966     }                                                                   \
2967   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2968   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2969 } while (0)
2970
2971 /* Convert the internal string representation of a Big5 character
2972    (lb, c1, c2) into Big5 code (b1, b2). */
2973
2974 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2975 {                                                                       \
2976   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2977                                                                         \
2978   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2979     {                                                                   \
2980       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2981     }                                                                   \
2982   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2983   b2 = I % BIG5_SAME_ROW;                                               \
2984   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2985 } while (0)
2986
2987 static int
2988 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2989                     unsigned int n)
2990 {
2991   int c;
2992
2993   while (n--)
2994     {
2995       c = *src++;
2996       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2997           (c >= 0x80 && c <= 0xA0))
2998         return 0;
2999       if (st->big5.in_second_byte)
3000         {
3001           st->big5.in_second_byte = 0;
3002           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3003             return 0;
3004         }
3005       else if (c >= 0xA1)
3006         st->big5.in_second_byte = 1;
3007     }
3008   return CODING_CATEGORY_BIG5_MASK;
3009 }
3010
3011 /* Convert Big5 data to internal format. */
3012
3013 static void
3014 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3015                     unsigned_char_dynarr *dst, unsigned int n)
3016 {
3017   unsigned char c;
3018   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3019   unsigned int flags  = str->flags;
3020   unsigned int ch     = str->ch;
3021   eol_type_t eol_type = str->eol_type;
3022
3023   while (n--)
3024     {
3025       c = *src++;
3026       if (ch)
3027         {
3028           /* Previous character was first byte of Big5 char. */
3029           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3030             {
3031               unsigned char b1, b2, b3;
3032               DECODE_BIG5 (ch, c, b1, b2, b3);
3033               Dynarr_add (dst, b1);
3034               Dynarr_add (dst, b2);
3035               Dynarr_add (dst, b3);
3036             }
3037           else
3038             {
3039               DECODE_ADD_BINARY_CHAR (ch, dst);
3040               DECODE_ADD_BINARY_CHAR (c, dst);
3041             }
3042           ch = 0;
3043         }
3044       else
3045         {
3046           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3047           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3048             ch = c;
3049           else
3050             DECODE_ADD_BINARY_CHAR (c, dst);
3051         }
3052     label_continue_loop:;
3053     }
3054
3055   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3056
3057   str->flags = flags;
3058   str->ch    = ch;
3059 }
3060
3061 /* Convert internally-formatted data to Big5. */
3062
3063 static void
3064 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3065                     unsigned_char_dynarr *dst, unsigned int n)
3066 {
3067   unsigned char c;
3068   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3069   unsigned int flags  = str->flags;
3070   unsigned int ch     = str->ch;
3071   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3072
3073   while (n--)
3074     {
3075       c = *src++;
3076       if (c == '\n')
3077         {
3078           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3079             Dynarr_add (dst, '\r');
3080           if (eol_type != EOL_CR)
3081             Dynarr_add (dst, '\n');
3082         }
3083       else if (BYTE_ASCII_P (c))
3084         {
3085           /* ASCII. */
3086           Dynarr_add (dst, c);
3087         }
3088       else if (BUFBYTE_LEADING_BYTE_P (c))
3089         {
3090           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3091               c == LEADING_BYTE_CHINESE_BIG5_2)
3092             {
3093               /* A recognized leading byte. */
3094               ch = c;
3095               continue; /* not done with this character. */
3096             }
3097           /* otherwise just ignore this character. */
3098         }
3099       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3100                ch == LEADING_BYTE_CHINESE_BIG5_2)
3101         {
3102           /* Previous char was a recognized leading byte. */
3103           ch = (ch << 8) | c;
3104           continue; /* not done with this character. */
3105         }
3106       else if (ch)
3107         {
3108           /* Encountering second byte of a Big5 character. */
3109           unsigned char b1, b2;
3110
3111           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3112           Dynarr_add (dst, b1);
3113           Dynarr_add (dst, b2);
3114         }
3115
3116       ch = 0;
3117     }
3118
3119   str->flags = flags;
3120   str->ch    = ch;
3121 }
3122
3123
3124 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3125 Decode a Big5 character CODE of BIG5 coding-system.
3126 CODE is the character code in BIG5, a cons of two integers.
3127 Return the corresponding character.
3128 */
3129        (code))
3130 {
3131   unsigned char c1, c2, b1, b2;
3132
3133   CHECK_CONS (code);
3134   CHECK_INT (XCAR (code));
3135   CHECK_INT (XCDR (code));
3136   b1 = XINT (XCAR (code));
3137   b2 = XINT (XCDR (code));
3138   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3139       BYTE_BIG5_TWO_BYTE_2_P (b2))
3140     {
3141       int leading_byte;
3142       Lisp_Object charset;
3143       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3144       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3145       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3146     }
3147   else
3148     return Qnil;
3149 }
3150
3151 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3152 Encode the Big5 character CH to BIG5 coding-system.
3153 Return the corresponding character code in Big5.
3154 */
3155        (ch))
3156 {
3157   Lisp_Object charset;
3158   int c1, c2, b1, b2;
3159
3160   CHECK_CHAR_COERCE_INT (ch);
3161   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3162   if (EQ (charset, Vcharset_chinese_big5_1) ||
3163       EQ (charset, Vcharset_chinese_big5_2))
3164     {
3165       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3166                    b1, b2);
3167       return Fcons (make_int (b1), make_int (b2));
3168     }
3169   else
3170     return Qnil;
3171 }
3172
3173 \f
3174 /************************************************************************/
3175 /*                           UCS-4 methods                              */
3176 /*                                                                      */
3177 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3178 /*                                                                      */
3179 /************************************************************************/
3180
3181 Lisp_Object ucs_to_mule_table[65536];
3182 Lisp_Object mule_to_ucs_table;
3183
3184 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3185 Map UCS-4 code CODE to Mule character CHARACTER.
3186
3187 Return T on success, NIL on failure.
3188 */
3189        (code, character))
3190 {
3191   unsigned int c;
3192
3193   CHECK_CHAR (character);
3194   CHECK_INT (code);
3195   c = XINT (code);
3196
3197   if (c < sizeof (ucs_to_mule_table))
3198     {
3199       ucs_to_mule_table[c] = character;
3200       return Qt;
3201     }
3202   else
3203     return Qnil;
3204 }
3205
3206 static Lisp_Object
3207 ucs_to_char (unsigned long code)
3208 {
3209   if (code < sizeof (ucs_to_mule_table))
3210     {
3211       return ucs_to_mule_table[code];
3212     }
3213   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3214     {
3215       unsigned int c;
3216
3217       code -= 0xe00000;
3218       c = code % (94 * 94);
3219       return make_char
3220         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3221                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3222                      CHARSET_LEFT_TO_RIGHT),
3223                     c / 94 + 33, c % 94 + 33));
3224     }
3225   else
3226     return Qnil;
3227 }
3228
3229 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3230 Return Mule character corresponding to UCS code CODE (a positive integer).
3231 */
3232        (code))
3233 {
3234   CHECK_NATNUM (code);
3235   return ucs_to_char (XINT (code));
3236 }
3237
3238 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3239 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3240 */
3241        (character, code))
3242 {
3243   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3244           Fset_char_ucs is more restrictive on index arg, but should
3245           check code arg in a char_table method. */
3246   CHECK_CHAR (character);
3247   CHECK_NATNUM (code);
3248   return Fput_char_table (character, code, mule_to_ucs_table);
3249 }
3250
3251 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3252 Return the UCS code (a positive integer) corresponding to CHARACTER.
3253 */
3254        (character))
3255 {
3256   return Fget_char_table (character, mule_to_ucs_table);
3257 }
3258
3259 #ifdef UTF2000
3260 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3261 #else
3262 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3263    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3264    is not found, instead.
3265    #### do something more appropriate (use blob?)
3266         Danger, Will Robinson!  Data loss.  Should we signal user? */
3267 static void
3268 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3269 {
3270   Lisp_Object chr = ucs_to_char (ch);
3271
3272   if (! NILP (chr))
3273     {
3274       Bufbyte work[MAX_EMCHAR_LEN];
3275       int len;
3276
3277       ch = XCHAR (chr);
3278       len = (ch < 128) ?
3279         simple_set_charptr_emchar (work, ch) :
3280         non_ascii_set_charptr_emchar (work, ch);
3281       Dynarr_add_many (dst, work, len);
3282     }
3283   else
3284     {
3285       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3286       Dynarr_add (dst, 34 + 128);
3287       Dynarr_add (dst, 46 + 128);
3288     }
3289 }
3290 #endif
3291
3292 static unsigned long
3293 mule_char_to_ucs4 (Lisp_Object charset,
3294                    unsigned char h, unsigned char l)
3295 {
3296   Lisp_Object code
3297     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3298                        mule_to_ucs_table);
3299
3300   if (INTP (code))
3301     {
3302       return XINT (code);
3303     }
3304   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3305             (XCHARSET_CHARS (charset) == 94) )
3306     {
3307       unsigned char final = XCHARSET_FINAL (charset);
3308
3309       if ( ('@' <= final) && (final < 0x7f) )
3310         {
3311           return 0xe00000 + (final - '@') * 94 * 94
3312             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3313         }
3314       else
3315         {
3316           return '?';
3317         }
3318     }
3319   else
3320     {
3321       return '?';
3322     }
3323 }
3324
3325 static void
3326 encode_ucs4 (Lisp_Object charset,
3327              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3328 {
3329   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3330   Dynarr_add (dst,  code >> 24);
3331   Dynarr_add (dst, (code >> 16) & 255);
3332   Dynarr_add (dst, (code >>  8) & 255);
3333   Dynarr_add (dst,  code        & 255);
3334 }
3335
3336 static int
3337 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3338                     unsigned int n)
3339 {
3340   while (n--)
3341     {
3342       int c = *src++;
3343       switch (st->ucs4.in_byte)
3344         {
3345         case 0:
3346           if (c >= 128)
3347             return 0;
3348           else
3349             st->ucs4.in_byte++;
3350           break;
3351         case 3:
3352           st->ucs4.in_byte = 0;
3353           break;
3354         default:
3355           st->ucs4.in_byte++;
3356         }
3357     }
3358   return CODING_CATEGORY_UCS4_MASK;
3359 }
3360
3361 static void
3362 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3363                     unsigned_char_dynarr *dst, unsigned int n)
3364 {
3365   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3366   unsigned int flags = str->flags;
3367   unsigned int ch    = str->ch;
3368
3369   while (n--)
3370     {
3371       unsigned char c = *src++;
3372       switch (flags)
3373         {
3374         case 0:
3375           ch = c;
3376           flags = 3;
3377           break;
3378         case 1:
3379           decode_ucs4 ( ( ch << 8 ) | c, dst);
3380           ch = 0;
3381           flags = 0;
3382           break;
3383         default:
3384           ch = ( ch << 8 ) | c;
3385           flags--;
3386         }
3387     }
3388   if (flags & CODING_STATE_END)
3389     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3390
3391   str->flags = flags;
3392   str->ch    = ch;
3393 }
3394
3395 static void
3396 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3397                     unsigned_char_dynarr *dst, unsigned int n)
3398 {
3399   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3400   unsigned int flags = str->flags;
3401   unsigned int ch = str->ch;
3402   unsigned char char_boundary = str->iso2022.current_char_boundary;
3403   Lisp_Object charset = str->iso2022.current_charset;
3404
3405 #ifdef ENABLE_COMPOSITE_CHARS
3406   /* flags for handling composite chars.  We do a little switcharoo
3407      on the source while we're outputting the composite char. */
3408   unsigned int saved_n = 0;
3409   CONST unsigned char *saved_src = NULL;
3410   int in_composite = 0;
3411
3412  back_to_square_n:
3413 #endif
3414
3415   while (n--)
3416     {
3417       unsigned char c = *src++;
3418
3419       if (BYTE_ASCII_P (c))
3420         {               /* Processing ASCII character */
3421           ch = 0;
3422           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3423           char_boundary = 1;
3424         }
3425       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3426         { /* Processing Leading Byte */
3427           ch = 0;
3428           charset = CHARSET_BY_LEADING_BYTE (c);
3429           if (LEADING_BYTE_PREFIX_P(c))
3430             ch = c;
3431           char_boundary = 0;
3432         }
3433       else
3434         {                       /* Processing Non-ASCII character */
3435           char_boundary = 1;
3436           if (EQ (charset, Vcharset_control_1))
3437             {
3438               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3439             }
3440           else
3441             {
3442               switch (XCHARSET_REP_BYTES (charset))
3443                 {
3444                 case 2:
3445                   encode_ucs4 (charset, c, 0, dst);
3446                   break;
3447                 case 3:
3448                   if (XCHARSET_PRIVATE_P (charset))
3449                     {
3450                       encode_ucs4 (charset, c, 0, dst);
3451                       ch = 0;
3452                     }
3453                   else if (ch)
3454                     {
3455 #ifdef ENABLE_COMPOSITE_CHARS
3456                       if (EQ (charset, Vcharset_composite))
3457                         {
3458                           if (in_composite)
3459                             {
3460                               /* #### Bother! We don't know how to
3461                                  handle this yet. */
3462                               Dynarr_add (dst, 0);
3463                               Dynarr_add (dst, 0);
3464                               Dynarr_add (dst, 0);
3465                               Dynarr_add (dst, '~');
3466                             }
3467                           else
3468                             {
3469                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3470                                                        ch & 0x7F, c & 0x7F);
3471                               Lisp_Object lstr = composite_char_string (emch);
3472                               saved_n = n;
3473                               saved_src = src;
3474                               in_composite = 1;
3475                               src = XSTRING_DATA   (lstr);
3476                               n   = XSTRING_LENGTH (lstr);
3477                             }
3478                         }
3479                       else
3480 #endif /* ENABLE_COMPOSITE_CHARS */
3481                         {
3482                           encode_ucs4(charset, ch, c, dst);
3483                         }
3484                       ch = 0;
3485                     }
3486                   else
3487                     {
3488                       ch = c;
3489                       char_boundary = 0;
3490                     }
3491                   break;
3492                 case 4:
3493                   if (ch)
3494                     {
3495                       encode_ucs4 (charset, ch, c, dst);
3496                       ch = 0;
3497                     }
3498                   else
3499                     {
3500                       ch = c;
3501                       char_boundary = 0;
3502                     }
3503                   break;
3504                 default:
3505                   abort ();
3506                 }
3507             }
3508         }
3509     }
3510
3511 #ifdef ENABLE_COMPOSITE_CHARS
3512   if (in_composite)
3513     {
3514       n = saved_n;
3515       src = saved_src;
3516       in_composite = 0;
3517       goto back_to_square_n; /* Wheeeeeeeee ..... */
3518     }
3519 #endif /* ENABLE_COMPOSITE_CHARS */
3520
3521   str->flags = flags;
3522   str->ch = ch;
3523   str->iso2022.current_char_boundary = char_boundary;
3524   str->iso2022.current_charset = charset;
3525
3526   /* Verbum caro factum est! */
3527 }
3528
3529 \f
3530 /************************************************************************/
3531 /*                           UTF-8 methods                              */
3532 /************************************************************************/
3533
3534 static int
3535 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3536                     unsigned int n)
3537 {
3538   while (n--)
3539     {
3540       unsigned char c = *src++;
3541       switch (st->utf8.in_byte)
3542         {
3543         case 0:
3544           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3545             return 0;
3546           else if (c >= 0xfc)
3547             st->utf8.in_byte = 5;
3548           else if (c >= 0xf8)
3549             st->utf8.in_byte = 4;
3550           else if (c >= 0xf0)
3551             st->utf8.in_byte = 3;
3552           else if (c >= 0xe0)
3553             st->utf8.in_byte = 2;
3554           else if (c >= 0xc0)
3555             st->utf8.in_byte = 1;
3556           else if (c >= 0x80)
3557             return 0;
3558           break;
3559         default:
3560           if ((c & 0xc0) != 0x80)
3561             return 0;
3562           else
3563             st->utf8.in_byte--;
3564         }
3565     }
3566   return CODING_CATEGORY_UTF8_MASK;
3567 }
3568
3569 static void
3570 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3571                     unsigned_char_dynarr *dst, unsigned int n)
3572 {
3573   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3574   unsigned int flags  = str->flags;
3575   unsigned int ch     = str->ch;
3576   eol_type_t eol_type = str->eol_type;
3577
3578   while (n--)
3579     {
3580       unsigned char c = *src++;
3581       switch (flags)
3582         {
3583         case 0:
3584           if ( c >= 0xfc )
3585             {
3586               ch = c & 0x01;
3587               flags = 5;
3588             }
3589           else if ( c >= 0xf8 )
3590             {
3591               ch = c & 0x03;
3592               flags = 4;
3593             }
3594           else if ( c >= 0xf0 )
3595             {
3596               ch = c & 0x07;
3597               flags = 3;
3598             }
3599           else if ( c >= 0xe0 )
3600             {
3601               ch = c & 0x0f;
3602               flags = 2;
3603             }
3604           else if ( c >= 0xc0 )
3605             {
3606               ch = c & 0x1f;
3607               flags = 1;
3608             }
3609           else
3610             {
3611               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3612               decode_ucs4 (c, dst);
3613             }
3614           break;
3615         case 1:
3616           ch = ( ch << 6 ) | ( c & 0x3f );
3617           decode_ucs4 (ch, dst);
3618           ch = 0;
3619           flags = 0;
3620           break;
3621         default:
3622           ch = ( ch << 6 ) | ( c & 0x3f );
3623           flags--;
3624         }
3625     label_continue_loop:;
3626     }
3627
3628   if (flags & CODING_STATE_END)
3629     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3630
3631   str->flags = flags;
3632   str->ch    = ch;
3633 }
3634
3635 static void
3636 encode_utf8 (Lisp_Object charset,
3637              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3638 {
3639   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3640   if ( code <= 0x7f )
3641     {
3642       Dynarr_add (dst, code);
3643     }
3644   else if ( code <= 0x7ff )
3645     {
3646       Dynarr_add (dst, (code >> 6) | 0xc0);
3647       Dynarr_add (dst, (code & 0x3f) | 0x80);
3648     }
3649   else if ( code <= 0xffff )
3650     {
3651       Dynarr_add (dst,  (code >> 12) | 0xe0);
3652       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3653       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3654     }
3655   else if ( code <= 0x1fffff )
3656     {
3657       Dynarr_add (dst,  (code >> 18) | 0xf0);
3658       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3659       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3660       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3661     }
3662   else if ( code <= 0x3ffffff )
3663     {
3664       Dynarr_add (dst,  (code >> 24) | 0xf8);
3665       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3666       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3667       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3668       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3669     }
3670   else
3671     {
3672       Dynarr_add (dst,  (code >> 30) | 0xfc);
3673       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3674       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3675       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3676       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3677       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3678     }
3679 }
3680
3681 static void
3682 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3683                     unsigned_char_dynarr *dst, unsigned int n)
3684 {
3685   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3686   unsigned int flags  = str->flags;
3687   unsigned int ch     = str->ch;
3688   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3689   unsigned char char_boundary = str->iso2022.current_char_boundary;
3690   Lisp_Object charset = str->iso2022.current_charset;
3691
3692 #ifdef ENABLE_COMPOSITE_CHARS
3693   /* flags for handling composite chars.  We do a little switcharoo
3694      on the source while we're outputting the composite char. */
3695   unsigned int saved_n = 0;
3696   CONST unsigned char *saved_src = NULL;
3697   int in_composite = 0;
3698
3699  back_to_square_n:
3700 #endif /* ENABLE_COMPOSITE_CHARS */
3701
3702   while (n--)
3703     {
3704       unsigned char c = *src++;
3705
3706       if (BYTE_ASCII_P (c))
3707         {               /* Processing ASCII character */
3708           ch = 0;
3709           if (c == '\n')
3710             {
3711               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3712                 Dynarr_add (dst, '\r');
3713               if (eol_type != EOL_CR)
3714                 Dynarr_add (dst, c);
3715             }
3716           else
3717             encode_utf8 (Vcharset_ascii, c, 0, dst);
3718           char_boundary = 1;
3719         }
3720       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3721         { /* Processing Leading Byte */
3722           ch = 0;
3723           charset = CHARSET_BY_LEADING_BYTE (c);
3724           if (LEADING_BYTE_PREFIX_P(c))
3725             ch = c;
3726           char_boundary = 0;
3727         }
3728       else
3729         {                       /* Processing Non-ASCII character */
3730           char_boundary = 1;
3731           if (EQ (charset, Vcharset_control_1))
3732             {
3733               encode_utf8 (Vcharset_control_1, c, 0, dst);
3734             }
3735           else
3736             {
3737               switch (XCHARSET_REP_BYTES (charset))
3738                 {
3739                 case 2:
3740                   encode_utf8 (charset, c, 0, dst);
3741                   break;
3742                 case 3:
3743                   if (XCHARSET_PRIVATE_P (charset))
3744                     {
3745                       encode_utf8 (charset, c, 0, dst);
3746                       ch = 0;
3747                     }
3748                   else if (ch)
3749                     {
3750 #ifdef ENABLE_COMPOSITE_CHARS
3751                       if (EQ (charset, Vcharset_composite))
3752                         {
3753                           if (in_composite)
3754                             {
3755                               /* #### Bother! We don't know how to
3756                                  handle this yet. */
3757                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3758                             }
3759                           else
3760                             {
3761                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3762                                                        ch & 0x7F, c & 0x7F);
3763                               Lisp_Object lstr = composite_char_string (emch);
3764                               saved_n = n;
3765                               saved_src = src;
3766                               in_composite = 1;
3767                               src = XSTRING_DATA   (lstr);
3768                               n   = XSTRING_LENGTH (lstr);
3769                             }
3770                         }
3771                       else
3772 #endif /* ENABLE_COMPOSITE_CHARS */
3773                         {
3774                           encode_utf8 (charset, ch, c, dst);
3775                         }
3776                       ch = 0;
3777                     }
3778                   else
3779                     {
3780                       ch = c;
3781                       char_boundary = 0;
3782                     }
3783                   break;
3784                 case 4:
3785                   if (ch)
3786                     {
3787                       encode_utf8 (charset, ch, c, dst);
3788                       ch = 0;
3789                     }
3790                   else
3791                     {
3792                       ch = c;
3793                       char_boundary = 0;
3794                     }
3795                   break;
3796                 default:
3797                   abort ();
3798                 }
3799             }
3800         }
3801     }
3802
3803 #ifdef ENABLE_COMPOSITE_CHARS
3804   if (in_composite)
3805     {
3806       n = saved_n;
3807       src = saved_src;
3808       in_composite = 0;
3809       goto back_to_square_n; /* Wheeeeeeeee ..... */
3810     }
3811 #endif
3812
3813   str->flags = flags;
3814   str->ch    = ch;
3815   str->iso2022.current_char_boundary = char_boundary;
3816   str->iso2022.current_charset = charset;
3817
3818   /* Verbum caro factum est! */
3819 }
3820
3821 \f
3822 /************************************************************************/
3823 /*                           ISO2022 methods                            */
3824 /************************************************************************/
3825
3826 /* The following note describes the coding system ISO2022 briefly.
3827    Since the intention of this note is to help understand the
3828    functions in this file, some parts are NOT ACCURATE or OVERLY
3829    SIMPLIFIED.  For thorough understanding, please refer to the
3830    original document of ISO2022.
3831
3832    ISO2022 provides many mechanisms to encode several character sets
3833    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3834    is encoded using bytes less than 128.  This may make the encoded
3835    text a little bit longer, but the text passes more easily through
3836    several gateways, some of which strip off MSB (Most Signigant Bit).
3837
3838    There are two kinds of character sets: control character set and
3839    graphic character set.  The former contains control characters such
3840    as `newline' and `escape' to provide control functions (control
3841    functions are also provided by escape sequences).  The latter
3842    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3843    two control character sets and many graphic character sets.
3844
3845    Graphic character sets are classified into one of the following
3846    four classes, according to the number of bytes (DIMENSION) and
3847    number of characters in one dimension (CHARS) of the set:
3848    - DIMENSION1_CHARS94
3849    - DIMENSION1_CHARS96
3850    - DIMENSION2_CHARS94
3851    - DIMENSION2_CHARS96
3852
3853    In addition, each character set is assigned an identification tag,
3854    unique for each set, called "final character" (denoted as <F>
3855    hereafter).  The <F> of each character set is decided by ECMA(*)
3856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3857    (0x30..0x3F are for private use only).
3858
3859    Note (*): ECMA = European Computer Manufacturers Association
3860
3861    Here are examples of graphic character set [NAME(<F>)]:
3862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3865         o DIMENSION2_CHARS96 -- none for the moment
3866
3867    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3868         C0 [0x00..0x1F] -- control character plane 0
3869         GL [0x20..0x7F] -- graphic character plane 0
3870         C1 [0x80..0x9F] -- control character plane 1
3871         GR [0xA0..0xFF] -- graphic character plane 1
3872
3873    A control character set is directly designated and invoked to C0 or
3874    C1 by an escape sequence.  The most common case is that:
3875    - ISO646's  control character set is designated/invoked to C0, and
3876    - ISO6429's control character set is designated/invoked to C1,
3877    and usually these designations/invocations are omitted in encoded
3878    text.  In a 7-bit environment, only C0 can be used, and a control
3879    character for C1 is encoded by an appropriate escape sequence to
3880    fit into the environment.  All control characters for C1 are
3881    defined to have corresponding escape sequences.
3882
3883    A graphic character set is at first designated to one of four
3884    graphic registers (G0 through G3), then these graphic registers are
3885    invoked to GL or GR.  These designations and invocations can be
3886    done independently.  The most common case is that G0 is invoked to
3887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3888    these invocations and designations are omitted in encoded text.
3889    In a 7-bit environment, only GL can be used.
3890
3891    When a graphic character set of CHARS94 is invoked to GL, codes
3892    0x20 and 0x7F of the GL area work as control characters SPACE and
3893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3894    be used.
3895
3896    There are two ways of invocation: locking-shift and single-shift.
3897    With locking-shift, the invocation lasts until the next different
3898    invocation, whereas with single-shift, the invocation affects the
3899    following character only and doesn't affect the locking-shift
3900    state.  Invocations are done by the following control characters or
3901    escape sequences:
3902
3903    ----------------------------------------------------------------------
3904    abbrev  function                  cntrl escape seq   description
3905    ----------------------------------------------------------------------
3906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3915    ----------------------------------------------------------------------
3916    (*) These are not used by any known coding system.
3917
3918    Control characters for these functions are defined by macros
3919    ISO_CODE_XXX in `coding.h'.
3920
3921    Designations are done by the following escape sequences:
3922    ----------------------------------------------------------------------
3923    escape sequence      description
3924    ----------------------------------------------------------------------
3925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3941    ----------------------------------------------------------------------
3942
3943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3944    of dimension 1, chars 94, and final character <F>, etc...
3945
3946    Note (*): Although these designations are not allowed in ISO2022,
3947    Emacs accepts them on decoding, and produces them on encoding
3948    CHARS96 character sets in a coding system which is characterized as
3949    7-bit environment, non-locking-shift, and non-single-shift.
3950
3951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3952    '(' can be omitted.  We refer to this as "short-form" hereafter.
3953
3954    Now you may notice that there are a lot of ways for encoding the
3955    same multilingual text in ISO2022.  Actually, there exist many
3956    coding systems such as Compound Text (used in X11's inter client
3957    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3958    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3959    localized platforms), and all of these are variants of ISO2022.
3960
3961    In addition to the above, Emacs handles two more kinds of escape
3962    sequences: ISO6429's direction specification and Emacs' private
3963    sequence for specifying character composition.
3964
3965    ISO6429's direction specification takes the following form:
3966         o CSI ']'      -- end of the current direction
3967         o CSI '0' ']'  -- end of the current direction
3968         o CSI '1' ']'  -- start of left-to-right text
3969         o CSI '2' ']'  -- start of right-to-left text
3970    The control character CSI (0x9B: control sequence introducer) is
3971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3972
3973    Character composition specification takes the following form:
3974         o ESC '0' -- start character composition
3975         o ESC '1' -- end character composition
3976    Since these are not standard escape sequences of any ISO standard,
3977    their use with these meanings is restricted to Emacs only.  */
3978
3979 static void
3980 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3981 {
3982   int i;
3983
3984   for (i = 0; i < 4; i++)
3985     {
3986       if (!NILP (coding_system))
3987         iso->charset[i] =
3988           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3989       else
3990         iso->charset[i] = Qt;
3991       iso->invalid_designated[i] = 0;
3992     }
3993   iso->esc = ISO_ESC_NOTHING;
3994   iso->esc_bytes_index = 0;
3995   iso->register_left = 0;
3996   iso->register_right = 1;
3997   iso->switched_dir_and_no_valid_charset_yet = 0;
3998   iso->invalid_switch_dir = 0;
3999   iso->output_direction_sequence = 0;
4000   iso->output_literally = 0;
4001 #ifdef ENABLE_COMPOSITE_CHARS
4002   if (iso->composite_chars)
4003     Dynarr_reset (iso->composite_chars);
4004 #endif
4005 }
4006
4007 static int
4008 fit_to_be_escape_quoted (unsigned char c)
4009 {
4010   switch (c)
4011     {
4012     case ISO_CODE_ESC:
4013     case ISO_CODE_CSI:
4014     case ISO_CODE_SS2:
4015     case ISO_CODE_SS3:
4016     case ISO_CODE_SO:
4017     case ISO_CODE_SI:
4018       return 1;
4019
4020     default:
4021       return 0;
4022     }
4023 }
4024
4025 /* Parse one byte of an ISO2022 escape sequence.
4026    If the result is an invalid escape sequence, return 0 and
4027    do not change anything in STR.  Otherwise, if the result is
4028    an incomplete escape sequence, update ISO2022.ESC and
4029    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4030    all the state variables (but not ISO2022.ESC_BYTES) and
4031    return 1.
4032
4033    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4034    or invocation of an invalid character set and treat that as
4035    an unrecognized escape sequence. */
4036
4037 static int
4038 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4039                    unsigned char c, unsigned int *flags,
4040                    int check_invalid_charsets)
4041 {
4042   /* (1) If we're at the end of a designation sequence, CS is the
4043      charset being designated and REG is the register to designate
4044      it to.
4045
4046      (2) If we're at the end of a locking-shift sequence, REG is
4047      the register to invoke and HALF (0 == left, 1 == right) is
4048      the half to invoke it into.
4049
4050      (3) If we're at the end of a single-shift sequence, REG is
4051      the register to invoke. */
4052   Lisp_Object cs = Qnil;
4053   int reg, half;
4054
4055   /* NOTE: This code does goto's all over the fucking place.
4056      The reason for this is that we're basically implementing
4057      a state machine here, and hierarchical languages like C
4058      don't really provide a clean way of doing this. */
4059
4060   if (! (*flags & CODING_STATE_ESCAPE))
4061     /* At beginning of escape sequence; we need to reset our
4062        escape-state variables. */
4063     iso->esc = ISO_ESC_NOTHING;
4064
4065   iso->output_literally = 0;
4066   iso->output_direction_sequence = 0;
4067
4068   switch (iso->esc)
4069     {
4070     case ISO_ESC_NOTHING:
4071       iso->esc_bytes_index = 0;
4072       switch (c)
4073         {
4074         case ISO_CODE_ESC:      /* Start escape sequence */
4075           *flags |= CODING_STATE_ESCAPE;
4076           iso->esc = ISO_ESC;
4077           goto not_done;
4078
4079         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4080           *flags |= CODING_STATE_ESCAPE;
4081           iso->esc = ISO_ESC_5_11;
4082           goto not_done;
4083
4084         case ISO_CODE_SO:       /* locking shift 1 */
4085           reg = 1; half = 0;
4086           goto locking_shift;
4087         case ISO_CODE_SI:       /* locking shift 0 */
4088           reg = 0; half = 0;
4089           goto locking_shift;
4090
4091         case ISO_CODE_SS2:      /* single shift */
4092           reg = 2;
4093           goto single_shift;
4094         case ISO_CODE_SS3:      /* single shift */
4095           reg = 3;
4096           goto single_shift;
4097
4098         default:                        /* Other control characters */
4099           return 0;
4100         }
4101
4102     case ISO_ESC:
4103       switch (c)
4104         {
4105           /**** single shift ****/
4106
4107         case 'N':       /* single shift 2 */
4108           reg = 2;
4109           goto single_shift;
4110         case 'O':       /* single shift 3 */
4111           reg = 3;
4112           goto single_shift;
4113
4114           /**** locking shift ****/
4115
4116         case '~':       /* locking shift 1 right */
4117           reg = 1; half = 1;
4118           goto locking_shift;
4119         case 'n':       /* locking shift 2 */
4120           reg = 2; half = 0;
4121           goto locking_shift;
4122         case '}':       /* locking shift 2 right */
4123           reg = 2; half = 1;
4124           goto locking_shift;
4125         case 'o':       /* locking shift 3 */
4126           reg = 3; half = 0;
4127           goto locking_shift;
4128         case '|':       /* locking shift 3 right */
4129           reg = 3; half = 1;
4130           goto locking_shift;
4131
4132 #ifdef ENABLE_COMPOSITE_CHARS
4133           /**** composite ****/
4134
4135         case '0':
4136           iso->esc = ISO_ESC_START_COMPOSITE;
4137           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4138             CODING_STATE_COMPOSITE;
4139           return 1;
4140
4141         case '1':
4142           iso->esc = ISO_ESC_END_COMPOSITE;
4143           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4144             ~CODING_STATE_COMPOSITE;
4145           return 1;
4146 #endif /* ENABLE_COMPOSITE_CHARS */
4147
4148           /**** directionality ****/
4149
4150         case '[':
4151           iso->esc = ISO_ESC_5_11;
4152           goto not_done;
4153
4154           /**** designation ****/
4155
4156         case '$':       /* multibyte charset prefix */
4157           iso->esc = ISO_ESC_2_4;
4158           goto not_done;
4159
4160         default:
4161           if (0x28 <= c && c <= 0x2F)
4162             {
4163               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4164               goto not_done;
4165             }
4166
4167           /* This function is called with CODESYS equal to nil when
4168              doing coding-system detection. */
4169           if (!NILP (codesys)
4170               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4171               && fit_to_be_escape_quoted (c))
4172             {
4173               iso->esc = ISO_ESC_LITERAL;
4174               *flags &= CODING_STATE_ISO2022_LOCK;
4175               return 1;
4176             }
4177
4178           /* bzzzt! */
4179           return 0;
4180         }
4181
4182
4183
4184       /**** directionality ****/
4185
4186     case ISO_ESC_5_11:          /* ISO6429 direction control */
4187       if (c == ']')
4188         {
4189           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4190           goto directionality;
4191         }
4192       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4193       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4194       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4195       else               return 0;
4196       goto not_done;
4197
4198     case ISO_ESC_5_11_0:
4199       if (c == ']')
4200         {
4201           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4202           goto directionality;
4203         }
4204       return 0;
4205
4206     case ISO_ESC_5_11_1:
4207       if (c == ']')
4208         {
4209           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4210           goto directionality;
4211         }
4212       return 0;
4213
4214     case ISO_ESC_5_11_2:
4215       if (c == ']')
4216         {
4217           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4218           goto directionality;
4219         }
4220       return 0;
4221
4222     directionality:
4223       iso->esc = ISO_ESC_DIRECTIONALITY;
4224       /* Various junk here to attempt to preserve the direction sequences
4225          literally in the text if they would otherwise be swallowed due
4226          to invalid designations that don't show up as actual charset
4227          changes in the text. */
4228       if (iso->invalid_switch_dir)
4229         {
4230           /* We already inserted a direction switch literally into the
4231              text.  We assume (#### this may not be right) that the
4232              next direction switch is the one going the other way,
4233              and we need to output that literally as well. */
4234           iso->output_literally = 1;
4235           iso->invalid_switch_dir = 0;
4236         }
4237       else
4238         {
4239           int jj;
4240
4241           /* If we are in the thrall of an invalid designation,
4242            then stick the directionality sequence literally into the
4243            output stream so it ends up in the original text again. */
4244           for (jj = 0; jj < 4; jj++)
4245             if (iso->invalid_designated[jj])
4246               break;
4247           if (jj < 4)
4248             {
4249               iso->output_literally = 1;
4250               iso->invalid_switch_dir = 1;
4251             }
4252           else
4253             /* Indicate that we haven't yet seen a valid designation,
4254                so that if a switch-dir is directly followed by an
4255                invalid designation, both get inserted literally. */
4256             iso->switched_dir_and_no_valid_charset_yet = 1;
4257         }
4258       return 1;
4259
4260
4261       /**** designation ****/
4262
4263     case ISO_ESC_2_4:
4264       if (0x28 <= c && c <= 0x2F)
4265         {
4266           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4267           goto not_done;
4268         }
4269       if (0x40 <= c && c <= 0x42)
4270         {
4271           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4272                                       *flags & CODING_STATE_R2L ?
4273                                       CHARSET_RIGHT_TO_LEFT :
4274                                       CHARSET_LEFT_TO_RIGHT);
4275           reg = 0;
4276           goto designated;
4277         }
4278       return 0;
4279
4280     default:
4281       {
4282         int type =-1;
4283
4284         if (c < '0' || c > '~')
4285           return 0; /* bad final byte */
4286
4287         if (iso->esc >= ISO_ESC_2_8 &&
4288             iso->esc <= ISO_ESC_2_15)
4289           {
4290             type = ((iso->esc >= ISO_ESC_2_12) ?
4291                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4292             reg = (iso->esc - ISO_ESC_2_8) & 3;
4293           }
4294         else if (iso->esc >= ISO_ESC_2_4_8 &&
4295                  iso->esc <= ISO_ESC_2_4_15)
4296           {
4297             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4298                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4299             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4300           }
4301         else
4302           {
4303             /* Can this ever be reached? -slb */
4304             abort();
4305           }
4306
4307         cs = CHARSET_BY_ATTRIBUTES (type, c,
4308                                     *flags & CODING_STATE_R2L ?
4309                                     CHARSET_RIGHT_TO_LEFT :
4310                                     CHARSET_LEFT_TO_RIGHT);
4311         goto designated;
4312       }
4313     }
4314
4315  not_done:
4316   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4317   return -1;
4318
4319  single_shift:
4320   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4321     /* can't invoke something that ain't there. */
4322     return 0;
4323   iso->esc = ISO_ESC_SINGLE_SHIFT;
4324   *flags &= CODING_STATE_ISO2022_LOCK;
4325   if (reg == 2)
4326     *flags |= CODING_STATE_SS2;
4327   else
4328     *flags |= CODING_STATE_SS3;
4329   return 1;
4330
4331  locking_shift:
4332   if (check_invalid_charsets &&
4333       !CHARSETP (iso->charset[reg]))
4334     /* can't invoke something that ain't there. */
4335     return 0;
4336   if (half)
4337     iso->register_right = reg;
4338   else
4339     iso->register_left = reg;
4340   *flags &= CODING_STATE_ISO2022_LOCK;
4341   iso->esc = ISO_ESC_LOCKING_SHIFT;
4342   return 1;
4343
4344  designated:
4345   if (NILP (cs) && check_invalid_charsets)
4346     {
4347       iso->invalid_designated[reg] = 1;
4348       iso->charset[reg] = Vcharset_ascii;
4349       iso->esc = ISO_ESC_DESIGNATE;
4350       *flags &= CODING_STATE_ISO2022_LOCK;
4351       iso->output_literally = 1;
4352       if (iso->switched_dir_and_no_valid_charset_yet)
4353         {
4354           /* We encountered a switch-direction followed by an
4355              invalid designation.  Ensure that the switch-direction
4356              gets outputted; otherwise it will probably get eaten
4357              when the text is written out again. */
4358           iso->switched_dir_and_no_valid_charset_yet = 0;
4359           iso->output_direction_sequence = 1;
4360           /* And make sure that the switch-dir going the other
4361              way gets outputted, as well. */
4362           iso->invalid_switch_dir = 1;
4363         }
4364       return 1;
4365     }
4366   /* This function is called with CODESYS equal to nil when
4367      doing coding-system detection. */
4368   if (!NILP (codesys))
4369     {
4370       charset_conversion_spec_dynarr *dyn =
4371         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4372
4373       if (dyn)
4374         {
4375           int i;
4376
4377           for (i = 0; i < Dynarr_length (dyn); i++)
4378             {
4379               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4380               if (EQ (cs, spec->from_charset))
4381                 cs = spec->to_charset;
4382             }
4383         }
4384     }
4385
4386   iso->charset[reg] = cs;
4387   iso->esc = ISO_ESC_DESIGNATE;
4388   *flags &= CODING_STATE_ISO2022_LOCK;
4389   if (iso->invalid_designated[reg])
4390     {
4391       iso->invalid_designated[reg] = 0;
4392       iso->output_literally = 1;
4393     }
4394   if (iso->switched_dir_and_no_valid_charset_yet)
4395     iso->switched_dir_and_no_valid_charset_yet = 0;
4396   return 1;
4397 }
4398
4399 static int
4400 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4401                        unsigned int n)
4402 {
4403   int mask;
4404
4405   /* #### There are serious deficiencies in the recognition mechanism
4406      here.  This needs to be much smarter if it's going to cut it.
4407      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4408      it should be detected as Latin-1.
4409      All the ISO2022 stuff in this file should be synced up with the
4410      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4411      Perhaps we should wait till R2L works in FSF Emacs? */
4412
4413   if (!st->iso2022.initted)
4414     {
4415       reset_iso2022 (Qnil, &st->iso2022.iso);
4416       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4417                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4418                           CODING_CATEGORY_ISO_8_1_MASK |
4419                           CODING_CATEGORY_ISO_8_2_MASK |
4420                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4421       st->iso2022.flags = 0;
4422       st->iso2022.high_byte_count = 0;
4423       st->iso2022.saw_single_shift = 0;
4424       st->iso2022.initted = 1;
4425     }
4426
4427   mask = st->iso2022.mask;
4428
4429   while (n--)
4430     {
4431       int c = *src++;
4432       if (c >= 0xA0)
4433         {
4434           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4435           st->iso2022.high_byte_count++;
4436         }
4437       else
4438         {
4439           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4440             {
4441               if (st->iso2022.high_byte_count & 1)
4442                 /* odd number of high bytes; assume not iso-8-2 */
4443                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4444             }
4445           st->iso2022.high_byte_count = 0;
4446           st->iso2022.saw_single_shift = 0;
4447           if (c > 0x80)
4448             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4449         }
4450       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4451           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4452         { /* control chars */
4453           switch (c)
4454             {
4455               /* Allow and ignore control characters that you might
4456                  reasonably see in a text file */
4457             case '\r':
4458             case '\n':
4459             case '\t':
4460             case  7: /* bell */
4461             case  8: /* backspace */
4462             case 11: /* vertical tab */
4463             case 12: /* form feed */
4464             case 26: /* MS-DOS C-z junk */
4465             case 31: /* '^_' -- for info */
4466               goto label_continue_loop;
4467
4468             default:
4469               break;
4470             }
4471         }
4472
4473       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4474           || BYTE_C1_P (c))
4475         {
4476           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4477                                  &st->iso2022.flags, 0))
4478             {
4479               switch (st->iso2022.iso.esc)
4480                 {
4481                 case ISO_ESC_DESIGNATE:
4482                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4483                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4484                   break;
4485                 case ISO_ESC_LOCKING_SHIFT:
4486                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4487                   goto ran_out_of_chars;
4488                 case ISO_ESC_SINGLE_SHIFT:
4489                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4490                   st->iso2022.saw_single_shift = 1;
4491                   break;
4492                 default:
4493                   break;
4494                 }
4495             }
4496           else
4497             {
4498               mask = 0;
4499               goto ran_out_of_chars;
4500             }
4501         }
4502     label_continue_loop:;
4503     }
4504
4505  ran_out_of_chars:
4506
4507   return mask;
4508 }
4509
4510 static int
4511 postprocess_iso2022_mask (int mask)
4512 {
4513   /* #### kind of cheesy */
4514   /* If seven-bit ISO is allowed, then assume that the encoding is
4515      entirely seven-bit and turn off the eight-bit ones. */
4516   if (mask & CODING_CATEGORY_ISO_7_MASK)
4517     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4518                CODING_CATEGORY_ISO_8_1_MASK |
4519                CODING_CATEGORY_ISO_8_2_MASK);
4520   return mask;
4521 }
4522
4523 /* If FLAGS is a null pointer or specifies right-to-left motion,
4524    output a switch-dir-to-left-to-right sequence to DST.
4525    Also update FLAGS if it is not a null pointer.
4526    If INTERNAL_P is set, we are outputting in internal format and
4527    need to handle the CSI differently. */
4528
4529 static void
4530 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4531                                  unsigned_char_dynarr *dst,
4532                                  unsigned int *flags,
4533                                  int internal_p)
4534 {
4535   if (!flags || (*flags & CODING_STATE_R2L))
4536     {
4537       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4538         {
4539           Dynarr_add (dst, ISO_CODE_ESC);
4540           Dynarr_add (dst, '[');
4541         }
4542       else if (internal_p)
4543         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4544       else
4545         Dynarr_add (dst, ISO_CODE_CSI);
4546       Dynarr_add (dst, '0');
4547       Dynarr_add (dst, ']');
4548       if (flags)
4549         *flags &= ~CODING_STATE_R2L;
4550     }
4551 }
4552
4553 /* If FLAGS is a null pointer or specifies a direction different from
4554    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4555    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4556    sequence to DST.  Also update FLAGS if it is not a null pointer.
4557    If INTERNAL_P is set, we are outputting in internal format and
4558    need to handle the CSI differently. */
4559
4560 static void
4561 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4562                           unsigned_char_dynarr *dst, unsigned int *flags,
4563                           int internal_p)
4564 {
4565   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4566       direction == CHARSET_LEFT_TO_RIGHT)
4567     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4568   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4569            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4570            direction == CHARSET_RIGHT_TO_LEFT)
4571     {
4572       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4573         {
4574           Dynarr_add (dst, ISO_CODE_ESC);
4575           Dynarr_add (dst, '[');
4576         }
4577       else if (internal_p)
4578         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4579       else
4580         Dynarr_add (dst, ISO_CODE_CSI);
4581       Dynarr_add (dst, '2');
4582       Dynarr_add (dst, ']');
4583       if (flags)
4584         *flags |= CODING_STATE_R2L;
4585     }
4586 }
4587
4588 /* Convert ISO2022-format data to internal format. */
4589
4590 static void
4591 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4592                        unsigned_char_dynarr *dst, unsigned int n)
4593 {
4594   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4595   unsigned int flags  = str->flags;
4596   unsigned int ch     = str->ch;
4597   eol_type_t eol_type = str->eol_type;
4598 #ifdef ENABLE_COMPOSITE_CHARS
4599   unsigned_char_dynarr *real_dst = dst;
4600 #endif
4601   Lisp_Object coding_system;
4602
4603   XSETCODING_SYSTEM (coding_system, str->codesys);
4604
4605 #ifdef ENABLE_COMPOSITE_CHARS
4606   if (flags & CODING_STATE_COMPOSITE)
4607     dst = str->iso2022.composite_chars;
4608 #endif /* ENABLE_COMPOSITE_CHARS */
4609
4610   while (n--)
4611     {
4612       unsigned char c = *src++;
4613       if (flags & CODING_STATE_ESCAPE)
4614         {       /* Within ESC sequence */
4615           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4616                                           c, &flags, 1);
4617
4618           if (retval)
4619             {
4620               switch (str->iso2022.esc)
4621                 {
4622 #ifdef ENABLE_COMPOSITE_CHARS
4623                 case ISO_ESC_START_COMPOSITE:
4624                   if (str->iso2022.composite_chars)
4625                     Dynarr_reset (str->iso2022.composite_chars);
4626                   else
4627                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4628                   dst = str->iso2022.composite_chars;
4629                   break;
4630                 case ISO_ESC_END_COMPOSITE:
4631                   {
4632                     Bufbyte comstr[MAX_EMCHAR_LEN];
4633                     Bytecount len;
4634                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4635                                                          Dynarr_length (dst));
4636                     dst = real_dst;
4637                     len = set_charptr_emchar (comstr, emch);
4638                     Dynarr_add_many (dst, comstr, len);
4639                     break;
4640                   }
4641 #endif /* ENABLE_COMPOSITE_CHARS */
4642
4643                 case ISO_ESC_LITERAL:
4644                   DECODE_ADD_BINARY_CHAR (c, dst);
4645                   break;
4646
4647                 default:
4648                   /* Everything else handled already */
4649                   break;
4650                 }
4651             }
4652
4653           /* Attempted error recovery. */
4654           if (str->iso2022.output_direction_sequence)
4655             ensure_correct_direction (flags & CODING_STATE_R2L ?
4656                                       CHARSET_RIGHT_TO_LEFT :
4657                                       CHARSET_LEFT_TO_RIGHT,
4658                                       str->codesys, dst, 0, 1);
4659           /* More error recovery. */
4660           if (!retval || str->iso2022.output_literally)
4661             {
4662               /* Output the (possibly invalid) sequence */
4663               int i;
4664               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4665                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4666               flags &= CODING_STATE_ISO2022_LOCK;
4667               if (!retval)
4668                 n++, src--;/* Repeat the loop with the same character. */
4669               else
4670                 {
4671                   /* No sense in reprocessing the final byte of the
4672                      escape sequence; it could mess things up anyway.
4673                      Just add it now. */
4674                   DECODE_ADD_BINARY_CHAR (c, dst);
4675                 }
4676             }
4677           ch = 0;
4678         }
4679       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4680         { /* Control characters */
4681
4682           /***** Error-handling *****/
4683
4684           /* If we were in the middle of a character, dump out the
4685              partial character. */
4686           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4687
4688           /* If we just saw a single-shift character, dump it out.
4689              This may dump out the wrong sort of single-shift character,
4690              but least it will give an indication that something went
4691              wrong. */
4692           if (flags & CODING_STATE_SS2)
4693             {
4694               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4695               flags &= ~CODING_STATE_SS2;
4696             }
4697           if (flags & CODING_STATE_SS3)
4698             {
4699               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4700               flags &= ~CODING_STATE_SS3;
4701             }
4702
4703           /***** Now handle the control characters. *****/
4704
4705           /* Handle CR/LF */
4706           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4707
4708           flags &= CODING_STATE_ISO2022_LOCK;
4709
4710           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4711             DECODE_ADD_BINARY_CHAR (c, dst);
4712         }
4713       else
4714         {                       /* Graphic characters */
4715           Lisp_Object charset;
4716 #ifndef UTF2000
4717           int lb;
4718 #endif
4719           int reg;
4720
4721           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4722
4723           /* Now determine the charset. */
4724           reg = ((flags & CODING_STATE_SS2) ? 2
4725                  : (flags & CODING_STATE_SS3) ? 3
4726                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4727                  : str->iso2022.register_left);
4728           charset = str->iso2022.charset[reg];
4729
4730           /* Error checking: */
4731           if (! CHARSETP (charset)
4732               || str->iso2022.invalid_designated[reg]
4733               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4734                   && XCHARSET_CHARS (charset) == 94))
4735             /* Mrmph.  We are trying to invoke a register that has no
4736                or an invalid charset in it, or trying to add a character
4737                outside the range of the charset.  Insert that char literally
4738                to preserve it for the output. */
4739             {
4740               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4741               DECODE_ADD_BINARY_CHAR (c, dst);
4742             }
4743
4744           else
4745             {
4746               /* Things are probably hunky-dorey. */
4747
4748               /* Fetch reverse charset, maybe. */
4749               if (((flags & CODING_STATE_R2L) &&
4750                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4751                   ||
4752                   (!(flags & CODING_STATE_R2L) &&
4753                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4754                 {
4755                   Lisp_Object new_charset =
4756                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4757                   if (!NILP (new_charset))
4758                     charset = new_charset;
4759                 }
4760
4761 #ifndef UTF2000
4762               lb = XCHARSET_LEADING_BYTE (charset);
4763 #endif
4764               switch (XCHARSET_REP_BYTES (charset))
4765                 {
4766                 case 1: /* ASCII */
4767                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4768                   Dynarr_add (dst, c & 0x7F);
4769                   break;
4770
4771                 case 2: /* one-byte official */
4772                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4773 #ifdef UTF2000
4774                   DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c, 0), dst);
4775 #else
4776                   Dynarr_add (dst, lb);
4777                   Dynarr_add (dst, c | 0x80);
4778 #endif
4779                   break;
4780
4781                 case 3: /* one-byte private or two-byte official */
4782                   if (XCHARSET_PRIVATE_P (charset))
4783                     {
4784                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4785 #ifdef UTF2000
4786                       DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c, 0), dst);
4787 #else
4788                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4789                       Dynarr_add (dst, lb);
4790                       Dynarr_add (dst, c | 0x80);
4791 #endif
4792                     }
4793                   else
4794                     {
4795                       if (ch)
4796                         {
4797 #ifdef UTF2000
4798                           DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, ch, c), dst);
4799 #else
4800                           Dynarr_add (dst, lb);
4801                           Dynarr_add (dst, ch | 0x80);
4802                           Dynarr_add (dst, c | 0x80);
4803 #endif
4804                           ch = 0;
4805                         }
4806                       else
4807                         ch = c;
4808                     }
4809                   break;
4810
4811                 default:        /* two-byte private */
4812                   if (ch)
4813                     {
4814 #ifdef UTF2000
4815                       DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, ch, c), dst);
4816 #else
4817                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4818                       Dynarr_add (dst, lb);
4819                       Dynarr_add (dst, ch | 0x80);
4820                       Dynarr_add (dst, c | 0x80);
4821 #endif
4822                       ch = 0;
4823                     }
4824                   else
4825                     ch = c;
4826                 }
4827             }
4828
4829           if (!ch)
4830             flags &= CODING_STATE_ISO2022_LOCK;
4831         }
4832
4833     label_continue_loop:;
4834     }
4835
4836   if (flags & CODING_STATE_END)
4837     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4838
4839   str->flags = flags;
4840   str->ch    = ch;
4841 }
4842
4843
4844 /***** ISO2022 encoder *****/
4845
4846 /* Designate CHARSET into register REG. */
4847
4848 static void
4849 iso2022_designate (Lisp_Object charset, unsigned char reg,
4850                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4851 {
4852   static CONST char inter94[] = "()*+";
4853   static CONST char inter96[] = ",-./";
4854   unsigned int type;
4855   unsigned char final;
4856   Lisp_Object old_charset = str->iso2022.charset[reg];
4857
4858   str->iso2022.charset[reg] = charset;
4859   if (!CHARSETP (charset))
4860     /* charset might be an initial nil or t. */
4861     return;
4862   type = XCHARSET_TYPE (charset);
4863   final = XCHARSET_FINAL (charset);
4864   if (!str->iso2022.force_charset_on_output[reg] &&
4865       CHARSETP (old_charset) &&
4866       XCHARSET_TYPE (old_charset) == type &&
4867       XCHARSET_FINAL (old_charset) == final)
4868     return;
4869
4870   str->iso2022.force_charset_on_output[reg] = 0;
4871
4872   {
4873     charset_conversion_spec_dynarr *dyn =
4874       str->codesys->iso2022.output_conv;
4875
4876     if (dyn)
4877       {
4878         int i;
4879
4880         for (i = 0; i < Dynarr_length (dyn); i++)
4881           {
4882             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4883             if (EQ (charset, spec->from_charset))
4884                 charset = spec->to_charset;
4885           }
4886       }
4887   }
4888
4889   Dynarr_add (dst, ISO_CODE_ESC);
4890   switch (type)
4891     {
4892     case CHARSET_TYPE_94:
4893       Dynarr_add (dst, inter94[reg]);
4894       break;
4895     case CHARSET_TYPE_96:
4896       Dynarr_add (dst, inter96[reg]);
4897       break;
4898     case CHARSET_TYPE_94X94:
4899       Dynarr_add (dst, '$');
4900       if (reg != 0
4901           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4902           || final < '@'
4903           || final > 'B')
4904         Dynarr_add (dst, inter94[reg]);
4905       break;
4906     case CHARSET_TYPE_96X96:
4907       Dynarr_add (dst, '$');
4908       Dynarr_add (dst, inter96[reg]);
4909       break;
4910     }
4911   Dynarr_add (dst, final);
4912 }
4913
4914 static void
4915 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4916 {
4917   if (str->iso2022.register_left != 0)
4918     {
4919       Dynarr_add (dst, ISO_CODE_SI);
4920       str->iso2022.register_left = 0;
4921     }
4922 }
4923
4924 static void
4925 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4926 {
4927   if (str->iso2022.register_left != 1)
4928     {
4929       Dynarr_add (dst, ISO_CODE_SO);
4930       str->iso2022.register_left = 1;
4931     }
4932 }
4933
4934 /* Convert internally-formatted data to ISO2022 format. */
4935
4936 static void
4937 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4938                        unsigned_char_dynarr *dst, unsigned int n)
4939 {
4940   unsigned char charmask, c;
4941   unsigned char char_boundary;
4942   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4943   unsigned int flags          = str->flags;
4944   unsigned int ch             = str->ch;
4945   Lisp_Coding_System *codesys = str->codesys;
4946   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4947   int i;
4948   Lisp_Object charset;
4949   int half;
4950
4951 #ifdef ENABLE_COMPOSITE_CHARS
4952   /* flags for handling composite chars.  We do a little switcharoo
4953      on the source while we're outputting the composite char. */
4954   unsigned int saved_n = 0;
4955   CONST unsigned char *saved_src = NULL;
4956   int in_composite = 0;
4957 #endif /* ENABLE_COMPOSITE_CHARS */
4958
4959   char_boundary = str->iso2022.current_char_boundary;
4960   charset = str->iso2022.current_charset;
4961   half = str->iso2022.current_half;
4962
4963 #ifdef ENABLE_COMPOSITE_CHARS
4964  back_to_square_n:
4965 #endif
4966   while (n--)
4967     {
4968       c = *src++;
4969
4970       if (BYTE_ASCII_P (c))
4971         {               /* Processing ASCII character */
4972           ch = 0;
4973
4974           restore_left_to_right_direction (codesys, dst, &flags, 0);
4975
4976           /* Make sure G0 contains ASCII */
4977           if ((c > ' ' && c < ISO_CODE_DEL) ||
4978               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4979             {
4980               ensure_normal_shift (str, dst);
4981               iso2022_designate (Vcharset_ascii, 0, str, dst);
4982             }
4983
4984           /* If necessary, restore everything to the default state
4985              at end-of-line */
4986           if (c == '\n' &&
4987               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4988             {
4989               restore_left_to_right_direction (codesys, dst, &flags, 0);
4990
4991               ensure_normal_shift (str, dst);
4992
4993               for (i = 0; i < 4; i++)
4994                 {
4995                   Lisp_Object initial_charset =
4996                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4997                   iso2022_designate (initial_charset, i, str, dst);
4998                 }
4999             }
5000           if (c == '\n')
5001             {
5002               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5003                 Dynarr_add (dst, '\r');
5004               if (eol_type != EOL_CR)
5005                 Dynarr_add (dst, c);
5006             }
5007           else
5008             {
5009               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5010                   && fit_to_be_escape_quoted (c))
5011                 Dynarr_add (dst, ISO_CODE_ESC);
5012               Dynarr_add (dst, c);
5013             }
5014           char_boundary = 1;
5015         }
5016
5017 #ifdef UTF2000
5018       else if (BUFBYTE_FIRST_BYTE_P (c))
5019         {
5020         }
5021 #else
5022       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5023         { /* Processing Leading Byte */
5024           ch = 0;
5025           charset = CHARSET_BY_LEADING_BYTE (c);
5026           if (LEADING_BYTE_PREFIX_P(c))
5027             ch = c;
5028           else if (!EQ (charset, Vcharset_control_1)
5029 #ifdef ENABLE_COMPOSITE_CHARS
5030                    && !EQ (charset, Vcharset_composite)
5031 #endif
5032                    )
5033             {
5034               int reg;
5035
5036               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5037                                         codesys, dst, &flags, 0);
5038
5039               /* Now determine which register to use. */
5040               reg = -1;
5041               for (i = 0; i < 4; i++)
5042                 {
5043                   if (EQ (charset, str->iso2022.charset[i]) ||
5044                       EQ (charset,
5045                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5046                     {
5047                       reg = i;
5048                       break;
5049                     }
5050                 }
5051
5052               if (reg == -1)
5053                 {
5054                   if (XCHARSET_GRAPHIC (charset) != 0)
5055                     {
5056                       if (!NILP (str->iso2022.charset[1]) &&
5057                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5058                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5059                         reg = 1;
5060                       else if (!NILP (str->iso2022.charset[2]))
5061                         reg = 2;
5062                       else if (!NILP (str->iso2022.charset[3]))
5063                         reg = 3;
5064                       else
5065                         reg = 0;
5066                     }
5067                   else
5068                     reg = 0;
5069                 }
5070
5071               iso2022_designate (charset, reg, str, dst);
5072
5073               /* Now invoke that register. */
5074               switch (reg)
5075                 {
5076                 case 0:
5077                   ensure_normal_shift (str, dst);
5078                   half = 0;
5079                   break;
5080
5081                 case 1:
5082                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5083                     {
5084                       ensure_shift_out (str, dst);
5085                       half = 0;
5086                     }
5087                   else
5088                     half = 1;
5089                   break;
5090
5091                 case 2:
5092                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5093                     {
5094                       Dynarr_add (dst, ISO_CODE_ESC);
5095                       Dynarr_add (dst, 'N');
5096                       half = 0;
5097                     }
5098                   else
5099                     {
5100                       Dynarr_add (dst, ISO_CODE_SS2);
5101                       half = 1;
5102                     }
5103                   break;
5104
5105                 case 3:
5106                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5107                     {
5108                       Dynarr_add (dst, ISO_CODE_ESC);
5109                       Dynarr_add (dst, 'O');
5110                       half = 0;
5111                     }
5112                   else
5113                     {
5114                       Dynarr_add (dst, ISO_CODE_SS3);
5115                       half = 1;
5116                     }
5117                   break;
5118
5119                 default:
5120                   abort ();
5121                 }
5122             }
5123           char_boundary = 0;
5124         }
5125 #endif /* not UTF2000 */
5126       else
5127         {                       /* Processing Non-ASCII character */
5128           charmask = (half == 0 ? 0x7F : 0xFF);
5129           char_boundary = 1;
5130           if (EQ (charset, Vcharset_control_1))
5131             {
5132               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5133                   && fit_to_be_escape_quoted (c))
5134                 Dynarr_add (dst, ISO_CODE_ESC);
5135               /* you asked for it ... */
5136               Dynarr_add (dst, c - 0x20);
5137             }
5138           else
5139             {
5140               switch (XCHARSET_REP_BYTES (charset))
5141                 {
5142                 case 2:
5143                   Dynarr_add (dst, c & charmask);
5144                   break;
5145                 case 3:
5146                   if (XCHARSET_PRIVATE_P (charset))
5147                     {
5148                       Dynarr_add (dst, c & charmask);
5149                       ch = 0;
5150                     }
5151                   else if (ch)
5152                     {
5153 #ifdef ENABLE_COMPOSITE_CHARS
5154                       if (EQ (charset, Vcharset_composite))
5155                         {
5156                           if (in_composite)
5157                             {
5158                               /* #### Bother! We don't know how to
5159                                  handle this yet. */
5160                               Dynarr_add (dst, '~');
5161                             }
5162                           else
5163                             {
5164                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5165                                                        ch & 0x7F, c & 0x7F);
5166                               Lisp_Object lstr = composite_char_string (emch);
5167                               saved_n = n;
5168                               saved_src = src;
5169                               in_composite = 1;
5170                               src = XSTRING_DATA   (lstr);
5171                               n   = XSTRING_LENGTH (lstr);
5172                               Dynarr_add (dst, ISO_CODE_ESC);
5173                               Dynarr_add (dst, '0'); /* start composing */
5174                             }
5175                         }
5176                       else
5177 #endif /* ENABLE_COMPOSITE_CHARS */
5178                         {
5179                           Dynarr_add (dst, ch & charmask);
5180                           Dynarr_add (dst, c & charmask);
5181                         }
5182                       ch = 0;
5183                     }
5184                   else
5185                     {
5186                       ch = c;
5187                       char_boundary = 0;
5188                     }
5189                   break;
5190                 case 4:
5191                   if (ch)
5192                     {
5193                       Dynarr_add (dst, ch & charmask);
5194                       Dynarr_add (dst, c & charmask);
5195                       ch = 0;
5196                     }
5197                   else
5198                     {
5199                       ch = c;
5200                       char_boundary = 0;
5201                     }
5202                   break;
5203                 default:
5204                   abort ();
5205                 }
5206             }
5207         }
5208     }
5209
5210 #ifdef ENABLE_COMPOSITE_CHARS
5211   if (in_composite)
5212     {
5213       n = saved_n;
5214       src = saved_src;
5215       in_composite = 0;
5216       Dynarr_add (dst, ISO_CODE_ESC);
5217       Dynarr_add (dst, '1'); /* end composing */
5218       goto back_to_square_n; /* Wheeeeeeeee ..... */
5219     }
5220 #endif /* ENABLE_COMPOSITE_CHARS */
5221
5222   if (char_boundary && flags & CODING_STATE_END)
5223     {
5224       restore_left_to_right_direction (codesys, dst, &flags, 0);
5225       ensure_normal_shift (str, dst);
5226       for (i = 0; i < 4; i++)
5227         {
5228           Lisp_Object initial_charset =
5229             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5230           iso2022_designate (initial_charset, i, str, dst);
5231         }
5232     }
5233
5234   str->flags = flags;
5235   str->ch    = ch;
5236   str->iso2022.current_char_boundary = char_boundary;
5237   str->iso2022.current_charset = charset;
5238   str->iso2022.current_half = half;
5239
5240   /* Verbum caro factum est! */
5241 }
5242 #endif /* MULE */
5243 \f
5244 /************************************************************************/
5245 /*                     No-conversion methods                            */
5246 /************************************************************************/
5247
5248 /* This is used when reading in "binary" files -- i.e. files that may
5249    contain all 256 possible byte values and that are not to be
5250    interpreted as being in any particular decoding. */
5251 static void
5252 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5253                              unsigned_char_dynarr *dst, unsigned int n)
5254 {
5255   unsigned char c;
5256   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5257   unsigned int flags  = str->flags;
5258   unsigned int ch     = str->ch;
5259   eol_type_t eol_type = str->eol_type;
5260
5261   while (n--)
5262     {
5263       c = *src++;
5264
5265       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5266       DECODE_ADD_BINARY_CHAR (c, dst);
5267     label_continue_loop:;
5268     }
5269
5270   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5271
5272   str->flags = flags;
5273   str->ch    = ch;
5274 }
5275
5276 static void
5277 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5278                              unsigned_char_dynarr *dst, unsigned int n)
5279 {
5280   unsigned char c;
5281   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5282   unsigned int flags  = str->flags;
5283   unsigned int ch     = str->ch;
5284   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5285
5286   while (n--)
5287     {
5288       c = *src++;
5289       if (c == '\n')
5290         {
5291           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5292             Dynarr_add (dst, '\r');
5293           if (eol_type != EOL_CR)
5294             Dynarr_add (dst, '\n');
5295           ch = 0;
5296         }
5297       else if (BYTE_ASCII_P (c))
5298         {
5299           assert (ch == 0);
5300           Dynarr_add (dst, c);
5301         }
5302       else if (BUFBYTE_LEADING_BYTE_P (c))
5303         {
5304           assert (ch == 0);
5305           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5306               c == LEADING_BYTE_CONTROL_1)
5307             ch = c;
5308           else
5309             Dynarr_add (dst, '~'); /* untranslatable character */
5310         }
5311       else
5312         {
5313           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5314             Dynarr_add (dst, c);
5315           else if (ch == LEADING_BYTE_CONTROL_1)
5316             {
5317               assert (c < 0xC0);
5318               Dynarr_add (dst, c - 0x20);
5319             }
5320           /* else it should be the second or third byte of an
5321              untranslatable character, so ignore it */
5322           ch = 0;
5323         }
5324     }
5325
5326   str->flags = flags;
5327   str->ch    = ch;
5328 }
5329
5330 \f
5331 /************************************************************************/
5332 /*                   Simple internal/external functions                 */
5333 /************************************************************************/
5334
5335 static Extbyte_dynarr *conversion_out_dynarr;
5336 static Bufbyte_dynarr *conversion_in_dynarr;
5337
5338 /* Determine coding system from coding format */
5339
5340 /* #### not correct for all values of `fmt'! */
5341 static Lisp_Object
5342 external_data_format_to_coding_system (enum external_data_format fmt)
5343 {
5344   switch (fmt)
5345     {
5346     case FORMAT_FILENAME:
5347     case FORMAT_TERMINAL:
5348       if (EQ (Vfile_name_coding_system, Qnil) ||
5349           EQ (Vfile_name_coding_system, Qbinary))
5350         return Qnil;
5351       else
5352         return Fget_coding_system (Vfile_name_coding_system);
5353 #ifdef MULE
5354     case FORMAT_CTEXT:
5355       return Fget_coding_system (Qctext);
5356 #endif
5357     default:
5358       return Qnil;
5359     }
5360 }
5361
5362 Extbyte *
5363 convert_to_external_format (CONST Bufbyte *ptr,
5364                             Bytecount len,
5365                             Extcount *len_out,
5366                             enum external_data_format fmt)
5367 {
5368   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5369
5370   if (!conversion_out_dynarr)
5371     conversion_out_dynarr = Dynarr_new (Extbyte);
5372   else
5373     Dynarr_reset (conversion_out_dynarr);
5374
5375   if (NILP (coding_system))
5376     {
5377       CONST Bufbyte *end = ptr + len;
5378
5379       for (; ptr < end;)
5380         {
5381 #ifdef UTF2000
5382           Bufbyte c =
5383             (*ptr < 0xc0) ? *ptr :
5384             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5385 #else
5386           Bufbyte c =
5387             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5388             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5389             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5390             '~';
5391 #endif
5392           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5393           INC_CHARPTR (ptr);
5394         }
5395
5396 #ifdef ERROR_CHECK_BUFPOS
5397       assert (ptr == end);
5398 #endif
5399     }
5400   else
5401     {
5402       Lisp_Object instream, outstream, da_outstream;
5403       Lstream *istr, *ostr;
5404       struct gcpro gcpro1, gcpro2, gcpro3;
5405       char tempbuf[1024]; /* some random amount */
5406
5407       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5408       da_outstream = make_dynarr_output_stream
5409         ((unsigned_char_dynarr *) conversion_out_dynarr);
5410       outstream =
5411         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5412       istr = XLSTREAM (instream);
5413       ostr = XLSTREAM (outstream);
5414       GCPRO3 (instream, outstream, da_outstream);
5415       while (1)
5416         {
5417           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5418           if (!size_in_bytes)
5419             break;
5420           Lstream_write (ostr, tempbuf, size_in_bytes);
5421         }
5422       Lstream_close (istr);
5423       Lstream_close (ostr);
5424       UNGCPRO;
5425       Lstream_delete (istr);
5426       Lstream_delete (ostr);
5427       Lstream_delete (XLSTREAM (da_outstream));
5428     }
5429
5430   *len_out = Dynarr_length (conversion_out_dynarr);
5431   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5432   return Dynarr_atp (conversion_out_dynarr, 0);
5433 }
5434
5435 Bufbyte *
5436 convert_from_external_format (CONST Extbyte *ptr,
5437                               Extcount len,
5438                               Bytecount *len_out,
5439                               enum external_data_format fmt)
5440 {
5441   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5442
5443   if (!conversion_in_dynarr)
5444     conversion_in_dynarr = Dynarr_new (Bufbyte);
5445   else
5446     Dynarr_reset (conversion_in_dynarr);
5447
5448   if (NILP (coding_system))
5449     {
5450       CONST Extbyte *end = ptr + len;
5451       for (; ptr < end; ptr++)
5452         {
5453           Extbyte c = *ptr;
5454           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5455         }
5456     }
5457   else
5458     {
5459       Lisp_Object instream, outstream, da_outstream;
5460       Lstream *istr, *ostr;
5461       struct gcpro gcpro1, gcpro2, gcpro3;
5462       char tempbuf[1024]; /* some random amount */
5463
5464       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5465       da_outstream = make_dynarr_output_stream
5466         ((unsigned_char_dynarr *) conversion_in_dynarr);
5467       outstream =
5468         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5469       istr = XLSTREAM (instream);
5470       ostr = XLSTREAM (outstream);
5471       GCPRO3 (instream, outstream, da_outstream);
5472       while (1)
5473         {
5474           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5475           if (!size_in_bytes)
5476             break;
5477           Lstream_write (ostr, tempbuf, size_in_bytes);
5478         }
5479       Lstream_close (istr);
5480       Lstream_close (ostr);
5481       UNGCPRO;
5482       Lstream_delete (istr);
5483       Lstream_delete (ostr);
5484       Lstream_delete (XLSTREAM (da_outstream));
5485     }
5486
5487   *len_out = Dynarr_length (conversion_in_dynarr);
5488   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5489   return Dynarr_atp (conversion_in_dynarr, 0);
5490 }
5491
5492 \f
5493 /************************************************************************/
5494 /*                             Initialization                           */
5495 /************************************************************************/
5496
5497 void
5498 syms_of_file_coding (void)
5499 {
5500   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5501   deferror (&Qcoding_system_error, "coding-system-error",
5502             "Coding-system error", Qio_error);
5503
5504   DEFSUBR (Fcoding_system_p);
5505   DEFSUBR (Ffind_coding_system);
5506   DEFSUBR (Fget_coding_system);
5507   DEFSUBR (Fcoding_system_list);
5508   DEFSUBR (Fcoding_system_name);
5509   DEFSUBR (Fmake_coding_system);
5510   DEFSUBR (Fcopy_coding_system);
5511   DEFSUBR (Fdefine_coding_system_alias);
5512   DEFSUBR (Fsubsidiary_coding_system);
5513
5514   DEFSUBR (Fcoding_system_type);
5515   DEFSUBR (Fcoding_system_doc_string);
5516 #ifdef MULE
5517   DEFSUBR (Fcoding_system_charset);
5518 #endif
5519   DEFSUBR (Fcoding_system_property);
5520
5521   DEFSUBR (Fcoding_category_list);
5522   DEFSUBR (Fset_coding_priority_list);
5523   DEFSUBR (Fcoding_priority_list);
5524   DEFSUBR (Fset_coding_category_system);
5525   DEFSUBR (Fcoding_category_system);
5526
5527   DEFSUBR (Fdetect_coding_region);
5528   DEFSUBR (Fdecode_coding_region);
5529   DEFSUBR (Fencode_coding_region);
5530 #ifdef MULE
5531   DEFSUBR (Fdecode_shift_jis_char);
5532   DEFSUBR (Fencode_shift_jis_char);
5533   DEFSUBR (Fdecode_big5_char);
5534   DEFSUBR (Fencode_big5_char);
5535   DEFSUBR (Fset_ucs_char);
5536   DEFSUBR (Fucs_char);
5537   DEFSUBR (Fset_char_ucs);
5538   DEFSUBR (Fchar_ucs);
5539 #endif /* MULE */
5540   defsymbol (&Qcoding_system_p, "coding-system-p");
5541   defsymbol (&Qno_conversion, "no-conversion");
5542   defsymbol (&Qraw_text, "raw-text");
5543 #ifdef MULE
5544   defsymbol (&Qbig5, "big5");
5545   defsymbol (&Qshift_jis, "shift-jis");
5546   defsymbol (&Qucs4, "ucs-4");
5547   defsymbol (&Qutf8, "utf-8");
5548   defsymbol (&Qccl, "ccl");
5549   defsymbol (&Qiso2022, "iso2022");
5550 #endif /* MULE */
5551   defsymbol (&Qmnemonic, "mnemonic");
5552   defsymbol (&Qeol_type, "eol-type");
5553   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5554   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5555
5556   defsymbol (&Qcr, "cr");
5557   defsymbol (&Qlf, "lf");
5558   defsymbol (&Qcrlf, "crlf");
5559   defsymbol (&Qeol_cr, "eol-cr");
5560   defsymbol (&Qeol_lf, "eol-lf");
5561   defsymbol (&Qeol_crlf, "eol-crlf");
5562 #ifdef MULE
5563   defsymbol (&Qcharset_g0, "charset-g0");
5564   defsymbol (&Qcharset_g1, "charset-g1");
5565   defsymbol (&Qcharset_g2, "charset-g2");
5566   defsymbol (&Qcharset_g3, "charset-g3");
5567   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5568   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5569   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5570   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5571   defsymbol (&Qno_iso6429, "no-iso6429");
5572   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5573   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5574
5575   defsymbol (&Qshort, "short");
5576   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5577   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5578   defsymbol (&Qseven, "seven");
5579   defsymbol (&Qlock_shift, "lock-shift");
5580   defsymbol (&Qescape_quoted, "escape-quoted");
5581 #endif /* MULE */
5582   defsymbol (&Qencode, "encode");
5583   defsymbol (&Qdecode, "decode");
5584
5585 #ifdef MULE
5586   defsymbol (&Qctext, "ctext");
5587   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5588              "shift-jis");
5589   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5590              "big5");
5591   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5592              "ucs-4");
5593   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5594              "utf-8");
5595   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5596              "iso-7");
5597   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5598              "iso-8-designate");
5599   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5600              "iso-8-1");
5601   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5602              "iso-8-2");
5603   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5604              "iso-lock-shift");
5605 #endif /* MULE */
5606   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5607              "no-conversion");
5608 }
5609
5610 void
5611 lstream_type_create_file_coding (void)
5612 {
5613   LSTREAM_HAS_METHOD (decoding, reader);
5614   LSTREAM_HAS_METHOD (decoding, writer);
5615   LSTREAM_HAS_METHOD (decoding, rewinder);
5616   LSTREAM_HAS_METHOD (decoding, seekable_p);
5617   LSTREAM_HAS_METHOD (decoding, flusher);
5618   LSTREAM_HAS_METHOD (decoding, closer);
5619   LSTREAM_HAS_METHOD (decoding, marker);
5620
5621   LSTREAM_HAS_METHOD (encoding, reader);
5622   LSTREAM_HAS_METHOD (encoding, writer);
5623   LSTREAM_HAS_METHOD (encoding, rewinder);
5624   LSTREAM_HAS_METHOD (encoding, seekable_p);
5625   LSTREAM_HAS_METHOD (encoding, flusher);
5626   LSTREAM_HAS_METHOD (encoding, closer);
5627   LSTREAM_HAS_METHOD (encoding, marker);
5628 }
5629
5630 void
5631 vars_of_file_coding (void)
5632 {
5633   int i;
5634
5635   /* Initialize to something reasonable ... */
5636   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5637     {
5638       coding_category_system[i] = Qnil;
5639       coding_category_by_priority[i] = i;
5640     }
5641
5642   Fprovide (intern ("file-coding"));
5643
5644   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5645 Coding system used for TTY keyboard input.
5646 Not used under a windowing system.
5647 */ );
5648   Vkeyboard_coding_system = Qnil;
5649
5650   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5651 Coding system used for TTY display output.
5652 Not used under a windowing system.
5653 */ );
5654   Vterminal_coding_system = Qnil;
5655
5656   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5657 Overriding coding system used when writing a file or process.
5658 You should *bind* this, not set it.  If this is non-nil, it specifies
5659 the coding system that will be used when a file or process is read
5660 in, and overrides `buffer-file-coding-system-for-read',
5661 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5662 this one for permanent changes to the environment.
5663 */ );
5664   Vcoding_system_for_read = Qnil;
5665
5666   DEFVAR_LISP ("coding-system-for-write",
5667                &Vcoding_system_for_write /*
5668 Overriding coding system used when writing a file or process.
5669 You should *bind* this, not set it.  If this is non-nil, it specifies
5670 the coding system that will be used when a file or process is wrote
5671 in, and overrides `buffer-file-coding-system',
5672 `write-region-pre-hook', etc.  Use those variables instead of this one
5673 for permanent changes to the environment.
5674 */ );
5675   Vcoding_system_for_write = Qnil;
5676
5677   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5678 Coding system used to convert pathnames when accessing files.
5679 */ );
5680   Vfile_name_coding_system = Qnil;
5681
5682   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5683 Non-nil means the buffer contents are regarded as multi-byte form
5684 of characters, not a binary code.  This affects the display, file I/O,
5685 and behaviors of various editing commands.
5686
5687 Setting this to nil does not do anything.
5688 */ );
5689   enable_multibyte_characters = 1;
5690 }
5691
5692 void
5693 complex_vars_of_file_coding (void)
5694 {
5695   staticpro (&Vcoding_system_hash_table);
5696   Vcoding_system_hash_table =
5697     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5698
5699   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5700
5701 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5702 {                                               \
5703   struct codesys_prop csp;                      \
5704   csp.sym = (Sym);                              \
5705   csp.prop_type = (Prop_Type);                  \
5706   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5707 } while (0)
5708
5709   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5710   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5711   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5712   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5713   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5714   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5715   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5716 #ifdef MULE
5717   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5718   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5719   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5720   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5721   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5722   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5723   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5724   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5725   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5726   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5727   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5728   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5729   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5730   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5731   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5732   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5733   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5734
5735   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5736   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5737 #endif /* MULE */
5738   /* Need to create this here or we're really screwed. */
5739   Fmake_coding_system
5740     (Qraw_text, Qno_conversion,
5741      build_string ("Raw text, which means it converts only line-break-codes."),
5742      list2 (Qmnemonic, build_string ("Raw")));
5743
5744   Fmake_coding_system
5745     (Qbinary, Qno_conversion,
5746      build_string ("Binary, which means it does not convert anything."),
5747      list4 (Qeol_type, Qlf,
5748             Qmnemonic, build_string ("Binary")));
5749
5750   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5751
5752   /* Need this for bootstrapping */
5753   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5754     Fget_coding_system (Qraw_text);
5755
5756 #ifdef MULE
5757   {
5758     unsigned int i;
5759
5760     for (i = 0; i < 65536; i++)
5761       ucs_to_mule_table[i] = Qnil;
5762   }
5763   staticpro (&mule_to_ucs_table);
5764   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5765 #endif /* MULE */
5766 }