src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28 #include "buffer.h"
  29 #include "elhash.h"
  30 #include "insdel.h"
  31 #include "lstream.h"
  32 #ifdef MULE
  33 #include "mule-ccl.h"
  34 #include "chartab.h"
  35 #endif
  36 #include "file-coding.h"
  37
  38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  39
  40 Lisp_Object Vkeyboard_coding_system;
  41 Lisp_Object Vterminal_coding_system;
  42 Lisp_Object Vcoding_system_for_read;
  43 Lisp_Object Vcoding_system_for_write;
  44 Lisp_Object Vfile_name_coding_system;
  45
  46 /* Table of symbols identifying each coding category. */
  47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  48
  49 /* Coding system currently associated with each coding category. */
  50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  51
  52 /* Table of all coding categories in decreasing order of priority.
  53    This describes a permutation of the possible coding categories. */
  54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  55
  56 Lisp_Object Qcoding_system_p;
  57
  58 Lisp_Object Qno_conversion, Qccl, Qiso2022;
  59 /* Qinternal in general.c */
  60
  61 Lisp_Object Qmnemonic, Qeol_type;
  62 Lisp_Object Qcr, Qcrlf, Qlf;
  63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  64 Lisp_Object Qpost_read_conversion;
  65 Lisp_Object Qpre_write_conversion;
  66
  67 #ifdef MULE
  68 Lisp_Object Qucs4, Qutf8;
  69 Lisp_Object Qbig5, Qshift_jis;
  70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  73 Lisp_Object Qno_iso6429;
  74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  75 Lisp_Object Qctext, Qescape_quoted;
  76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  77 #endif
  78 Lisp_Object Qencode, Qdecode;
  79
  80 Lisp_Object Vcoding_system_hash_table;
  81
  82 int enable_multibyte_characters;
  83
  84 #ifdef MULE
  85 /* Additional information used by the ISO2022 decoder and detector. */
  86 struct iso2022_decoder
  87 {
  88   /* CHARSET holds the character sets currently assigned to the G0
  89      through G3 variables.  It is initialized from the array
  90      INITIAL_CHARSET in CODESYS. */
  91   Lisp_Object charset[4];
  92
  93   /* Which registers are currently invoked into the left (GL) and
  94      right (GR) halves of the 8-bit encoding space? */
  95   int register_left, register_right;
  96
  97   /* ISO_ESC holds a value indicating part of an escape sequence
  98      that has already been seen. */
  99   enum iso_esc_flag esc;
 100
 101   /* This records the bytes we've seen so far in an escape sequence,
 102      in case the sequence is invalid (we spit out the bytes unchanged). */
 103   unsigned char esc_bytes[8];
 104
 105   /* Index for next byte to store in ISO escape sequence. */
 106   int esc_bytes_index;
 107
 108 #ifdef ENABLE_COMPOSITE_CHARS
 109   /* Stuff seen so far when composing a string. */
 110   unsigned_char_dynarr *composite_chars;
 111 #endif
 112
 113   /* If we saw an invalid designation sequence for a particular
 114      register, we flag it here and switch to ASCII.  The next time we
 115      see a valid designation for this register, we turn off the flag
 116      and do the designation normally, but pretend the sequence was
 117      invalid.  The effect of all this is that (most of the time) the
 118      escape sequences for both the switch to the unknown charset, and
 119      the switch back to the known charset, get inserted literally into
 120      the buffer and saved out as such.  The hope is that we can
 121      preserve the escape sequences so that the resulting written out
 122      file makes sense.  If we don't do any of this, the designation
 123      to the invalid charset will be preserved but that switch back
 124      to the known charset will probably get eaten because it was
 125      the same charset that was already present in the register. */
 126   unsigned char invalid_designated[4];
 127
 128   /* We try to do similar things as above for direction-switching
 129      sequences.  If we encountered a direction switch while an
 130      invalid designation was present, or an invalid designation
 131      just after a direction switch (i.e. no valid designation
 132      encountered yet), we insert the direction-switch escape
 133      sequence literally into the output stream, and later on
 134      insert the corresponding direction-restoring escape sequence
 135      literally also. */
 136   unsigned int switched_dir_and_no_valid_charset_yet :1;
 137   unsigned int invalid_switch_dir :1;
 138
 139   /* Tells the decoder to output the escape sequence literally
 140      even though it was valid.  Used in the games we play to
 141      avoid lossage when we encounter invalid designations. */
 142   unsigned int output_literally :1;
 143   /* We encountered a direction switch followed by an invalid
 144      designation.  We didn't output the direction switch
 145      literally because we didn't know about the invalid designation;
 146      but we have to do so now. */
 147   unsigned int output_direction_sequence :1;
 148 };
 149 #endif /* MULE */
 150 EXFUN (Fcopy_coding_system, 2);
 151 #ifdef MULE
 152 struct detection_state;
 153 static int detect_coding_sjis (struct detection_state *st,
 154                                CONST unsigned char *src,
 155                                unsigned int n);
 156 static void decode_coding_sjis (Lstream *decoding,
 157                                 CONST unsigned char *src,
 158                                 unsigned_char_dynarr *dst,
 159                                 unsigned int n);
 160 static void encode_coding_sjis (Lstream *encoding,
 161                                 CONST unsigned char *src,
 162                                 unsigned_char_dynarr *dst,
 163                                 unsigned int n);
 164 static int detect_coding_big5 (struct detection_state *st,
 165                                CONST unsigned char *src,
 166                                unsigned int n);
 167 static void decode_coding_big5 (Lstream *decoding,
 168                                 CONST unsigned char *src,
 169                                 unsigned_char_dynarr *dst, unsigned int n);
 170 static void encode_coding_big5 (Lstream *encoding,
 171                                 CONST unsigned char *src,
 172                                 unsigned_char_dynarr *dst, unsigned int n);
 173 static int detect_coding_ucs4 (struct detection_state *st,
 174                                CONST unsigned char *src,
 175                                unsigned int n);
 176 static void decode_coding_ucs4 (Lstream *decoding,
 177                                 CONST unsigned char *src,
 178                                 unsigned_char_dynarr *dst, unsigned int n);
 179 static void encode_coding_ucs4 (Lstream *encoding,
 180                                 CONST unsigned char *src,
 181                                 unsigned_char_dynarr *dst, unsigned int n);
 182 static int detect_coding_utf8 (struct detection_state *st,
 183                                CONST unsigned char *src,
 184                                unsigned int n);
 185 static void decode_coding_utf8 (Lstream *decoding,
 186                                 CONST unsigned char *src,
 187                                 unsigned_char_dynarr *dst, unsigned int n);
 188 static void encode_coding_utf8 (Lstream *encoding,
 189                                 CONST unsigned char *src,
 190                                 unsigned_char_dynarr *dst, unsigned int n);
 191 static int postprocess_iso2022_mask (int mask);
 192 static void reset_iso2022 (Lisp_Object coding_system,
 193                            struct iso2022_decoder *iso);
 194 static int detect_coding_iso2022 (struct detection_state *st,
 195                                   CONST unsigned char *src,
 196                                   unsigned int n);
 197 static void decode_coding_iso2022 (Lstream *decoding,
 198                                    CONST unsigned char *src,
 199                                    unsigned_char_dynarr *dst, unsigned int n);
 200 static void encode_coding_iso2022 (Lstream *encoding,
 201                                    CONST unsigned char *src,
 202                                    unsigned_char_dynarr *dst, unsigned int n);
 203 #endif /* MULE */
 204 static void decode_coding_no_conversion (Lstream *decoding,
 205                                          CONST unsigned char *src,
 206                                          unsigned_char_dynarr *dst,
 207                                          unsigned int n);
 208 static void encode_coding_no_conversion (Lstream *encoding,
 209                                          CONST unsigned char *src,
 210                                          unsigned_char_dynarr *dst,
 211                                          unsigned int n);
 212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 213                          unsigned_char_dynarr *dst, unsigned int n);
 214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 215                          unsigned_char_dynarr *dst, unsigned int n);
 216
 217 typedef struct codesys_prop codesys_prop;
 218 struct codesys_prop
 219 {
 220   Lisp_Object sym;
 221   int prop_type;
 222 };
 223
 224 typedef struct
 225 {
 226   Dynarr_declare (codesys_prop);
 227 } codesys_prop_dynarr;
 228
 229 codesys_prop_dynarr *the_codesys_prop_dynarr;
 230
 231 enum codesys_prop_enum
 232 {
 233   CODESYS_PROP_ALL_OK,
 234   CODESYS_PROP_ISO2022,
 235   CODESYS_PROP_CCL
 236 };
 237
 238 \f
 239 /************************************************************************/
 240 /*                       Coding system functions                        */
 241 /************************************************************************/
 242
 243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 245 static void finalize_coding_system (void *header, int for_disksave);
 246
 247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 248                                mark_coding_system, print_coding_system,
 249                                finalize_coding_system,
 250                                0, 0, struct Lisp_Coding_System);
 251
 252 static Lisp_Object
 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 254 {
 255   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 256
 257   markobj (CODING_SYSTEM_NAME (codesys));
 258   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 259   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 260   markobj (CODING_SYSTEM_EOL_LF (codesys));
 261   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 262   markobj (CODING_SYSTEM_EOL_CR (codesys));
 263
 264   switch (CODING_SYSTEM_TYPE (codesys))
 265     {
 266 #ifdef MULE
 267       int i;
 268     case CODESYS_ISO2022:
 269       for (i = 0; i < 4; i++)
 270         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 271       if (codesys->iso2022.input_conv)
 272         {
 273           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 274             {
 275               struct charset_conversion_spec *ccs =
 276                 Dynarr_atp (codesys->iso2022.input_conv, i);
 277               markobj (ccs->from_charset);
 278               markobj (ccs->to_charset);
 279             }
 280         }
 281       if (codesys->iso2022.output_conv)
 282         {
 283           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 284             {
 285               struct charset_conversion_spec *ccs =
 286                 Dynarr_atp (codesys->iso2022.output_conv, i);
 287               markobj (ccs->from_charset);
 288               markobj (ccs->to_charset);
 289             }
 290         }
 291       break;
 292
 293     case CODESYS_CCL:
 294       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 295       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 296       break;
 297 #endif /* MULE */
 298     default:
 299       break;
 300     }
 301
 302   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 303   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 304 }
 305
 306 static void
 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 308                      int escapeflag)
 309 {
 310   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 311   if (print_readably)
 312     error ("printing unreadable object #<coding_system 0x%x>",
 313            c->header.uid);
 314
 315   write_c_string ("#<coding_system ", printcharfun);
 316   print_internal (c->name, printcharfun, 1);
 317   write_c_string (">", printcharfun);
 318 }
 319
 320 static void
 321 finalize_coding_system (void *header, int for_disksave)
 322 {
 323   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 324   /* Since coding systems never go away, this function is not
 325      necessary.  But it would be necessary if we changed things
 326      so that coding systems could go away. */
 327   if (!for_disksave) /* see comment in lstream.c */
 328     {
 329       switch (CODING_SYSTEM_TYPE (c))
 330         {
 331 #ifdef MULE
 332         case CODESYS_ISO2022:
 333           if (c->iso2022.input_conv)
 334             {
 335               Dynarr_free (c->iso2022.input_conv);
 336               c->iso2022.input_conv = 0;
 337             }
 338           if (c->iso2022.output_conv)
 339             {
 340               Dynarr_free (c->iso2022.output_conv);
 341               c->iso2022.output_conv = 0;
 342             }
 343           break;
 344 #endif /* MULE */
 345         default:
 346           break;
 347         }
 348     }
 349 }
 350
 351 static enum eol_type
 352 symbol_to_eol_type (Lisp_Object symbol)
 353 {
 354   CHECK_SYMBOL (symbol);
 355   if (NILP (symbol))      return EOL_AUTODETECT;
 356   if (EQ (symbol, Qlf))   return EOL_LF;
 357   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 358   if (EQ (symbol, Qcr))   return EOL_CR;
 359
 360   signal_simple_error ("Unrecognized eol type", symbol);
 361   return EOL_AUTODETECT; /* not reached */
 362 }
 363
 364 static Lisp_Object
 365 eol_type_to_symbol (enum eol_type type)
 366 {
 367   switch (type)
 368     {
 369     default: abort ();
 370     case EOL_LF:         return Qlf;
 371     case EOL_CRLF:       return Qcrlf;
 372     case EOL_CR:         return Qcr;
 373     case EOL_AUTODETECT: return Qnil;
 374     }
 375 }
 376
 377 static void
 378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 379 {
 380   Lisp_Object codesys_obj;
 381   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 382   char *codesys_name = (char *) alloca (len + 7);
 383   int mlen = -1;
 384   char *codesys_mnemonic=0;
 385
 386   Lisp_Object codesys_name_sym, sub_codesys_obj;
 387
 388   /* kludge */
 389
 390   XSETCODING_SYSTEM (codesys_obj, codesys);
 391
 392   memcpy (codesys_name,
 393           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 394
 395   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 396     {
 397       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 398       codesys_mnemonic = (char *) alloca (mlen + 7);
 399       memcpy (codesys_mnemonic,
 400               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 401     }
 402
 403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 404   strcpy (codesys_name + len, "-" op_sys);                                      \
 405   if (mlen != -1)                                                               \
 406     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 407   codesys_name_sym = intern (codesys_name);                                     \
 408   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 409   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 410   if (mlen != -1)                                                               \
 411     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 412       build_string (codesys_mnemonic);                                          \
 413   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 414 } while (0)
 415
 416   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 417   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 418   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 419 }
 420
 421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 422 Return t if OBJECT is a coding system.
 423 A coding system is an object that defines how text containing multiple
 424 character sets is encoded into a stream of (typically 8-bit) bytes.
 425 The coding system is used to decode the stream into a series of
 426 characters (which may be from multiple charsets) when the text is read
 427 from a file or process, and is used to encode the text back into the
 428 same format when it is written out to a file or process.
 429
 430 For example, many ISO2022-compliant coding systems (such as Compound
 431 Text, which is used for inter-client data under the X Window System)
 432 use escape sequences to switch between different charsets -- Japanese
 433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 435 `make-coding-system' for more information.
 436
 437 Coding systems are normally identified using a symbol, and the
 438 symbol is accepted in place of the actual coding system object whenever
 439 a coding system is called for. (This is similar to how faces work.)
 440 */
 441        (object))
 442 {
 443   return CODING_SYSTEMP (object) ? Qt : Qnil;
 444 }
 445
 446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 447 Retrieve the coding system of the given name.
 448
 449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 450 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 451 If there is no such coding system, nil is returned.  Otherwise the
 452 associated coding system object is returned.
 453 */
 454        (coding_system_or_name))
 455 {
 456   if (CODING_SYSTEMP (coding_system_or_name))
 457     return coding_system_or_name;
 458
 459   if (NILP (coding_system_or_name))
 460     coding_system_or_name = Qbinary;
 461   else
 462     CHECK_SYMBOL (coding_system_or_name);
 463
 464   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 465 }
 466
 467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 468 Retrieve the coding system of the given name.
 469 Same as `find-coding-system' except that if there is no such
 470 coding system, an error is signaled instead of returning nil.
 471 */
 472        (name))
 473 {
 474   Lisp_Object coding_system = Ffind_coding_system (name);
 475
 476   if (NILP (coding_system))
 477     signal_simple_error ("No such coding system", name);
 478   return coding_system;
 479 }
 480
 481 /* We store the coding systems in hash tables with the names as the key and the
 482    actual coding system object as the value.  Occasionally we need to use them
 483    in a list format.  These routines provide us with that. */
 484 struct coding_system_list_closure
 485 {
 486   Lisp_Object *coding_system_list;
 487 };
 488
 489 static int
 490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 491                                   void *coding_system_list_closure)
 492 {
 493   /* This function can GC */
 494   struct coding_system_list_closure *cscl =
 495     (struct coding_system_list_closure *) coding_system_list_closure;
 496   Lisp_Object *coding_system_list = cscl->coding_system_list;
 497
 498   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 499                                *coding_system_list);
 500   return 0;
 501 }
 502
 503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 504 Return a list of the names of all defined coding systems.
 505 */
 506        ())
 507 {
 508   Lisp_Object coding_system_list = Qnil;
 509   struct gcpro gcpro1;
 510   struct coding_system_list_closure coding_system_list_closure;
 511
 512   GCPRO1 (coding_system_list);
 513   coding_system_list_closure.coding_system_list = &coding_system_list;
 514   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 515                  &coding_system_list_closure);
 516   UNGCPRO;
 517
 518   return coding_system_list;
 519 }
 520
 521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 522 Return the name of the given coding system.
 523 */
 524        (coding_system))
 525 {
 526   coding_system = Fget_coding_system (coding_system);
 527   return XCODING_SYSTEM_NAME (coding_system);
 528 }
 529
 530 static Lisp_Coding_System *
 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 532 {
 533   Lisp_Coding_System *codesys =
 534     alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system);
 535
 536   zero_lcrecord (codesys);
 537   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 538   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 539   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 540   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 541   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 542   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 543   CODING_SYSTEM_TYPE     (codesys) = type;
 544   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 545 #ifdef MULE
 546   if (type == CODESYS_ISO2022)
 547     {
 548       int i;
 549       for (i = 0; i < 4; i++)
 550         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 551     }
 552   else if (type == CODESYS_CCL)
 553     {
 554       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 555       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 556     }
 557 #endif /* MULE */
 558   CODING_SYSTEM_NAME (codesys) = name;
 559
 560   return codesys;
 561 }
 562
 563 #ifdef MULE
 564 /* Given a list of charset conversion specs as specified in a Lisp
 565    program, parse it into STORE_HERE. */
 566
 567 static void
 568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 569                                 Lisp_Object spec_list)
 570 {
 571   Lisp_Object rest;
 572
 573   EXTERNAL_LIST_LOOP (rest, spec_list)
 574     {
 575       Lisp_Object car = XCAR (rest);
 576       Lisp_Object from, to;
 577       struct charset_conversion_spec spec;
 578
 579       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 580         signal_simple_error ("Invalid charset conversion spec", car);
 581       from = Fget_charset (XCAR (car));
 582       to = Fget_charset (XCAR (XCDR (car)));
 583       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 584         signal_simple_error_2
 585           ("Attempted conversion between different charset types",
 586            from, to);
 587       spec.from_charset = from;
 588       spec.to_charset = to;
 589
 590       Dynarr_add (store_here, spec);
 591     }
 592 }
 593
 594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 595    specs, return the equivalent as the Lisp programmer would see it.
 596
 597    If LOAD_HERE is 0, return Qnil. */
 598
 599 static Lisp_Object
 600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 601 {
 602   int i;
 603   Lisp_Object result;
 604
 605   if (!load_here)
 606     return Qnil;
 607   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 608     {
 609       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 610       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 611     }
 612
 613   return Fnreverse (result);
 614 }
 615
 616 #endif /* MULE */
 617
 618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 619 Register symbol NAME as a coding system.
 620
 621 TYPE describes the conversion method used and should be one of
 622
 623 nil or 'undecided
 624      Automatic conversion.  XEmacs attempts to detect the coding system
 625      used in the file.
 626 'no-conversion
 627      No conversion.  Use this for binary files and such.  On output,
 628      graphic characters that are not in ASCII or Latin-1 will be
 629      replaced by a ?. (For a no-conversion-encoded buffer, these
 630      characters will only be present if you explicitly insert them.)
 631 'shift-jis
 632      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 633 'ucs-4
 634      ISO 10646 UCS-4 encoding.
 635 'utf-8
 636      ISO 10646 UTF-8 encoding.
 637 'iso2022
 638      Any ISO2022-compliant encoding.  Among other things, this includes
 639      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 640      standard Unix encoding for Japanese and other languages), and
 641      Compound Text (the encoding used in X11).  You can specify more
 642      specific information about the conversion with the FLAGS argument.
 643 'big5
 644      Big5 (the encoding commonly used for Taiwanese).
 645 'ccl
 646      The conversion is performed using a user-written pseudo-code
 647      program.  CCL (Code Conversion Language) is the name of this
 648      pseudo-code.
 649 'internal
 650      Write out or read in the raw contents of the memory representing
 651      the buffer's text.  This is primarily useful for debugging
 652      purposes, and is only enabled when XEmacs has been compiled with
 653      DEBUG_XEMACS defined (via the --debug configure option).
 654      WARNING: Reading in a file using 'internal conversion can result
 655      in an internal inconsistency in the memory representing a
 656      buffer's text, which will produce unpredictable results and may
 657      cause XEmacs to crash.  Under normal circumstances you should
 658      never use 'internal conversion.
 659
 660 DOC-STRING is a string describing the coding system.
 661
 662 PROPS is a property list, describing the specific nature of the
 663 character set.  Recognized properties are:
 664
 665 'mnemonic
 666      String to be displayed in the modeline when this coding system is
 667      active.
 668
 669 'eol-type
 670      End-of-line conversion to be used.  It should be one of
 671
 672         nil
 673                 Automatically detect the end-of-line type (LF, CRLF,
 674                 or CR).  Also generate subsidiary coding systems named
 675                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 676                 identical to this coding system but have an EOL-TYPE
 677                 value of 'lf, 'crlf, and 'cr, respectively.
 678         'lf
 679                 The end of a line is marked externally using ASCII LF.
 680                 Since this is also the way that XEmacs represents an
 681                 end-of-line internally, specifying this option results
 682                 in no end-of-line conversion.  This is the standard
 683                 format for Unix text files.
 684         'crlf
 685                 The end of a line is marked externally using ASCII
 686                 CRLF.  This is the standard format for MS-DOS text
 687                 files.
 688         'cr
 689                 The end of a line is marked externally using ASCII CR.
 690                 This is the standard format for Macintosh text files.
 691         t
 692                 Automatically detect the end-of-line type but do not
 693                 generate subsidiary coding systems.  (This value is
 694                 converted to nil when stored internally, and
 695                 `coding-system-property' will return nil.)
 696
 697 'post-read-conversion
 698      Function called after a file has been read in, to perform the
 699      decoding.  Called with two arguments, BEG and END, denoting
 700      a region of the current buffer to be decoded.
 701
 702 'pre-write-conversion
 703      Function called before a file is written out, to perform the
 704      encoding.  Called with two arguments, BEG and END, denoting
 705      a region of the current buffer to be encoded.
 706
 707
 708 The following additional properties are recognized if TYPE is 'iso2022:
 709
 710 'charset-g0
 711 'charset-g1
 712 'charset-g2
 713 'charset-g3
 714      The character set initially designated to the G0 - G3 registers.
 715      The value should be one of
 716
 717           -- A charset object (designate that character set)
 718           -- nil (do not ever use this register)
 719           -- t (no character set is initially designated to
 720                 the register, but may be later on; this automatically
 721                 sets the corresponding `force-g*-on-output' property)
 722
 723 'force-g0-on-output
 724 'force-g1-on-output
 725 'force-g2-on-output
 726 'force-g2-on-output
 727      If non-nil, send an explicit designation sequence on output before
 728      using the specified register.
 729
 730 'short
 731      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 732      "ESC $ B" on output in place of the full designation sequences
 733      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 734
 735 'no-ascii-eol
 736      If non-nil, don't designate ASCII to G0 at each end of line on output.
 737      Setting this to non-nil also suppresses other state-resetting that
 738      normally happens at the end of a line.
 739
 740 'no-ascii-cntl
 741      If non-nil, don't designate ASCII to G0 before control chars on output.
 742
 743 'seven
 744      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 745      environment.
 746
 747 'lock-shift
 748      If non-nil, use locking-shift (SO/SI) instead of single-shift
 749      or designation by escape sequence.
 750
 751 'no-iso6429
 752      If non-nil, don't use ISO6429's direction specification.
 753
 754 'escape-quoted
 755      If non-nil, literal control characters that are the same as
 756      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 757      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 758      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 759      so that they can be properly distinguished from an escape sequence.
 760      (Note that doing this results in a non-portable encoding.) This
 761      encoding flag is used for byte-compiled files.  Note that ESC
 762      is a good choice for a quoting character because there are no
 763      escape sequences whose second byte is a character from the Control-0
 764      or Control-1 character sets; this is explicitly disallowed by the
 765      ISO2022 standard.
 766
 767 'input-charset-conversion
 768      A list of conversion specifications, specifying conversion of
 769      characters in one charset to another when decoding is performed.
 770      Each specification is a list of two elements: the source charset,
 771      and the destination charset.
 772
 773 'output-charset-conversion
 774      A list of conversion specifications, specifying conversion of
 775      characters in one charset to another when encoding is performed.
 776      The form of each specification is the same as for
 777      'input-charset-conversion.
 778
 779
 780 The following additional properties are recognized (and required)
 781 if TYPE is 'ccl:
 782
 783 'decode
 784      CCL program used for decoding (converting to internal format).
 785
 786 'encode
 787      CCL program used for encoding (converting to external format).
 788 */
 789        (name, type, doc_string, props))
 790 {
 791   Lisp_Coding_System *codesys;
 792   Lisp_Object rest, key, value;
 793   enum coding_system_type ty;
 794   int need_to_setup_eol_systems = 1;
 795
 796   /* Convert type to constant */
 797   if (NILP (type) || EQ (type, Qundecided))
 798                                       { ty = CODESYS_AUTODETECT; }
 799 #ifdef MULE
 800   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 801   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 802   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 803   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 804   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 805   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 806 #endif
 807   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 808 #ifdef DEBUG_XEMACS
 809   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 810 #endif
 811   else
 812     signal_simple_error ("Invalid coding system type", type);
 813
 814   CHECK_SYMBOL (name);
 815
 816   codesys = allocate_coding_system (ty, name);
 817
 818   if (NILP (doc_string))
 819     doc_string = build_string ("");
 820   else
 821     CHECK_STRING (doc_string);
 822   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 823
 824   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 825     {
 826       if (EQ (key, Qmnemonic))
 827         {
 828           if (!NILP (value))
 829             CHECK_STRING (value);
 830           CODING_SYSTEM_MNEMONIC (codesys) = value;
 831         }
 832
 833       else if (EQ (key, Qeol_type))
 834         {
 835           need_to_setup_eol_systems = NILP (value);
 836           if (EQ (value, Qt))
 837             value = Qnil;
 838           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 839         }
 840
 841       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 842       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 843 #ifdef MULE
 844       else if (ty == CODESYS_ISO2022)
 845         {
 846 #define FROB_INITIAL_CHARSET(charset_num) \
 847   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 848     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 849
 850           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 851           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 852           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 853           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 854
 855 #define FROB_FORCE_CHARSET(charset_num) \
 856   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 857
 858           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 859           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 860           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 861           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 862
 863 #define FROB_BOOLEAN_PROPERTY(prop) \
 864   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 865
 866           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 867           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 868           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 869           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 870           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 871           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 872           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 873
 874           else if (EQ (key, Qinput_charset_conversion))
 875             {
 876               codesys->iso2022.input_conv =
 877                 Dynarr_new (charset_conversion_spec);
 878               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 879                                               value);
 880             }
 881           else if (EQ (key, Qoutput_charset_conversion))
 882             {
 883               codesys->iso2022.output_conv =
 884                 Dynarr_new (charset_conversion_spec);
 885               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 886                                               value);
 887             }
 888           else
 889             signal_simple_error ("Unrecognized property", key);
 890         }
 891       else if (EQ (type, Qccl))
 892         {
 893           if (EQ (key, Qdecode))
 894             {
 895               CHECK_VECTOR (value);
 896               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 897             }
 898           else if (EQ (key, Qencode))
 899             {
 900               CHECK_VECTOR (value);
 901               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 902             }
 903           else
 904             signal_simple_error ("Unrecognized property", key);
 905         }
 906 #endif /* MULE */
 907       else
 908         signal_simple_error ("Unrecognized property", key);
 909     }
 910
 911   if (need_to_setup_eol_systems)
 912     setup_eol_coding_systems (codesys);
 913
 914   {
 915     Lisp_Object codesys_obj;
 916     XSETCODING_SYSTEM (codesys_obj, codesys);
 917     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 918     return codesys_obj;
 919   }
 920 }
 921
 922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 923 Copy OLD-CODING-SYSTEM to NEW-NAME.
 924 If NEW-NAME does not name an existing coding system, a new one will
 925 be created.
 926 */
 927        (old_coding_system, new_name))
 928 {
 929   Lisp_Object new_coding_system;
 930   old_coding_system = Fget_coding_system (old_coding_system);
 931   new_coding_system = Ffind_coding_system (new_name);
 932   if (NILP (new_coding_system))
 933     {
 934       XSETCODING_SYSTEM (new_coding_system,
 935                          allocate_coding_system
 936                          (XCODING_SYSTEM_TYPE (old_coding_system),
 937                           new_name));
 938       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 939     }
 940
 941   {
 942     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 943     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 944     memcpy (((char *) to  ) + sizeof (to->header),
 945             ((char *) from) + sizeof (from->header),
 946             sizeof (*from) - sizeof (from->header));
 947     to->name = new_name;
 948   }
 949   return new_coding_system;
 950 }
 951
 952 static Lisp_Object
 953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
 954 {
 955   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
 956   Lisp_Object new_coding_system;
 957
 958   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
 959     return coding_system;
 960
 961   switch (type)
 962     {
 963     case EOL_AUTODETECT: return coding_system;
 964     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
 965     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
 966     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
 967     default:       abort ();
 968     }
 969
 970   return NILP (new_coding_system) ? coding_system : new_coding_system;
 971 }
 972
 973 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
 974 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
 975 */
 976        (coding_system, eol_type))
 977 {
 978   coding_system = Fget_coding_system (coding_system);
 979
 980   return subsidiary_coding_system (coding_system,
 981                                    symbol_to_eol_type (eol_type));
 982 }
 983
 984 \f
 985 /************************************************************************/
 986 /*                         Coding system accessors                      */
 987 /************************************************************************/
 988
 989 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
 990 Return the doc string for CODING-SYSTEM.
 991 */
 992        (coding_system))
 993 {
 994   coding_system = Fget_coding_system (coding_system);
 995   return XCODING_SYSTEM_DOC_STRING (coding_system);
 996 }
 997
 998 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
 999 Return the type of CODING-SYSTEM.
1000 */
1001        (coding_system))
1002 {
1003   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1004     {
1005     default: abort ();
1006     case CODESYS_AUTODETECT:    return Qundecided;
1007 #ifdef MULE
1008     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1009     case CODESYS_ISO2022:       return Qiso2022;
1010     case CODESYS_BIG5:          return Qbig5;
1011     case CODESYS_UCS4:          return Qucs4;
1012     case CODESYS_UTF8:          return Qutf8;
1013     case CODESYS_CCL:           return Qccl;
1014 #endif
1015     case CODESYS_NO_CONVERSION: return Qno_conversion;
1016 #ifdef DEBUG_XEMACS
1017     case CODESYS_INTERNAL:      return Qinternal;
1018 #endif
1019     }
1020 }
1021
1022 #ifdef MULE
1023 static
1024 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1025 {
1026   Lisp_Object cs
1027     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1028
1029   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1030 }
1031
1032 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1033 Return initial charset of CODING-SYSTEM designated to GNUM.
1034 GNUM allows 0 .. 3.
1035 */
1036        (coding_system, gnum))
1037 {
1038   coding_system = Fget_coding_system (coding_system);
1039   CHECK_INT (gnum);
1040
1041   return coding_system_charset (coding_system, XINT (gnum));
1042 }
1043 #endif /* MULE */
1044
1045 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1046 Return the PROP property of CODING-SYSTEM.
1047 */
1048        (coding_system, prop))
1049 {
1050   int i, ok = 0;
1051   enum coding_system_type type;
1052
1053   coding_system = Fget_coding_system (coding_system);
1054   CHECK_SYMBOL (prop);
1055   type = XCODING_SYSTEM_TYPE (coding_system);
1056
1057   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1058     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1059       {
1060         ok = 1;
1061         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1062           {
1063           case CODESYS_PROP_ALL_OK:
1064             break;
1065 #ifdef MULE
1066           case CODESYS_PROP_ISO2022:
1067             if (type != CODESYS_ISO2022)
1068               signal_simple_error
1069                 ("Property only valid in ISO2022 coding systems",
1070                  prop);
1071             break;
1072
1073           case CODESYS_PROP_CCL:
1074             if (type != CODESYS_CCL)
1075               signal_simple_error
1076                 ("Property only valid in CCL coding systems",
1077                  prop);
1078             break;
1079 #endif /* MULE */
1080           default:
1081             abort ();
1082           }
1083       }
1084
1085   if (!ok)
1086     signal_simple_error ("Unrecognized property", prop);
1087
1088   if (EQ (prop, Qname))
1089     return XCODING_SYSTEM_NAME (coding_system);
1090   else if (EQ (prop, Qtype))
1091     return Fcoding_system_type (coding_system);
1092   else if (EQ (prop, Qdoc_string))
1093     return XCODING_SYSTEM_DOC_STRING (coding_system);
1094   else if (EQ (prop, Qmnemonic))
1095     return XCODING_SYSTEM_MNEMONIC (coding_system);
1096   else if (EQ (prop, Qeol_type))
1097     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1098   else if (EQ (prop, Qeol_lf))
1099     return XCODING_SYSTEM_EOL_LF (coding_system);
1100   else if (EQ (prop, Qeol_crlf))
1101     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1102   else if (EQ (prop, Qeol_cr))
1103     return XCODING_SYSTEM_EOL_CR (coding_system);
1104   else if (EQ (prop, Qpost_read_conversion))
1105     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1106   else if (EQ (prop, Qpre_write_conversion))
1107     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1108 #ifdef MULE
1109   else if (type == CODESYS_ISO2022)
1110     {
1111       if (EQ (prop, Qcharset_g0))
1112         return coding_system_charset (coding_system, 0);
1113       else if (EQ (prop, Qcharset_g1))
1114         return coding_system_charset (coding_system, 1);
1115       else if (EQ (prop, Qcharset_g2))
1116         return coding_system_charset (coding_system, 2);
1117       else if (EQ (prop, Qcharset_g3))
1118         return coding_system_charset (coding_system, 3);
1119
1120 #define FORCE_CHARSET(charset_num) \
1121   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1122    (coding_system, charset_num) ? Qt : Qnil)
1123
1124       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1125       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1126       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1127       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1128
1129 #define LISP_BOOLEAN(prop) \
1130   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1131
1132       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1133       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1134       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1135       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1136       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1137       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1138       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1139
1140       else if (EQ (prop, Qinput_charset_conversion))
1141         return
1142           unparse_charset_conversion_specs
1143             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1144       else if (EQ (prop, Qoutput_charset_conversion))
1145         return
1146           unparse_charset_conversion_specs
1147             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1148       else
1149         abort ();
1150     }
1151   else if (type == CODESYS_CCL)
1152     {
1153       if (EQ (prop, Qdecode))
1154         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1155       else if (EQ (prop, Qencode))
1156         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1157       else
1158         abort ();
1159     }
1160 #endif /* MULE */
1161   else
1162     abort ();
1163
1164   return Qnil; /* not reached */
1165 }
1166
1167 \f
1168 /************************************************************************/
1169 /*                       Coding category functions                      */
1170 /************************************************************************/
1171
1172 static int
1173 decode_coding_category (Lisp_Object symbol)
1174 {
1175   int i;
1176
1177   CHECK_SYMBOL (symbol);
1178   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1179     if (EQ (coding_category_symbol[i], symbol))
1180       return i;
1181
1182   signal_simple_error ("Unrecognized coding category", symbol);
1183   return 0; /* not reached */
1184 }
1185
1186 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1187 Return a list of all recognized coding categories.
1188 */
1189        ())
1190 {
1191   int i;
1192   Lisp_Object list = Qnil;
1193
1194   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1195     list = Fcons (coding_category_symbol[i], list);
1196   return list;
1197 }
1198
1199 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1200 Change the priority order of the coding categories.
1201 LIST should be list of coding categories, in descending order of
1202 priority.  Unspecified coding categories will be lower in priority
1203 than all specified ones, in the same relative order they were in
1204 previously.
1205 */
1206        (list))
1207 {
1208   int category_to_priority[CODING_CATEGORY_LAST + 1];
1209   int i, j;
1210   Lisp_Object rest;
1211
1212   /* First generate a list that maps coding categories to priorities. */
1213
1214   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1215     category_to_priority[i] = -1;
1216
1217   /* Highest priority comes from the specified list. */
1218   i = 0;
1219   EXTERNAL_LIST_LOOP (rest, list)
1220     {
1221       int cat = decode_coding_category (XCAR (rest));
1222
1223       if (category_to_priority[cat] >= 0)
1224         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1225       category_to_priority[cat] = i++;
1226     }
1227
1228   /* Now go through the existing categories by priority to retrieve
1229      the categories not yet specified and preserve their priority
1230      order. */
1231   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1232     {
1233       int cat = coding_category_by_priority[j];
1234       if (category_to_priority[cat] < 0)
1235         category_to_priority[cat] = i++;
1236     }
1237
1238   /* Now we need to construct the inverse of the mapping we just
1239      constructed. */
1240
1241   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1242     coding_category_by_priority[category_to_priority[i]] = i;
1243
1244   /* Phew!  That was confusing. */
1245   return Qnil;
1246 }
1247
1248 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1249 Return a list of coding categories in descending order of priority.
1250 */
1251        ())
1252 {
1253   int i;
1254   Lisp_Object list = Qnil;
1255
1256   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1257     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1258                   list);
1259   return list;
1260 }
1261
1262 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1263 Change the coding system associated with a coding category.
1264 */
1265        (coding_category, coding_system))
1266 {
1267   int cat = decode_coding_category (coding_category);
1268
1269   coding_system = Fget_coding_system (coding_system);
1270   coding_category_system[cat] = coding_system;
1271   return Qnil;
1272 }
1273
1274 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1275 Return the coding system associated with a coding category.
1276 */
1277        (coding_category))
1278 {
1279   int cat = decode_coding_category (coding_category);
1280   Lisp_Object sys = coding_category_system[cat];
1281
1282   if (!NILP (sys))
1283     return XCODING_SYSTEM_NAME (sys);
1284   return Qnil;
1285 }
1286
1287 \f
1288 /************************************************************************/
1289 /*                     Detecting the encoding of data                   */
1290 /************************************************************************/
1291
1292 struct detection_state
1293 {
1294   enum eol_type eol_type;
1295   int seen_non_ascii;
1296   int mask;
1297 #ifdef MULE
1298   struct
1299     {
1300       int mask;
1301       int in_second_byte;
1302     }
1303   big5;
1304
1305   struct
1306     {
1307       int mask;
1308       int in_second_byte;
1309     }
1310   shift_jis;
1311
1312   struct
1313     {
1314       int mask;
1315       int in_byte;
1316   }
1317   ucs4;
1318
1319   struct
1320     {
1321       int mask;
1322       int in_byte;
1323     }
1324   utf8;
1325
1326   struct
1327     {
1328       int mask;
1329       int initted;
1330       struct iso2022_decoder iso;
1331       unsigned int flags;
1332       int high_byte_count;
1333       unsigned int saw_single_shift:1;
1334     }
1335   iso2022;
1336 #endif
1337   struct
1338     {
1339       int seen_anything;
1340       int just_saw_cr;
1341     }
1342   eol;
1343 };
1344
1345 static int
1346 acceptable_control_char_p (int c)
1347 {
1348   switch (c)
1349     {
1350       /* Allow and ignore control characters that you might
1351          reasonably see in a text file */
1352     case '\r':
1353     case '\n':
1354     case '\t':
1355     case  7: /* bell */
1356     case  8: /* backspace */
1357     case 11: /* vertical tab */
1358     case 12: /* form feed */
1359     case 26: /* MS-DOS C-z junk */
1360     case 31: /* '^_' -- for info */
1361       return 1;
1362     default:
1363       return 0;
1364     }
1365 }
1366
1367 static int
1368 mask_has_at_most_one_bit_p (int mask)
1369 {
1370   /* Perhaps the only thing useful you learn from intensive Microsoft
1371      technical interviews */
1372   return (mask & (mask - 1)) == 0;
1373 }
1374
1375 static enum eol_type
1376 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1377                  unsigned int n)
1378 {
1379   int c;
1380
1381   while (n--)
1382     {
1383       c = *src++;
1384       if (c == '\r')
1385         st->eol.just_saw_cr = 1;
1386       else
1387         {
1388           if (c == '\n')
1389             {
1390               if (st->eol.just_saw_cr)
1391                 return EOL_CRLF;
1392               else if (st->eol.seen_anything)
1393                 return EOL_LF;
1394             }
1395           else if (st->eol.just_saw_cr)
1396             return EOL_CR;
1397           st->eol.just_saw_cr = 0;
1398         }
1399       st->eol.seen_anything = 1;
1400     }
1401
1402   return EOL_AUTODETECT;
1403 }
1404
1405 /* Attempt to determine the encoding and EOL type of the given text.
1406    Before calling this function for the first type, you must initialize
1407    st->eol_type as appropriate and initialize st->mask to ~0.
1408
1409    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1410    not yet known.
1411
1412    st->mask holds the determined coding category mask, or ~0 if only
1413    ASCII has been seen so far.
1414
1415    Returns:
1416
1417    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1418         is present in st->mask
1419    1 == definitive answers are here for both st->eol_type and st->mask
1420 */
1421
1422 static int
1423 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1424                     unsigned int n, int just_do_eol)
1425 {
1426   int c;
1427
1428   if (st->eol_type == EOL_AUTODETECT)
1429     st->eol_type = detect_eol_type (st, src, n);
1430
1431   if (just_do_eol)
1432     return st->eol_type != EOL_AUTODETECT;
1433
1434   if (!st->seen_non_ascii)
1435     {
1436       for (; n; n--, src++)
1437         {
1438           c = *src;
1439           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1440             {
1441               st->seen_non_ascii = 1;
1442 #ifdef MULE
1443               st->shift_jis.mask = ~0;
1444               st->big5.mask = ~0;
1445               st->ucs4.mask = ~0;
1446               st->utf8.mask = ~0;
1447               st->iso2022.mask = ~0;
1448 #endif
1449               break;
1450             }
1451         }
1452     }
1453
1454   if (!n)
1455     return 0;
1456 #ifdef MULE
1457   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1458     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1459   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1460     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1461   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1462     st->big5.mask = detect_coding_big5 (st, src, n);
1463   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1464     st->utf8.mask = detect_coding_utf8 (st, src, n);
1465   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1466     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1467
1468   st->mask
1469     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1470     | st->utf8.mask | st->ucs4.mask;
1471 #endif
1472   {
1473     int retval = mask_has_at_most_one_bit_p (st->mask);
1474     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1475     return retval && st->eol_type != EOL_AUTODETECT;
1476   }
1477 }
1478
1479 static Lisp_Object
1480 coding_system_from_mask (int mask)
1481 {
1482   if (mask == ~0)
1483     {
1484       /* If the file was entirely or basically ASCII, use the
1485          default value of `buffer-file-coding-system'. */
1486       Lisp_Object retval =
1487         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1488       if (!NILP (retval))
1489         {
1490           retval = Ffind_coding_system (retval);
1491           if (NILP (retval))
1492             {
1493               warn_when_safe
1494                 (Qbad_variable, Qwarning,
1495                  "Invalid `default-buffer-file-coding-system', set to nil");
1496               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1497             }
1498         }
1499       if (NILP (retval))
1500         retval = Fget_coding_system (Qno_conversion);
1501       return retval;
1502     }
1503   else
1504     {
1505       int i;
1506       int cat = -1;
1507 #ifdef MULE
1508       mask = postprocess_iso2022_mask (mask);
1509 #endif
1510       /* Look through the coding categories by priority and find
1511          the first one that is allowed. */
1512       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1513         {
1514           cat = coding_category_by_priority[i];
1515           if ((mask & (1 << cat)) &&
1516               !NILP (coding_category_system[cat]))
1517             break;
1518         }
1519       if (cat >= 0)
1520         return coding_category_system[cat];
1521       else
1522         return Fget_coding_system (Qno_conversion);
1523     }
1524 }
1525
1526 /* Given a seekable read stream and potential coding system and EOL type
1527    as specified, do any autodetection that is called for.  If the
1528    coding system and/or EOL type are not autodetect, they will be left
1529    alone; but this function will never return an autodetect coding system
1530    or EOL type.
1531
1532    This function does not automatically fetch subsidiary coding systems;
1533    that should be unnecessary with the explicit eol-type argument. */
1534
1535 void
1536 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1537                               enum eol_type *eol_type_in_out)
1538 {
1539   struct detection_state decst;
1540
1541   if (*eol_type_in_out == EOL_AUTODETECT)
1542     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1543
1544   xzero (decst);
1545   decst.eol_type = *eol_type_in_out;
1546   decst.mask = ~0;
1547
1548   /* If autodetection is called for, do it now. */
1549   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1550       *eol_type_in_out == EOL_AUTODETECT)
1551     {
1552
1553       while (1)
1554         {
1555           unsigned char random_buffer[4096];
1556           int nread;
1557
1558           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1559           if (!nread)
1560             break;
1561           if (detect_coding_type (&decst, random_buffer, nread,
1562                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1563                                   CODESYS_AUTODETECT))
1564             break;
1565         }
1566
1567       *eol_type_in_out = decst.eol_type;
1568       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1569         *codesys_in_out = coding_system_from_mask (decst.mask);
1570     }
1571
1572   /* If we absolutely can't determine the EOL type, just assume LF. */
1573   if (*eol_type_in_out == EOL_AUTODETECT)
1574     *eol_type_in_out = EOL_LF;
1575
1576   Lstream_rewind (stream);
1577 }
1578
1579 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1580 Detect coding system of the text in the region between START and END.
1581 Returned a list of possible coding systems ordered by priority.
1582 If only ASCII characters are found, it returns 'undecided or one of
1583 its subsidiary coding systems according to a detected end-of-line
1584 type.  Optional arg BUFFER defaults to the current buffer.
1585 */
1586        (start, end, buffer))
1587 {
1588   Lisp_Object val = Qnil;
1589   struct buffer *buf = decode_buffer (buffer, 0);
1590   Bufpos b, e;
1591   Lisp_Object instream, lb_instream;
1592   Lstream *istr, *lb_istr;
1593   struct detection_state decst;
1594   struct gcpro gcpro1, gcpro2;
1595
1596   get_buffer_range_char (buf, start, end, &b, &e, 0);
1597   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1598   lb_istr = XLSTREAM (lb_instream);
1599   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1600   istr = XLSTREAM (instream);
1601   GCPRO2 (instream, lb_instream);
1602   xzero (decst);
1603   decst.eol_type = EOL_AUTODETECT;
1604   decst.mask = ~0;
1605   while (1)
1606     {
1607       unsigned char random_buffer[4096];
1608       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1609
1610       if (!nread)
1611         break;
1612       if (detect_coding_type (&decst, random_buffer, nread, 0))
1613         break;
1614     }
1615
1616   if (decst.mask == ~0)
1617     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1618                                     decst.eol_type);
1619   else
1620     {
1621       int i;
1622
1623       val = Qnil;
1624 #ifdef MULE
1625       decst.mask = postprocess_iso2022_mask (decst.mask);
1626 #endif
1627       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1628         {
1629           int sys = coding_category_by_priority[i];
1630           if (decst.mask & (1 << sys))
1631             {
1632               Lisp_Object codesys = coding_category_system[sys];
1633               if (!NILP (codesys))
1634                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1635               val = Fcons (codesys, val);
1636             }
1637         }
1638     }
1639   Lstream_close (istr);
1640   UNGCPRO;
1641   Lstream_delete (istr);
1642   Lstream_delete (lb_istr);
1643   return val;
1644 }
1645
1646 \f
1647 /************************************************************************/
1648 /*           Converting to internal Mule format ("decoding")            */
1649 /************************************************************************/
1650
1651 /* A decoding stream is a stream used for decoding text (i.e.
1652    converting from some external format to internal format).
1653    The decoding-stream object keeps track of the actual coding
1654    stream, the stream that is at the other end, and data that
1655    needs to be persistent across the lifetime of the stream. */
1656
1657 /* Handle the EOL stuff related to just-read-in character C.
1658    EOL_TYPE is the EOL type of the coding stream.
1659    FLAGS is the current value of FLAGS in the coding stream, and may
1660    be modified by this macro.  (The macro only looks at the
1661    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1662    bytes are to be written.  You need to also define a local goto
1663    label "label_continue_loop" that is at the end of the main
1664    character-reading loop.
1665
1666    If C is a CR character, then this macro handles it entirely and
1667    jumps to label_continue_loop.  Otherwise, this macro does not add
1668    anything to DST, and continues normally.  You should continue
1669    processing C normally after this macro. */
1670
1671 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1672 do {                                                            \
1673   if (c == '\r')                                                \
1674     {                                                           \
1675       if (eol_type == EOL_CR)                                   \
1676         Dynarr_add (dst, '\n');                                 \
1677       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1678         Dynarr_add (dst, c);                                    \
1679       else                                                      \
1680         flags |= CODING_STATE_CR;                               \
1681       goto label_continue_loop;                                 \
1682     }                                                           \
1683   else if (flags & CODING_STATE_CR)                             \
1684     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1685       if (c != '\n')                                            \
1686         Dynarr_add (dst, '\r');                                 \
1687       flags &= ~CODING_STATE_CR;                                \
1688     }                                                           \
1689 } while (0)
1690
1691 /* C should be a binary character in the range 0 - 255; convert
1692    to internal format and add to Dynarr DST. */
1693
1694 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1695 do {                                            \
1696   if (BYTE_ASCII_P (c))                         \
1697     Dynarr_add (dst, c);                        \
1698   else if (BYTE_C1_P (c))                       \
1699     {                                           \
1700       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1701       Dynarr_add (dst, c + 0x20);               \
1702     }                                           \
1703   else                                          \
1704     {                                           \
1705       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1706       Dynarr_add (dst, c);                      \
1707     }                                           \
1708 } while (0)
1709
1710 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1711 do {                                    \
1712   if (ch)                               \
1713     {                                   \
1714       DECODE_ADD_BINARY_CHAR (ch, dst); \
1715       ch = 0;                           \
1716     }                                   \
1717 } while (0)
1718
1719 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1720 do {                                    \
1721   DECODE_OUTPUT_PARTIAL_CHAR (ch);      \
1722   if ((flags & CODING_STATE_END) &&     \
1723       (flags & CODING_STATE_CR))        \
1724     Dynarr_add (dst, '\r');             \
1725 } while (0)
1726
1727 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1728
1729 struct decoding_stream
1730 {
1731   /* Coding system that governs the conversion. */
1732   Lisp_Coding_System *codesys;
1733
1734   /* Stream that we read the encoded data from or
1735      write the decoded data to. */
1736   Lstream *other_end;
1737
1738   /* If we are reading, then we can return only a fixed amount of
1739      data, so if the conversion resulted in too much data, we store it
1740      here for retrieval the next time around. */
1741   unsigned_char_dynarr *runoff;
1742
1743   /* FLAGS holds flags indicating the current state of the decoding.
1744      Some of these flags are dependent on the coding system. */
1745   unsigned int flags;
1746
1747   /* CH holds a partially built-up character.  Since we only deal
1748      with one- and two-byte characters at the moment, we only use
1749      this to store the first byte of a two-byte character. */
1750   unsigned int ch;
1751
1752   /* EOL_TYPE specifies the type of end-of-line conversion that
1753      currently applies.  We need to keep this separate from the
1754      EOL type stored in CODESYS because the latter might indicate
1755      automatic EOL-type detection while the former will always
1756      indicate a particular EOL type. */
1757   enum eol_type eol_type;
1758 #ifdef MULE
1759   /* Additional ISO2022 information.  We define the structure above
1760      because it's also needed by the detection routines. */
1761   struct iso2022_decoder iso2022;
1762
1763   /* Additional information (the state of the running CCL program)
1764      used by the CCL decoder. */
1765   struct ccl_program ccl;
1766 #endif
1767   struct detection_state decst;
1768 };
1769
1770 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1771 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1772 static int decoding_rewinder   (Lstream *stream);
1773 static int decoding_seekable_p (Lstream *stream);
1774 static int decoding_flusher    (Lstream *stream);
1775 static int decoding_closer     (Lstream *stream);
1776
1777 static Lisp_Object decoding_marker (Lisp_Object stream,
1778                                     void (*markobj) (Lisp_Object));
1779
1780 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1781                                sizeof (struct decoding_stream));
1782
1783 static Lisp_Object
1784 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1785 {
1786   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1787   Lisp_Object str_obj;
1788
1789   /* We do not need to mark the coding systems or charsets stored
1790      within the stream because they are stored in a global list
1791      and automatically marked. */
1792
1793   XSETLSTREAM (str_obj, str);
1794   markobj (str_obj);
1795   if (str->imp->marker)
1796     return (str->imp->marker) (str_obj, markobj);
1797   else
1798     return Qnil;
1799 }
1800
1801 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1802    so we read data from the other end, decode it, and store it into DATA. */
1803
1804 static int
1805 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1806 {
1807   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1808   unsigned char *orig_data = data;
1809   int read_size;
1810   int error_occurred = 0;
1811
1812   /* We need to interface to mule_decode(), which expects to take some
1813      amount of data and store the result into a Dynarr.  We have
1814      mule_decode() store into str->runoff, and take data from there
1815      as necessary. */
1816
1817   /* We loop until we have enough data, reading chunks from the other
1818      end and decoding it. */
1819   while (1)
1820     {
1821       /* Take data from the runoff if we can.  Make sure to take at
1822          most SIZE bytes, and delete the data from the runoff. */
1823       if (Dynarr_length (str->runoff) > 0)
1824         {
1825           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1826           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1827           Dynarr_delete_many (str->runoff, 0, chunk);
1828           data += chunk;
1829           size -= chunk;
1830         }
1831
1832       if (size == 0)
1833         break; /* No more room for data */
1834
1835       if (str->flags & CODING_STATE_END)
1836         /* This means that on the previous iteration, we hit the EOF on
1837            the other end.  We loop once more so that mule_decode() can
1838            output any final stuff it may be holding, or any "go back
1839            to a sane state" escape sequences. (This latter makes sense
1840            during encoding.) */
1841         break;
1842
1843       /* Exhausted the runoff, so get some more.  DATA has at least
1844          SIZE bytes left of storage in it, so it's OK to read directly
1845          into it.  (We'll be overwriting above, after we've decoded it
1846          into the runoff.) */
1847       read_size = Lstream_read (str->other_end, data, size);
1848       if (read_size < 0)
1849         {
1850           error_occurred = 1;
1851           break;
1852         }
1853       if (read_size == 0)
1854         /* There might be some more end data produced in the translation.
1855            See the comment above. */
1856         str->flags |= CODING_STATE_END;
1857       mule_decode (stream, data, str->runoff, read_size);
1858     }
1859
1860   if (data - orig_data == 0)
1861     return error_occurred ? -1 : 0;
1862   else
1863     return data - orig_data;
1864 }
1865
1866 static int
1867 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1868 {
1869   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1870   int retval;
1871
1872   /* Decode all our data into the runoff, and then attempt to write
1873      it all out to the other end.  Remove whatever chunk we succeeded
1874      in writing. */
1875   mule_decode (stream, data, str->runoff, size);
1876   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1877                           Dynarr_length (str->runoff));
1878   if (retval > 0)
1879     Dynarr_delete_many (str->runoff, 0, retval);
1880   /* Do NOT return retval.  The return value indicates how much
1881      of the incoming data was written, not how many bytes were
1882      written. */
1883   return size;
1884 }
1885
1886 static void
1887 reset_decoding_stream (struct decoding_stream *str)
1888 {
1889 #ifdef MULE
1890   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1891     {
1892       Lisp_Object coding_system;
1893       XSETCODING_SYSTEM (coding_system, str->codesys);
1894       reset_iso2022 (coding_system, &str->iso2022);
1895     }
1896   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1897     {
1898       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1899     }
1900 #endif /* MULE */
1901   str->flags = str->ch = 0;
1902 }
1903
1904 static int
1905 decoding_rewinder (Lstream *stream)
1906 {
1907   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1908   reset_decoding_stream (str);
1909   Dynarr_reset (str->runoff);
1910   return Lstream_rewind (str->other_end);
1911 }
1912
1913 static int
1914 decoding_seekable_p (Lstream *stream)
1915 {
1916   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1917   return Lstream_seekable_p (str->other_end);
1918 }
1919
1920 static int
1921 decoding_flusher (Lstream *stream)
1922 {
1923   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1924   return Lstream_flush (str->other_end);
1925 }
1926
1927 static int
1928 decoding_closer (Lstream *stream)
1929 {
1930   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1931   if (stream->flags & LSTREAM_FL_WRITE)
1932     {
1933       str->flags |= CODING_STATE_END;
1934       decoding_writer (stream, 0, 0);
1935     }
1936   Dynarr_free (str->runoff);
1937 #ifdef MULE
1938 #ifdef ENABLE_COMPOSITE_CHARS
1939   if (str->iso2022.composite_chars)
1940     Dynarr_free (str->iso2022.composite_chars);
1941 #endif
1942 #endif
1943   return Lstream_close (str->other_end);
1944 }
1945
1946 Lisp_Object
1947 decoding_stream_coding_system (Lstream *stream)
1948 {
1949   Lisp_Object coding_system;
1950   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1951
1952   XSETCODING_SYSTEM (coding_system, str->codesys);
1953   return subsidiary_coding_system (coding_system, str->eol_type);
1954 }
1955
1956 void
1957 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1958 {
1959   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1960   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1961   str->codesys = cs;
1962   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1963     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1964   reset_decoding_stream (str);
1965 }
1966
1967 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
1968    stream for writing, no automatic code detection will be performed.
1969    The reason for this is that automatic code detection requires a
1970    seekable input.  Things will also fail if you open a decoding
1971    stream for reading using a non-fully-specified coding system and
1972    a non-seekable input stream. */
1973
1974 static Lisp_Object
1975 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1976                         CONST char *mode)
1977 {
1978   Lstream *lstr = Lstream_new (lstream_decoding, mode);
1979   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1980   Lisp_Object obj;
1981
1982   xzero (*str);
1983   str->other_end = stream;
1984   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1985   str->eol_type = EOL_AUTODETECT;
1986   if (!strcmp (mode, "r")
1987       && Lstream_seekable_p (stream))
1988     /* We can determine the coding system now. */
1989     determine_real_coding_system (stream, &codesys, &str->eol_type);
1990   set_decoding_stream_coding_system (lstr, codesys);
1991   str->decst.eol_type = str->eol_type;
1992   str->decst.mask = ~0;
1993   XSETLSTREAM (obj, lstr);
1994   return obj;
1995 }
1996
1997 Lisp_Object
1998 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1999 {
2000   return make_decoding_stream_1 (stream, codesys, "r");
2001 }
2002
2003 Lisp_Object
2004 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2005 {
2006   return make_decoding_stream_1 (stream, codesys, "w");
2007 }
2008
2009 /* Note: the decode_coding_* functions all take the same
2010    arguments as mule_decode(), which is to say some SRC data of
2011    size N, which is to be stored into dynamic array DST.
2012    DECODING is the stream within which the decoding is
2013    taking place, but no data is actually read from or
2014    written to that stream; that is handled in decoding_reader()
2015    or decoding_writer().  This allows the same functions to
2016    be used for both reading and writing. */
2017
2018 static void
2019 mule_decode (Lstream *decoding, CONST unsigned char *src,
2020              unsigned_char_dynarr *dst, unsigned int n)
2021 {
2022   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2023
2024   /* If necessary, do encoding-detection now.  We do this when
2025      we're a writing stream or a non-seekable reading stream,
2026      meaning that we can't just process the whole input,
2027      rewind, and start over. */
2028
2029   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2030       str->eol_type == EOL_AUTODETECT)
2031     {
2032       Lisp_Object codesys;
2033
2034       XSETCODING_SYSTEM (codesys, str->codesys);
2035       detect_coding_type (&str->decst, src, n,
2036                           CODING_SYSTEM_TYPE (str->codesys) !=
2037                           CODESYS_AUTODETECT);
2038       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2039           str->decst.mask != ~0)
2040         /* #### This is cheesy.  What we really ought to do is
2041            buffer up a certain amount of data so as to get a
2042            less random result. */
2043         codesys = coding_system_from_mask (str->decst.mask);
2044       str->eol_type = str->decst.eol_type;
2045       if (XCODING_SYSTEM (codesys) != str->codesys)
2046         {
2047           /* Preserve the CODING_STATE_END flag in case it was set.
2048              If we erase it, bad things might happen. */
2049           int was_end = str->flags & CODING_STATE_END;
2050           set_decoding_stream_coding_system (decoding, codesys);
2051           if (was_end)
2052             str->flags |= CODING_STATE_END;
2053         }
2054     }
2055
2056   switch (CODING_SYSTEM_TYPE (str->codesys))
2057     {
2058 #ifdef DEBUG_XEMACS
2059     case CODESYS_INTERNAL:
2060       Dynarr_add_many (dst, src, n);
2061       break;
2062 #endif
2063     case CODESYS_AUTODETECT:
2064       /* If we got this far and still haven't decided on the coding
2065          system, then do no conversion. */
2066     case CODESYS_NO_CONVERSION:
2067       decode_coding_no_conversion (decoding, src, dst, n);
2068       break;
2069 #ifdef MULE
2070     case CODESYS_SHIFT_JIS:
2071       decode_coding_sjis (decoding, src, dst, n);
2072       break;
2073     case CODESYS_BIG5:
2074       decode_coding_big5 (decoding, src, dst, n);
2075       break;
2076     case CODESYS_UCS4:
2077       decode_coding_ucs4 (decoding, src, dst, n);
2078       break;
2079     case CODESYS_UTF8:
2080       decode_coding_utf8 (decoding, src, dst, n);
2081       break;
2082     case CODESYS_CCL:
2083       ccl_driver (&str->ccl, src, dst, n, 0);
2084       break;
2085     case CODESYS_ISO2022:
2086       decode_coding_iso2022 (decoding, src, dst, n);
2087       break;
2088 #endif /* MULE */
2089     default:
2090       abort ();
2091     }
2092 }
2093
2094 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2095 Decode the text between START and END which is encoded in CODING-SYSTEM.
2096 This is useful if you've read in encoded text from a file without decoding
2097 it (e.g. you read in a JIS-formatted file but used the `binary' or
2098 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2099 Return length of decoded text.
2100 BUFFER defaults to the current buffer if unspecified.
2101 */
2102        (start, end, coding_system, buffer))
2103 {
2104   Bufpos b, e;
2105   struct buffer *buf = decode_buffer (buffer, 0);
2106   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2107   Lstream *istr, *ostr;
2108   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2109
2110   get_buffer_range_char (buf, start, end, &b, &e, 0);
2111
2112   barf_if_buffer_read_only (buf, b, e);
2113
2114   coding_system = Fget_coding_system (coding_system);
2115   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2116   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2117   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2118                                               coding_system);
2119   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2120                                            Fget_coding_system (Qbinary));
2121   istr = XLSTREAM (instream);
2122   ostr = XLSTREAM (outstream);
2123   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2124
2125   /* The chain of streams looks like this:
2126
2127      [BUFFER] <----- send through
2128                      ------> [ENCODE AS BINARY]
2129                              ------> [DECODE AS SPECIFIED]
2130                                      ------> [BUFFER]
2131    */
2132
2133   while (1)
2134     {
2135       char tempbuf[1024]; /* some random amount */
2136       Bufpos newpos, even_newer_pos;
2137       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2138       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2139
2140       if (!size_in_bytes)
2141         break;
2142       newpos = lisp_buffer_stream_startpos (istr);
2143       Lstream_write (ostr, tempbuf, size_in_bytes);
2144       even_newer_pos = lisp_buffer_stream_startpos (istr);
2145       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2146                            even_newer_pos, 0);
2147     }
2148   Lstream_close (istr);
2149   Lstream_close (ostr);
2150   UNGCPRO;
2151   Lstream_delete (istr);
2152   Lstream_delete (ostr);
2153   Lstream_delete (XLSTREAM (de_outstream));
2154   Lstream_delete (XLSTREAM (lb_outstream));
2155   return Qnil;
2156 }
2157
2158 \f
2159 /************************************************************************/
2160 /*           Converting to an external encoding ("encoding")            */
2161 /************************************************************************/
2162
2163 /* An encoding stream is an output stream.  When you create the
2164    stream, you specify the coding system that governs the encoding
2165    and another stream that the resulting encoded data is to be
2166    sent to, and then start sending data to it. */
2167
2168 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2169
2170 struct encoding_stream
2171 {
2172   /* Coding system that governs the conversion. */
2173   Lisp_Coding_System *codesys;
2174
2175   /* Stream that we read the encoded data from or
2176      write the decoded data to. */
2177   Lstream *other_end;
2178
2179   /* If we are reading, then we can return only a fixed amount of
2180      data, so if the conversion resulted in too much data, we store it
2181      here for retrieval the next time around. */
2182   unsigned_char_dynarr *runoff;
2183
2184   /* FLAGS holds flags indicating the current state of the encoding.
2185      Some of these flags are dependent on the coding system. */
2186   unsigned int flags;
2187
2188   /* CH holds a partially built-up character.  Since we only deal
2189      with one- and two-byte characters at the moment, we only use
2190      this to store the first byte of a two-byte character. */
2191   unsigned int ch;
2192 #ifdef MULE
2193   /* Additional information used by the ISO2022 encoder. */
2194   struct
2195     {
2196       /* CHARSET holds the character sets currently assigned to the G0
2197          through G3 registers.  It is initialized from the array
2198          INITIAL_CHARSET in CODESYS. */
2199       Lisp_Object charset[4];
2200
2201       /* Which registers are currently invoked into the left (GL) and
2202          right (GR) halves of the 8-bit encoding space? */
2203       int register_left, register_right;
2204
2205       /* Whether we need to explicitly designate the charset in the
2206          G? register before using it.  It is initialized from the
2207          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2208       unsigned char force_charset_on_output[4];
2209
2210       /* Other state variables that need to be preserved across
2211          invocations. */
2212       Lisp_Object current_charset;
2213       int current_half;
2214       int current_char_boundary;
2215     } iso2022;
2216
2217   /* Additional information (the state of the running CCL program)
2218      used by the CCL encoder. */
2219   struct ccl_program ccl;
2220 #endif /* MULE */
2221 };
2222
2223 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2224 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2225                             size_t size);
2226 static int encoding_rewinder   (Lstream *stream);
2227 static int encoding_seekable_p (Lstream *stream);
2228 static int encoding_flusher    (Lstream *stream);
2229 static int encoding_closer     (Lstream *stream);
2230
2231 static Lisp_Object encoding_marker (Lisp_Object stream,
2232                                     void (*markobj) (Lisp_Object));
2233
2234 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2235                                sizeof (struct encoding_stream));
2236
2237 static Lisp_Object
2238 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2239 {
2240   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2241   Lisp_Object str_obj;
2242
2243   /* We do not need to mark the coding systems or charsets stored
2244      within the stream because they are stored in a global list
2245      and automatically marked. */
2246
2247   XSETLSTREAM (str_obj, str);
2248   markobj (str_obj);
2249   if (str->imp->marker)
2250     return (str->imp->marker) (str_obj, markobj);
2251   else
2252     return Qnil;
2253 }
2254
2255 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2256    so we read data from the other end, encode it, and store it into DATA. */
2257
2258 static int
2259 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2260 {
2261   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2262   unsigned char *orig_data = data;
2263   int read_size;
2264   int error_occurred = 0;
2265
2266   /* We need to interface to mule_encode(), which expects to take some
2267      amount of data and store the result into a Dynarr.  We have
2268      mule_encode() store into str->runoff, and take data from there
2269      as necessary. */
2270
2271   /* We loop until we have enough data, reading chunks from the other
2272      end and encoding it. */
2273   while (1)
2274     {
2275       /* Take data from the runoff if we can.  Make sure to take at
2276          most SIZE bytes, and delete the data from the runoff. */
2277       if (Dynarr_length (str->runoff) > 0)
2278         {
2279           int chunk = min ((int) size, Dynarr_length (str->runoff));
2280           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2281           Dynarr_delete_many (str->runoff, 0, chunk);
2282           data += chunk;
2283           size -= chunk;
2284         }
2285
2286       if (size == 0)
2287         break; /* No more room for data */
2288
2289       if (str->flags & CODING_STATE_END)
2290         /* This means that on the previous iteration, we hit the EOF on
2291            the other end.  We loop once more so that mule_encode() can
2292            output any final stuff it may be holding, or any "go back
2293            to a sane state" escape sequences. (This latter makes sense
2294            during encoding.) */
2295         break;
2296
2297       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2298          left of storage in it, so it's OK to read directly into it.
2299          (We'll be overwriting above, after we've encoded it into the
2300          runoff.) */
2301       read_size = Lstream_read (str->other_end, data, size);
2302       if (read_size < 0)
2303         {
2304           error_occurred = 1;
2305           break;
2306         }
2307       if (read_size == 0)
2308         /* There might be some more end data produced in the translation.
2309            See the comment above. */
2310         str->flags |= CODING_STATE_END;
2311       mule_encode (stream, data, str->runoff, read_size);
2312     }
2313
2314   if (data == orig_data)
2315     return error_occurred ? -1 : 0;
2316   else
2317     return data - orig_data;
2318 }
2319
2320 static int
2321 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2322 {
2323   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2324   int retval;
2325
2326   /* Encode all our data into the runoff, and then attempt to write
2327      it all out to the other end.  Remove whatever chunk we succeeded
2328      in writing. */
2329   mule_encode (stream, data, str->runoff, size);
2330   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2331                           Dynarr_length (str->runoff));
2332   if (retval > 0)
2333     Dynarr_delete_many (str->runoff, 0, retval);
2334   /* Do NOT return retval.  The return value indicates how much
2335      of the incoming data was written, not how many bytes were
2336      written. */
2337   return size;
2338 }
2339
2340 static void
2341 reset_encoding_stream (struct encoding_stream *str)
2342 {
2343 #ifdef MULE
2344   switch (CODING_SYSTEM_TYPE (str->codesys))
2345     {
2346     case CODESYS_ISO2022:
2347       {
2348         int i;
2349
2350         for (i = 0; i < 4; i++)
2351           {
2352             str->iso2022.charset[i] =
2353               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2354             str->iso2022.force_charset_on_output[i] =
2355               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2356           }
2357         str->iso2022.register_left = 0;
2358         str->iso2022.register_right = 1;
2359         str->iso2022.current_charset = Qnil;
2360         str->iso2022.current_half = 0;
2361         str->iso2022.current_char_boundary = 1;
2362         break;
2363       }
2364     case CODESYS_CCL:
2365       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2366       break;
2367     default:
2368       break;
2369     }
2370 #endif /* MULE */
2371
2372   str->flags = str->ch = 0;
2373 }
2374
2375 static int
2376 encoding_rewinder (Lstream *stream)
2377 {
2378   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2379   reset_encoding_stream (str);
2380   Dynarr_reset (str->runoff);
2381   return Lstream_rewind (str->other_end);
2382 }
2383
2384 static int
2385 encoding_seekable_p (Lstream *stream)
2386 {
2387   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2388   return Lstream_seekable_p (str->other_end);
2389 }
2390
2391 static int
2392 encoding_flusher (Lstream *stream)
2393 {
2394   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2395   return Lstream_flush (str->other_end);
2396 }
2397
2398 static int
2399 encoding_closer (Lstream *stream)
2400 {
2401   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2402   if (stream->flags & LSTREAM_FL_WRITE)
2403     {
2404       str->flags |= CODING_STATE_END;
2405       encoding_writer (stream, 0, 0);
2406     }
2407   Dynarr_free (str->runoff);
2408   return Lstream_close (str->other_end);
2409 }
2410
2411 Lisp_Object
2412 encoding_stream_coding_system (Lstream *stream)
2413 {
2414   Lisp_Object coding_system;
2415   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2416
2417   XSETCODING_SYSTEM (coding_system, str->codesys);
2418   return coding_system;
2419 }
2420
2421 void
2422 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2423 {
2424   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2425   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2426   str->codesys = cs;
2427   reset_encoding_stream (str);
2428 }
2429
2430 static Lisp_Object
2431 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2432                         CONST char *mode)
2433 {
2434   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2435   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2436   Lisp_Object obj;
2437
2438   xzero (*str);
2439   str->runoff = Dynarr_new (unsigned_char);
2440   str->other_end = stream;
2441   set_encoding_stream_coding_system (lstr, codesys);
2442   XSETLSTREAM (obj, lstr);
2443   return obj;
2444 }
2445
2446 Lisp_Object
2447 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2448 {
2449   return make_encoding_stream_1 (stream, codesys, "r");
2450 }
2451
2452 Lisp_Object
2453 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2454 {
2455   return make_encoding_stream_1 (stream, codesys, "w");
2456 }
2457
2458 /* Convert N bytes of internally-formatted data stored in SRC to an
2459    external format, according to the encoding stream ENCODING.
2460    Store the encoded data into DST. */
2461
2462 static void
2463 mule_encode (Lstream *encoding, CONST unsigned char *src,
2464              unsigned_char_dynarr *dst, unsigned int n)
2465 {
2466   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2467
2468   switch (CODING_SYSTEM_TYPE (str->codesys))
2469     {
2470 #ifdef DEBUG_XEMACS
2471     case CODESYS_INTERNAL:
2472       Dynarr_add_many (dst, src, n);
2473       break;
2474 #endif
2475     case CODESYS_AUTODETECT:
2476       /* If we got this far and still haven't decided on the coding
2477          system, then do no conversion. */
2478     case CODESYS_NO_CONVERSION:
2479       encode_coding_no_conversion (encoding, src, dst, n);
2480       break;
2481 #ifdef MULE
2482     case CODESYS_SHIFT_JIS:
2483       encode_coding_sjis (encoding, src, dst, n);
2484       break;
2485     case CODESYS_BIG5:
2486       encode_coding_big5 (encoding, src, dst, n);
2487       break;
2488     case CODESYS_UCS4:
2489       encode_coding_ucs4 (encoding, src, dst, n);
2490       break;
2491     case CODESYS_UTF8:
2492       encode_coding_utf8 (encoding, src, dst, n);
2493       break;
2494     case CODESYS_CCL:
2495       ccl_driver (&str->ccl, src, dst, n, 0);
2496       break;
2497     case CODESYS_ISO2022:
2498       encode_coding_iso2022 (encoding, src, dst, n);
2499       break;
2500 #endif /* MULE */
2501     default:
2502       abort ();
2503     }
2504 }
2505
2506 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2507 Encode the text between START and END using CODING-SYSTEM.
2508 This will, for example, convert Japanese characters into stuff such as
2509 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2510 text.  BUFFER defaults to the current buffer if unspecified.
2511 */
2512        (start, end, coding_system, buffer))
2513 {
2514   Bufpos b, e;
2515   struct buffer *buf = decode_buffer (buffer, 0);
2516   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2517   Lstream *istr, *ostr;
2518   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2519
2520   get_buffer_range_char (buf, start, end, &b, &e, 0);
2521
2522   barf_if_buffer_read_only (buf, b, e);
2523
2524   coding_system = Fget_coding_system (coding_system);
2525   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2526   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2527   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2528                                               Fget_coding_system (Qbinary));
2529   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2530                                            coding_system);
2531   istr = XLSTREAM (instream);
2532   ostr = XLSTREAM (outstream);
2533   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2534   /* The chain of streams looks like this:
2535
2536      [BUFFER] <----- send through
2537                      ------> [ENCODE AS SPECIFIED]
2538                              ------> [DECODE AS BINARY]
2539                                      ------> [BUFFER]
2540    */
2541   while (1)
2542     {
2543       char tempbuf[1024]; /* some random amount */
2544       Bufpos newpos, even_newer_pos;
2545       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2546       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2547
2548       if (!size_in_bytes)
2549         break;
2550       newpos = lisp_buffer_stream_startpos (istr);
2551       Lstream_write (ostr, tempbuf, size_in_bytes);
2552       even_newer_pos = lisp_buffer_stream_startpos (istr);
2553       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2554                            even_newer_pos, 0);
2555     }
2556
2557   {
2558     Charcount retlen =
2559       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2560     Lstream_close (istr);
2561     Lstream_close (ostr);
2562     UNGCPRO;
2563     Lstream_delete (istr);
2564     Lstream_delete (ostr);
2565     Lstream_delete (XLSTREAM (de_outstream));
2566     Lstream_delete (XLSTREAM (lb_outstream));
2567     return make_int (retlen);
2568   }
2569 }
2570
2571 #ifdef MULE
2572 \f
2573 /************************************************************************/
2574 /*                          Shift-JIS methods                           */
2575 /************************************************************************/
2576
2577 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2578    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2579    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2580    encoded by "position-code + 0x80".  A character of JISX0208
2581    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2582    position-codes are divided and shifted so that it fit in the range
2583    below.
2584
2585    --- CODE RANGE of Shift-JIS ---
2586    (character set)      (range)
2587    ASCII                0x00 .. 0x7F
2588    JISX0201-Kana        0xA0 .. 0xDF
2589    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2590             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2591    -------------------------------
2592
2593 */
2594
2595 /* Is this the first byte of a Shift-JIS two-byte char? */
2596
2597 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2598   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2599
2600 /* Is this the second byte of a Shift-JIS two-byte char? */
2601
2602 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2603   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2604
2605 #define BYTE_SJIS_KATAKANA_P(c) \
2606   ((c) >= 0xA1 && (c) <= 0xDF)
2607
2608 static int
2609 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2610                     unsigned int n)
2611 {
2612   int c;
2613
2614   while (n--)
2615     {
2616       c = *src++;
2617       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2618         return 0;
2619       if (st->shift_jis.in_second_byte)
2620         {
2621           st->shift_jis.in_second_byte = 0;
2622           if (c < 0x40)
2623             return 0;
2624         }
2625       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2626         st->shift_jis.in_second_byte = 1;
2627     }
2628   return CODING_CATEGORY_SHIFT_JIS_MASK;
2629 }
2630
2631 /* Convert Shift-JIS data to internal format. */
2632
2633 static void
2634 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2635                     unsigned_char_dynarr *dst, unsigned int n)
2636 {
2637   unsigned char c;
2638   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2639   unsigned int flags  = str->flags;
2640   unsigned int ch     = str->ch;
2641   eol_type_t eol_type = str->eol_type;
2642
2643   while (n--)
2644     {
2645       c = *src++;
2646
2647       if (ch)
2648         {
2649           /* Previous character was first byte of Shift-JIS Kanji char. */
2650           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2651             {
2652               unsigned char e1, e2;
2653
2654               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2655               DECODE_SJIS (ch, c, e1, e2);
2656               Dynarr_add (dst, e1);
2657               Dynarr_add (dst, e2);
2658             }
2659           else
2660             {
2661               DECODE_ADD_BINARY_CHAR (ch, dst);
2662               DECODE_ADD_BINARY_CHAR (c, dst);
2663             }
2664           ch = 0;
2665         }
2666       else
2667         {
2668           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2669           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2670             ch = c;
2671           else if (BYTE_SJIS_KATAKANA_P (c))
2672             {
2673               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2674               Dynarr_add (dst, c);
2675             }
2676           else
2677             DECODE_ADD_BINARY_CHAR (c, dst);
2678         }
2679     label_continue_loop:;
2680     }
2681
2682   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2683
2684   str->flags = flags;
2685   str->ch    = ch;
2686 }
2687
2688 /* Convert internally-formatted data to Shift-JIS. */
2689
2690 static void
2691 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2692                     unsigned_char_dynarr *dst, unsigned int n)
2693 {
2694   unsigned char c;
2695   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2696   unsigned int flags  = str->flags;
2697   unsigned int ch     = str->ch;
2698   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2699
2700   while (n--)
2701     {
2702       c = *src++;
2703       if (c == '\n')
2704         {
2705           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2706             Dynarr_add (dst, '\r');
2707           if (eol_type != EOL_CR)
2708             Dynarr_add (dst, '\n');
2709           ch = 0;
2710         }
2711       else if (BYTE_ASCII_P (c))
2712         {
2713           Dynarr_add (dst, c);
2714           ch = 0;
2715         }
2716       else if (BUFBYTE_LEADING_BYTE_P (c))
2717         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2718               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2719               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2720       else if (ch)
2721         {
2722           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2723             {
2724               Dynarr_add (dst, c);
2725               ch = 0;
2726             }
2727           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2728                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2729             ch = c;
2730           else
2731             {
2732               unsigned char j1, j2;
2733               ENCODE_SJIS (ch, c, j1, j2);
2734               Dynarr_add (dst, j1);
2735               Dynarr_add (dst, j2);
2736               ch = 0;
2737             }
2738         }
2739     }
2740
2741   str->flags = flags;
2742   str->ch    = ch;
2743 }
2744
2745 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2746 Decode a JISX0208 character of Shift-JIS coding-system.
2747 CODE is the character code in Shift-JIS as a cons of type bytes.
2748 Return the corresponding character.
2749 */
2750        (code))
2751 {
2752   unsigned char c1, c2, s1, s2;
2753
2754   CHECK_CONS (code);
2755   CHECK_INT (XCAR (code));
2756   CHECK_INT (XCDR (code));
2757   s1 = XINT (XCAR (code));
2758   s2 = XINT (XCDR (code));
2759   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2760       BYTE_SJIS_TWO_BYTE_2_P (s2))
2761     {
2762       DECODE_SJIS (s1, s2, c1, c2);
2763       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2764                                    c1 & 0x7F, c2 & 0x7F));
2765     }
2766   else
2767     return Qnil;
2768 }
2769
2770 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2771 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2772 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2773 */
2774        (ch))
2775 {
2776   Lisp_Object charset;
2777   int c1, c2, s1, s2;
2778
2779   CHECK_CHAR_COERCE_INT (ch);
2780   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2781   if (EQ (charset, Vcharset_japanese_jisx0208))
2782     {
2783       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2784       return Fcons (make_int (s1), make_int (s2));
2785     }
2786   else
2787     return Qnil;
2788 }
2789
2790 \f
2791 /************************************************************************/
2792 /*                            Big5 methods                              */
2793 /************************************************************************/
2794
2795 /* BIG5 is a coding system encoding two character sets: ASCII and
2796    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2797    character set and is encoded in two-byte.
2798
2799    --- CODE RANGE of BIG5 ---
2800    (character set)      (range)
2801    ASCII                0x00 .. 0x7F
2802    Big5 (1st byte)      0xA1 .. 0xFE
2803         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2804    --------------------------
2805
2806    Since the number of characters in Big5 is larger than maximum
2807    characters in Emacs' charset (96x96), it can't be handled as one
2808    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2809    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
2810    contains frequently used characters and the latter contains less
2811    frequently used characters.  */
2812
2813 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2814   ((c) >= 0xA1 && (c) <= 0xFE)
2815
2816 /* Is this the second byte of a Shift-JIS two-byte char? */
2817
2818 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2819   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2820
2821 /* Number of Big5 characters which have the same code in 1st byte.  */
2822
2823 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2824
2825 /* Code conversion macros.  These are macros because they are used in
2826    inner loops during code conversion.
2827
2828    Note that temporary variables in macros introduce the classic
2829    dynamic-scoping problems with variable names.  We use capital-
2830    lettered variables in the assumption that XEmacs does not use
2831    capital letters in variables except in a very formalized way
2832    (e.g. Qstring). */
2833
2834 /* Convert Big5 code (b1, b2) into its internal string representation
2835    (lb, c1, c2). */
2836
2837 /* There is a much simpler way to split the Big5 charset into two.
2838    For the moment I'm going to leave the algorithm as-is because it
2839    claims to separate out the most-used characters into a single
2840    charset, which perhaps will lead to optimizations in various
2841    places.
2842
2843    The way the algorithm works is something like this:
2844
2845    Big5 can be viewed as a 94x157 charset, where the row is
2846    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2847    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2848    the split between low and high column numbers is apparently
2849    meaningless; ascending rows produce less and less frequent chars.
2850    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2851    the first charset, and the upper half (0xC9 .. 0xFE) to the
2852    second.  To do the conversion, we convert the character into
2853    a single number where 0 .. 156 is the first row, 157 .. 313
2854    is the second, etc.  That way, the characters are ordered by
2855    decreasing frequency.  Then we just chop the space in two
2856    and coerce the result into a 94x94 space.
2857    */
2858
2859 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2860 {                                                                       \
2861   int B1 = b1, B2 = b2;                                                 \
2862   unsigned int I                                                        \
2863     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2864                                                                         \
2865   if (B1 < 0xC9)                                                        \
2866     {                                                                   \
2867       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2868     }                                                                   \
2869   else                                                                  \
2870     {                                                                   \
2871       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2872       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2873     }                                                                   \
2874   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2875   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2876 } while (0)
2877
2878 /* Convert the internal string representation of a Big5 character
2879    (lb, c1, c2) into Big5 code (b1, b2). */
2880
2881 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2882 {                                                                       \
2883   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2884                                                                         \
2885   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2886     {                                                                   \
2887       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2888     }                                                                   \
2889   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2890   b2 = I % BIG5_SAME_ROW;                                               \
2891   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2892 } while (0)
2893
2894 static int
2895 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2896                     unsigned int n)
2897 {
2898   int c;
2899
2900   while (n--)
2901     {
2902       c = *src++;
2903       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2904           (c >= 0x80 && c <= 0xA0))
2905         return 0;
2906       if (st->big5.in_second_byte)
2907         {
2908           st->big5.in_second_byte = 0;
2909           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2910             return 0;
2911         }
2912       else if (c >= 0xA1)
2913         st->big5.in_second_byte = 1;
2914     }
2915   return CODING_CATEGORY_BIG5_MASK;
2916 }
2917
2918 /* Convert Big5 data to internal format. */
2919
2920 static void
2921 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2922                     unsigned_char_dynarr *dst, unsigned int n)
2923 {
2924   unsigned char c;
2925   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2926   unsigned int flags  = str->flags;
2927   unsigned int ch     = str->ch;
2928   eol_type_t eol_type = str->eol_type;
2929
2930   while (n--)
2931     {
2932       c = *src++;
2933       if (ch)
2934         {
2935           /* Previous character was first byte of Big5 char. */
2936           if (BYTE_BIG5_TWO_BYTE_2_P (c))
2937             {
2938               unsigned char b1, b2, b3;
2939               DECODE_BIG5 (ch, c, b1, b2, b3);
2940               Dynarr_add (dst, b1);
2941               Dynarr_add (dst, b2);
2942               Dynarr_add (dst, b3);
2943             }
2944           else
2945             {
2946               DECODE_ADD_BINARY_CHAR (ch, dst);
2947               DECODE_ADD_BINARY_CHAR (c, dst);
2948             }
2949           ch = 0;
2950         }
2951       else
2952         {
2953           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2954           if (BYTE_BIG5_TWO_BYTE_1_P (c))
2955             ch = c;
2956           else
2957             DECODE_ADD_BINARY_CHAR (c, dst);
2958         }
2959     label_continue_loop:;
2960     }
2961
2962   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2963
2964   str->flags = flags;
2965   str->ch    = ch;
2966 }
2967
2968 /* Convert internally-formatted data to Big5. */
2969
2970 static void
2971 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2972                     unsigned_char_dynarr *dst, unsigned int n)
2973 {
2974   unsigned char c;
2975   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2976   unsigned int flags  = str->flags;
2977   unsigned int ch     = str->ch;
2978   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2979
2980   while (n--)
2981     {
2982       c = *src++;
2983       if (c == '\n')
2984         {
2985           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2986             Dynarr_add (dst, '\r');
2987           if (eol_type != EOL_CR)
2988             Dynarr_add (dst, '\n');
2989         }
2990       else if (BYTE_ASCII_P (c))
2991         {
2992           /* ASCII. */
2993           Dynarr_add (dst, c);
2994         }
2995       else if (BUFBYTE_LEADING_BYTE_P (c))
2996         {
2997           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2998               c == LEADING_BYTE_CHINESE_BIG5_2)
2999             {
3000               /* A recognized leading byte. */
3001               ch = c;
3002               continue; /* not done with this character. */
3003             }
3004           /* otherwise just ignore this character. */
3005         }
3006       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3007                ch == LEADING_BYTE_CHINESE_BIG5_2)
3008         {
3009           /* Previous char was a recognized leading byte. */
3010           ch = (ch << 8) | c;
3011           continue; /* not done with this character. */
3012         }
3013       else if (ch)
3014         {
3015           /* Encountering second byte of a Big5 character. */
3016           unsigned char b1, b2;
3017
3018           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3019           Dynarr_add (dst, b1);
3020           Dynarr_add (dst, b2);
3021         }
3022
3023       ch = 0;
3024     }
3025
3026   str->flags = flags;
3027   str->ch    = ch;
3028 }
3029
3030
3031 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3032 Decode a Big5 character CODE of BIG5 coding-system.
3033 CODE is the character code in BIG5, a cons of two integers.
3034 Return the corresponding character.
3035 */
3036        (code))
3037 {
3038   unsigned char c1, c2, b1, b2;
3039
3040   CHECK_CONS (code);
3041   CHECK_INT (XCAR (code));
3042   CHECK_INT (XCDR (code));
3043   b1 = XINT (XCAR (code));
3044   b2 = XINT (XCDR (code));
3045   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3046       BYTE_BIG5_TWO_BYTE_2_P (b2))
3047     {
3048       int leading_byte;
3049       Lisp_Object charset;
3050       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3051       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3052       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3053     }
3054   else
3055     return Qnil;
3056 }
3057
3058 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3059 Encode the Big5 character CH to BIG5 coding-system.
3060 Return the corresponding character code in Big5.
3061 */
3062        (ch))
3063 {
3064   Lisp_Object charset;
3065   int c1, c2, b1, b2;
3066
3067   CHECK_CHAR_COERCE_INT (ch);
3068   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3069   if (EQ (charset, Vcharset_chinese_big5_1) ||
3070       EQ (charset, Vcharset_chinese_big5_2))
3071     {
3072       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3073                    b1, b2);
3074       return Fcons (make_int (b1), make_int (b2));
3075     }
3076   else
3077     return Qnil;
3078 }
3079
3080 \f
3081 /************************************************************************/
3082 /*                           UCS-4 methods                              */
3083 /*                                                                      */
3084 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3085 /*                                                                      */
3086 /************************************************************************/
3087
3088 Lisp_Object ucs_to_mule_table[65536];
3089 Lisp_Object mule_to_ucs_table;
3090
3091 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3092 Map UCS-4 code CODE to Mule character CHARACTER.
3093
3094 Return T on success, NIL on failure.
3095 */
3096        (code, character))
3097 {
3098   unsigned int c;
3099
3100   CHECK_CHAR (character);
3101   CHECK_INT (code);
3102   c = XINT (code);
3103
3104   if (c < sizeof (ucs_to_mule_table))
3105     {
3106       ucs_to_mule_table[c] = character;
3107       return Qt;
3108     }
3109   else
3110     return Qnil;
3111 }
3112
3113 static Lisp_Object
3114 ucs_to_char (unsigned long code)
3115 {
3116   if (code < sizeof (ucs_to_mule_table))
3117     {
3118       return ucs_to_mule_table[code];
3119     }
3120   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3121     {
3122       unsigned int c;
3123
3124       code -= 0xe00000;
3125       c = code % (94 * 94);
3126       return make_char
3127         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3128                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3129                      CHARSET_LEFT_TO_RIGHT),
3130                     c / 94 + 33, c % 94 + 33));
3131     }
3132   else
3133     return Qnil;
3134 }
3135
3136 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3137 Return Mule character corresponding to UCS code CODE (a positive integer).
3138 */
3139        (code))
3140 {
3141   CHECK_NATNUM (code);
3142   return ucs_to_char (XINT (code));
3143 }
3144
3145 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3146 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3147 */
3148        (character, code))
3149 {
3150   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3151           Fset_char_ucs is more restrictive on index arg, but should
3152           check code arg in a char_table method. */
3153   CHECK_CHAR (character);
3154   CHECK_NATNUM (code);
3155   return Fput_char_table (character, code, mule_to_ucs_table);
3156 }
3157
3158 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3159 Return the UCS code (a positive integer) corresponding to CHARACTER.
3160 */
3161        (character))
3162 {
3163   return Fget_char_table (character, mule_to_ucs_table);
3164 }
3165
3166 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3167    JIS X 0208 double-width `=' instead.
3168    #### do something more appropriate (use blob?)
3169         Danger, Will Robinson!  Data loss.  Should we signal user? */
3170 static void
3171 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3172 {
3173   Lisp_Object chr = ucs_to_char (ch);
3174
3175   if (! NILP (chr))
3176     {
3177       Bufbyte work[MAX_EMCHAR_LEN];
3178       int len;
3179
3180       ch = XCHAR (chr);
3181       len = (ch < 128) ?
3182         simple_set_charptr_emchar (work, ch) :
3183         non_ascii_set_charptr_emchar (work, ch);
3184       Dynarr_add_many (dst, work, len);
3185     }
3186   else
3187     {
3188       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3189       Dynarr_add (dst, 34 + 128);
3190       Dynarr_add (dst, 46 + 128);
3191     }
3192 }
3193
3194 static unsigned long
3195 mule_char_to_ucs4 (Lisp_Object charset,
3196                    unsigned char h, unsigned char l)
3197 {
3198   Lisp_Object code
3199     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3200                        mule_to_ucs_table);
3201
3202   if (INTP (code))
3203     {
3204       return XINT (code);
3205     }
3206   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3207             (XCHARSET_CHARS (charset) == 94) )
3208     {
3209       unsigned char final = XCHARSET_FINAL (charset);
3210
3211       if ( ('@' <= final) && (final < 0x7f) )
3212         {
3213           return 0xe00000 + (final - '@') * 94 * 94
3214             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3215         }
3216       else
3217         {
3218           return '?';
3219         }
3220     }
3221   else
3222     {
3223       return '?';
3224     }
3225 }
3226
3227 static void
3228 encode_ucs4 (Lisp_Object charset,
3229              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3230 {
3231   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3232   Dynarr_add (dst,  code >> 24);
3233   Dynarr_add (dst, (code >> 16) & 255);
3234   Dynarr_add (dst, (code >>  8) & 255);
3235   Dynarr_add (dst,  code        & 255);
3236 }
3237
3238 static int
3239 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3240                     unsigned int n)
3241 {
3242   while (n--)
3243     {
3244       int c = *src++;
3245       switch (st->ucs4.in_byte)
3246         {
3247         case 0:
3248           if (c >= 128)
3249             return 0;
3250           else
3251             st->ucs4.in_byte++;
3252           break;
3253         case 3:
3254           st->ucs4.in_byte = 0;
3255           break;
3256         default:
3257           st->ucs4.in_byte++;
3258         }
3259     }
3260   return CODING_CATEGORY_UCS4_MASK;
3261 }
3262
3263 static void
3264 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3265                     unsigned_char_dynarr *dst, unsigned int n)
3266 {
3267   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3268   unsigned int flags = str->flags;
3269   unsigned int ch    = str->ch;
3270
3271   while (n--)
3272     {
3273       unsigned char c = *src++;
3274       switch (flags)
3275         {
3276         case 0:
3277           ch = c;
3278           flags = 3;
3279           break;
3280         case 1:
3281           decode_ucs4 ( ( ch << 8 ) | c, dst);
3282           ch = 0;
3283           flags = 0;
3284           break;
3285         default:
3286           ch = ( ch << 8 ) | c;
3287           flags--;
3288         }
3289     }
3290   if (flags & CODING_STATE_END)
3291     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3292
3293   str->flags = flags;
3294   str->ch    = ch;
3295 }
3296
3297 static void
3298 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3299                     unsigned_char_dynarr *dst, unsigned int n)
3300 {
3301   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3302   unsigned int flags = str->flags;
3303   unsigned int ch = str->ch;
3304   unsigned char char_boundary = str->iso2022.current_char_boundary;
3305   Lisp_Object charset = str->iso2022.current_charset;
3306
3307 #ifdef ENABLE_COMPOSITE_CHARS
3308   /* flags for handling composite chars.  We do a little switcharoo
3309      on the source while we're outputting the composite char. */
3310   unsigned int saved_n = 0;
3311   CONST unsigned char *saved_src = NULL;
3312   int in_composite = 0;
3313
3314  back_to_square_n:
3315 #endif
3316
3317   while (n--)
3318     {
3319       unsigned char c = *src++;
3320
3321       if (BYTE_ASCII_P (c))
3322         {               /* Processing ASCII character */
3323           ch = 0;
3324           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3325           char_boundary = 1;
3326         }
3327       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3328         { /* Processing Leading Byte */
3329           ch = 0;
3330           charset = CHARSET_BY_LEADING_BYTE (c);
3331           if (LEADING_BYTE_PREFIX_P(c))
3332             ch = c;
3333           char_boundary = 0;
3334         }
3335       else
3336         {                       /* Processing Non-ASCII character */
3337           char_boundary = 1;
3338           if (EQ (charset, Vcharset_control_1))
3339             {
3340               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3341             }
3342           else
3343             {
3344               switch (XCHARSET_REP_BYTES (charset))
3345                 {
3346                 case 2:
3347                   encode_ucs4 (charset, c, 0, dst);
3348                   break;
3349                 case 3:
3350                   if (XCHARSET_PRIVATE_P (charset))
3351                     {
3352                       encode_ucs4 (charset, c, 0, dst);
3353                       ch = 0;
3354                     }
3355                   else if (ch)
3356                     {
3357 #ifdef ENABLE_COMPOSITE_CHARS
3358                       if (EQ (charset, Vcharset_composite))
3359                         {
3360                           if (in_composite)
3361                             {
3362                               /* #### Bother! We don't know how to
3363                                  handle this yet. */
3364                               Dynarr_add (dst, 0);
3365                               Dynarr_add (dst, 0);
3366                               Dynarr_add (dst, 0);
3367                               Dynarr_add (dst, '~');
3368                             }
3369                           else
3370                             {
3371                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3372                                                        ch & 0x7F, c & 0x7F);
3373                               Lisp_Object lstr = composite_char_string (emch);
3374                               saved_n = n;
3375                               saved_src = src;
3376                               in_composite = 1;
3377                               src = XSTRING_DATA   (lstr);
3378                               n   = XSTRING_LENGTH (lstr);
3379                             }
3380                         }
3381                       else
3382 #endif /* ENABLE_COMPOSITE_CHARS */
3383                         {
3384                           encode_ucs4(charset, ch, c, dst);
3385                         }
3386                       ch = 0;
3387                     }
3388                   else
3389                     {
3390                       ch = c;
3391                       char_boundary = 0;
3392                     }
3393                   break;
3394                 case 4:
3395                   if (ch)
3396                     {
3397                       encode_ucs4 (charset, ch, c, dst);
3398                       ch = 0;
3399                     }
3400                   else
3401                     {
3402                       ch = c;
3403                       char_boundary = 0;
3404                     }
3405                   break;
3406                 default:
3407                   abort ();
3408                 }
3409             }
3410         }
3411     }
3412
3413 #ifdef ENABLE_COMPOSITE_CHARS
3414   if (in_composite)
3415     {
3416       n = saved_n;
3417       src = saved_src;
3418       in_composite = 0;
3419       goto back_to_square_n; /* Wheeeeeeeee ..... */
3420     }
3421 #endif /* ENABLE_COMPOSITE_CHARS */
3422
3423   str->flags = flags;
3424   str->ch = ch;
3425   str->iso2022.current_char_boundary = char_boundary;
3426   str->iso2022.current_charset = charset;
3427
3428   /* Verbum caro factum est! */
3429 }
3430
3431 \f
3432 /************************************************************************/
3433 /*                           UTF-8 methods                              */
3434 /************************************************************************/
3435
3436 static int
3437 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3438                     unsigned int n)
3439 {
3440   while (n--)
3441     {
3442       unsigned char c = *src++;
3443       switch (st->utf8.in_byte)
3444         {
3445         case 0:
3446           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3447             return 0;
3448           else if (c >= 0xfc)
3449             st->utf8.in_byte = 5;
3450           else if (c >= 0xf8)
3451             st->utf8.in_byte = 4;
3452           else if (c >= 0xf0)
3453             st->utf8.in_byte = 3;
3454           else if (c >= 0xe0)
3455             st->utf8.in_byte = 2;
3456           else if (c >= 0xc0)
3457             st->utf8.in_byte = 1;
3458           else if (c >= 0x80)
3459             return 0;
3460           break;
3461         default:
3462           if ((c & 0xc0) != 0x80)
3463             return 0;
3464           else
3465             st->utf8.in_byte--;
3466         }
3467     }
3468   return CODING_CATEGORY_UTF8_MASK;
3469 }
3470
3471 static void
3472 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3473                     unsigned_char_dynarr *dst, unsigned int n)
3474 {
3475   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3476   unsigned int flags  = str->flags;
3477   unsigned int ch     = str->ch;
3478   eol_type_t eol_type = str->eol_type;
3479
3480   while (n--)
3481     {
3482       unsigned char c = *src++;
3483       switch (flags)
3484         {
3485         case 0:
3486           if ( c >= 0xfc )
3487             {
3488               ch = c & 0x01;
3489               flags = 5;
3490             }
3491           else if ( c >= 0xf8 )
3492             {
3493               ch = c & 0x03;
3494               flags = 4;
3495             }
3496           else if ( c >= 0xf0 )
3497             {
3498               ch = c & 0x07;
3499               flags = 3;
3500             }
3501           else if ( c >= 0xe0 )
3502             {
3503               ch = c & 0x0f;
3504               flags = 2;
3505             }
3506           else if ( c >= 0xc0 )
3507             {
3508               ch = c & 0x1f;
3509               flags = 1;
3510             }
3511           else
3512             {
3513               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3514               decode_ucs4 (c, dst);
3515             }
3516           break;
3517         case 1:
3518           ch = ( ch << 6 ) | ( c & 0x3f );
3519           decode_ucs4 (ch, dst);
3520           ch = 0;
3521           flags = 0;
3522           break;
3523         default:
3524           ch = ( ch << 6 ) | ( c & 0x3f );
3525           flags--;
3526         }
3527     label_continue_loop:;
3528     }
3529
3530   if (flags & CODING_STATE_END)
3531     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3532
3533   str->flags = flags;
3534   str->ch    = ch;
3535 }
3536
3537 static void
3538 encode_utf8 (Lisp_Object charset,
3539              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3540 {
3541   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3542   if ( code <= 0x7f )
3543     {
3544       Dynarr_add (dst, code);
3545     }
3546   else if ( code <= 0x7ff )
3547     {
3548       Dynarr_add (dst, (code >> 6) | 0xc0);
3549       Dynarr_add (dst, (code & 0x3f) | 0x80);
3550     }
3551   else if ( code <= 0xffff )
3552     {
3553       Dynarr_add (dst,  (code >> 12) | 0xe0);
3554       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3555       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3556     }
3557   else if ( code <= 0x1fffff )
3558     {
3559       Dynarr_add (dst,  (code >> 18) | 0xf0);
3560       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3561       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3562       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3563     }
3564   else if ( code <= 0x3ffffff )
3565     {
3566       Dynarr_add (dst,  (code >> 24) | 0xf8);
3567       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3568       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3569       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3570       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3571     }
3572   else
3573     {
3574       Dynarr_add (dst,  (code >> 30) | 0xfc);
3575       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3576       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3577       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3578       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3579       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3580     }
3581 }
3582
3583 static void
3584 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3585                     unsigned_char_dynarr *dst, unsigned int n)
3586 {
3587   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3588   unsigned int flags  = str->flags;
3589   unsigned int ch     = str->ch;
3590   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3591   unsigned char char_boundary = str->iso2022.current_char_boundary;
3592   Lisp_Object charset = str->iso2022.current_charset;
3593
3594 #ifdef ENABLE_COMPOSITE_CHARS
3595   /* flags for handling composite chars.  We do a little switcharoo
3596      on the source while we're outputting the composite char. */
3597   unsigned int saved_n = 0;
3598   CONST unsigned char *saved_src = NULL;
3599   int in_composite = 0;
3600
3601  back_to_square_n:
3602 #endif /* ENABLE_COMPOSITE_CHARS */
3603
3604   while (n--)
3605     {
3606       unsigned char c = *src++;
3607
3608       if (BYTE_ASCII_P (c))
3609         {               /* Processing ASCII character */
3610           ch = 0;
3611           if (c == '\n')
3612             {
3613               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3614                 Dynarr_add (dst, '\r');
3615               if (eol_type != EOL_CR)
3616                 Dynarr_add (dst, c);
3617             }
3618           else
3619             encode_utf8 (Vcharset_ascii, c, 0, dst);
3620           char_boundary = 1;
3621         }
3622       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3623         { /* Processing Leading Byte */
3624           ch = 0;
3625           charset = CHARSET_BY_LEADING_BYTE (c);
3626           if (LEADING_BYTE_PREFIX_P(c))
3627             ch = c;
3628           char_boundary = 0;
3629         }
3630       else
3631         {                       /* Processing Non-ASCII character */
3632           char_boundary = 1;
3633           if (EQ (charset, Vcharset_control_1))
3634             {
3635               encode_utf8 (Vcharset_control_1, c, 0, dst);
3636             }
3637           else
3638             {
3639               switch (XCHARSET_REP_BYTES (charset))
3640                 {
3641                 case 2:
3642                   encode_utf8 (charset, c, 0, dst);
3643                   break;
3644                 case 3:
3645                   if (XCHARSET_PRIVATE_P (charset))
3646                     {
3647                       encode_utf8 (charset, c, 0, dst);
3648                       ch = 0;
3649                     }
3650                   else if (ch)
3651                     {
3652 #ifdef ENABLE_COMPOSITE_CHARS
3653                       if (EQ (charset, Vcharset_composite))
3654                         {
3655                           if (in_composite)
3656                             {
3657                               /* #### Bother! We don't know how to
3658                                  handle this yet. */
3659                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3660                             }
3661                           else
3662                             {
3663                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3664                                                        ch & 0x7F, c & 0x7F);
3665                               Lisp_Object lstr = composite_char_string (emch);
3666                               saved_n = n;
3667                               saved_src = src;
3668                               in_composite = 1;
3669                               src = XSTRING_DATA   (lstr);
3670                               n   = XSTRING_LENGTH (lstr);
3671                             }
3672                         }
3673                       else
3674 #endif /* ENABLE_COMPOSITE_CHARS */
3675                         {
3676                           encode_utf8 (charset, ch, c, dst);
3677                         }
3678                       ch = 0;
3679                     }
3680                   else
3681                     {
3682                       ch = c;
3683                       char_boundary = 0;
3684                     }
3685                   break;
3686                 case 4:
3687                   if (ch)
3688                     {
3689                       encode_utf8 (charset, ch, c, dst);
3690                       ch = 0;
3691                     }
3692                   else
3693                     {
3694                       ch = c;
3695                       char_boundary = 0;
3696                     }
3697                   break;
3698                 default:
3699                   abort ();
3700                 }
3701             }
3702         }
3703     }
3704
3705 #ifdef ENABLE_COMPOSITE_CHARS
3706   if (in_composite)
3707     {
3708       n = saved_n;
3709       src = saved_src;
3710       in_composite = 0;
3711       goto back_to_square_n; /* Wheeeeeeeee ..... */
3712     }
3713 #endif
3714
3715   str->flags = flags;
3716   str->ch    = ch;
3717   str->iso2022.current_char_boundary = char_boundary;
3718   str->iso2022.current_charset = charset;
3719
3720   /* Verbum caro factum est! */
3721 }
3722
3723 \f
3724 /************************************************************************/
3725 /*                           ISO2022 methods                            */
3726 /************************************************************************/
3727
3728 /* The following note describes the coding system ISO2022 briefly.
3729    Since the intention of this note is to help understand the
3730    functions in this file, some parts are NOT ACCURATE or OVERLY
3731    SIMPLIFIED.  For thorough understanding, please refer to the
3732    original document of ISO2022.
3733
3734    ISO2022 provides many mechanisms to encode several character sets
3735    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3736    is encoded using bytes less than 128.  This may make the encoded
3737    text a little bit longer, but the text passes more easily through
3738    several gateways, some of which strip off MSB (Most Signigant Bit).
3739
3740    There are two kinds of character sets: control character set and
3741    graphic character set.  The former contains control characters such
3742    as `newline' and `escape' to provide control functions (control
3743    functions are also provided by escape sequences).  The latter
3744    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3745    two control character sets and many graphic character sets.
3746
3747    Graphic character sets are classified into one of the following
3748    four classes, according to the number of bytes (DIMENSION) and
3749    number of characters in one dimension (CHARS) of the set:
3750    - DIMENSION1_CHARS94
3751    - DIMENSION1_CHARS96
3752    - DIMENSION2_CHARS94
3753    - DIMENSION2_CHARS96
3754
3755    In addition, each character set is assigned an identification tag,
3756    unique for each set, called "final character" (denoted as <F>
3757    hereafter).  The <F> of each character set is decided by ECMA(*)
3758    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3759    (0x30..0x3F are for private use only).
3760
3761    Note (*): ECMA = European Computer Manufacturers Association
3762
3763    Here are examples of graphic character set [NAME(<F>)]:
3764         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3765         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3766         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3767         o DIMENSION2_CHARS96 -- none for the moment
3768
3769    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3770         C0 [0x00..0x1F] -- control character plane 0
3771         GL [0x20..0x7F] -- graphic character plane 0
3772         C1 [0x80..0x9F] -- control character plane 1
3773         GR [0xA0..0xFF] -- graphic character plane 1
3774
3775    A control character set is directly designated and invoked to C0 or
3776    C1 by an escape sequence.  The most common case is that:
3777    - ISO646's  control character set is designated/invoked to C0, and
3778    - ISO6429's control character set is designated/invoked to C1,
3779    and usually these designations/invocations are omitted in encoded
3780    text.  In a 7-bit environment, only C0 can be used, and a control
3781    character for C1 is encoded by an appropriate escape sequence to
3782    fit into the environment.  All control characters for C1 are
3783    defined to have corresponding escape sequences.
3784
3785    A graphic character set is at first designated to one of four
3786    graphic registers (G0 through G3), then these graphic registers are
3787    invoked to GL or GR.  These designations and invocations can be
3788    done independently.  The most common case is that G0 is invoked to
3789    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3790    these invocations and designations are omitted in encoded text.
3791    In a 7-bit environment, only GL can be used.
3792
3793    When a graphic character set of CHARS94 is invoked to GL, codes
3794    0x20 and 0x7F of the GL area work as control characters SPACE and
3795    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3796    be used.
3797
3798    There are two ways of invocation: locking-shift and single-shift.
3799    With locking-shift, the invocation lasts until the next different
3800    invocation, whereas with single-shift, the invocation affects the
3801    following character only and doesn't affect the locking-shift
3802    state.  Invocations are done by the following control characters or
3803    escape sequences:
3804
3805    ----------------------------------------------------------------------
3806    abbrev  function                  cntrl escape seq   description
3807    ----------------------------------------------------------------------
3808    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3809    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3810    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3811    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3812    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3813    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3814    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3815    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3816    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3817    ----------------------------------------------------------------------
3818    (*) These are not used by any known coding system.
3819
3820    Control characters for these functions are defined by macros
3821    ISO_CODE_XXX in `coding.h'.
3822
3823    Designations are done by the following escape sequences:
3824    ----------------------------------------------------------------------
3825    escape sequence      description
3826    ----------------------------------------------------------------------
3827    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3828    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3829    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3830    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3831    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3832    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3833    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3834    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3835    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3836    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3837    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3838    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3839    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3840    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3841    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3842    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3843    ----------------------------------------------------------------------
3844
3845    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3846    of dimension 1, chars 94, and final character <F>, etc...
3847
3848    Note (*): Although these designations are not allowed in ISO2022,
3849    Emacs accepts them on decoding, and produces them on encoding
3850    CHARS96 character sets in a coding system which is characterized as
3851    7-bit environment, non-locking-shift, and non-single-shift.
3852
3853    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3854    '(' can be omitted.  We refer to this as "short-form" hereafter.
3855
3856    Now you may notice that there are a lot of ways for encoding the
3857    same multilingual text in ISO2022.  Actually, there exist many
3858    coding systems such as Compound Text (used in X11's inter client
3859    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3860    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3861    localized platforms), and all of these are variants of ISO2022.
3862
3863    In addition to the above, Emacs handles two more kinds of escape
3864    sequences: ISO6429's direction specification and Emacs' private
3865    sequence for specifying character composition.
3866
3867    ISO6429's direction specification takes the following form:
3868         o CSI ']'      -- end of the current direction
3869         o CSI '0' ']'  -- end of the current direction
3870         o CSI '1' ']'  -- start of left-to-right text
3871         o CSI '2' ']'  -- start of right-to-left text
3872    The control character CSI (0x9B: control sequence introducer) is
3873    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3874
3875    Character composition specification takes the following form:
3876         o ESC '0' -- start character composition
3877         o ESC '1' -- end character composition
3878    Since these are not standard escape sequences of any ISO standard,
3879    their use with these meanings is restricted to Emacs only.  */
3880
3881 static void
3882 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3883 {
3884   int i;
3885
3886   for (i = 0; i < 4; i++)
3887     {
3888       if (!NILP (coding_system))
3889         iso->charset[i] =
3890           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3891       else
3892         iso->charset[i] = Qt;
3893       iso->invalid_designated[i] = 0;
3894     }
3895   iso->esc = ISO_ESC_NOTHING;
3896   iso->esc_bytes_index = 0;
3897   iso->register_left = 0;
3898   iso->register_right = 1;
3899   iso->switched_dir_and_no_valid_charset_yet = 0;
3900   iso->invalid_switch_dir = 0;
3901   iso->output_direction_sequence = 0;
3902   iso->output_literally = 0;
3903 #ifdef ENABLE_COMPOSITE_CHARS
3904   if (iso->composite_chars)
3905     Dynarr_reset (iso->composite_chars);
3906 #endif
3907 }
3908
3909 static int
3910 fit_to_be_escape_quoted (unsigned char c)
3911 {
3912   switch (c)
3913     {
3914     case ISO_CODE_ESC:
3915     case ISO_CODE_CSI:
3916     case ISO_CODE_SS2:
3917     case ISO_CODE_SS3:
3918     case ISO_CODE_SO:
3919     case ISO_CODE_SI:
3920       return 1;
3921
3922     default:
3923       return 0;
3924     }
3925 }
3926
3927 /* Parse one byte of an ISO2022 escape sequence.
3928    If the result is an invalid escape sequence, return 0 and
3929    do not change anything in STR.  Otherwise, if the result is
3930    an incomplete escape sequence, update ISO2022.ESC and
3931    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3932    all the state variables (but not ISO2022.ESC_BYTES) and
3933    return 1.
3934
3935    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3936    or invocation of an invalid character set and treat that as
3937    an unrecognized escape sequence. */
3938
3939 static int
3940 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3941                    unsigned char c, unsigned int *flags,
3942                    int check_invalid_charsets)
3943 {
3944   /* (1) If we're at the end of a designation sequence, CS is the
3945      charset being designated and REG is the register to designate
3946      it to.
3947
3948      (2) If we're at the end of a locking-shift sequence, REG is
3949      the register to invoke and HALF (0 == left, 1 == right) is
3950      the half to invoke it into.
3951
3952      (3) If we're at the end of a single-shift sequence, REG is
3953      the register to invoke. */
3954   Lisp_Object cs = Qnil;
3955   int reg, half;
3956
3957   /* NOTE: This code does goto's all over the fucking place.
3958      The reason for this is that we're basically implementing
3959      a state machine here, and hierarchical languages like C
3960      don't really provide a clean way of doing this. */
3961
3962   if (! (*flags & CODING_STATE_ESCAPE))
3963     /* At beginning of escape sequence; we need to reset our
3964        escape-state variables. */
3965     iso->esc = ISO_ESC_NOTHING;
3966
3967   iso->output_literally = 0;
3968   iso->output_direction_sequence = 0;
3969
3970   switch (iso->esc)
3971     {
3972     case ISO_ESC_NOTHING:
3973       iso->esc_bytes_index = 0;
3974       switch (c)
3975         {
3976         case ISO_CODE_ESC:      /* Start escape sequence */
3977           *flags |= CODING_STATE_ESCAPE;
3978           iso->esc = ISO_ESC;
3979           goto not_done;
3980
3981         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3982           *flags |= CODING_STATE_ESCAPE;
3983           iso->esc = ISO_ESC_5_11;
3984           goto not_done;
3985
3986         case ISO_CODE_SO:       /* locking shift 1 */
3987           reg = 1; half = 0;
3988           goto locking_shift;
3989         case ISO_CODE_SI:       /* locking shift 0 */
3990           reg = 0; half = 0;
3991           goto locking_shift;
3992
3993         case ISO_CODE_SS2:      /* single shift */
3994           reg = 2;
3995           goto single_shift;
3996         case ISO_CODE_SS3:      /* single shift */
3997           reg = 3;
3998           goto single_shift;
3999
4000         default:                        /* Other control characters */
4001           return 0;
4002         }
4003
4004     case ISO_ESC:
4005       switch (c)
4006         {
4007           /**** single shift ****/
4008
4009         case 'N':       /* single shift 2 */
4010           reg = 2;
4011           goto single_shift;
4012         case 'O':       /* single shift 3 */
4013           reg = 3;
4014           goto single_shift;
4015
4016           /**** locking shift ****/
4017
4018         case '~':       /* locking shift 1 right */
4019           reg = 1; half = 1;
4020           goto locking_shift;
4021         case 'n':       /* locking shift 2 */
4022           reg = 2; half = 0;
4023           goto locking_shift;
4024         case '}':       /* locking shift 2 right */
4025           reg = 2; half = 1;
4026           goto locking_shift;
4027         case 'o':       /* locking shift 3 */
4028           reg = 3; half = 0;
4029           goto locking_shift;
4030         case '|':       /* locking shift 3 right */
4031           reg = 3; half = 1;
4032           goto locking_shift;
4033
4034 #ifdef ENABLE_COMPOSITE_CHARS
4035           /**** composite ****/
4036
4037         case '0':
4038           iso->esc = ISO_ESC_START_COMPOSITE;
4039           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4040             CODING_STATE_COMPOSITE;
4041           return 1;
4042
4043         case '1':
4044           iso->esc = ISO_ESC_END_COMPOSITE;
4045           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4046             ~CODING_STATE_COMPOSITE;
4047           return 1;
4048 #endif /* ENABLE_COMPOSITE_CHARS */
4049
4050           /**** directionality ****/
4051
4052         case '[':
4053           iso->esc = ISO_ESC_5_11;
4054           goto not_done;
4055
4056           /**** designation ****/
4057
4058         case '$':       /* multibyte charset prefix */
4059           iso->esc = ISO_ESC_2_4;
4060           goto not_done;
4061
4062         default:
4063           if (0x28 <= c && c <= 0x2F)
4064             {
4065               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4066               goto not_done;
4067             }
4068
4069           /* This function is called with CODESYS equal to nil when
4070              doing coding-system detection. */
4071           if (!NILP (codesys)
4072               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4073               && fit_to_be_escape_quoted (c))
4074             {
4075               iso->esc = ISO_ESC_LITERAL;
4076               *flags &= CODING_STATE_ISO2022_LOCK;
4077               return 1;
4078             }
4079
4080           /* bzzzt! */
4081           return 0;
4082         }
4083
4084
4085
4086       /**** directionality ****/
4087
4088     case ISO_ESC_5_11:          /* ISO6429 direction control */
4089       if (c == ']')
4090         {
4091           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4092           goto directionality;
4093         }
4094       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4095       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4096       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4097       else               return 0;
4098       goto not_done;
4099
4100     case ISO_ESC_5_11_0:
4101       if (c == ']')
4102         {
4103           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4104           goto directionality;
4105         }
4106       return 0;
4107
4108     case ISO_ESC_5_11_1:
4109       if (c == ']')
4110         {
4111           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4112           goto directionality;
4113         }
4114       return 0;
4115
4116     case ISO_ESC_5_11_2:
4117       if (c == ']')
4118         {
4119           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4120           goto directionality;
4121         }
4122       return 0;
4123
4124     directionality:
4125       iso->esc = ISO_ESC_DIRECTIONALITY;
4126       /* Various junk here to attempt to preserve the direction sequences
4127          literally in the text if they would otherwise be swallowed due
4128          to invalid designations that don't show up as actual charset
4129          changes in the text. */
4130       if (iso->invalid_switch_dir)
4131         {
4132           /* We already inserted a direction switch literally into the
4133              text.  We assume (#### this may not be right) that the
4134              next direction switch is the one going the other way,
4135              and we need to output that literally as well. */
4136           iso->output_literally = 1;
4137           iso->invalid_switch_dir = 0;
4138         }
4139       else
4140         {
4141           int jj;
4142
4143           /* If we are in the thrall of an invalid designation,
4144            then stick the directionality sequence literally into the
4145            output stream so it ends up in the original text again. */
4146           for (jj = 0; jj < 4; jj++)
4147             if (iso->invalid_designated[jj])
4148               break;
4149           if (jj < 4)
4150             {
4151               iso->output_literally = 1;
4152               iso->invalid_switch_dir = 1;
4153             }
4154           else
4155             /* Indicate that we haven't yet seen a valid designation,
4156                so that if a switch-dir is directly followed by an
4157                invalid designation, both get inserted literally. */
4158             iso->switched_dir_and_no_valid_charset_yet = 1;
4159         }
4160       return 1;
4161
4162
4163       /**** designation ****/
4164
4165     case ISO_ESC_2_4:
4166       if (0x28 <= c && c <= 0x2F)
4167         {
4168           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4169           goto not_done;
4170         }
4171       if (0x40 <= c && c <= 0x42)
4172         {
4173           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4174                                       *flags & CODING_STATE_R2L ?
4175                                       CHARSET_RIGHT_TO_LEFT :
4176                                       CHARSET_LEFT_TO_RIGHT);
4177           reg = 0;
4178           goto designated;
4179         }
4180       return 0;
4181
4182     default:
4183       {
4184         int type =-1;
4185
4186         if (c < '0' || c > '~')
4187           return 0; /* bad final byte */
4188
4189         if (iso->esc >= ISO_ESC_2_8 &&
4190             iso->esc <= ISO_ESC_2_15)
4191           {
4192             type = ((iso->esc >= ISO_ESC_2_12) ?
4193                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4194             reg = (iso->esc - ISO_ESC_2_8) & 3;
4195           }
4196         else if (iso->esc >= ISO_ESC_2_4_8 &&
4197                  iso->esc <= ISO_ESC_2_4_15)
4198           {
4199             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4200                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4201             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4202           }
4203         else
4204           {
4205             /* Can this ever be reached? -slb */
4206             abort();
4207           }
4208
4209         cs = CHARSET_BY_ATTRIBUTES (type, c,
4210                                     *flags & CODING_STATE_R2L ?
4211                                     CHARSET_RIGHT_TO_LEFT :
4212                                     CHARSET_LEFT_TO_RIGHT);
4213         goto designated;
4214       }
4215     }
4216
4217  not_done:
4218   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4219   return -1;
4220
4221  single_shift:
4222   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4223     /* can't invoke something that ain't there. */
4224     return 0;
4225   iso->esc = ISO_ESC_SINGLE_SHIFT;
4226   *flags &= CODING_STATE_ISO2022_LOCK;
4227   if (reg == 2)
4228     *flags |= CODING_STATE_SS2;
4229   else
4230     *flags |= CODING_STATE_SS3;
4231   return 1;
4232
4233  locking_shift:
4234   if (check_invalid_charsets &&
4235       !CHARSETP (iso->charset[reg]))
4236     /* can't invoke something that ain't there. */
4237     return 0;
4238   if (half)
4239     iso->register_right = reg;
4240   else
4241     iso->register_left = reg;
4242   *flags &= CODING_STATE_ISO2022_LOCK;
4243   iso->esc = ISO_ESC_LOCKING_SHIFT;
4244   return 1;
4245
4246  designated:
4247   if (NILP (cs) && check_invalid_charsets)
4248     {
4249       iso->invalid_designated[reg] = 1;
4250       iso->charset[reg] = Vcharset_ascii;
4251       iso->esc = ISO_ESC_DESIGNATE;
4252       *flags &= CODING_STATE_ISO2022_LOCK;
4253       iso->output_literally = 1;
4254       if (iso->switched_dir_and_no_valid_charset_yet)
4255         {
4256           /* We encountered a switch-direction followed by an
4257              invalid designation.  Ensure that the switch-direction
4258              gets outputted; otherwise it will probably get eaten
4259              when the text is written out again. */
4260           iso->switched_dir_and_no_valid_charset_yet = 0;
4261           iso->output_direction_sequence = 1;
4262           /* And make sure that the switch-dir going the other
4263              way gets outputted, as well. */
4264           iso->invalid_switch_dir = 1;
4265         }
4266       return 1;
4267     }
4268   /* This function is called with CODESYS equal to nil when
4269      doing coding-system detection. */
4270   if (!NILP (codesys))
4271     {
4272       charset_conversion_spec_dynarr *dyn =
4273         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4274
4275       if (dyn)
4276         {
4277           int i;
4278
4279           for (i = 0; i < Dynarr_length (dyn); i++)
4280             {
4281               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4282               if (EQ (cs, spec->from_charset))
4283                 cs = spec->to_charset;
4284             }
4285         }
4286     }
4287
4288   iso->charset[reg] = cs;
4289   iso->esc = ISO_ESC_DESIGNATE;
4290   *flags &= CODING_STATE_ISO2022_LOCK;
4291   if (iso->invalid_designated[reg])
4292     {
4293       iso->invalid_designated[reg] = 0;
4294       iso->output_literally = 1;
4295     }
4296   if (iso->switched_dir_and_no_valid_charset_yet)
4297     iso->switched_dir_and_no_valid_charset_yet = 0;
4298   return 1;
4299 }
4300
4301 static int
4302 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4303                        unsigned int n)
4304 {
4305   int mask;
4306
4307   /* #### There are serious deficiencies in the recognition mechanism
4308      here.  This needs to be much smarter if it's going to cut it.
4309      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4310      it should be detected as Latin-1.
4311      All the ISO2022 stuff in this file should be synced up with the
4312      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4313      Perhaps we should wait till R2L works in FSF Emacs? */
4314
4315   if (!st->iso2022.initted)
4316     {
4317       reset_iso2022 (Qnil, &st->iso2022.iso);
4318       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4319                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4320                           CODING_CATEGORY_ISO_8_1_MASK |
4321                           CODING_CATEGORY_ISO_8_2_MASK |
4322                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4323       st->iso2022.flags = 0;
4324       st->iso2022.high_byte_count = 0;
4325       st->iso2022.saw_single_shift = 0;
4326       st->iso2022.initted = 1;
4327     }
4328
4329   mask = st->iso2022.mask;
4330
4331   while (n--)
4332     {
4333       int c = *src++;
4334       if (c >= 0xA0)
4335         {
4336           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4337           st->iso2022.high_byte_count++;
4338         }
4339       else
4340         {
4341           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4342             {
4343               if (st->iso2022.high_byte_count & 1)
4344                 /* odd number of high bytes; assume not iso-8-2 */
4345                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4346             }
4347           st->iso2022.high_byte_count = 0;
4348           st->iso2022.saw_single_shift = 0;
4349           if (c > 0x80)
4350             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4351         }
4352       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4353           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4354         { /* control chars */
4355           switch (c)
4356             {
4357               /* Allow and ignore control characters that you might
4358                  reasonably see in a text file */
4359             case '\r':
4360             case '\n':
4361             case '\t':
4362             case  7: /* bell */
4363             case  8: /* backspace */
4364             case 11: /* vertical tab */
4365             case 12: /* form feed */
4366             case 26: /* MS-DOS C-z junk */
4367             case 31: /* '^_' -- for info */
4368               goto label_continue_loop;
4369
4370             default:
4371               break;
4372             }
4373         }
4374
4375       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4376           || BYTE_C1_P (c))
4377         {
4378           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4379                                  &st->iso2022.flags, 0))
4380             {
4381               switch (st->iso2022.iso.esc)
4382                 {
4383                 case ISO_ESC_DESIGNATE:
4384                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4385                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4386                   break;
4387                 case ISO_ESC_LOCKING_SHIFT:
4388                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4389                   goto ran_out_of_chars;
4390                 case ISO_ESC_SINGLE_SHIFT:
4391                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4392                   st->iso2022.saw_single_shift = 1;
4393                   break;
4394                 default:
4395                   break;
4396                 }
4397             }
4398           else
4399             {
4400               mask = 0;
4401               goto ran_out_of_chars;
4402             }
4403         }
4404     label_continue_loop:;
4405     }
4406
4407  ran_out_of_chars:
4408
4409   return mask;
4410 }
4411
4412 static int
4413 postprocess_iso2022_mask (int mask)
4414 {
4415   /* #### kind of cheesy */
4416   /* If seven-bit ISO is allowed, then assume that the encoding is
4417      entirely seven-bit and turn off the eight-bit ones. */
4418   if (mask & CODING_CATEGORY_ISO_7_MASK)
4419     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4420                CODING_CATEGORY_ISO_8_1_MASK |
4421                CODING_CATEGORY_ISO_8_2_MASK);
4422   return mask;
4423 }
4424
4425 /* If FLAGS is a null pointer or specifies right-to-left motion,
4426    output a switch-dir-to-left-to-right sequence to DST.
4427    Also update FLAGS if it is not a null pointer.
4428    If INTERNAL_P is set, we are outputting in internal format and
4429    need to handle the CSI differently. */
4430
4431 static void
4432 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4433                                  unsigned_char_dynarr *dst,
4434                                  unsigned int *flags,
4435                                  int internal_p)
4436 {
4437   if (!flags || (*flags & CODING_STATE_R2L))
4438     {
4439       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4440         {
4441           Dynarr_add (dst, ISO_CODE_ESC);
4442           Dynarr_add (dst, '[');
4443         }
4444       else if (internal_p)
4445         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4446       else
4447         Dynarr_add (dst, ISO_CODE_CSI);
4448       Dynarr_add (dst, '0');
4449       Dynarr_add (dst, ']');
4450       if (flags)
4451         *flags &= ~CODING_STATE_R2L;
4452     }
4453 }
4454
4455 /* If FLAGS is a null pointer or specifies a direction different from
4456    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4457    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4458    sequence to DST.  Also update FLAGS if it is not a null pointer.
4459    If INTERNAL_P is set, we are outputting in internal format and
4460    need to handle the CSI differently. */
4461
4462 static void
4463 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4464                           unsigned_char_dynarr *dst, unsigned int *flags,
4465                           int internal_p)
4466 {
4467   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4468       direction == CHARSET_LEFT_TO_RIGHT)
4469     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4470   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4471            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4472            direction == CHARSET_RIGHT_TO_LEFT)
4473     {
4474       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4475         {
4476           Dynarr_add (dst, ISO_CODE_ESC);
4477           Dynarr_add (dst, '[');
4478         }
4479       else if (internal_p)
4480         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4481       else
4482         Dynarr_add (dst, ISO_CODE_CSI);
4483       Dynarr_add (dst, '2');
4484       Dynarr_add (dst, ']');
4485       if (flags)
4486         *flags |= CODING_STATE_R2L;
4487     }
4488 }
4489
4490 /* Convert ISO2022-format data to internal format. */
4491
4492 static void
4493 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4494                        unsigned_char_dynarr *dst, unsigned int n)
4495 {
4496   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4497   unsigned int flags  = str->flags;
4498   unsigned int ch     = str->ch;
4499   eol_type_t eol_type = str->eol_type;
4500 #ifdef ENABLE_COMPOSITE_CHARS
4501   unsigned_char_dynarr *real_dst = dst;
4502 #endif
4503   Lisp_Object coding_system;
4504
4505   XSETCODING_SYSTEM (coding_system, str->codesys);
4506
4507 #ifdef ENABLE_COMPOSITE_CHARS
4508   if (flags & CODING_STATE_COMPOSITE)
4509     dst = str->iso2022.composite_chars;
4510 #endif /* ENABLE_COMPOSITE_CHARS */
4511
4512   while (n--)
4513     {
4514       unsigned char c = *src++;
4515       if (flags & CODING_STATE_ESCAPE)
4516         {       /* Within ESC sequence */
4517           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4518                                           c, &flags, 1);
4519
4520           if (retval)
4521             {
4522               switch (str->iso2022.esc)
4523                 {
4524 #ifdef ENABLE_COMPOSITE_CHARS
4525                 case ISO_ESC_START_COMPOSITE:
4526                   if (str->iso2022.composite_chars)
4527                     Dynarr_reset (str->iso2022.composite_chars);
4528                   else
4529                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4530                   dst = str->iso2022.composite_chars;
4531                   break;
4532                 case ISO_ESC_END_COMPOSITE:
4533                   {
4534                     Bufbyte comstr[MAX_EMCHAR_LEN];
4535                     Bytecount len;
4536                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4537                                                          Dynarr_length (dst));
4538                     dst = real_dst;
4539                     len = set_charptr_emchar (comstr, emch);
4540                     Dynarr_add_many (dst, comstr, len);
4541                     break;
4542                   }
4543 #endif /* ENABLE_COMPOSITE_CHARS */
4544
4545                 case ISO_ESC_LITERAL:
4546                   DECODE_ADD_BINARY_CHAR (c, dst);
4547                   break;
4548
4549                 default:
4550                   /* Everything else handled already */
4551                   break;
4552                 }
4553             }
4554
4555           /* Attempted error recovery. */
4556           if (str->iso2022.output_direction_sequence)
4557             ensure_correct_direction (flags & CODING_STATE_R2L ?
4558                                       CHARSET_RIGHT_TO_LEFT :
4559                                       CHARSET_LEFT_TO_RIGHT,
4560                                       str->codesys, dst, 0, 1);
4561           /* More error recovery. */
4562           if (!retval || str->iso2022.output_literally)
4563             {
4564               /* Output the (possibly invalid) sequence */
4565               int i;
4566               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4567                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4568               flags &= CODING_STATE_ISO2022_LOCK;
4569               if (!retval)
4570                 n++, src--;/* Repeat the loop with the same character. */
4571               else
4572                 {
4573                   /* No sense in reprocessing the final byte of the
4574                      escape sequence; it could mess things up anyway.
4575                      Just add it now. */
4576                   DECODE_ADD_BINARY_CHAR (c, dst);
4577                 }
4578             }
4579           ch = 0;
4580         }
4581       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4582         { /* Control characters */
4583
4584           /***** Error-handling *****/
4585
4586           /* If we were in the middle of a character, dump out the
4587              partial character. */
4588           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4589
4590           /* If we just saw a single-shift character, dump it out.
4591              This may dump out the wrong sort of single-shift character,
4592              but least it will give an indication that something went
4593              wrong. */
4594           if (flags & CODING_STATE_SS2)
4595             {
4596               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4597               flags &= ~CODING_STATE_SS2;
4598             }
4599           if (flags & CODING_STATE_SS3)
4600             {
4601               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4602               flags &= ~CODING_STATE_SS3;
4603             }
4604
4605           /***** Now handle the control characters. *****/
4606
4607           /* Handle CR/LF */
4608           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4609
4610           flags &= CODING_STATE_ISO2022_LOCK;
4611
4612           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4613             DECODE_ADD_BINARY_CHAR (c, dst);
4614         }
4615       else
4616         {                       /* Graphic characters */
4617           Lisp_Object charset;
4618           int lb;
4619           int reg;
4620
4621           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4622
4623           /* Now determine the charset. */
4624           reg = ((flags & CODING_STATE_SS2) ? 2
4625                  : (flags & CODING_STATE_SS3) ? 3
4626                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4627                  : str->iso2022.register_left);
4628           charset = str->iso2022.charset[reg];
4629
4630           /* Error checking: */
4631           if (! CHARSETP (charset)
4632               || str->iso2022.invalid_designated[reg]
4633               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4634                   && XCHARSET_CHARS (charset) == 94))
4635             /* Mrmph.  We are trying to invoke a register that has no
4636                or an invalid charset in it, or trying to add a character
4637                outside the range of the charset.  Insert that char literally
4638                to preserve it for the output. */
4639             {
4640               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4641               DECODE_ADD_BINARY_CHAR (c, dst);
4642             }
4643
4644           else
4645             {
4646               /* Things are probably hunky-dorey. */
4647
4648               /* Fetch reverse charset, maybe. */
4649               if (((flags & CODING_STATE_R2L) &&
4650                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4651                   ||
4652                   (!(flags & CODING_STATE_R2L) &&
4653                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4654                 {
4655                   Lisp_Object new_charset =
4656                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4657                   if (!NILP (new_charset))
4658                     charset = new_charset;
4659                 }
4660
4661               lb = XCHARSET_LEADING_BYTE (charset);
4662               switch (XCHARSET_REP_BYTES (charset))
4663                 {
4664                 case 1: /* ASCII */
4665                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4666                   Dynarr_add (dst, c & 0x7F);
4667                   break;
4668
4669                 case 2: /* one-byte official */
4670                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4671                   Dynarr_add (dst, lb);
4672                   Dynarr_add (dst, c | 0x80);
4673                   break;
4674
4675                 case 3: /* one-byte private or two-byte official */
4676                   if (XCHARSET_PRIVATE_P (charset))
4677                     {
4678                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4679                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4680                       Dynarr_add (dst, lb);
4681                       Dynarr_add (dst, c | 0x80);
4682                     }
4683                   else
4684                     {
4685                       if (ch)
4686                         {
4687                           Dynarr_add (dst, lb);
4688                           Dynarr_add (dst, ch | 0x80);
4689                           Dynarr_add (dst, c | 0x80);
4690                           ch = 0;
4691                         }
4692                       else
4693                         ch = c;
4694                     }
4695                   break;
4696
4697                 default:        /* two-byte private */
4698                   if (ch)
4699                     {
4700                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4701                       Dynarr_add (dst, lb);
4702                       Dynarr_add (dst, ch | 0x80);
4703                       Dynarr_add (dst, c | 0x80);
4704                       ch = 0;
4705                     }
4706                   else
4707                     ch = c;
4708                 }
4709             }
4710
4711           if (!ch)
4712             flags &= CODING_STATE_ISO2022_LOCK;
4713         }
4714
4715     label_continue_loop:;
4716     }
4717
4718   if (flags & CODING_STATE_END)
4719     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4720
4721   str->flags = flags;
4722   str->ch    = ch;
4723 }
4724
4725
4726 /***** ISO2022 encoder *****/
4727
4728 /* Designate CHARSET into register REG. */
4729
4730 static void
4731 iso2022_designate (Lisp_Object charset, unsigned char reg,
4732                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4733 {
4734   static CONST char inter94[] = "()*+";
4735   static CONST char inter96[] = ",-./";
4736   unsigned int type;
4737   unsigned char final;
4738   Lisp_Object old_charset = str->iso2022.charset[reg];
4739
4740   str->iso2022.charset[reg] = charset;
4741   if (!CHARSETP (charset))
4742     /* charset might be an initial nil or t. */
4743     return;
4744   type = XCHARSET_TYPE (charset);
4745   final = XCHARSET_FINAL (charset);
4746   if (!str->iso2022.force_charset_on_output[reg] &&
4747       CHARSETP (old_charset) &&
4748       XCHARSET_TYPE (old_charset) == type &&
4749       XCHARSET_FINAL (old_charset) == final)
4750     return;
4751
4752   str->iso2022.force_charset_on_output[reg] = 0;
4753
4754   {
4755     charset_conversion_spec_dynarr *dyn =
4756       str->codesys->iso2022.output_conv;
4757
4758     if (dyn)
4759       {
4760         int i;
4761
4762         for (i = 0; i < Dynarr_length (dyn); i++)
4763           {
4764             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4765             if (EQ (charset, spec->from_charset))
4766                 charset = spec->to_charset;
4767           }
4768       }
4769   }
4770
4771   Dynarr_add (dst, ISO_CODE_ESC);
4772   switch (type)
4773     {
4774     case CHARSET_TYPE_94:
4775       Dynarr_add (dst, inter94[reg]);
4776       break;
4777     case CHARSET_TYPE_96:
4778       Dynarr_add (dst, inter96[reg]);
4779       break;
4780     case CHARSET_TYPE_94X94:
4781       Dynarr_add (dst, '$');
4782       if (reg != 0
4783           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4784           || final < '@'
4785           || final > 'B')
4786         Dynarr_add (dst, inter94[reg]);
4787       break;
4788     case CHARSET_TYPE_96X96:
4789       Dynarr_add (dst, '$');
4790       Dynarr_add (dst, inter96[reg]);
4791       break;
4792     }
4793   Dynarr_add (dst, final);
4794 }
4795
4796 static void
4797 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4798 {
4799   if (str->iso2022.register_left != 0)
4800     {
4801       Dynarr_add (dst, ISO_CODE_SI);
4802       str->iso2022.register_left = 0;
4803     }
4804 }
4805
4806 static void
4807 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4808 {
4809   if (str->iso2022.register_left != 1)
4810     {
4811       Dynarr_add (dst, ISO_CODE_SO);
4812       str->iso2022.register_left = 1;
4813     }
4814 }
4815
4816 /* Convert internally-formatted data to ISO2022 format. */
4817
4818 static void
4819 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4820                        unsigned_char_dynarr *dst, unsigned int n)
4821 {
4822   unsigned char charmask, c;
4823   unsigned char char_boundary;
4824   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4825   unsigned int flags          = str->flags;
4826   unsigned int ch             = str->ch;
4827   Lisp_Coding_System *codesys = str->codesys;
4828   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4829   int i;
4830   Lisp_Object charset;
4831   int half;
4832
4833 #ifdef ENABLE_COMPOSITE_CHARS
4834   /* flags for handling composite chars.  We do a little switcharoo
4835      on the source while we're outputting the composite char. */
4836   unsigned int saved_n = 0;
4837   CONST unsigned char *saved_src = NULL;
4838   int in_composite = 0;
4839 #endif /* ENABLE_COMPOSITE_CHARS */
4840
4841   char_boundary = str->iso2022.current_char_boundary;
4842   charset = str->iso2022.current_charset;
4843   half = str->iso2022.current_half;
4844
4845 #ifdef ENABLE_COMPOSITE_CHARS
4846  back_to_square_n:
4847 #endif
4848   while (n--)
4849     {
4850       c = *src++;
4851
4852       if (BYTE_ASCII_P (c))
4853         {               /* Processing ASCII character */
4854           ch = 0;
4855
4856           restore_left_to_right_direction (codesys, dst, &flags, 0);
4857
4858           /* Make sure G0 contains ASCII */
4859           if ((c > ' ' && c < ISO_CODE_DEL) ||
4860               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4861             {
4862               ensure_normal_shift (str, dst);
4863               iso2022_designate (Vcharset_ascii, 0, str, dst);
4864             }
4865
4866           /* If necessary, restore everything to the default state
4867              at end-of-line */
4868           if (c == '\n' &&
4869               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4870             {
4871               restore_left_to_right_direction (codesys, dst, &flags, 0);
4872
4873               ensure_normal_shift (str, dst);
4874
4875               for (i = 0; i < 4; i++)
4876                 {
4877                   Lisp_Object initial_charset =
4878                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4879                   iso2022_designate (initial_charset, i, str, dst);
4880                 }
4881             }
4882           if (c == '\n')
4883             {
4884               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4885                 Dynarr_add (dst, '\r');
4886               if (eol_type != EOL_CR)
4887                 Dynarr_add (dst, c);
4888             }
4889           else
4890             {
4891               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4892                   && fit_to_be_escape_quoted (c))
4893                 Dynarr_add (dst, ISO_CODE_ESC);
4894               Dynarr_add (dst, c);
4895             }
4896           char_boundary = 1;
4897         }
4898
4899       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4900         { /* Processing Leading Byte */
4901           ch = 0;
4902           charset = CHARSET_BY_LEADING_BYTE (c);
4903           if (LEADING_BYTE_PREFIX_P(c))
4904             ch = c;
4905           else if (!EQ (charset, Vcharset_control_1)
4906 #ifdef ENABLE_COMPOSITE_CHARS
4907                    && !EQ (charset, Vcharset_composite)
4908 #endif
4909                    )
4910             {
4911               int reg;
4912
4913               ensure_correct_direction (XCHARSET_DIRECTION (charset),
4914                                         codesys, dst, &flags, 0);
4915
4916               /* Now determine which register to use. */
4917               reg = -1;
4918               for (i = 0; i < 4; i++)
4919                 {
4920                   if (EQ (charset, str->iso2022.charset[i]) ||
4921                       EQ (charset,
4922                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4923                     {
4924                       reg = i;
4925                       break;
4926                     }
4927                 }
4928
4929               if (reg == -1)
4930                 {
4931                   if (XCHARSET_GRAPHIC (charset) != 0)
4932                     {
4933                       if (!NILP (str->iso2022.charset[1]) &&
4934                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4935                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4936                         reg = 1;
4937                       else if (!NILP (str->iso2022.charset[2]))
4938                         reg = 2;
4939                       else if (!NILP (str->iso2022.charset[3]))
4940                         reg = 3;
4941                       else
4942                         reg = 0;
4943                     }
4944                   else
4945                     reg = 0;
4946                 }
4947
4948               iso2022_designate (charset, reg, str, dst);
4949
4950               /* Now invoke that register. */
4951               switch (reg)
4952                 {
4953                 case 0:
4954                   ensure_normal_shift (str, dst);
4955                   half = 0;
4956                   break;
4957
4958                 case 1:
4959                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4960                     {
4961                       ensure_shift_out (str, dst);
4962                       half = 0;
4963                     }
4964                   else
4965                     half = 1;
4966                   break;
4967
4968                 case 2:
4969                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4970                     {
4971                       Dynarr_add (dst, ISO_CODE_ESC);
4972                       Dynarr_add (dst, 'N');
4973                       half = 0;
4974                     }
4975                   else
4976                     {
4977                       Dynarr_add (dst, ISO_CODE_SS2);
4978                       half = 1;
4979                     }
4980                   break;
4981
4982                 case 3:
4983                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4984                     {
4985                       Dynarr_add (dst, ISO_CODE_ESC);
4986                       Dynarr_add (dst, 'O');
4987                       half = 0;
4988                     }
4989                   else
4990                     {
4991                       Dynarr_add (dst, ISO_CODE_SS3);
4992                       half = 1;
4993                     }
4994                   break;
4995
4996                 default:
4997                   abort ();
4998                 }
4999             }
5000           char_boundary = 0;
5001         }
5002       else
5003         {                       /* Processing Non-ASCII character */
5004           charmask = (half == 0 ? 0x7F : 0xFF);
5005           char_boundary = 1;
5006           if (EQ (charset, Vcharset_control_1))
5007             {
5008               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5009                   && fit_to_be_escape_quoted (c))
5010                 Dynarr_add (dst, ISO_CODE_ESC);
5011               /* you asked for it ... */
5012               Dynarr_add (dst, c - 0x20);
5013             }
5014           else
5015             {
5016               switch (XCHARSET_REP_BYTES (charset))
5017                 {
5018                 case 2:
5019                   Dynarr_add (dst, c & charmask);
5020                   break;
5021                 case 3:
5022                   if (XCHARSET_PRIVATE_P (charset))
5023                     {
5024                       Dynarr_add (dst, c & charmask);
5025                       ch = 0;
5026                     }
5027                   else if (ch)
5028                     {
5029 #ifdef ENABLE_COMPOSITE_CHARS
5030                       if (EQ (charset, Vcharset_composite))
5031                         {
5032                           if (in_composite)
5033                             {
5034                               /* #### Bother! We don't know how to
5035                                  handle this yet. */
5036                               Dynarr_add (dst, '~');
5037                             }
5038                           else
5039                             {
5040                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5041                                                        ch & 0x7F, c & 0x7F);
5042                               Lisp_Object lstr = composite_char_string (emch);
5043                               saved_n = n;
5044                               saved_src = src;
5045                               in_composite = 1;
5046                               src = XSTRING_DATA   (lstr);
5047                               n   = XSTRING_LENGTH (lstr);
5048                               Dynarr_add (dst, ISO_CODE_ESC);
5049                               Dynarr_add (dst, '0'); /* start composing */
5050                             }
5051                         }
5052                       else
5053 #endif /* ENABLE_COMPOSITE_CHARS */
5054                         {
5055                           Dynarr_add (dst, ch & charmask);
5056                           Dynarr_add (dst, c & charmask);
5057                         }
5058                       ch = 0;
5059                     }
5060                   else
5061                     {
5062                       ch = c;
5063                       char_boundary = 0;
5064                     }
5065                   break;
5066                 case 4:
5067                   if (ch)
5068                     {
5069                       Dynarr_add (dst, ch & charmask);
5070                       Dynarr_add (dst, c & charmask);
5071                       ch = 0;
5072                     }
5073                   else
5074                     {
5075                       ch = c;
5076                       char_boundary = 0;
5077                     }
5078                   break;
5079                 default:
5080                   abort ();
5081                 }
5082             }
5083         }
5084     }
5085
5086 #ifdef ENABLE_COMPOSITE_CHARS
5087   if (in_composite)
5088     {
5089       n = saved_n;
5090       src = saved_src;
5091       in_composite = 0;
5092       Dynarr_add (dst, ISO_CODE_ESC);
5093       Dynarr_add (dst, '1'); /* end composing */
5094       goto back_to_square_n; /* Wheeeeeeeee ..... */
5095     }
5096 #endif /* ENABLE_COMPOSITE_CHARS */
5097
5098   if (char_boundary && flags & CODING_STATE_END)
5099     {
5100       restore_left_to_right_direction (codesys, dst, &flags, 0);
5101       ensure_normal_shift (str, dst);
5102       for (i = 0; i < 4; i++)
5103         {
5104           Lisp_Object initial_charset =
5105             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5106           iso2022_designate (initial_charset, i, str, dst);
5107         }
5108     }
5109
5110   str->flags = flags;
5111   str->ch    = ch;
5112   str->iso2022.current_char_boundary = char_boundary;
5113   str->iso2022.current_charset = charset;
5114   str->iso2022.current_half = half;
5115
5116   /* Verbum caro factum est! */
5117 }
5118 #endif /* MULE */
5119 \f
5120 /************************************************************************/
5121 /*                     No-conversion methods                            */
5122 /************************************************************************/
5123
5124 /* This is used when reading in "binary" files -- i.e. files that may
5125    contain all 256 possible byte values and that are not to be
5126    interpreted as being in any particular decoding. */
5127 static void
5128 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5129                              unsigned_char_dynarr *dst, unsigned int n)
5130 {
5131   unsigned char c;
5132   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5133   unsigned int flags  = str->flags;
5134   unsigned int ch     = str->ch;
5135   eol_type_t eol_type = str->eol_type;
5136
5137   while (n--)
5138     {
5139       c = *src++;
5140
5141       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5142       DECODE_ADD_BINARY_CHAR (c, dst);
5143     label_continue_loop:;
5144     }
5145
5146   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5147
5148   str->flags = flags;
5149   str->ch    = ch;
5150 }
5151
5152 static void
5153 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5154                              unsigned_char_dynarr *dst, unsigned int n)
5155 {
5156   unsigned char c;
5157   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5158   unsigned int flags  = str->flags;
5159   unsigned int ch     = str->ch;
5160   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5161
5162   while (n--)
5163     {
5164       c = *src++;
5165       if (c == '\n')
5166         {
5167           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5168             Dynarr_add (dst, '\r');
5169           if (eol_type != EOL_CR)
5170             Dynarr_add (dst, '\n');
5171           ch = 0;
5172         }
5173       else if (BYTE_ASCII_P (c))
5174         {
5175           assert (ch == 0);
5176           Dynarr_add (dst, c);
5177         }
5178       else if (BUFBYTE_LEADING_BYTE_P (c))
5179         {
5180           assert (ch == 0);
5181           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5182               c == LEADING_BYTE_CONTROL_1)
5183             ch = c;
5184           else
5185             Dynarr_add (dst, '~'); /* untranslatable character */
5186         }
5187       else
5188         {
5189           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5190             Dynarr_add (dst, c);
5191           else if (ch == LEADING_BYTE_CONTROL_1)
5192             {
5193               assert (c < 0xC0);
5194               Dynarr_add (dst, c - 0x20);
5195             }
5196           /* else it should be the second or third byte of an
5197              untranslatable character, so ignore it */
5198           ch = 0;
5199         }
5200     }
5201
5202   str->flags = flags;
5203   str->ch    = ch;
5204 }
5205
5206 \f
5207 /************************************************************************/
5208 /*                   Simple internal/external functions                 */
5209 /************************************************************************/
5210
5211 static Extbyte_dynarr *conversion_out_dynarr;
5212 static Bufbyte_dynarr *conversion_in_dynarr;
5213
5214 /* Determine coding system from coding format */
5215
5216 /* #### not correct for all values of `fmt'! */
5217 static Lisp_Object
5218 external_data_format_to_coding_system (enum external_data_format fmt)
5219 {
5220   switch (fmt)
5221     {
5222     case FORMAT_FILENAME:
5223     case FORMAT_TERMINAL:
5224       if (EQ (Vfile_name_coding_system, Qnil) ||
5225           EQ (Vfile_name_coding_system, Qbinary))
5226         return Qnil;
5227       else
5228         return Fget_coding_system (Vfile_name_coding_system);
5229 #ifdef MULE
5230     case FORMAT_CTEXT:
5231       return Fget_coding_system (Qctext);
5232 #endif
5233     default:
5234       return Qnil;
5235     }
5236 }
5237
5238 Extbyte *
5239 convert_to_external_format (CONST Bufbyte *ptr,
5240                             Bytecount len,
5241                             Extcount *len_out,
5242                             enum external_data_format fmt)
5243 {
5244   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5245
5246   if (!conversion_out_dynarr)
5247     conversion_out_dynarr = Dynarr_new (Extbyte);
5248   else
5249     Dynarr_reset (conversion_out_dynarr);
5250
5251   if (NILP (coding_system))
5252     {
5253       CONST Bufbyte *end = ptr + len;
5254
5255       for (; ptr < end;)
5256         {
5257           Bufbyte c =
5258             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5259             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5260             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5261             '~';
5262
5263           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5264           INC_CHARPTR (ptr);
5265         }
5266
5267 #ifdef ERROR_CHECK_BUFPOS
5268       assert (ptr == end);
5269 #endif
5270     }
5271   else
5272     {
5273       Lisp_Object instream, outstream, da_outstream;
5274       Lstream *istr, *ostr;
5275       struct gcpro gcpro1, gcpro2, gcpro3;
5276       char tempbuf[1024]; /* some random amount */
5277
5278       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5279       da_outstream = make_dynarr_output_stream
5280         ((unsigned_char_dynarr *) conversion_out_dynarr);
5281       outstream =
5282         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5283       istr = XLSTREAM (instream);
5284       ostr = XLSTREAM (outstream);
5285       GCPRO3 (instream, outstream, da_outstream);
5286       while (1)
5287         {
5288           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5289           if (!size_in_bytes)
5290             break;
5291           Lstream_write (ostr, tempbuf, size_in_bytes);
5292         }
5293       Lstream_close (istr);
5294       Lstream_close (ostr);
5295       UNGCPRO;
5296       Lstream_delete (istr);
5297       Lstream_delete (ostr);
5298       Lstream_delete (XLSTREAM (da_outstream));
5299     }
5300
5301   *len_out = Dynarr_length (conversion_out_dynarr);
5302   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5303   return Dynarr_atp (conversion_out_dynarr, 0);
5304 }
5305
5306 Bufbyte *
5307 convert_from_external_format (CONST Extbyte *ptr,
5308                               Extcount len,
5309                               Bytecount *len_out,
5310                               enum external_data_format fmt)
5311 {
5312   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5313
5314   if (!conversion_in_dynarr)
5315     conversion_in_dynarr = Dynarr_new (Bufbyte);
5316   else
5317     Dynarr_reset (conversion_in_dynarr);
5318
5319   if (NILP (coding_system))
5320     {
5321       CONST Extbyte *end = ptr + len;
5322       for (; ptr < end; ptr++)
5323         {
5324           Extbyte c = *ptr;
5325           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5326         }
5327     }
5328   else
5329     {
5330       Lisp_Object instream, outstream, da_outstream;
5331       Lstream *istr, *ostr;
5332       struct gcpro gcpro1, gcpro2, gcpro3;
5333       char tempbuf[1024]; /* some random amount */
5334
5335       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5336       da_outstream = make_dynarr_output_stream
5337         ((unsigned_char_dynarr *) conversion_in_dynarr);
5338       outstream =
5339         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5340       istr = XLSTREAM (instream);
5341       ostr = XLSTREAM (outstream);
5342       GCPRO3 (instream, outstream, da_outstream);
5343       while (1)
5344         {
5345           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5346           if (!size_in_bytes)
5347             break;
5348           Lstream_write (ostr, tempbuf, size_in_bytes);
5349         }
5350       Lstream_close (istr);
5351       Lstream_close (ostr);
5352       UNGCPRO;
5353       Lstream_delete (istr);
5354       Lstream_delete (ostr);
5355       Lstream_delete (XLSTREAM (da_outstream));
5356     }
5357
5358   *len_out = Dynarr_length (conversion_in_dynarr);
5359   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5360   return Dynarr_atp (conversion_in_dynarr, 0);
5361 }
5362
5363 \f
5364 /************************************************************************/
5365 /*                             Initialization                           */
5366 /************************************************************************/
5367
5368 void
5369 syms_of_mule_coding (void)
5370 {
5371   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5372   deferror (&Qcoding_system_error, "coding-system-error",
5373             "Coding-system error", Qio_error);
5374
5375   DEFSUBR (Fcoding_system_p);
5376   DEFSUBR (Ffind_coding_system);
5377   DEFSUBR (Fget_coding_system);
5378   DEFSUBR (Fcoding_system_list);
5379   DEFSUBR (Fcoding_system_name);
5380   DEFSUBR (Fmake_coding_system);
5381   DEFSUBR (Fcopy_coding_system);
5382   DEFSUBR (Fsubsidiary_coding_system);
5383
5384   DEFSUBR (Fcoding_system_type);
5385   DEFSUBR (Fcoding_system_doc_string);
5386 #ifdef MULE
5387   DEFSUBR (Fcoding_system_charset);
5388 #endif
5389   DEFSUBR (Fcoding_system_property);
5390
5391   DEFSUBR (Fcoding_category_list);
5392   DEFSUBR (Fset_coding_priority_list);
5393   DEFSUBR (Fcoding_priority_list);
5394   DEFSUBR (Fset_coding_category_system);
5395   DEFSUBR (Fcoding_category_system);
5396
5397   DEFSUBR (Fdetect_coding_region);
5398   DEFSUBR (Fdecode_coding_region);
5399   DEFSUBR (Fencode_coding_region);
5400 #ifdef MULE
5401   DEFSUBR (Fdecode_shift_jis_char);
5402   DEFSUBR (Fencode_shift_jis_char);
5403   DEFSUBR (Fdecode_big5_char);
5404   DEFSUBR (Fencode_big5_char);
5405   DEFSUBR (Fset_ucs_char);
5406   DEFSUBR (Fucs_char);
5407   DEFSUBR (Fset_char_ucs);
5408   DEFSUBR (Fchar_ucs);
5409 #endif /* MULE */
5410   defsymbol (&Qcoding_system_p, "coding-system-p");
5411   defsymbol (&Qno_conversion, "no-conversion");
5412 #ifdef MULE
5413   defsymbol (&Qbig5, "big5");
5414   defsymbol (&Qshift_jis, "shift-jis");
5415   defsymbol (&Qucs4, "ucs-4");
5416   defsymbol (&Qutf8, "utf-8");
5417   defsymbol (&Qccl, "ccl");
5418   defsymbol (&Qiso2022, "iso2022");
5419 #endif /* MULE */
5420   defsymbol (&Qmnemonic, "mnemonic");
5421   defsymbol (&Qeol_type, "eol-type");
5422   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5423   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5424
5425   defsymbol (&Qcr, "cr");
5426   defsymbol (&Qlf, "lf");
5427   defsymbol (&Qcrlf, "crlf");
5428   defsymbol (&Qeol_cr, "eol-cr");
5429   defsymbol (&Qeol_lf, "eol-lf");
5430   defsymbol (&Qeol_crlf, "eol-crlf");
5431 #ifdef MULE
5432   defsymbol (&Qcharset_g0, "charset-g0");
5433   defsymbol (&Qcharset_g1, "charset-g1");
5434   defsymbol (&Qcharset_g2, "charset-g2");
5435   defsymbol (&Qcharset_g3, "charset-g3");
5436   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5437   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5438   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5439   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5440   defsymbol (&Qno_iso6429, "no-iso6429");
5441   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5442   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5443
5444   defsymbol (&Qshort, "short");
5445   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5446   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5447   defsymbol (&Qseven, "seven");
5448   defsymbol (&Qlock_shift, "lock-shift");
5449   defsymbol (&Qescape_quoted, "escape-quoted");
5450 #endif /* MULE */
5451   defsymbol (&Qencode, "encode");
5452   defsymbol (&Qdecode, "decode");
5453
5454 #ifdef MULE
5455   defsymbol (&Qctext, "ctext");
5456   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5457              "shift-jis");
5458   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5459              "big5");
5460   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5461              "ucs-4");
5462   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5463              "utf-8");
5464   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5465              "iso-7");
5466   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5467              "iso-8-designate");
5468   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5469              "iso-8-1");
5470   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5471              "iso-8-2");
5472   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5473              "iso-lock-shift");
5474 #endif /* MULE */
5475   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5476              "no-conversion");
5477 }
5478
5479 void
5480 lstream_type_create_mule_coding (void)
5481 {
5482   LSTREAM_HAS_METHOD (decoding, reader);
5483   LSTREAM_HAS_METHOD (decoding, writer);
5484   LSTREAM_HAS_METHOD (decoding, rewinder);
5485   LSTREAM_HAS_METHOD (decoding, seekable_p);
5486   LSTREAM_HAS_METHOD (decoding, flusher);
5487   LSTREAM_HAS_METHOD (decoding, closer);
5488   LSTREAM_HAS_METHOD (decoding, marker);
5489
5490   LSTREAM_HAS_METHOD (encoding, reader);
5491   LSTREAM_HAS_METHOD (encoding, writer);
5492   LSTREAM_HAS_METHOD (encoding, rewinder);
5493   LSTREAM_HAS_METHOD (encoding, seekable_p);
5494   LSTREAM_HAS_METHOD (encoding, flusher);
5495   LSTREAM_HAS_METHOD (encoding, closer);
5496   LSTREAM_HAS_METHOD (encoding, marker);
5497 }
5498
5499 void
5500 vars_of_mule_coding (void)
5501 {
5502   int i;
5503
5504   /* Initialize to something reasonable ... */
5505   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5506     {
5507       coding_category_system[i] = Qnil;
5508       coding_category_by_priority[i] = i;
5509     }
5510
5511   Fprovide (intern ("file-coding"));
5512
5513   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5514 Coding system used for TTY keyboard input.
5515 Not used under a windowing system.
5516 */ );
5517   Vkeyboard_coding_system = Qnil;
5518
5519   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5520 Coding system used for TTY display output.
5521 Not used under a windowing system.
5522 */ );
5523   Vterminal_coding_system = Qnil;
5524
5525   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5526 Overriding coding system used when writing a file or process.
5527 You should *bind* this, not set it.  If this is non-nil, it specifies
5528 the coding system that will be used when a file or process is read
5529 in, and overrides `buffer-file-coding-system-for-read',
5530 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5531 this one for permanent changes to the environment.
5532 */ );
5533   Vcoding_system_for_read = Qnil;
5534
5535   DEFVAR_LISP ("coding-system-for-write",
5536                &Vcoding_system_for_write /*
5537 Overriding coding system used when writing a file or process.
5538 You should *bind* this, not set it.  If this is non-nil, it specifies
5539 the coding system that will be used when a file or process is wrote
5540 in, and overrides `buffer-file-coding-system',
5541 `write-region-pre-hook', etc.  Use those variables instead of this one
5542 for permanent changes to the environment.
5543 */ );
5544   Vcoding_system_for_write = Qnil;
5545
5546   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5547 Coding system used to convert pathnames when accessing files.
5548 */ );
5549   Vfile_name_coding_system = Qnil;
5550
5551   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5552 Non-nil means the buffer contents are regarded as multi-byte form
5553 of characters, not a binary code.  This affects the display, file I/O,
5554 and behaviors of various editing commands.
5555
5556 Setting this to nil does not do anything.
5557 */ );
5558   enable_multibyte_characters = 1;
5559 }
5560
5561 void
5562 complex_vars_of_mule_coding (void)
5563 {
5564   staticpro (&Vcoding_system_hash_table);
5565   Vcoding_system_hash_table =
5566     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5567
5568   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5569
5570 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5571 {                                               \
5572   struct codesys_prop csp;                      \
5573   csp.sym = (Sym);                              \
5574   csp.prop_type = (Prop_Type);                  \
5575   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5576 } while (0)
5577
5578   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5579   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5580   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5581   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5582   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5583   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5584   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5585 #ifdef MULE
5586   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5587   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5588   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5589   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5590   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5591   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5592   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5593   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5594   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5595   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5596   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5597   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5598   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5599   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5600   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5601   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5602   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5603
5604   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5605   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5606 #endif /* MULE */
5607   /* Need to create this here or we're really screwed. */
5608   Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
5609                        list2 (Qmnemonic, build_string ("Noconv")));
5610
5611   Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
5612                        Qbinary);
5613
5614   /* Need this for bootstrapping */
5615   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5616     Fget_coding_system (Qno_conversion);
5617
5618 #ifdef MULE
5619   {
5620     unsigned int i;
5621
5622     for (i = 0; i < 65536; i++)
5623       ucs_to_mule_table[i] = Qnil;
5624   }
5625   staticpro (&mule_to_ucs_table);
5626   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5627 #endif /* MULE */
5628 }