src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28 #include "buffer.h"
  29 #include "elhash.h"
  30 #include "insdel.h"
  31 #include "lstream.h"
  32 #ifdef MULE
  33 #include "mule-ccl.h"
  34 #include "chartab.h"
  35 #endif
  36 #include "file-coding.h"
  37
  38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  39
  40 Lisp_Object Vkeyboard_coding_system;
  41 Lisp_Object Vterminal_coding_system;
  42 Lisp_Object Vcoding_system_for_read;
  43 Lisp_Object Vcoding_system_for_write;
  44 Lisp_Object Vfile_name_coding_system;
  45
  46 /* Table of symbols identifying each coding category. */
  47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  48
  49 /* Coding system currently associated with each coding category. */
  50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  51
  52 /* Table of all coding categories in decreasing order of priority.
  53    This describes a permutation of the possible coding categories. */
  54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  55
  56 Lisp_Object Qcoding_system_p;
  57
  58 Lisp_Object Qno_conversion, Qccl, Qiso2022;
  59 /* Qinternal in general.c */
  60
  61 Lisp_Object Qmnemonic, Qeol_type;
  62 Lisp_Object Qcr, Qcrlf, Qlf;
  63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  64 Lisp_Object Qpost_read_conversion;
  65 Lisp_Object Qpre_write_conversion;
  66
  67 #ifdef MULE
  68 Lisp_Object Qucs4, Qutf8;
  69 Lisp_Object Qbig5, Qshift_jis;
  70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  73 Lisp_Object Qno_iso6429;
  74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  75 Lisp_Object Qctext, Qescape_quoted;
  76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  77 #endif
  78 Lisp_Object Qencode, Qdecode;
  79
  80 Lisp_Object Vcoding_system_hash_table;
  81
  82 int enable_multibyte_characters;
  83
  84 #ifdef MULE
  85 /* Additional information used by the ISO2022 decoder and detector. */
  86 struct iso2022_decoder
  87 {
  88   /* CHARSET holds the character sets currently assigned to the G0
  89      through G3 variables.  It is initialized from the array
  90      INITIAL_CHARSET in CODESYS. */
  91   Lisp_Object charset[4];
  92
  93   /* Which registers are currently invoked into the left (GL) and
  94      right (GR) halves of the 8-bit encoding space? */
  95   int register_left, register_right;
  96
  97   /* ISO_ESC holds a value indicating part of an escape sequence
  98      that has already been seen. */
  99   enum iso_esc_flag esc;
 100
 101   /* This records the bytes we've seen so far in an escape sequence,
 102      in case the sequence is invalid (we spit out the bytes unchanged). */
 103   unsigned char esc_bytes[8];
 104
 105   /* Index for next byte to store in ISO escape sequence. */
 106   int esc_bytes_index;
 107
 108 #ifdef ENABLE_COMPOSITE_CHARS
 109   /* Stuff seen so far when composing a string. */
 110   unsigned_char_dynarr *composite_chars;
 111 #endif
 112
 113   /* If we saw an invalid designation sequence for a particular
 114      register, we flag it here and switch to ASCII.  The next time we
 115      see a valid designation for this register, we turn off the flag
 116      and do the designation normally, but pretend the sequence was
 117      invalid.  The effect of all this is that (most of the time) the
 118      escape sequences for both the switch to the unknown charset, and
 119      the switch back to the known charset, get inserted literally into
 120      the buffer and saved out as such.  The hope is that we can
 121      preserve the escape sequences so that the resulting written out
 122      file makes sense.  If we don't do any of this, the designation
 123      to the invalid charset will be preserved but that switch back
 124      to the known charset will probably get eaten because it was
 125      the same charset that was already present in the register. */
 126   unsigned char invalid_designated[4];
 127
 128   /* We try to do similar things as above for direction-switching
 129      sequences.  If we encountered a direction switch while an
 130      invalid designation was present, or an invalid designation
 131      just after a direction switch (i.e. no valid designation
 132      encountered yet), we insert the direction-switch escape
 133      sequence literally into the output stream, and later on
 134      insert the corresponding direction-restoring escape sequence
 135      literally also. */
 136   unsigned int switched_dir_and_no_valid_charset_yet :1;
 137   unsigned int invalid_switch_dir :1;
 138
 139   /* Tells the decoder to output the escape sequence literally
 140      even though it was valid.  Used in the games we play to
 141      avoid lossage when we encounter invalid designations. */
 142   unsigned int output_literally :1;
 143   /* We encountered a direction switch followed by an invalid
 144      designation.  We didn't output the direction switch
 145      literally because we didn't know about the invalid designation;
 146      but we have to do so now. */
 147   unsigned int output_direction_sequence :1;
 148 };
 149 #endif /* MULE */
 150 EXFUN (Fcopy_coding_system, 2);
 151 #ifdef MULE
 152 struct detection_state;
 153 static int detect_coding_sjis (struct detection_state *st,
 154                                CONST unsigned char *src,
 155                                unsigned int n);
 156 static void decode_coding_sjis (Lstream *decoding,
 157                                 CONST unsigned char *src,
 158                                 unsigned_char_dynarr *dst,
 159                                 unsigned int n);
 160 static void encode_coding_sjis (Lstream *encoding,
 161                                 CONST unsigned char *src,
 162                                 unsigned_char_dynarr *dst,
 163                                 unsigned int n);
 164 static int detect_coding_big5 (struct detection_state *st,
 165                                CONST unsigned char *src,
 166                                unsigned int n);
 167 static void decode_coding_big5 (Lstream *decoding,
 168                                 CONST unsigned char *src,
 169                                 unsigned_char_dynarr *dst, unsigned int n);
 170 static void encode_coding_big5 (Lstream *encoding,
 171                                 CONST unsigned char *src,
 172                                 unsigned_char_dynarr *dst, unsigned int n);
 173 static int detect_coding_ucs4 (struct detection_state *st,
 174                                CONST unsigned char *src,
 175                                unsigned int n);
 176 static void decode_coding_ucs4 (Lstream *decoding,
 177                                 CONST unsigned char *src,
 178                                 unsigned_char_dynarr *dst, unsigned int n);
 179 static void encode_coding_ucs4 (Lstream *encoding,
 180                                 CONST unsigned char *src,
 181                                 unsigned_char_dynarr *dst, unsigned int n);
 182 static int detect_coding_utf8 (struct detection_state *st,
 183                                CONST unsigned char *src,
 184                                unsigned int n);
 185 static void decode_coding_utf8 (Lstream *decoding,
 186                                 CONST unsigned char *src,
 187                                 unsigned_char_dynarr *dst, unsigned int n);
 188 static void encode_coding_utf8 (Lstream *encoding,
 189                                 CONST unsigned char *src,
 190                                 unsigned_char_dynarr *dst, unsigned int n);
 191 static int postprocess_iso2022_mask (int mask);
 192 static void reset_iso2022 (Lisp_Object coding_system,
 193                            struct iso2022_decoder *iso);
 194 static int detect_coding_iso2022 (struct detection_state *st,
 195                                   CONST unsigned char *src,
 196                                   unsigned int n);
 197 static void decode_coding_iso2022 (Lstream *decoding,
 198                                    CONST unsigned char *src,
 199                                    unsigned_char_dynarr *dst, unsigned int n);
 200 static void encode_coding_iso2022 (Lstream *encoding,
 201                                    CONST unsigned char *src,
 202                                    unsigned_char_dynarr *dst, unsigned int n);
 203 #endif /* MULE */
 204 static void decode_coding_no_conversion (Lstream *decoding,
 205                                          CONST unsigned char *src,
 206                                          unsigned_char_dynarr *dst,
 207                                          unsigned int n);
 208 static void encode_coding_no_conversion (Lstream *encoding,
 209                                          CONST unsigned char *src,
 210                                          unsigned_char_dynarr *dst,
 211                                          unsigned int n);
 212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 213                          unsigned_char_dynarr *dst, unsigned int n);
 214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 215                          unsigned_char_dynarr *dst, unsigned int n);
 216
 217 typedef struct codesys_prop codesys_prop;
 218 struct codesys_prop
 219 {
 220   Lisp_Object sym;
 221   int prop_type;
 222 };
 223
 224 typedef struct
 225 {
 226   Dynarr_declare (codesys_prop);
 227 } codesys_prop_dynarr;
 228
 229 codesys_prop_dynarr *the_codesys_prop_dynarr;
 230
 231 enum codesys_prop_enum
 232 {
 233   CODESYS_PROP_ALL_OK,
 234   CODESYS_PROP_ISO2022,
 235   CODESYS_PROP_CCL
 236 };
 237
 238 \f
 239 /************************************************************************/
 240 /*                       Coding system functions                        */
 241 /************************************************************************/
 242
 243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 245 static void finalize_coding_system (void *header, int for_disksave);
 246
 247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 248                                mark_coding_system, print_coding_system,
 249                                finalize_coding_system,
 250                                0, 0, struct Lisp_Coding_System);
 251
 252 static Lisp_Object
 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 254 {
 255   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 256
 257   markobj (CODING_SYSTEM_NAME (codesys));
 258   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 259   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 260   markobj (CODING_SYSTEM_EOL_LF (codesys));
 261   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 262   markobj (CODING_SYSTEM_EOL_CR (codesys));
 263
 264   switch (CODING_SYSTEM_TYPE (codesys))
 265     {
 266 #ifdef MULE
 267       int i;
 268     case CODESYS_ISO2022:
 269       for (i = 0; i < 4; i++)
 270         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 271       if (codesys->iso2022.input_conv)
 272         {
 273           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 274             {
 275               struct charset_conversion_spec *ccs =
 276                 Dynarr_atp (codesys->iso2022.input_conv, i);
 277               markobj (ccs->from_charset);
 278               markobj (ccs->to_charset);
 279             }
 280         }
 281       if (codesys->iso2022.output_conv)
 282         {
 283           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 284             {
 285               struct charset_conversion_spec *ccs =
 286                 Dynarr_atp (codesys->iso2022.output_conv, i);
 287               markobj (ccs->from_charset);
 288               markobj (ccs->to_charset);
 289             }
 290         }
 291       break;
 292
 293     case CODESYS_CCL:
 294       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 295       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 296       break;
 297 #endif /* MULE */
 298     default:
 299       break;
 300     }
 301
 302   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 303   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 304 }
 305
 306 static void
 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 308                      int escapeflag)
 309 {
 310   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 311   if (print_readably)
 312     error ("printing unreadable object #<coding_system 0x%x>",
 313            c->header.uid);
 314
 315   write_c_string ("#<coding_system ", printcharfun);
 316   print_internal (c->name, printcharfun, 1);
 317   write_c_string (">", printcharfun);
 318 }
 319
 320 static void
 321 finalize_coding_system (void *header, int for_disksave)
 322 {
 323   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 324   /* Since coding systems never go away, this function is not
 325      necessary.  But it would be necessary if we changed things
 326      so that coding systems could go away. */
 327   if (!for_disksave) /* see comment in lstream.c */
 328     {
 329       switch (CODING_SYSTEM_TYPE (c))
 330         {
 331 #ifdef MULE
 332         case CODESYS_ISO2022:
 333           if (c->iso2022.input_conv)
 334             {
 335               Dynarr_free (c->iso2022.input_conv);
 336               c->iso2022.input_conv = 0;
 337             }
 338           if (c->iso2022.output_conv)
 339             {
 340               Dynarr_free (c->iso2022.output_conv);
 341               c->iso2022.output_conv = 0;
 342             }
 343           break;
 344 #endif /* MULE */
 345         default:
 346           break;
 347         }
 348     }
 349 }
 350
 351 static enum eol_type
 352 symbol_to_eol_type (Lisp_Object symbol)
 353 {
 354   CHECK_SYMBOL (symbol);
 355   if (NILP (symbol))      return EOL_AUTODETECT;
 356   if (EQ (symbol, Qlf))   return EOL_LF;
 357   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 358   if (EQ (symbol, Qcr))   return EOL_CR;
 359
 360   signal_simple_error ("Unrecognized eol type", symbol);
 361   return EOL_AUTODETECT; /* not reached */
 362 }
 363
 364 static Lisp_Object
 365 eol_type_to_symbol (enum eol_type type)
 366 {
 367   switch (type)
 368     {
 369     default: abort ();
 370     case EOL_LF:         return Qlf;
 371     case EOL_CRLF:       return Qcrlf;
 372     case EOL_CR:         return Qcr;
 373     case EOL_AUTODETECT: return Qnil;
 374     }
 375 }
 376
 377 static void
 378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 379 {
 380   Lisp_Object codesys_obj;
 381   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 382   char *codesys_name = (char *) alloca (len + 7);
 383   int mlen = -1;
 384   char *codesys_mnemonic=0;
 385
 386   Lisp_Object codesys_name_sym, sub_codesys_obj;
 387
 388   /* kludge */
 389
 390   XSETCODING_SYSTEM (codesys_obj, codesys);
 391
 392   memcpy (codesys_name,
 393           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 394
 395   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 396     {
 397       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 398       codesys_mnemonic = (char *) alloca (mlen + 7);
 399       memcpy (codesys_mnemonic,
 400               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 401     }
 402
 403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 404   strcpy (codesys_name + len, "-" op_sys);                                      \
 405   if (mlen != -1)                                                               \
 406     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 407   codesys_name_sym = intern (codesys_name);                                     \
 408   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 409   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 410   if (mlen != -1)                                                               \
 411     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 412       build_string (codesys_mnemonic);                                          \
 413   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 414 } while (0)
 415
 416   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 417   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 418   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 419 }
 420
 421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 422 Return t if OBJECT is a coding system.
 423 A coding system is an object that defines how text containing multiple
 424 character sets is encoded into a stream of (typically 8-bit) bytes.
 425 The coding system is used to decode the stream into a series of
 426 characters (which may be from multiple charsets) when the text is read
 427 from a file or process, and is used to encode the text back into the
 428 same format when it is written out to a file or process.
 429
 430 For example, many ISO2022-compliant coding systems (such as Compound
 431 Text, which is used for inter-client data under the X Window System)
 432 use escape sequences to switch between different charsets -- Japanese
 433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 435 `make-coding-system' for more information.
 436
 437 Coding systems are normally identified using a symbol, and the
 438 symbol is accepted in place of the actual coding system object whenever
 439 a coding system is called for. (This is similar to how faces work.)
 440 */
 441        (object))
 442 {
 443   return CODING_SYSTEMP (object) ? Qt : Qnil;
 444 }
 445
 446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 447 Retrieve the coding system of the given name.
 448
 449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 450 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 451 If there is no such coding system, nil is returned.  Otherwise the
 452 associated coding system object is returned.
 453 */
 454        (coding_system_or_name))
 455 {
 456   if (CODING_SYSTEMP (coding_system_or_name))
 457     return coding_system_or_name;
 458
 459   if (NILP (coding_system_or_name))
 460     coding_system_or_name = Qbinary;
 461   else
 462     CHECK_SYMBOL (coding_system_or_name);
 463
 464   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 465 }
 466
 467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 468 Retrieve the coding system of the given name.
 469 Same as `find-coding-system' except that if there is no such
 470 coding system, an error is signaled instead of returning nil.
 471 */
 472        (name))
 473 {
 474   Lisp_Object coding_system = Ffind_coding_system (name);
 475
 476   if (NILP (coding_system))
 477     signal_simple_error ("No such coding system", name);
 478   return coding_system;
 479 }
 480
 481 /* We store the coding systems in hash tables with the names as the key and the
 482    actual coding system object as the value.  Occasionally we need to use them
 483    in a list format.  These routines provide us with that. */
 484 struct coding_system_list_closure
 485 {
 486   Lisp_Object *coding_system_list;
 487 };
 488
 489 static int
 490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 491                                   void *coding_system_list_closure)
 492 {
 493   /* This function can GC */
 494   struct coding_system_list_closure *cscl =
 495     (struct coding_system_list_closure *) coding_system_list_closure;
 496   Lisp_Object *coding_system_list = cscl->coding_system_list;
 497
 498   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 499                                *coding_system_list);
 500   return 0;
 501 }
 502
 503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 504 Return a list of the names of all defined coding systems.
 505 */
 506        ())
 507 {
 508   Lisp_Object coding_system_list = Qnil;
 509   struct gcpro gcpro1;
 510   struct coding_system_list_closure coding_system_list_closure;
 511
 512   GCPRO1 (coding_system_list);
 513   coding_system_list_closure.coding_system_list = &coding_system_list;
 514   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 515                  &coding_system_list_closure);
 516   UNGCPRO;
 517
 518   return coding_system_list;
 519 }
 520
 521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 522 Return the name of the given coding system.
 523 */
 524        (coding_system))
 525 {
 526   coding_system = Fget_coding_system (coding_system);
 527   return XCODING_SYSTEM_NAME (coding_system);
 528 }
 529
 530 static Lisp_Coding_System *
 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 532 {
 533   Lisp_Coding_System *codesys =
 534     alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system);
 535
 536   zero_lcrecord (codesys);
 537   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 538   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 539   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 540   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 541   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 542   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 543   CODING_SYSTEM_TYPE     (codesys) = type;
 544   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 545 #ifdef MULE
 546   if (type == CODESYS_ISO2022)
 547     {
 548       int i;
 549       for (i = 0; i < 4; i++)
 550         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 551     }
 552   else if (type == CODESYS_CCL)
 553     {
 554       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 555       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 556     }
 557 #endif /* MULE */
 558   CODING_SYSTEM_NAME (codesys) = name;
 559
 560   return codesys;
 561 }
 562
 563 #ifdef MULE
 564 /* Given a list of charset conversion specs as specified in a Lisp
 565    program, parse it into STORE_HERE. */
 566
 567 static void
 568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 569                                 Lisp_Object spec_list)
 570 {
 571   Lisp_Object rest;
 572
 573   EXTERNAL_LIST_LOOP (rest, spec_list)
 574     {
 575       Lisp_Object car = XCAR (rest);
 576       Lisp_Object from, to;
 577       struct charset_conversion_spec spec;
 578
 579       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 580         signal_simple_error ("Invalid charset conversion spec", car);
 581       from = Fget_charset (XCAR (car));
 582       to = Fget_charset (XCAR (XCDR (car)));
 583       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 584         signal_simple_error_2
 585           ("Attempted conversion between different charset types",
 586            from, to);
 587       spec.from_charset = from;
 588       spec.to_charset = to;
 589
 590       Dynarr_add (store_here, spec);
 591     }
 592 }
 593
 594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 595    specs, return the equivalent as the Lisp programmer would see it.
 596
 597    If LOAD_HERE is 0, return Qnil. */
 598
 599 static Lisp_Object
 600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 601 {
 602   int i;
 603   Lisp_Object result;
 604
 605   if (!load_here)
 606     return Qnil;
 607   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 608     {
 609       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 610       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 611     }
 612
 613   return Fnreverse (result);
 614 }
 615
 616 #endif /* MULE */
 617
 618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 619 Register symbol NAME as a coding system.
 620
 621 TYPE describes the conversion method used and should be one of
 622
 623 nil or 'undecided
 624      Automatic conversion.  XEmacs attempts to detect the coding system
 625      used in the file.
 626 'no-conversion
 627      No conversion.  Use this for binary files and such.  On output,
 628      graphic characters that are not in ASCII or Latin-1 will be
 629      replaced by a ?. (For a no-conversion-encoded buffer, these
 630      characters will only be present if you explicitly insert them.)
 631 'shift-jis
 632      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 633 'ucs-4
 634      ISO 10646 UCS-4 encoding.
 635 'utf-8
 636      ISO 10646 UTF-8 encoding.
 637 'iso2022
 638      Any ISO2022-compliant encoding.  Among other things, this includes
 639      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 640      standard Unix encoding for Japanese and other languages), and
 641      Compound Text (the encoding used in X11).  You can specify more
 642      specific information about the conversion with the FLAGS argument.
 643 'big5
 644      Big5 (the encoding commonly used for Taiwanese).
 645 'ccl
 646      The conversion is performed using a user-written pseudo-code
 647      program.  CCL (Code Conversion Language) is the name of this
 648      pseudo-code.
 649 'internal
 650      Write out or read in the raw contents of the memory representing
 651      the buffer's text.  This is primarily useful for debugging
 652      purposes, and is only enabled when XEmacs has been compiled with
 653      DEBUG_XEMACS defined (via the --debug configure option).
 654      WARNING: Reading in a file using 'internal conversion can result
 655      in an internal inconsistency in the memory representing a
 656      buffer's text, which will produce unpredictable results and may
 657      cause XEmacs to crash.  Under normal circumstances you should
 658      never use 'internal conversion.
 659
 660 DOC-STRING is a string describing the coding system.
 661
 662 PROPS is a property list, describing the specific nature of the
 663 character set.  Recognized properties are:
 664
 665 'mnemonic
 666      String to be displayed in the modeline when this coding system is
 667      active.
 668
 669 'eol-type
 670      End-of-line conversion to be used.  It should be one of
 671
 672         nil
 673                 Automatically detect the end-of-line type (LF, CRLF,
 674                 or CR).  Also generate subsidiary coding systems named
 675                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 676                 identical to this coding system but have an EOL-TYPE
 677                 value of 'lf, 'crlf, and 'cr, respectively.
 678         'lf
 679                 The end of a line is marked externally using ASCII LF.
 680                 Since this is also the way that XEmacs represents an
 681                 end-of-line internally, specifying this option results
 682                 in no end-of-line conversion.  This is the standard
 683                 format for Unix text files.
 684         'crlf
 685                 The end of a line is marked externally using ASCII
 686                 CRLF.  This is the standard format for MS-DOS text
 687                 files.
 688         'cr
 689                 The end of a line is marked externally using ASCII CR.
 690                 This is the standard format for Macintosh text files.
 691         t
 692                 Automatically detect the end-of-line type but do not
 693                 generate subsidiary coding systems.  (This value is
 694                 converted to nil when stored internally, and
 695                 `coding-system-property' will return nil.)
 696
 697 'post-read-conversion
 698      Function called after a file has been read in, to perform the
 699      decoding.  Called with two arguments, BEG and END, denoting
 700      a region of the current buffer to be decoded.
 701
 702 'pre-write-conversion
 703      Function called before a file is written out, to perform the
 704      encoding.  Called with two arguments, BEG and END, denoting
 705      a region of the current buffer to be encoded.
 706
 707
 708 The following additional properties are recognized if TYPE is 'iso2022:
 709
 710 'charset-g0
 711 'charset-g1
 712 'charset-g2
 713 'charset-g3
 714      The character set initially designated to the G0 - G3 registers.
 715      The value should be one of
 716
 717           -- A charset object (designate that character set)
 718           -- nil (do not ever use this register)
 719           -- t (no character set is initially designated to
 720                 the register, but may be later on; this automatically
 721                 sets the corresponding `force-g*-on-output' property)
 722
 723 'force-g0-on-output
 724 'force-g1-on-output
 725 'force-g2-on-output
 726 'force-g2-on-output
 727      If non-nil, send an explicit designation sequence on output before
 728      using the specified register.
 729
 730 'short
 731      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 732      "ESC $ B" on output in place of the full designation sequences
 733      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 734
 735 'no-ascii-eol
 736      If non-nil, don't designate ASCII to G0 at each end of line on output.
 737      Setting this to non-nil also suppresses other state-resetting that
 738      normally happens at the end of a line.
 739
 740 'no-ascii-cntl
 741      If non-nil, don't designate ASCII to G0 before control chars on output.
 742
 743 'seven
 744      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 745      environment.
 746
 747 'lock-shift
 748      If non-nil, use locking-shift (SO/SI) instead of single-shift
 749      or designation by escape sequence.
 750
 751 'no-iso6429
 752      If non-nil, don't use ISO6429's direction specification.
 753
 754 'escape-quoted
 755      If non-nil, literal control characters that are the same as
 756      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 757      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 758      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 759      so that they can be properly distinguished from an escape sequence.
 760      (Note that doing this results in a non-portable encoding.) This
 761      encoding flag is used for byte-compiled files.  Note that ESC
 762      is a good choice for a quoting character because there are no
 763      escape sequences whose second byte is a character from the Control-0
 764      or Control-1 character sets; this is explicitly disallowed by the
 765      ISO2022 standard.
 766
 767 'input-charset-conversion
 768      A list of conversion specifications, specifying conversion of
 769      characters in one charset to another when decoding is performed.
 770      Each specification is a list of two elements: the source charset,
 771      and the destination charset.
 772
 773 'output-charset-conversion
 774      A list of conversion specifications, specifying conversion of
 775      characters in one charset to another when encoding is performed.
 776      The form of each specification is the same as for
 777      'input-charset-conversion.
 778
 779
 780 The following additional properties are recognized (and required)
 781 if TYPE is 'ccl:
 782
 783 'decode
 784      CCL program used for decoding (converting to internal format).
 785
 786 'encode
 787      CCL program used for encoding (converting to external format).
 788 */
 789        (name, type, doc_string, props))
 790 {
 791   Lisp_Coding_System *codesys;
 792   Lisp_Object rest, key, value;
 793   enum coding_system_type ty;
 794   int need_to_setup_eol_systems = 1;
 795
 796   /* Convert type to constant */
 797   if (NILP (type) || EQ (type, Qundecided))
 798                                       { ty = CODESYS_AUTODETECT; }
 799 #ifdef MULE
 800   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 801   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 802   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 803   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 804   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 805   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 806 #endif
 807   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 808 #ifdef DEBUG_XEMACS
 809   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 810 #endif
 811   else
 812     signal_simple_error ("Invalid coding system type", type);
 813
 814   CHECK_SYMBOL (name);
 815
 816   codesys = allocate_coding_system (ty, name);
 817
 818   if (NILP (doc_string))
 819     doc_string = build_string ("");
 820   else
 821     CHECK_STRING (doc_string);
 822   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 823
 824   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 825     {
 826       if (EQ (key, Qmnemonic))
 827         {
 828           if (!NILP (value))
 829             CHECK_STRING (value);
 830           CODING_SYSTEM_MNEMONIC (codesys) = value;
 831         }
 832
 833       else if (EQ (key, Qeol_type))
 834         {
 835           need_to_setup_eol_systems = NILP (value);
 836           if (EQ (value, Qt))
 837             value = Qnil;
 838           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 839         }
 840
 841       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 842       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 843 #ifdef MULE
 844       else if (ty == CODESYS_ISO2022)
 845         {
 846 #define FROB_INITIAL_CHARSET(charset_num) \
 847   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 848     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 849
 850           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 851           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 852           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 853           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 854
 855 #define FROB_FORCE_CHARSET(charset_num) \
 856   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 857
 858           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 859           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 860           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 861           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 862
 863 #define FROB_BOOLEAN_PROPERTY(prop) \
 864   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 865
 866           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 867           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 868           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 869           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 870           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 871           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 872           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 873
 874           else if (EQ (key, Qinput_charset_conversion))
 875             {
 876               codesys->iso2022.input_conv =
 877                 Dynarr_new (charset_conversion_spec);
 878               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 879                                               value);
 880             }
 881           else if (EQ (key, Qoutput_charset_conversion))
 882             {
 883               codesys->iso2022.output_conv =
 884                 Dynarr_new (charset_conversion_spec);
 885               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 886                                               value);
 887             }
 888           else
 889             signal_simple_error ("Unrecognized property", key);
 890         }
 891       else if (EQ (type, Qccl))
 892         {
 893           if (EQ (key, Qdecode))
 894             {
 895               CHECK_VECTOR (value);
 896               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 897             }
 898           else if (EQ (key, Qencode))
 899             {
 900               CHECK_VECTOR (value);
 901               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 902             }
 903           else
 904             signal_simple_error ("Unrecognized property", key);
 905         }
 906 #endif /* MULE */
 907       else
 908         signal_simple_error ("Unrecognized property", key);
 909     }
 910
 911   if (need_to_setup_eol_systems)
 912     setup_eol_coding_systems (codesys);
 913
 914   {
 915     Lisp_Object codesys_obj;
 916     XSETCODING_SYSTEM (codesys_obj, codesys);
 917     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 918     return codesys_obj;
 919   }
 920 }
 921
 922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 923 Copy OLD-CODING-SYSTEM to NEW-NAME.
 924 If NEW-NAME does not name an existing coding system, a new one will
 925 be created.
 926 */
 927        (old_coding_system, new_name))
 928 {
 929   Lisp_Object new_coding_system;
 930   old_coding_system = Fget_coding_system (old_coding_system);
 931   new_coding_system = Ffind_coding_system (new_name);
 932   if (NILP (new_coding_system))
 933     {
 934       XSETCODING_SYSTEM (new_coding_system,
 935                          allocate_coding_system
 936                          (XCODING_SYSTEM_TYPE (old_coding_system),
 937                           new_name));
 938       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 939     }
 940
 941   {
 942     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 943     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 944     memcpy (((char *) to  ) + sizeof (to->header),
 945             ((char *) from) + sizeof (from->header),
 946             sizeof (*from) - sizeof (from->header));
 947     to->name = new_name;
 948   }
 949   return new_coding_system;
 950 }
 951
 952 static Lisp_Object
 953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
 954 {
 955   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
 956   Lisp_Object new_coding_system;
 957
 958   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
 959     return coding_system;
 960
 961   switch (type)
 962     {
 963     case EOL_AUTODETECT: return coding_system;
 964     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
 965     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
 966     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
 967     default:       abort ();
 968     }
 969
 970   return NILP (new_coding_system) ? coding_system : new_coding_system;
 971 }
 972
 973 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
 974 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
 975 */
 976        (coding_system, eol_type))
 977 {
 978   coding_system = Fget_coding_system (coding_system);
 979
 980   return subsidiary_coding_system (coding_system,
 981                                    symbol_to_eol_type (eol_type));
 982 }
 983
 984 \f
 985 /************************************************************************/
 986 /*                         Coding system accessors                      */
 987 /************************************************************************/
 988
 989 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
 990 Return the doc string for CODING-SYSTEM.
 991 */
 992        (coding_system))
 993 {
 994   coding_system = Fget_coding_system (coding_system);
 995   return XCODING_SYSTEM_DOC_STRING (coding_system);
 996 }
 997
 998 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
 999 Return the type of CODING-SYSTEM.
1000 */
1001        (coding_system))
1002 {
1003   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1004     {
1005     default: abort ();
1006     case CODESYS_AUTODETECT:    return Qundecided;
1007 #ifdef MULE
1008     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1009     case CODESYS_ISO2022:       return Qiso2022;
1010     case CODESYS_BIG5:          return Qbig5;
1011     case CODESYS_UCS4:          return Qucs4;
1012     case CODESYS_UTF8:          return Qutf8;
1013     case CODESYS_CCL:           return Qccl;
1014 #endif
1015     case CODESYS_NO_CONVERSION: return Qno_conversion;
1016 #ifdef DEBUG_XEMACS
1017     case CODESYS_INTERNAL:      return Qinternal;
1018 #endif
1019     }
1020 }
1021
1022 #ifdef MULE
1023 static
1024 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1025 {
1026   Lisp_Object cs
1027     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1028
1029   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1030 }
1031
1032 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1033 Return initial charset of CODING-SYSTEM designated to GNUM.
1034 GNUM allows 0 .. 3.
1035 */
1036        (coding_system, gnum))
1037 {
1038   coding_system = Fget_coding_system (coding_system);
1039   CHECK_INT (gnum);
1040
1041   return coding_system_charset (coding_system, XINT (gnum));
1042 }
1043 #endif /* MULE */
1044
1045 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1046 Return the PROP property of CODING-SYSTEM.
1047 */
1048        (coding_system, prop))
1049 {
1050   int i, ok = 0;
1051   enum coding_system_type type;
1052
1053   coding_system = Fget_coding_system (coding_system);
1054   CHECK_SYMBOL (prop);
1055   type = XCODING_SYSTEM_TYPE (coding_system);
1056
1057   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1058     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1059       {
1060         ok = 1;
1061         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1062           {
1063           case CODESYS_PROP_ALL_OK:
1064             break;
1065 #ifdef MULE
1066           case CODESYS_PROP_ISO2022:
1067             if (type != CODESYS_ISO2022)
1068               signal_simple_error
1069                 ("Property only valid in ISO2022 coding systems",
1070                  prop);
1071             break;
1072
1073           case CODESYS_PROP_CCL:
1074             if (type != CODESYS_CCL)
1075               signal_simple_error
1076                 ("Property only valid in CCL coding systems",
1077                  prop);
1078             break;
1079 #endif /* MULE */
1080           default:
1081             abort ();
1082           }
1083       }
1084
1085   if (!ok)
1086     signal_simple_error ("Unrecognized property", prop);
1087
1088   if (EQ (prop, Qname))
1089     return XCODING_SYSTEM_NAME (coding_system);
1090   else if (EQ (prop, Qtype))
1091     return Fcoding_system_type (coding_system);
1092   else if (EQ (prop, Qdoc_string))
1093     return XCODING_SYSTEM_DOC_STRING (coding_system);
1094   else if (EQ (prop, Qmnemonic))
1095     return XCODING_SYSTEM_MNEMONIC (coding_system);
1096   else if (EQ (prop, Qeol_type))
1097     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1098   else if (EQ (prop, Qeol_lf))
1099     return XCODING_SYSTEM_EOL_LF (coding_system);
1100   else if (EQ (prop, Qeol_crlf))
1101     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1102   else if (EQ (prop, Qeol_cr))
1103     return XCODING_SYSTEM_EOL_CR (coding_system);
1104   else if (EQ (prop, Qpost_read_conversion))
1105     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1106   else if (EQ (prop, Qpre_write_conversion))
1107     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1108 #ifdef MULE
1109   else if (type == CODESYS_ISO2022)
1110     {
1111       if (EQ (prop, Qcharset_g0))
1112         return coding_system_charset (coding_system, 0);
1113       else if (EQ (prop, Qcharset_g1))
1114         return coding_system_charset (coding_system, 1);
1115       else if (EQ (prop, Qcharset_g2))
1116         return coding_system_charset (coding_system, 2);
1117       else if (EQ (prop, Qcharset_g3))
1118         return coding_system_charset (coding_system, 3);
1119
1120 #define FORCE_CHARSET(charset_num) \
1121   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1122    (coding_system, charset_num) ? Qt : Qnil)
1123
1124       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1125       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1126       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1127       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1128
1129 #define LISP_BOOLEAN(prop) \
1130   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1131
1132       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1133       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1134       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1135       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1136       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1137       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1138       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1139
1140       else if (EQ (prop, Qinput_charset_conversion))
1141         return
1142           unparse_charset_conversion_specs
1143             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1144       else if (EQ (prop, Qoutput_charset_conversion))
1145         return
1146           unparse_charset_conversion_specs
1147             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1148       else
1149         abort ();
1150     }
1151   else if (type == CODESYS_CCL)
1152     {
1153       if (EQ (prop, Qdecode))
1154         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1155       else if (EQ (prop, Qencode))
1156         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1157       else
1158         abort ();
1159     }
1160 #endif /* MULE */
1161   else
1162     abort ();
1163
1164   return Qnil; /* not reached */
1165 }
1166
1167 \f
1168 /************************************************************************/
1169 /*                       Coding category functions                      */
1170 /************************************************************************/
1171
1172 static int
1173 decode_coding_category (Lisp_Object symbol)
1174 {
1175   int i;
1176
1177   CHECK_SYMBOL (symbol);
1178   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1179     if (EQ (coding_category_symbol[i], symbol))
1180       return i;
1181
1182   signal_simple_error ("Unrecognized coding category", symbol);
1183   return 0; /* not reached */
1184 }
1185
1186 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1187 Return a list of all recognized coding categories.
1188 */
1189        ())
1190 {
1191   int i;
1192   Lisp_Object list = Qnil;
1193
1194   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1195     list = Fcons (coding_category_symbol[i], list);
1196   return list;
1197 }
1198
1199 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1200 Change the priority order of the coding categories.
1201 LIST should be list of coding categories, in descending order of
1202 priority.  Unspecified coding categories will be lower in priority
1203 than all specified ones, in the same relative order they were in
1204 previously.
1205 */
1206        (list))
1207 {
1208   int category_to_priority[CODING_CATEGORY_LAST + 1];
1209   int i, j;
1210   Lisp_Object rest;
1211
1212   /* First generate a list that maps coding categories to priorities. */
1213
1214   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1215     category_to_priority[i] = -1;
1216
1217   /* Highest priority comes from the specified list. */
1218   i = 0;
1219   EXTERNAL_LIST_LOOP (rest, list)
1220     {
1221       int cat = decode_coding_category (XCAR (rest));
1222
1223       if (category_to_priority[cat] >= 0)
1224         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1225       category_to_priority[cat] = i++;
1226     }
1227
1228   /* Now go through the existing categories by priority to retrieve
1229      the categories not yet specified and preserve their priority
1230      order. */
1231   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1232     {
1233       int cat = coding_category_by_priority[j];
1234       if (category_to_priority[cat] < 0)
1235         category_to_priority[cat] = i++;
1236     }
1237
1238   /* Now we need to construct the inverse of the mapping we just
1239      constructed. */
1240
1241   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1242     coding_category_by_priority[category_to_priority[i]] = i;
1243
1244   /* Phew!  That was confusing. */
1245   return Qnil;
1246 }
1247
1248 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1249 Return a list of coding categories in descending order of priority.
1250 */
1251        ())
1252 {
1253   int i;
1254   Lisp_Object list = Qnil;
1255
1256   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1257     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1258                   list);
1259   return list;
1260 }
1261
1262 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1263 Change the coding system associated with a coding category.
1264 */
1265        (coding_category, coding_system))
1266 {
1267   int cat = decode_coding_category (coding_category);
1268
1269   coding_system = Fget_coding_system (coding_system);
1270   coding_category_system[cat] = coding_system;
1271   return Qnil;
1272 }
1273
1274 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1275 Return the coding system associated with a coding category.
1276 */
1277        (coding_category))
1278 {
1279   int cat = decode_coding_category (coding_category);
1280   Lisp_Object sys = coding_category_system[cat];
1281
1282   if (!NILP (sys))
1283     return XCODING_SYSTEM_NAME (sys);
1284   return Qnil;
1285 }
1286
1287 \f
1288 /************************************************************************/
1289 /*                     Detecting the encoding of data                   */
1290 /************************************************************************/
1291
1292 struct detection_state
1293 {
1294   enum eol_type eol_type;
1295   int seen_non_ascii;
1296   int mask;
1297 #ifdef MULE
1298   struct
1299     {
1300       int mask;
1301       int in_second_byte;
1302     }
1303   big5;
1304
1305   struct
1306     {
1307       int mask;
1308       int in_second_byte;
1309     }
1310   shift_jis;
1311
1312   struct
1313     {
1314       int mask;
1315       int in_byte;
1316   }
1317   ucs4;
1318
1319   struct
1320     {
1321       int mask;
1322       int in_byte;
1323     }
1324   utf8;
1325
1326   struct
1327     {
1328       int mask;
1329       int initted;
1330       struct iso2022_decoder iso;
1331       unsigned int flags;
1332       int high_byte_count;
1333       unsigned int saw_single_shift:1;
1334     }
1335   iso2022;
1336 #endif
1337   struct
1338     {
1339       int seen_anything;
1340       int just_saw_cr;
1341     }
1342   eol;
1343 };
1344
1345 static int
1346 acceptable_control_char_p (int c)
1347 {
1348   switch (c)
1349     {
1350       /* Allow and ignore control characters that you might
1351          reasonably see in a text file */
1352     case '\r':
1353     case '\n':
1354     case '\t':
1355     case  7: /* bell */
1356     case  8: /* backspace */
1357     case 11: /* vertical tab */
1358     case 12: /* form feed */
1359     case 26: /* MS-DOS C-z junk */
1360     case 31: /* '^_' -- for info */
1361       return 1;
1362     default:
1363       return 0;
1364     }
1365 }
1366
1367 static int
1368 mask_has_at_most_one_bit_p (int mask)
1369 {
1370   /* Perhaps the only thing useful you learn from intensive Microsoft
1371      technical interviews */
1372   return (mask & (mask - 1)) == 0;
1373 }
1374
1375 static enum eol_type
1376 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1377                  unsigned int n)
1378 {
1379   int c;
1380
1381   while (n--)
1382     {
1383       c = *src++;
1384       if (c == '\r')
1385         st->eol.just_saw_cr = 1;
1386       else
1387         {
1388           if (c == '\n')
1389             {
1390               if (st->eol.just_saw_cr)
1391                 return EOL_CRLF;
1392               else if (st->eol.seen_anything)
1393                 return EOL_LF;
1394             }
1395           else if (st->eol.just_saw_cr)
1396             return EOL_CR;
1397           st->eol.just_saw_cr = 0;
1398         }
1399       st->eol.seen_anything = 1;
1400     }
1401
1402   return EOL_AUTODETECT;
1403 }
1404
1405 /* Attempt to determine the encoding and EOL type of the given text.
1406    Before calling this function for the first type, you must initialize
1407    st->eol_type as appropriate and initialize st->mask to ~0.
1408
1409    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1410    not yet known.
1411
1412    st->mask holds the determined coding category mask, or ~0 if only
1413    ASCII has been seen so far.
1414
1415    Returns:
1416
1417    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1418         is present in st->mask
1419    1 == definitive answers are here for both st->eol_type and st->mask
1420 */
1421
1422 static int
1423 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1424                     unsigned int n, int just_do_eol)
1425 {
1426   int c;
1427
1428   if (st->eol_type == EOL_AUTODETECT)
1429     st->eol_type = detect_eol_type (st, src, n);
1430
1431   if (just_do_eol)
1432     return st->eol_type != EOL_AUTODETECT;
1433
1434   if (!st->seen_non_ascii)
1435     {
1436       for (; n; n--, src++)
1437         {
1438           c = *src;
1439           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1440             {
1441               st->seen_non_ascii = 1;
1442 #ifdef MULE
1443               st->shift_jis.mask = ~0;
1444               st->big5.mask = ~0;
1445               st->ucs4.mask = ~0;
1446               st->utf8.mask = ~0;
1447               st->iso2022.mask = ~0;
1448 #endif
1449               break;
1450             }
1451         }
1452     }
1453
1454   if (!n)
1455     return 0;
1456 #ifdef MULE
1457   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1458     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1459   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1460     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1461   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1462     st->big5.mask = detect_coding_big5 (st, src, n);
1463   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1464     st->utf8.mask = detect_coding_utf8 (st, src, n);
1465   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1466     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1467
1468   st->mask
1469     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1470     | st->utf8.mask | st->ucs4.mask;
1471 #endif
1472   {
1473     int retval = mask_has_at_most_one_bit_p (st->mask);
1474     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1475     return retval && st->eol_type != EOL_AUTODETECT;
1476   }
1477 }
1478
1479 static Lisp_Object
1480 coding_system_from_mask (int mask)
1481 {
1482   if (mask == ~0)
1483     {
1484       /* If the file was entirely or basically ASCII, use the
1485          default value of `buffer-file-coding-system'. */
1486       Lisp_Object retval =
1487         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1488       if (!NILP (retval))
1489         {
1490           retval = Ffind_coding_system (retval);
1491           if (NILP (retval))
1492             {
1493               warn_when_safe
1494                 (Qbad_variable, Qwarning,
1495                  "Invalid `default-buffer-file-coding-system', set to nil");
1496               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1497             }
1498         }
1499       if (NILP (retval))
1500         retval = Fget_coding_system (Qno_conversion);
1501       return retval;
1502     }
1503   else
1504     {
1505       int i;
1506       int cat = -1;
1507 #ifdef MULE
1508       mask = postprocess_iso2022_mask (mask);
1509 #endif
1510       /* Look through the coding categories by priority and find
1511          the first one that is allowed. */
1512       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1513         {
1514           cat = coding_category_by_priority[i];
1515           if ((mask & (1 << cat)) &&
1516               !NILP (coding_category_system[cat]))
1517             break;
1518         }
1519       if (cat >= 0)
1520         return coding_category_system[cat];
1521       else
1522         return Fget_coding_system (Qno_conversion);
1523     }
1524 }
1525
1526 /* Given a seekable read stream and potential coding system and EOL type
1527    as specified, do any autodetection that is called for.  If the
1528    coding system and/or EOL type are not autodetect, they will be left
1529    alone; but this function will never return an autodetect coding system
1530    or EOL type.
1531
1532    This function does not automatically fetch subsidiary coding systems;
1533    that should be unnecessary with the explicit eol-type argument. */
1534
1535 void
1536 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1537                               enum eol_type *eol_type_in_out)
1538 {
1539   struct detection_state decst;
1540
1541   if (*eol_type_in_out == EOL_AUTODETECT)
1542     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1543
1544   xzero (decst);
1545   decst.eol_type = *eol_type_in_out;
1546   decst.mask = ~0;
1547
1548   /* If autodetection is called for, do it now. */
1549   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1550       *eol_type_in_out == EOL_AUTODETECT)
1551     {
1552
1553       while (1)
1554         {
1555           unsigned char random_buffer[4096];
1556           int nread;
1557
1558           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1559           if (!nread)
1560             break;
1561           if (detect_coding_type (&decst, random_buffer, nread,
1562                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1563                                   CODESYS_AUTODETECT))
1564             break;
1565         }
1566
1567       *eol_type_in_out = decst.eol_type;
1568       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1569         *codesys_in_out = coding_system_from_mask (decst.mask);
1570     }
1571
1572   /* If we absolutely can't determine the EOL type, just assume LF. */
1573   if (*eol_type_in_out == EOL_AUTODETECT)
1574     *eol_type_in_out = EOL_LF;
1575
1576   Lstream_rewind (stream);
1577 }
1578
1579 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1580 Detect coding system of the text in the region between START and END.
1581 Returned a list of possible coding systems ordered by priority.
1582 If only ASCII characters are found, it returns 'undecided or one of
1583 its subsidiary coding systems according to a detected end-of-line
1584 type.  Optional arg BUFFER defaults to the current buffer.
1585 */
1586        (start, end, buffer))
1587 {
1588   Lisp_Object val = Qnil;
1589   struct buffer *buf = decode_buffer (buffer, 0);
1590   Bufpos b, e;
1591   Lisp_Object instream, lb_instream;
1592   Lstream *istr, *lb_istr;
1593   struct detection_state decst;
1594   struct gcpro gcpro1, gcpro2;
1595
1596   get_buffer_range_char (buf, start, end, &b, &e, 0);
1597   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1598   lb_istr = XLSTREAM (lb_instream);
1599   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1600   istr = XLSTREAM (instream);
1601   GCPRO2 (instream, lb_instream);
1602   xzero (decst);
1603   decst.eol_type = EOL_AUTODETECT;
1604   decst.mask = ~0;
1605   while (1)
1606     {
1607       unsigned char random_buffer[4096];
1608       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1609
1610       if (!nread)
1611         break;
1612       if (detect_coding_type (&decst, random_buffer, nread, 0))
1613         break;
1614     }
1615
1616   if (decst.mask == ~0)
1617     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1618                                     decst.eol_type);
1619   else
1620     {
1621       int i;
1622
1623       val = Qnil;
1624 #ifdef MULE
1625       decst.mask = postprocess_iso2022_mask (decst.mask);
1626 #endif
1627       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1628         {
1629           int sys = coding_category_by_priority[i];
1630           if (decst.mask & (1 << sys))
1631             {
1632               Lisp_Object codesys = coding_category_system[sys];
1633               if (!NILP (codesys))
1634                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1635               val = Fcons (codesys, val);
1636             }
1637         }
1638     }
1639   Lstream_close (istr);
1640   UNGCPRO;
1641   Lstream_delete (istr);
1642   Lstream_delete (lb_istr);
1643   return val;
1644 }
1645
1646 \f
1647 /************************************************************************/
1648 /*           Converting to internal Mule format ("decoding")            */
1649 /************************************************************************/
1650
1651 /* A decoding stream is a stream used for decoding text (i.e.
1652    converting from some external format to internal format).
1653    The decoding-stream object keeps track of the actual coding
1654    stream, the stream that is at the other end, and data that
1655    needs to be persistent across the lifetime of the stream. */
1656
1657 /* Handle the EOL stuff related to just-read-in character C.
1658    EOL_TYPE is the EOL type of the coding stream.
1659    FLAGS is the current value of FLAGS in the coding stream, and may
1660    be modified by this macro.  (The macro only looks at the
1661    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1662    bytes are to be written.  You need to also define a local goto
1663    label "label_continue_loop" that is at the end of the main
1664    character-reading loop.
1665
1666    If C is a CR character, then this macro handles it entirely and
1667    jumps to label_continue_loop.  Otherwise, this macro does not add
1668    anything to DST, and continues normally.  You should continue
1669    processing C normally after this macro. */
1670
1671 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1672 do {                                                            \
1673   if (c == '\r')                                                \
1674     {                                                           \
1675       if (eol_type == EOL_CR)                                   \
1676         Dynarr_add (dst, '\n');                                 \
1677       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1678         Dynarr_add (dst, c);                                    \
1679       else                                                      \
1680         flags |= CODING_STATE_CR;                               \
1681       goto label_continue_loop;                                 \
1682     }                                                           \
1683   else if (flags & CODING_STATE_CR)                             \
1684     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1685       if (c != '\n')                                            \
1686         Dynarr_add (dst, '\r');                                 \
1687       flags &= ~CODING_STATE_CR;                                \
1688     }                                                           \
1689 } while (0)
1690
1691 /* C should be a binary character in the range 0 - 255; convert
1692    to internal format and add to Dynarr DST. */
1693
1694 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1695 do {                                            \
1696   if (BYTE_ASCII_P (c))                         \
1697     Dynarr_add (dst, c);                        \
1698   else if (BYTE_C1_P (c))                       \
1699     {                                           \
1700       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1701       Dynarr_add (dst, c + 0x20);               \
1702     }                                           \
1703   else                                          \
1704     {                                           \
1705       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1706       Dynarr_add (dst, c);                      \
1707     }                                           \
1708 } while (0)
1709
1710 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1711 do {                                    \
1712   if (ch)                               \
1713     {                                   \
1714       DECODE_ADD_BINARY_CHAR (ch, dst); \
1715       ch = 0;                           \
1716     }                                   \
1717 } while (0)
1718
1719 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1720 do {                                    \
1721   if (flags & CODING_STATE_END)         \
1722     {                                   \
1723       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1724       if (flags & CODING_STATE_CR)      \
1725         Dynarr_add (dst, '\r');         \
1726     }                                   \
1727 } while (0)
1728
1729 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1730
1731 struct decoding_stream
1732 {
1733   /* Coding system that governs the conversion. */
1734   Lisp_Coding_System *codesys;
1735
1736   /* Stream that we read the encoded data from or
1737      write the decoded data to. */
1738   Lstream *other_end;
1739
1740   /* If we are reading, then we can return only a fixed amount of
1741      data, so if the conversion resulted in too much data, we store it
1742      here for retrieval the next time around. */
1743   unsigned_char_dynarr *runoff;
1744
1745   /* FLAGS holds flags indicating the current state of the decoding.
1746      Some of these flags are dependent on the coding system. */
1747   unsigned int flags;
1748
1749   /* CH holds a partially built-up character.  Since we only deal
1750      with one- and two-byte characters at the moment, we only use
1751      this to store the first byte of a two-byte character. */
1752   unsigned int ch;
1753
1754   /* EOL_TYPE specifies the type of end-of-line conversion that
1755      currently applies.  We need to keep this separate from the
1756      EOL type stored in CODESYS because the latter might indicate
1757      automatic EOL-type detection while the former will always
1758      indicate a particular EOL type. */
1759   enum eol_type eol_type;
1760 #ifdef MULE
1761   /* Additional ISO2022 information.  We define the structure above
1762      because it's also needed by the detection routines. */
1763   struct iso2022_decoder iso2022;
1764
1765   /* Additional information (the state of the running CCL program)
1766      used by the CCL decoder. */
1767   struct ccl_program ccl;
1768 #endif
1769   struct detection_state decst;
1770 };
1771
1772 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1773 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1774 static int decoding_rewinder   (Lstream *stream);
1775 static int decoding_seekable_p (Lstream *stream);
1776 static int decoding_flusher    (Lstream *stream);
1777 static int decoding_closer     (Lstream *stream);
1778
1779 static Lisp_Object decoding_marker (Lisp_Object stream,
1780                                     void (*markobj) (Lisp_Object));
1781
1782 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1783                                sizeof (struct decoding_stream));
1784
1785 static Lisp_Object
1786 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1787 {
1788   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1789   Lisp_Object str_obj;
1790
1791   /* We do not need to mark the coding systems or charsets stored
1792      within the stream because they are stored in a global list
1793      and automatically marked. */
1794
1795   XSETLSTREAM (str_obj, str);
1796   markobj (str_obj);
1797   if (str->imp->marker)
1798     return (str->imp->marker) (str_obj, markobj);
1799   else
1800     return Qnil;
1801 }
1802
1803 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1804    so we read data from the other end, decode it, and store it into DATA. */
1805
1806 static int
1807 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1808 {
1809   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1810   unsigned char *orig_data = data;
1811   int read_size;
1812   int error_occurred = 0;
1813
1814   /* We need to interface to mule_decode(), which expects to take some
1815      amount of data and store the result into a Dynarr.  We have
1816      mule_decode() store into str->runoff, and take data from there
1817      as necessary. */
1818
1819   /* We loop until we have enough data, reading chunks from the other
1820      end and decoding it. */
1821   while (1)
1822     {
1823       /* Take data from the runoff if we can.  Make sure to take at
1824          most SIZE bytes, and delete the data from the runoff. */
1825       if (Dynarr_length (str->runoff) > 0)
1826         {
1827           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1828           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1829           Dynarr_delete_many (str->runoff, 0, chunk);
1830           data += chunk;
1831           size -= chunk;
1832         }
1833
1834       if (size == 0)
1835         break; /* No more room for data */
1836
1837       if (str->flags & CODING_STATE_END)
1838         /* This means that on the previous iteration, we hit the EOF on
1839            the other end.  We loop once more so that mule_decode() can
1840            output any final stuff it may be holding, or any "go back
1841            to a sane state" escape sequences. (This latter makes sense
1842            during encoding.) */
1843         break;
1844
1845       /* Exhausted the runoff, so get some more.  DATA has at least
1846          SIZE bytes left of storage in it, so it's OK to read directly
1847          into it.  (We'll be overwriting above, after we've decoded it
1848          into the runoff.) */
1849       read_size = Lstream_read (str->other_end, data, size);
1850       if (read_size < 0)
1851         {
1852           error_occurred = 1;
1853           break;
1854         }
1855       if (read_size == 0)
1856         /* There might be some more end data produced in the translation.
1857            See the comment above. */
1858         str->flags |= CODING_STATE_END;
1859       mule_decode (stream, data, str->runoff, read_size);
1860     }
1861
1862   if (data - orig_data == 0)
1863     return error_occurred ? -1 : 0;
1864   else
1865     return data - orig_data;
1866 }
1867
1868 static int
1869 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1870 {
1871   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1872   int retval;
1873
1874   /* Decode all our data into the runoff, and then attempt to write
1875      it all out to the other end.  Remove whatever chunk we succeeded
1876      in writing. */
1877   mule_decode (stream, data, str->runoff, size);
1878   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1879                           Dynarr_length (str->runoff));
1880   if (retval > 0)
1881     Dynarr_delete_many (str->runoff, 0, retval);
1882   /* Do NOT return retval.  The return value indicates how much
1883      of the incoming data was written, not how many bytes were
1884      written. */
1885   return size;
1886 }
1887
1888 static void
1889 reset_decoding_stream (struct decoding_stream *str)
1890 {
1891 #ifdef MULE
1892   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1893     {
1894       Lisp_Object coding_system;
1895       XSETCODING_SYSTEM (coding_system, str->codesys);
1896       reset_iso2022 (coding_system, &str->iso2022);
1897     }
1898   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1899     {
1900       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1901     }
1902 #endif /* MULE */
1903   str->flags = str->ch = 0;
1904 }
1905
1906 static int
1907 decoding_rewinder (Lstream *stream)
1908 {
1909   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1910   reset_decoding_stream (str);
1911   Dynarr_reset (str->runoff);
1912   return Lstream_rewind (str->other_end);
1913 }
1914
1915 static int
1916 decoding_seekable_p (Lstream *stream)
1917 {
1918   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1919   return Lstream_seekable_p (str->other_end);
1920 }
1921
1922 static int
1923 decoding_flusher (Lstream *stream)
1924 {
1925   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1926   return Lstream_flush (str->other_end);
1927 }
1928
1929 static int
1930 decoding_closer (Lstream *stream)
1931 {
1932   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1933   if (stream->flags & LSTREAM_FL_WRITE)
1934     {
1935       str->flags |= CODING_STATE_END;
1936       decoding_writer (stream, 0, 0);
1937     }
1938   Dynarr_free (str->runoff);
1939 #ifdef MULE
1940 #ifdef ENABLE_COMPOSITE_CHARS
1941   if (str->iso2022.composite_chars)
1942     Dynarr_free (str->iso2022.composite_chars);
1943 #endif
1944 #endif
1945   return Lstream_close (str->other_end);
1946 }
1947
1948 Lisp_Object
1949 decoding_stream_coding_system (Lstream *stream)
1950 {
1951   Lisp_Object coding_system;
1952   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1953
1954   XSETCODING_SYSTEM (coding_system, str->codesys);
1955   return subsidiary_coding_system (coding_system, str->eol_type);
1956 }
1957
1958 void
1959 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1960 {
1961   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1962   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1963   str->codesys = cs;
1964   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1965     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1966   reset_decoding_stream (str);
1967 }
1968
1969 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
1970    stream for writing, no automatic code detection will be performed.
1971    The reason for this is that automatic code detection requires a
1972    seekable input.  Things will also fail if you open a decoding
1973    stream for reading using a non-fully-specified coding system and
1974    a non-seekable input stream. */
1975
1976 static Lisp_Object
1977 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1978                         CONST char *mode)
1979 {
1980   Lstream *lstr = Lstream_new (lstream_decoding, mode);
1981   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1982   Lisp_Object obj;
1983
1984   xzero (*str);
1985   str->other_end = stream;
1986   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1987   str->eol_type = EOL_AUTODETECT;
1988   if (!strcmp (mode, "r")
1989       && Lstream_seekable_p (stream))
1990     /* We can determine the coding system now. */
1991     determine_real_coding_system (stream, &codesys, &str->eol_type);
1992   set_decoding_stream_coding_system (lstr, codesys);
1993   str->decst.eol_type = str->eol_type;
1994   str->decst.mask = ~0;
1995   XSETLSTREAM (obj, lstr);
1996   return obj;
1997 }
1998
1999 Lisp_Object
2000 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2001 {
2002   return make_decoding_stream_1 (stream, codesys, "r");
2003 }
2004
2005 Lisp_Object
2006 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2007 {
2008   return make_decoding_stream_1 (stream, codesys, "w");
2009 }
2010
2011 /* Note: the decode_coding_* functions all take the same
2012    arguments as mule_decode(), which is to say some SRC data of
2013    size N, which is to be stored into dynamic array DST.
2014    DECODING is the stream within which the decoding is
2015    taking place, but no data is actually read from or
2016    written to that stream; that is handled in decoding_reader()
2017    or decoding_writer().  This allows the same functions to
2018    be used for both reading and writing. */
2019
2020 static void
2021 mule_decode (Lstream *decoding, CONST unsigned char *src,
2022              unsigned_char_dynarr *dst, unsigned int n)
2023 {
2024   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2025
2026   /* If necessary, do encoding-detection now.  We do this when
2027      we're a writing stream or a non-seekable reading stream,
2028      meaning that we can't just process the whole input,
2029      rewind, and start over. */
2030
2031   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2032       str->eol_type == EOL_AUTODETECT)
2033     {
2034       Lisp_Object codesys;
2035
2036       XSETCODING_SYSTEM (codesys, str->codesys);
2037       detect_coding_type (&str->decst, src, n,
2038                           CODING_SYSTEM_TYPE (str->codesys) !=
2039                           CODESYS_AUTODETECT);
2040       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2041           str->decst.mask != ~0)
2042         /* #### This is cheesy.  What we really ought to do is
2043            buffer up a certain amount of data so as to get a
2044            less random result. */
2045         codesys = coding_system_from_mask (str->decst.mask);
2046       str->eol_type = str->decst.eol_type;
2047       if (XCODING_SYSTEM (codesys) != str->codesys)
2048         {
2049           /* Preserve the CODING_STATE_END flag in case it was set.
2050              If we erase it, bad things might happen. */
2051           int was_end = str->flags & CODING_STATE_END;
2052           set_decoding_stream_coding_system (decoding, codesys);
2053           if (was_end)
2054             str->flags |= CODING_STATE_END;
2055         }
2056     }
2057
2058   switch (CODING_SYSTEM_TYPE (str->codesys))
2059     {
2060 #ifdef DEBUG_XEMACS
2061     case CODESYS_INTERNAL:
2062       Dynarr_add_many (dst, src, n);
2063       break;
2064 #endif
2065     case CODESYS_AUTODETECT:
2066       /* If we got this far and still haven't decided on the coding
2067          system, then do no conversion. */
2068     case CODESYS_NO_CONVERSION:
2069       decode_coding_no_conversion (decoding, src, dst, n);
2070       break;
2071 #ifdef MULE
2072     case CODESYS_SHIFT_JIS:
2073       decode_coding_sjis (decoding, src, dst, n);
2074       break;
2075     case CODESYS_BIG5:
2076       decode_coding_big5 (decoding, src, dst, n);
2077       break;
2078     case CODESYS_UCS4:
2079       decode_coding_ucs4 (decoding, src, dst, n);
2080       break;
2081     case CODESYS_UTF8:
2082       decode_coding_utf8 (decoding, src, dst, n);
2083       break;
2084     case CODESYS_CCL:
2085       ccl_driver (&str->ccl, src, dst, n, 0);
2086       break;
2087     case CODESYS_ISO2022:
2088       decode_coding_iso2022 (decoding, src, dst, n);
2089       break;
2090 #endif /* MULE */
2091     default:
2092       abort ();
2093     }
2094 }
2095
2096 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2097 Decode the text between START and END which is encoded in CODING-SYSTEM.
2098 This is useful if you've read in encoded text from a file without decoding
2099 it (e.g. you read in a JIS-formatted file but used the `binary' or
2100 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2101 Return length of decoded text.
2102 BUFFER defaults to the current buffer if unspecified.
2103 */
2104        (start, end, coding_system, buffer))
2105 {
2106   Bufpos b, e;
2107   struct buffer *buf = decode_buffer (buffer, 0);
2108   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2109   Lstream *istr, *ostr;
2110   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2111
2112   get_buffer_range_char (buf, start, end, &b, &e, 0);
2113
2114   barf_if_buffer_read_only (buf, b, e);
2115
2116   coding_system = Fget_coding_system (coding_system);
2117   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2118   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2119   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2120                                               coding_system);
2121   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2122                                            Fget_coding_system (Qbinary));
2123   istr = XLSTREAM (instream);
2124   ostr = XLSTREAM (outstream);
2125   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2126
2127   /* The chain of streams looks like this:
2128
2129      [BUFFER] <----- send through
2130                      ------> [ENCODE AS BINARY]
2131                              ------> [DECODE AS SPECIFIED]
2132                                      ------> [BUFFER]
2133    */
2134
2135   while (1)
2136     {
2137       char tempbuf[1024]; /* some random amount */
2138       Bufpos newpos, even_newer_pos;
2139       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2140       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2141
2142       if (!size_in_bytes)
2143         break;
2144       newpos = lisp_buffer_stream_startpos (istr);
2145       Lstream_write (ostr, tempbuf, size_in_bytes);
2146       even_newer_pos = lisp_buffer_stream_startpos (istr);
2147       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2148                            even_newer_pos, 0);
2149     }
2150   Lstream_close (istr);
2151   Lstream_close (ostr);
2152   UNGCPRO;
2153   Lstream_delete (istr);
2154   Lstream_delete (ostr);
2155   Lstream_delete (XLSTREAM (de_outstream));
2156   Lstream_delete (XLSTREAM (lb_outstream));
2157   return Qnil;
2158 }
2159
2160 \f
2161 /************************************************************************/
2162 /*           Converting to an external encoding ("encoding")            */
2163 /************************************************************************/
2164
2165 /* An encoding stream is an output stream.  When you create the
2166    stream, you specify the coding system that governs the encoding
2167    and another stream that the resulting encoded data is to be
2168    sent to, and then start sending data to it. */
2169
2170 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2171
2172 struct encoding_stream
2173 {
2174   /* Coding system that governs the conversion. */
2175   Lisp_Coding_System *codesys;
2176
2177   /* Stream that we read the encoded data from or
2178      write the decoded data to. */
2179   Lstream *other_end;
2180
2181   /* If we are reading, then we can return only a fixed amount of
2182      data, so if the conversion resulted in too much data, we store it
2183      here for retrieval the next time around. */
2184   unsigned_char_dynarr *runoff;
2185
2186   /* FLAGS holds flags indicating the current state of the encoding.
2187      Some of these flags are dependent on the coding system. */
2188   unsigned int flags;
2189
2190   /* CH holds a partially built-up character.  Since we only deal
2191      with one- and two-byte characters at the moment, we only use
2192      this to store the first byte of a two-byte character. */
2193   unsigned int ch;
2194 #ifdef MULE
2195   /* Additional information used by the ISO2022 encoder. */
2196   struct
2197     {
2198       /* CHARSET holds the character sets currently assigned to the G0
2199          through G3 registers.  It is initialized from the array
2200          INITIAL_CHARSET in CODESYS. */
2201       Lisp_Object charset[4];
2202
2203       /* Which registers are currently invoked into the left (GL) and
2204          right (GR) halves of the 8-bit encoding space? */
2205       int register_left, register_right;
2206
2207       /* Whether we need to explicitly designate the charset in the
2208          G? register before using it.  It is initialized from the
2209          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2210       unsigned char force_charset_on_output[4];
2211
2212       /* Other state variables that need to be preserved across
2213          invocations. */
2214       Lisp_Object current_charset;
2215       int current_half;
2216       int current_char_boundary;
2217     } iso2022;
2218
2219   /* Additional information (the state of the running CCL program)
2220      used by the CCL encoder. */
2221   struct ccl_program ccl;
2222 #endif /* MULE */
2223 };
2224
2225 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2226 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2227                             size_t size);
2228 static int encoding_rewinder   (Lstream *stream);
2229 static int encoding_seekable_p (Lstream *stream);
2230 static int encoding_flusher    (Lstream *stream);
2231 static int encoding_closer     (Lstream *stream);
2232
2233 static Lisp_Object encoding_marker (Lisp_Object stream,
2234                                     void (*markobj) (Lisp_Object));
2235
2236 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2237                                sizeof (struct encoding_stream));
2238
2239 static Lisp_Object
2240 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2241 {
2242   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2243   Lisp_Object str_obj;
2244
2245   /* We do not need to mark the coding systems or charsets stored
2246      within the stream because they are stored in a global list
2247      and automatically marked. */
2248
2249   XSETLSTREAM (str_obj, str);
2250   markobj (str_obj);
2251   if (str->imp->marker)
2252     return (str->imp->marker) (str_obj, markobj);
2253   else
2254     return Qnil;
2255 }
2256
2257 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2258    so we read data from the other end, encode it, and store it into DATA. */
2259
2260 static int
2261 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2262 {
2263   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2264   unsigned char *orig_data = data;
2265   int read_size;
2266   int error_occurred = 0;
2267
2268   /* We need to interface to mule_encode(), which expects to take some
2269      amount of data and store the result into a Dynarr.  We have
2270      mule_encode() store into str->runoff, and take data from there
2271      as necessary. */
2272
2273   /* We loop until we have enough data, reading chunks from the other
2274      end and encoding it. */
2275   while (1)
2276     {
2277       /* Take data from the runoff if we can.  Make sure to take at
2278          most SIZE bytes, and delete the data from the runoff. */
2279       if (Dynarr_length (str->runoff) > 0)
2280         {
2281           int chunk = min ((int) size, Dynarr_length (str->runoff));
2282           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2283           Dynarr_delete_many (str->runoff, 0, chunk);
2284           data += chunk;
2285           size -= chunk;
2286         }
2287
2288       if (size == 0)
2289         break; /* No more room for data */
2290
2291       if (str->flags & CODING_STATE_END)
2292         /* This means that on the previous iteration, we hit the EOF on
2293            the other end.  We loop once more so that mule_encode() can
2294            output any final stuff it may be holding, or any "go back
2295            to a sane state" escape sequences. (This latter makes sense
2296            during encoding.) */
2297         break;
2298
2299       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2300          left of storage in it, so it's OK to read directly into it.
2301          (We'll be overwriting above, after we've encoded it into the
2302          runoff.) */
2303       read_size = Lstream_read (str->other_end, data, size);
2304       if (read_size < 0)
2305         {
2306           error_occurred = 1;
2307           break;
2308         }
2309       if (read_size == 0)
2310         /* There might be some more end data produced in the translation.
2311            See the comment above. */
2312         str->flags |= CODING_STATE_END;
2313       mule_encode (stream, data, str->runoff, read_size);
2314     }
2315
2316   if (data == orig_data)
2317     return error_occurred ? -1 : 0;
2318   else
2319     return data - orig_data;
2320 }
2321
2322 static int
2323 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2324 {
2325   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2326   int retval;
2327
2328   /* Encode all our data into the runoff, and then attempt to write
2329      it all out to the other end.  Remove whatever chunk we succeeded
2330      in writing. */
2331   mule_encode (stream, data, str->runoff, size);
2332   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2333                           Dynarr_length (str->runoff));
2334   if (retval > 0)
2335     Dynarr_delete_many (str->runoff, 0, retval);
2336   /* Do NOT return retval.  The return value indicates how much
2337      of the incoming data was written, not how many bytes were
2338      written. */
2339   return size;
2340 }
2341
2342 static void
2343 reset_encoding_stream (struct encoding_stream *str)
2344 {
2345 #ifdef MULE
2346   switch (CODING_SYSTEM_TYPE (str->codesys))
2347     {
2348     case CODESYS_ISO2022:
2349       {
2350         int i;
2351
2352         for (i = 0; i < 4; i++)
2353           {
2354             str->iso2022.charset[i] =
2355               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2356             str->iso2022.force_charset_on_output[i] =
2357               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2358           }
2359         str->iso2022.register_left = 0;
2360         str->iso2022.register_right = 1;
2361         str->iso2022.current_charset = Qnil;
2362         str->iso2022.current_half = 0;
2363         str->iso2022.current_char_boundary = 1;
2364         break;
2365       }
2366     case CODESYS_CCL:
2367       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2368       break;
2369     default:
2370       break;
2371     }
2372 #endif /* MULE */
2373
2374   str->flags = str->ch = 0;
2375 }
2376
2377 static int
2378 encoding_rewinder (Lstream *stream)
2379 {
2380   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2381   reset_encoding_stream (str);
2382   Dynarr_reset (str->runoff);
2383   return Lstream_rewind (str->other_end);
2384 }
2385
2386 static int
2387 encoding_seekable_p (Lstream *stream)
2388 {
2389   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2390   return Lstream_seekable_p (str->other_end);
2391 }
2392
2393 static int
2394 encoding_flusher (Lstream *stream)
2395 {
2396   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2397   return Lstream_flush (str->other_end);
2398 }
2399
2400 static int
2401 encoding_closer (Lstream *stream)
2402 {
2403   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2404   if (stream->flags & LSTREAM_FL_WRITE)
2405     {
2406       str->flags |= CODING_STATE_END;
2407       encoding_writer (stream, 0, 0);
2408     }
2409   Dynarr_free (str->runoff);
2410   return Lstream_close (str->other_end);
2411 }
2412
2413 Lisp_Object
2414 encoding_stream_coding_system (Lstream *stream)
2415 {
2416   Lisp_Object coding_system;
2417   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2418
2419   XSETCODING_SYSTEM (coding_system, str->codesys);
2420   return coding_system;
2421 }
2422
2423 void
2424 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2425 {
2426   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2427   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2428   str->codesys = cs;
2429   reset_encoding_stream (str);
2430 }
2431
2432 static Lisp_Object
2433 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2434                         CONST char *mode)
2435 {
2436   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2437   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2438   Lisp_Object obj;
2439
2440   xzero (*str);
2441   str->runoff = Dynarr_new (unsigned_char);
2442   str->other_end = stream;
2443   set_encoding_stream_coding_system (lstr, codesys);
2444   XSETLSTREAM (obj, lstr);
2445   return obj;
2446 }
2447
2448 Lisp_Object
2449 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2450 {
2451   return make_encoding_stream_1 (stream, codesys, "r");
2452 }
2453
2454 Lisp_Object
2455 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2456 {
2457   return make_encoding_stream_1 (stream, codesys, "w");
2458 }
2459
2460 /* Convert N bytes of internally-formatted data stored in SRC to an
2461    external format, according to the encoding stream ENCODING.
2462    Store the encoded data into DST. */
2463
2464 static void
2465 mule_encode (Lstream *encoding, CONST unsigned char *src,
2466              unsigned_char_dynarr *dst, unsigned int n)
2467 {
2468   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2469
2470   switch (CODING_SYSTEM_TYPE (str->codesys))
2471     {
2472 #ifdef DEBUG_XEMACS
2473     case CODESYS_INTERNAL:
2474       Dynarr_add_many (dst, src, n);
2475       break;
2476 #endif
2477     case CODESYS_AUTODETECT:
2478       /* If we got this far and still haven't decided on the coding
2479          system, then do no conversion. */
2480     case CODESYS_NO_CONVERSION:
2481       encode_coding_no_conversion (encoding, src, dst, n);
2482       break;
2483 #ifdef MULE
2484     case CODESYS_SHIFT_JIS:
2485       encode_coding_sjis (encoding, src, dst, n);
2486       break;
2487     case CODESYS_BIG5:
2488       encode_coding_big5 (encoding, src, dst, n);
2489       break;
2490     case CODESYS_UCS4:
2491       encode_coding_ucs4 (encoding, src, dst, n);
2492       break;
2493     case CODESYS_UTF8:
2494       encode_coding_utf8 (encoding, src, dst, n);
2495       break;
2496     case CODESYS_CCL:
2497       ccl_driver (&str->ccl, src, dst, n, 0);
2498       break;
2499     case CODESYS_ISO2022:
2500       encode_coding_iso2022 (encoding, src, dst, n);
2501       break;
2502 #endif /* MULE */
2503     default:
2504       abort ();
2505     }
2506 }
2507
2508 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2509 Encode the text between START and END using CODING-SYSTEM.
2510 This will, for example, convert Japanese characters into stuff such as
2511 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2512 text.  BUFFER defaults to the current buffer if unspecified.
2513 */
2514        (start, end, coding_system, buffer))
2515 {
2516   Bufpos b, e;
2517   struct buffer *buf = decode_buffer (buffer, 0);
2518   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2519   Lstream *istr, *ostr;
2520   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2521
2522   get_buffer_range_char (buf, start, end, &b, &e, 0);
2523
2524   barf_if_buffer_read_only (buf, b, e);
2525
2526   coding_system = Fget_coding_system (coding_system);
2527   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2528   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2529   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2530                                               Fget_coding_system (Qbinary));
2531   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2532                                            coding_system);
2533   istr = XLSTREAM (instream);
2534   ostr = XLSTREAM (outstream);
2535   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2536   /* The chain of streams looks like this:
2537
2538      [BUFFER] <----- send through
2539                      ------> [ENCODE AS SPECIFIED]
2540                              ------> [DECODE AS BINARY]
2541                                      ------> [BUFFER]
2542    */
2543   while (1)
2544     {
2545       char tempbuf[1024]; /* some random amount */
2546       Bufpos newpos, even_newer_pos;
2547       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2548       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2549
2550       if (!size_in_bytes)
2551         break;
2552       newpos = lisp_buffer_stream_startpos (istr);
2553       Lstream_write (ostr, tempbuf, size_in_bytes);
2554       even_newer_pos = lisp_buffer_stream_startpos (istr);
2555       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2556                            even_newer_pos, 0);
2557     }
2558
2559   {
2560     Charcount retlen =
2561       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2562     Lstream_close (istr);
2563     Lstream_close (ostr);
2564     UNGCPRO;
2565     Lstream_delete (istr);
2566     Lstream_delete (ostr);
2567     Lstream_delete (XLSTREAM (de_outstream));
2568     Lstream_delete (XLSTREAM (lb_outstream));
2569     return make_int (retlen);
2570   }
2571 }
2572
2573 #ifdef MULE
2574 \f
2575 /************************************************************************/
2576 /*                          Shift-JIS methods                           */
2577 /************************************************************************/
2578
2579 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2580    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2581    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2582    encoded by "position-code + 0x80".  A character of JISX0208
2583    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2584    position-codes are divided and shifted so that it fit in the range
2585    below.
2586
2587    --- CODE RANGE of Shift-JIS ---
2588    (character set)      (range)
2589    ASCII                0x00 .. 0x7F
2590    JISX0201-Kana        0xA0 .. 0xDF
2591    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2592             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2593    -------------------------------
2594
2595 */
2596
2597 /* Is this the first byte of a Shift-JIS two-byte char? */
2598
2599 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2600   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2601
2602 /* Is this the second byte of a Shift-JIS two-byte char? */
2603
2604 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2605   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2606
2607 #define BYTE_SJIS_KATAKANA_P(c) \
2608   ((c) >= 0xA1 && (c) <= 0xDF)
2609
2610 static int
2611 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2612                     unsigned int n)
2613 {
2614   int c;
2615
2616   while (n--)
2617     {
2618       c = *src++;
2619       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2620         return 0;
2621       if (st->shift_jis.in_second_byte)
2622         {
2623           st->shift_jis.in_second_byte = 0;
2624           if (c < 0x40)
2625             return 0;
2626         }
2627       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2628         st->shift_jis.in_second_byte = 1;
2629     }
2630   return CODING_CATEGORY_SHIFT_JIS_MASK;
2631 }
2632
2633 /* Convert Shift-JIS data to internal format. */
2634
2635 static void
2636 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2637                     unsigned_char_dynarr *dst, unsigned int n)
2638 {
2639   unsigned char c;
2640   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2641   unsigned int flags  = str->flags;
2642   unsigned int ch     = str->ch;
2643   eol_type_t eol_type = str->eol_type;
2644
2645   while (n--)
2646     {
2647       c = *src++;
2648
2649       if (ch)
2650         {
2651           /* Previous character was first byte of Shift-JIS Kanji char. */
2652           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2653             {
2654               unsigned char e1, e2;
2655
2656               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2657               DECODE_SJIS (ch, c, e1, e2);
2658               Dynarr_add (dst, e1);
2659               Dynarr_add (dst, e2);
2660             }
2661           else
2662             {
2663               DECODE_ADD_BINARY_CHAR (ch, dst);
2664               DECODE_ADD_BINARY_CHAR (c, dst);
2665             }
2666           ch = 0;
2667         }
2668       else
2669         {
2670           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2671           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2672             ch = c;
2673           else if (BYTE_SJIS_KATAKANA_P (c))
2674             {
2675               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2676               Dynarr_add (dst, c);
2677             }
2678           else
2679             DECODE_ADD_BINARY_CHAR (c, dst);
2680         }
2681     label_continue_loop:;
2682     }
2683
2684   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2685
2686   str->flags = flags;
2687   str->ch    = ch;
2688 }
2689
2690 /* Convert internally-formatted data to Shift-JIS. */
2691
2692 static void
2693 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2694                     unsigned_char_dynarr *dst, unsigned int n)
2695 {
2696   unsigned char c;
2697   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2698   unsigned int flags  = str->flags;
2699   unsigned int ch     = str->ch;
2700   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2701
2702   while (n--)
2703     {
2704       c = *src++;
2705       if (c == '\n')
2706         {
2707           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2708             Dynarr_add (dst, '\r');
2709           if (eol_type != EOL_CR)
2710             Dynarr_add (dst, '\n');
2711           ch = 0;
2712         }
2713       else if (BYTE_ASCII_P (c))
2714         {
2715           Dynarr_add (dst, c);
2716           ch = 0;
2717         }
2718       else if (BUFBYTE_LEADING_BYTE_P (c))
2719         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2720               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2721               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2722       else if (ch)
2723         {
2724           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2725             {
2726               Dynarr_add (dst, c);
2727               ch = 0;
2728             }
2729           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2730                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2731             ch = c;
2732           else
2733             {
2734               unsigned char j1, j2;
2735               ENCODE_SJIS (ch, c, j1, j2);
2736               Dynarr_add (dst, j1);
2737               Dynarr_add (dst, j2);
2738               ch = 0;
2739             }
2740         }
2741     }
2742
2743   str->flags = flags;
2744   str->ch    = ch;
2745 }
2746
2747 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2748 Decode a JISX0208 character of Shift-JIS coding-system.
2749 CODE is the character code in Shift-JIS as a cons of type bytes.
2750 Return the corresponding character.
2751 */
2752        (code))
2753 {
2754   unsigned char c1, c2, s1, s2;
2755
2756   CHECK_CONS (code);
2757   CHECK_INT (XCAR (code));
2758   CHECK_INT (XCDR (code));
2759   s1 = XINT (XCAR (code));
2760   s2 = XINT (XCDR (code));
2761   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2762       BYTE_SJIS_TWO_BYTE_2_P (s2))
2763     {
2764       DECODE_SJIS (s1, s2, c1, c2);
2765       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2766                                    c1 & 0x7F, c2 & 0x7F));
2767     }
2768   else
2769     return Qnil;
2770 }
2771
2772 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2773 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2774 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2775 */
2776        (ch))
2777 {
2778   Lisp_Object charset;
2779   int c1, c2, s1, s2;
2780
2781   CHECK_CHAR_COERCE_INT (ch);
2782   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2783   if (EQ (charset, Vcharset_japanese_jisx0208))
2784     {
2785       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2786       return Fcons (make_int (s1), make_int (s2));
2787     }
2788   else
2789     return Qnil;
2790 }
2791
2792 \f
2793 /************************************************************************/
2794 /*                            Big5 methods                              */
2795 /************************************************************************/
2796
2797 /* BIG5 is a coding system encoding two character sets: ASCII and
2798    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2799    character set and is encoded in two-byte.
2800
2801    --- CODE RANGE of BIG5 ---
2802    (character set)      (range)
2803    ASCII                0x00 .. 0x7F
2804    Big5 (1st byte)      0xA1 .. 0xFE
2805         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2806    --------------------------
2807
2808    Since the number of characters in Big5 is larger than maximum
2809    characters in Emacs' charset (96x96), it can't be handled as one
2810    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2811    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
2812    contains frequently used characters and the latter contains less
2813    frequently used characters.  */
2814
2815 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2816   ((c) >= 0xA1 && (c) <= 0xFE)
2817
2818 /* Is this the second byte of a Shift-JIS two-byte char? */
2819
2820 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2821   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2822
2823 /* Number of Big5 characters which have the same code in 1st byte.  */
2824
2825 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2826
2827 /* Code conversion macros.  These are macros because they are used in
2828    inner loops during code conversion.
2829
2830    Note that temporary variables in macros introduce the classic
2831    dynamic-scoping problems with variable names.  We use capital-
2832    lettered variables in the assumption that XEmacs does not use
2833    capital letters in variables except in a very formalized way
2834    (e.g. Qstring). */
2835
2836 /* Convert Big5 code (b1, b2) into its internal string representation
2837    (lb, c1, c2). */
2838
2839 /* There is a much simpler way to split the Big5 charset into two.
2840    For the moment I'm going to leave the algorithm as-is because it
2841    claims to separate out the most-used characters into a single
2842    charset, which perhaps will lead to optimizations in various
2843    places.
2844
2845    The way the algorithm works is something like this:
2846
2847    Big5 can be viewed as a 94x157 charset, where the row is
2848    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2849    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2850    the split between low and high column numbers is apparently
2851    meaningless; ascending rows produce less and less frequent chars.
2852    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2853    the first charset, and the upper half (0xC9 .. 0xFE) to the
2854    second.  To do the conversion, we convert the character into
2855    a single number where 0 .. 156 is the first row, 157 .. 313
2856    is the second, etc.  That way, the characters are ordered by
2857    decreasing frequency.  Then we just chop the space in two
2858    and coerce the result into a 94x94 space.
2859    */
2860
2861 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2862 {                                                                       \
2863   int B1 = b1, B2 = b2;                                                 \
2864   unsigned int I                                                        \
2865     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2866                                                                         \
2867   if (B1 < 0xC9)                                                        \
2868     {                                                                   \
2869       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2870     }                                                                   \
2871   else                                                                  \
2872     {                                                                   \
2873       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2874       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2875     }                                                                   \
2876   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2877   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2878 } while (0)
2879
2880 /* Convert the internal string representation of a Big5 character
2881    (lb, c1, c2) into Big5 code (b1, b2). */
2882
2883 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2884 {                                                                       \
2885   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2886                                                                         \
2887   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2888     {                                                                   \
2889       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2890     }                                                                   \
2891   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2892   b2 = I % BIG5_SAME_ROW;                                               \
2893   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2894 } while (0)
2895
2896 static int
2897 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2898                     unsigned int n)
2899 {
2900   int c;
2901
2902   while (n--)
2903     {
2904       c = *src++;
2905       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2906           (c >= 0x80 && c <= 0xA0))
2907         return 0;
2908       if (st->big5.in_second_byte)
2909         {
2910           st->big5.in_second_byte = 0;
2911           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2912             return 0;
2913         }
2914       else if (c >= 0xA1)
2915         st->big5.in_second_byte = 1;
2916     }
2917   return CODING_CATEGORY_BIG5_MASK;
2918 }
2919
2920 /* Convert Big5 data to internal format. */
2921
2922 static void
2923 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2924                     unsigned_char_dynarr *dst, unsigned int n)
2925 {
2926   unsigned char c;
2927   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2928   unsigned int flags  = str->flags;
2929   unsigned int ch     = str->ch;
2930   eol_type_t eol_type = str->eol_type;
2931
2932   while (n--)
2933     {
2934       c = *src++;
2935       if (ch)
2936         {
2937           /* Previous character was first byte of Big5 char. */
2938           if (BYTE_BIG5_TWO_BYTE_2_P (c))
2939             {
2940               unsigned char b1, b2, b3;
2941               DECODE_BIG5 (ch, c, b1, b2, b3);
2942               Dynarr_add (dst, b1);
2943               Dynarr_add (dst, b2);
2944               Dynarr_add (dst, b3);
2945             }
2946           else
2947             {
2948               DECODE_ADD_BINARY_CHAR (ch, dst);
2949               DECODE_ADD_BINARY_CHAR (c, dst);
2950             }
2951           ch = 0;
2952         }
2953       else
2954         {
2955           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2956           if (BYTE_BIG5_TWO_BYTE_1_P (c))
2957             ch = c;
2958           else
2959             DECODE_ADD_BINARY_CHAR (c, dst);
2960         }
2961     label_continue_loop:;
2962     }
2963
2964   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2965
2966   str->flags = flags;
2967   str->ch    = ch;
2968 }
2969
2970 /* Convert internally-formatted data to Big5. */
2971
2972 static void
2973 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2974                     unsigned_char_dynarr *dst, unsigned int n)
2975 {
2976   unsigned char c;
2977   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2978   unsigned int flags  = str->flags;
2979   unsigned int ch     = str->ch;
2980   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2981
2982   while (n--)
2983     {
2984       c = *src++;
2985       if (c == '\n')
2986         {
2987           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2988             Dynarr_add (dst, '\r');
2989           if (eol_type != EOL_CR)
2990             Dynarr_add (dst, '\n');
2991         }
2992       else if (BYTE_ASCII_P (c))
2993         {
2994           /* ASCII. */
2995           Dynarr_add (dst, c);
2996         }
2997       else if (BUFBYTE_LEADING_BYTE_P (c))
2998         {
2999           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3000               c == LEADING_BYTE_CHINESE_BIG5_2)
3001             {
3002               /* A recognized leading byte. */
3003               ch = c;
3004               continue; /* not done with this character. */
3005             }
3006           /* otherwise just ignore this character. */
3007         }
3008       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3009                ch == LEADING_BYTE_CHINESE_BIG5_2)
3010         {
3011           /* Previous char was a recognized leading byte. */
3012           ch = (ch << 8) | c;
3013           continue; /* not done with this character. */
3014         }
3015       else if (ch)
3016         {
3017           /* Encountering second byte of a Big5 character. */
3018           unsigned char b1, b2;
3019
3020           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3021           Dynarr_add (dst, b1);
3022           Dynarr_add (dst, b2);
3023         }
3024
3025       ch = 0;
3026     }
3027
3028   str->flags = flags;
3029   str->ch    = ch;
3030 }
3031
3032
3033 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3034 Decode a Big5 character CODE of BIG5 coding-system.
3035 CODE is the character code in BIG5, a cons of two integers.
3036 Return the corresponding character.
3037 */
3038        (code))
3039 {
3040   unsigned char c1, c2, b1, b2;
3041
3042   CHECK_CONS (code);
3043   CHECK_INT (XCAR (code));
3044   CHECK_INT (XCDR (code));
3045   b1 = XINT (XCAR (code));
3046   b2 = XINT (XCDR (code));
3047   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3048       BYTE_BIG5_TWO_BYTE_2_P (b2))
3049     {
3050       int leading_byte;
3051       Lisp_Object charset;
3052       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3053       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3054       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3055     }
3056   else
3057     return Qnil;
3058 }
3059
3060 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3061 Encode the Big5 character CH to BIG5 coding-system.
3062 Return the corresponding character code in Big5.
3063 */
3064        (ch))
3065 {
3066   Lisp_Object charset;
3067   int c1, c2, b1, b2;
3068
3069   CHECK_CHAR_COERCE_INT (ch);
3070   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3071   if (EQ (charset, Vcharset_chinese_big5_1) ||
3072       EQ (charset, Vcharset_chinese_big5_2))
3073     {
3074       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3075                    b1, b2);
3076       return Fcons (make_int (b1), make_int (b2));
3077     }
3078   else
3079     return Qnil;
3080 }
3081
3082 \f
3083 /************************************************************************/
3084 /*                           UCS-4 methods                              */
3085 /*                                                                      */
3086 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3087 /*                                                                      */
3088 /************************************************************************/
3089
3090 Lisp_Object ucs_to_mule_table[65536];
3091 Lisp_Object mule_to_ucs_table;
3092
3093 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3094 Map UCS-4 code CODE to Mule character CHARACTER.
3095
3096 Return T on success, NIL on failure.
3097 */
3098        (code, character))
3099 {
3100   unsigned int c;
3101
3102   CHECK_CHAR (character);
3103   CHECK_INT (code);
3104   c = XINT (code);
3105
3106   if (c < sizeof (ucs_to_mule_table))
3107     {
3108       ucs_to_mule_table[c] = character;
3109       return Qt;
3110     }
3111   else
3112     return Qnil;
3113 }
3114
3115 static Lisp_Object
3116 ucs_to_char (unsigned long code)
3117 {
3118   if (code < sizeof (ucs_to_mule_table))
3119     {
3120       return ucs_to_mule_table[code];
3121     }
3122   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3123     {
3124       unsigned int c;
3125
3126       code -= 0xe00000;
3127       c = code % (94 * 94);
3128       return make_char
3129         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3130                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3131                      CHARSET_LEFT_TO_RIGHT),
3132                     c / 94 + 33, c % 94 + 33));
3133     }
3134   else
3135     return Qnil;
3136 }
3137
3138 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3139 Return Mule character corresponding to UCS code CODE (a positive integer).
3140 */
3141        (code))
3142 {
3143   CHECK_NATNUM (code);
3144   return ucs_to_char (XINT (code));
3145 }
3146
3147 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3148 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3149 */
3150        (character, code))
3151 {
3152   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3153           Fset_char_ucs is more restrictive on index arg, but should
3154           check code arg in a char_table method. */
3155   CHECK_CHAR (character);
3156   CHECK_NATNUM (code);
3157   return Fput_char_table (character, code, mule_to_ucs_table);
3158 }
3159
3160 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3161 Return the UCS code (a positive integer) corresponding to CHARACTER.
3162 */
3163        (character))
3164 {
3165   return Fget_char_table (character, mule_to_ucs_table);
3166 }
3167
3168 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3169    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3170    is not found, instead.
3171    #### do something more appropriate (use blob?)
3172         Danger, Will Robinson!  Data loss.  Should we signal user? */
3173 static void
3174 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3175 {
3176   Lisp_Object chr = ucs_to_char (ch);
3177
3178   if (! NILP (chr))
3179     {
3180       Bufbyte work[MAX_EMCHAR_LEN];
3181       int len;
3182
3183       ch = XCHAR (chr);
3184       len = (ch < 128) ?
3185         simple_set_charptr_emchar (work, ch) :
3186         non_ascii_set_charptr_emchar (work, ch);
3187       Dynarr_add_many (dst, work, len);
3188     }
3189   else
3190     {
3191       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3192       Dynarr_add (dst, 34 + 128);
3193       Dynarr_add (dst, 46 + 128);
3194     }
3195 }
3196
3197 static unsigned long
3198 mule_char_to_ucs4 (Lisp_Object charset,
3199                    unsigned char h, unsigned char l)
3200 {
3201   Lisp_Object code
3202     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3203                        mule_to_ucs_table);
3204
3205   if (INTP (code))
3206     {
3207       return XINT (code);
3208     }
3209   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3210             (XCHARSET_CHARS (charset) == 94) )
3211     {
3212       unsigned char final = XCHARSET_FINAL (charset);
3213
3214       if ( ('@' <= final) && (final < 0x7f) )
3215         {
3216           return 0xe00000 + (final - '@') * 94 * 94
3217             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3218         }
3219       else
3220         {
3221           return '?';
3222         }
3223     }
3224   else
3225     {
3226       return '?';
3227     }
3228 }
3229
3230 static void
3231 encode_ucs4 (Lisp_Object charset,
3232              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3233 {
3234   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3235   Dynarr_add (dst,  code >> 24);
3236   Dynarr_add (dst, (code >> 16) & 255);
3237   Dynarr_add (dst, (code >>  8) & 255);
3238   Dynarr_add (dst,  code        & 255);
3239 }
3240
3241 static int
3242 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3243                     unsigned int n)
3244 {
3245   while (n--)
3246     {
3247       int c = *src++;
3248       switch (st->ucs4.in_byte)
3249         {
3250         case 0:
3251           if (c >= 128)
3252             return 0;
3253           else
3254             st->ucs4.in_byte++;
3255           break;
3256         case 3:
3257           st->ucs4.in_byte = 0;
3258           break;
3259         default:
3260           st->ucs4.in_byte++;
3261         }
3262     }
3263   return CODING_CATEGORY_UCS4_MASK;
3264 }
3265
3266 static void
3267 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3268                     unsigned_char_dynarr *dst, unsigned int n)
3269 {
3270   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3271   unsigned int flags = str->flags;
3272   unsigned int ch    = str->ch;
3273
3274   while (n--)
3275     {
3276       unsigned char c = *src++;
3277       switch (flags)
3278         {
3279         case 0:
3280           ch = c;
3281           flags = 3;
3282           break;
3283         case 1:
3284           decode_ucs4 ( ( ch << 8 ) | c, dst);
3285           ch = 0;
3286           flags = 0;
3287           break;
3288         default:
3289           ch = ( ch << 8 ) | c;
3290           flags--;
3291         }
3292     }
3293   if (flags & CODING_STATE_END)
3294     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3295
3296   str->flags = flags;
3297   str->ch    = ch;
3298 }
3299
3300 static void
3301 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3302                     unsigned_char_dynarr *dst, unsigned int n)
3303 {
3304   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3305   unsigned int flags = str->flags;
3306   unsigned int ch = str->ch;
3307   unsigned char char_boundary = str->iso2022.current_char_boundary;
3308   Lisp_Object charset = str->iso2022.current_charset;
3309
3310 #ifdef ENABLE_COMPOSITE_CHARS
3311   /* flags for handling composite chars.  We do a little switcharoo
3312      on the source while we're outputting the composite char. */
3313   unsigned int saved_n = 0;
3314   CONST unsigned char *saved_src = NULL;
3315   int in_composite = 0;
3316
3317  back_to_square_n:
3318 #endif
3319
3320   while (n--)
3321     {
3322       unsigned char c = *src++;
3323
3324       if (BYTE_ASCII_P (c))
3325         {               /* Processing ASCII character */
3326           ch = 0;
3327           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3328           char_boundary = 1;
3329         }
3330       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3331         { /* Processing Leading Byte */
3332           ch = 0;
3333           charset = CHARSET_BY_LEADING_BYTE (c);
3334           if (LEADING_BYTE_PREFIX_P(c))
3335             ch = c;
3336           char_boundary = 0;
3337         }
3338       else
3339         {                       /* Processing Non-ASCII character */
3340           char_boundary = 1;
3341           if (EQ (charset, Vcharset_control_1))
3342             {
3343               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3344             }
3345           else
3346             {
3347               switch (XCHARSET_REP_BYTES (charset))
3348                 {
3349                 case 2:
3350                   encode_ucs4 (charset, c, 0, dst);
3351                   break;
3352                 case 3:
3353                   if (XCHARSET_PRIVATE_P (charset))
3354                     {
3355                       encode_ucs4 (charset, c, 0, dst);
3356                       ch = 0;
3357                     }
3358                   else if (ch)
3359                     {
3360 #ifdef ENABLE_COMPOSITE_CHARS
3361                       if (EQ (charset, Vcharset_composite))
3362                         {
3363                           if (in_composite)
3364                             {
3365                               /* #### Bother! We don't know how to
3366                                  handle this yet. */
3367                               Dynarr_add (dst, 0);
3368                               Dynarr_add (dst, 0);
3369                               Dynarr_add (dst, 0);
3370                               Dynarr_add (dst, '~');
3371                             }
3372                           else
3373                             {
3374                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3375                                                        ch & 0x7F, c & 0x7F);
3376                               Lisp_Object lstr = composite_char_string (emch);
3377                               saved_n = n;
3378                               saved_src = src;
3379                               in_composite = 1;
3380                               src = XSTRING_DATA   (lstr);
3381                               n   = XSTRING_LENGTH (lstr);
3382                             }
3383                         }
3384                       else
3385 #endif /* ENABLE_COMPOSITE_CHARS */
3386                         {
3387                           encode_ucs4(charset, ch, c, dst);
3388                         }
3389                       ch = 0;
3390                     }
3391                   else
3392                     {
3393                       ch = c;
3394                       char_boundary = 0;
3395                     }
3396                   break;
3397                 case 4:
3398                   if (ch)
3399                     {
3400                       encode_ucs4 (charset, ch, c, dst);
3401                       ch = 0;
3402                     }
3403                   else
3404                     {
3405                       ch = c;
3406                       char_boundary = 0;
3407                     }
3408                   break;
3409                 default:
3410                   abort ();
3411                 }
3412             }
3413         }
3414     }
3415
3416 #ifdef ENABLE_COMPOSITE_CHARS
3417   if (in_composite)
3418     {
3419       n = saved_n;
3420       src = saved_src;
3421       in_composite = 0;
3422       goto back_to_square_n; /* Wheeeeeeeee ..... */
3423     }
3424 #endif /* ENABLE_COMPOSITE_CHARS */
3425
3426   str->flags = flags;
3427   str->ch = ch;
3428   str->iso2022.current_char_boundary = char_boundary;
3429   str->iso2022.current_charset = charset;
3430
3431   /* Verbum caro factum est! */
3432 }
3433
3434 \f
3435 /************************************************************************/
3436 /*                           UTF-8 methods                              */
3437 /************************************************************************/
3438
3439 static int
3440 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3441                     unsigned int n)
3442 {
3443   while (n--)
3444     {
3445       unsigned char c = *src++;
3446       switch (st->utf8.in_byte)
3447         {
3448         case 0:
3449           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3450             return 0;
3451           else if (c >= 0xfc)
3452             st->utf8.in_byte = 5;
3453           else if (c >= 0xf8)
3454             st->utf8.in_byte = 4;
3455           else if (c >= 0xf0)
3456             st->utf8.in_byte = 3;
3457           else if (c >= 0xe0)
3458             st->utf8.in_byte = 2;
3459           else if (c >= 0xc0)
3460             st->utf8.in_byte = 1;
3461           else if (c >= 0x80)
3462             return 0;
3463           break;
3464         default:
3465           if ((c & 0xc0) != 0x80)
3466             return 0;
3467           else
3468             st->utf8.in_byte--;
3469         }
3470     }
3471   return CODING_CATEGORY_UTF8_MASK;
3472 }
3473
3474 static void
3475 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3476                     unsigned_char_dynarr *dst, unsigned int n)
3477 {
3478   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3479   unsigned int flags  = str->flags;
3480   unsigned int ch     = str->ch;
3481   eol_type_t eol_type = str->eol_type;
3482
3483   while (n--)
3484     {
3485       unsigned char c = *src++;
3486       switch (flags)
3487         {
3488         case 0:
3489           if ( c >= 0xfc )
3490             {
3491               ch = c & 0x01;
3492               flags = 5;
3493             }
3494           else if ( c >= 0xf8 )
3495             {
3496               ch = c & 0x03;
3497               flags = 4;
3498             }
3499           else if ( c >= 0xf0 )
3500             {
3501               ch = c & 0x07;
3502               flags = 3;
3503             }
3504           else if ( c >= 0xe0 )
3505             {
3506               ch = c & 0x0f;
3507               flags = 2;
3508             }
3509           else if ( c >= 0xc0 )
3510             {
3511               ch = c & 0x1f;
3512               flags = 1;
3513             }
3514           else
3515             {
3516               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3517               decode_ucs4 (c, dst);
3518             }
3519           break;
3520         case 1:
3521           ch = ( ch << 6 ) | ( c & 0x3f );
3522           decode_ucs4 (ch, dst);
3523           ch = 0;
3524           flags = 0;
3525           break;
3526         default:
3527           ch = ( ch << 6 ) | ( c & 0x3f );
3528           flags--;
3529         }
3530     label_continue_loop:;
3531     }
3532
3533   if (flags & CODING_STATE_END)
3534     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3535
3536   str->flags = flags;
3537   str->ch    = ch;
3538 }
3539
3540 static void
3541 encode_utf8 (Lisp_Object charset,
3542              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3543 {
3544   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3545   if ( code <= 0x7f )
3546     {
3547       Dynarr_add (dst, code);
3548     }
3549   else if ( code <= 0x7ff )
3550     {
3551       Dynarr_add (dst, (code >> 6) | 0xc0);
3552       Dynarr_add (dst, (code & 0x3f) | 0x80);
3553     }
3554   else if ( code <= 0xffff )
3555     {
3556       Dynarr_add (dst,  (code >> 12) | 0xe0);
3557       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3558       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3559     }
3560   else if ( code <= 0x1fffff )
3561     {
3562       Dynarr_add (dst,  (code >> 18) | 0xf0);
3563       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3564       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3565       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3566     }
3567   else if ( code <= 0x3ffffff )
3568     {
3569       Dynarr_add (dst,  (code >> 24) | 0xf8);
3570       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3571       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3572       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3573       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3574     }
3575   else
3576     {
3577       Dynarr_add (dst,  (code >> 30) | 0xfc);
3578       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3579       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3580       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3581       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3582       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3583     }
3584 }
3585
3586 static void
3587 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3588                     unsigned_char_dynarr *dst, unsigned int n)
3589 {
3590   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3591   unsigned int flags  = str->flags;
3592   unsigned int ch     = str->ch;
3593   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3594   unsigned char char_boundary = str->iso2022.current_char_boundary;
3595   Lisp_Object charset = str->iso2022.current_charset;
3596
3597 #ifdef ENABLE_COMPOSITE_CHARS
3598   /* flags for handling composite chars.  We do a little switcharoo
3599      on the source while we're outputting the composite char. */
3600   unsigned int saved_n = 0;
3601   CONST unsigned char *saved_src = NULL;
3602   int in_composite = 0;
3603
3604  back_to_square_n:
3605 #endif /* ENABLE_COMPOSITE_CHARS */
3606
3607   while (n--)
3608     {
3609       unsigned char c = *src++;
3610
3611       if (BYTE_ASCII_P (c))
3612         {               /* Processing ASCII character */
3613           ch = 0;
3614           if (c == '\n')
3615             {
3616               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3617                 Dynarr_add (dst, '\r');
3618               if (eol_type != EOL_CR)
3619                 Dynarr_add (dst, c);
3620             }
3621           else
3622             encode_utf8 (Vcharset_ascii, c, 0, dst);
3623           char_boundary = 1;
3624         }
3625       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3626         { /* Processing Leading Byte */
3627           ch = 0;
3628           charset = CHARSET_BY_LEADING_BYTE (c);
3629           if (LEADING_BYTE_PREFIX_P(c))
3630             ch = c;
3631           char_boundary = 0;
3632         }
3633       else
3634         {                       /* Processing Non-ASCII character */
3635           char_boundary = 1;
3636           if (EQ (charset, Vcharset_control_1))
3637             {
3638               encode_utf8 (Vcharset_control_1, c, 0, dst);
3639             }
3640           else
3641             {
3642               switch (XCHARSET_REP_BYTES (charset))
3643                 {
3644                 case 2:
3645                   encode_utf8 (charset, c, 0, dst);
3646                   break;
3647                 case 3:
3648                   if (XCHARSET_PRIVATE_P (charset))
3649                     {
3650                       encode_utf8 (charset, c, 0, dst);
3651                       ch = 0;
3652                     }
3653                   else if (ch)
3654                     {
3655 #ifdef ENABLE_COMPOSITE_CHARS
3656                       if (EQ (charset, Vcharset_composite))
3657                         {
3658                           if (in_composite)
3659                             {
3660                               /* #### Bother! We don't know how to
3661                                  handle this yet. */
3662                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3663                             }
3664                           else
3665                             {
3666                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3667                                                        ch & 0x7F, c & 0x7F);
3668                               Lisp_Object lstr = composite_char_string (emch);
3669                               saved_n = n;
3670                               saved_src = src;
3671                               in_composite = 1;
3672                               src = XSTRING_DATA   (lstr);
3673                               n   = XSTRING_LENGTH (lstr);
3674                             }
3675                         }
3676                       else
3677 #endif /* ENABLE_COMPOSITE_CHARS */
3678                         {
3679                           encode_utf8 (charset, ch, c, dst);
3680                         }
3681                       ch = 0;
3682                     }
3683                   else
3684                     {
3685                       ch = c;
3686                       char_boundary = 0;
3687                     }
3688                   break;
3689                 case 4:
3690                   if (ch)
3691                     {
3692                       encode_utf8 (charset, ch, c, dst);
3693                       ch = 0;
3694                     }
3695                   else
3696                     {
3697                       ch = c;
3698                       char_boundary = 0;
3699                     }
3700                   break;
3701                 default:
3702                   abort ();
3703                 }
3704             }
3705         }
3706     }
3707
3708 #ifdef ENABLE_COMPOSITE_CHARS
3709   if (in_composite)
3710     {
3711       n = saved_n;
3712       src = saved_src;
3713       in_composite = 0;
3714       goto back_to_square_n; /* Wheeeeeeeee ..... */
3715     }
3716 #endif
3717
3718   str->flags = flags;
3719   str->ch    = ch;
3720   str->iso2022.current_char_boundary = char_boundary;
3721   str->iso2022.current_charset = charset;
3722
3723   /* Verbum caro factum est! */
3724 }
3725
3726 \f
3727 /************************************************************************/
3728 /*                           ISO2022 methods                            */
3729 /************************************************************************/
3730
3731 /* The following note describes the coding system ISO2022 briefly.
3732    Since the intention of this note is to help understand the
3733    functions in this file, some parts are NOT ACCURATE or OVERLY
3734    SIMPLIFIED.  For thorough understanding, please refer to the
3735    original document of ISO2022.
3736
3737    ISO2022 provides many mechanisms to encode several character sets
3738    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3739    is encoded using bytes less than 128.  This may make the encoded
3740    text a little bit longer, but the text passes more easily through
3741    several gateways, some of which strip off MSB (Most Signigant Bit).
3742
3743    There are two kinds of character sets: control character set and
3744    graphic character set.  The former contains control characters such
3745    as `newline' and `escape' to provide control functions (control
3746    functions are also provided by escape sequences).  The latter
3747    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3748    two control character sets and many graphic character sets.
3749
3750    Graphic character sets are classified into one of the following
3751    four classes, according to the number of bytes (DIMENSION) and
3752    number of characters in one dimension (CHARS) of the set:
3753    - DIMENSION1_CHARS94
3754    - DIMENSION1_CHARS96
3755    - DIMENSION2_CHARS94
3756    - DIMENSION2_CHARS96
3757
3758    In addition, each character set is assigned an identification tag,
3759    unique for each set, called "final character" (denoted as <F>
3760    hereafter).  The <F> of each character set is decided by ECMA(*)
3761    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3762    (0x30..0x3F are for private use only).
3763
3764    Note (*): ECMA = European Computer Manufacturers Association
3765
3766    Here are examples of graphic character set [NAME(<F>)]:
3767         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3768         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3769         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3770         o DIMENSION2_CHARS96 -- none for the moment
3771
3772    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3773         C0 [0x00..0x1F] -- control character plane 0
3774         GL [0x20..0x7F] -- graphic character plane 0
3775         C1 [0x80..0x9F] -- control character plane 1
3776         GR [0xA0..0xFF] -- graphic character plane 1
3777
3778    A control character set is directly designated and invoked to C0 or
3779    C1 by an escape sequence.  The most common case is that:
3780    - ISO646's  control character set is designated/invoked to C0, and
3781    - ISO6429's control character set is designated/invoked to C1,
3782    and usually these designations/invocations are omitted in encoded
3783    text.  In a 7-bit environment, only C0 can be used, and a control
3784    character for C1 is encoded by an appropriate escape sequence to
3785    fit into the environment.  All control characters for C1 are
3786    defined to have corresponding escape sequences.
3787
3788    A graphic character set is at first designated to one of four
3789    graphic registers (G0 through G3), then these graphic registers are
3790    invoked to GL or GR.  These designations and invocations can be
3791    done independently.  The most common case is that G0 is invoked to
3792    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3793    these invocations and designations are omitted in encoded text.
3794    In a 7-bit environment, only GL can be used.
3795
3796    When a graphic character set of CHARS94 is invoked to GL, codes
3797    0x20 and 0x7F of the GL area work as control characters SPACE and
3798    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3799    be used.
3800
3801    There are two ways of invocation: locking-shift and single-shift.
3802    With locking-shift, the invocation lasts until the next different
3803    invocation, whereas with single-shift, the invocation affects the
3804    following character only and doesn't affect the locking-shift
3805    state.  Invocations are done by the following control characters or
3806    escape sequences:
3807
3808    ----------------------------------------------------------------------
3809    abbrev  function                  cntrl escape seq   description
3810    ----------------------------------------------------------------------
3811    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3812    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3813    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3814    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3815    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3816    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3817    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3818    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3819    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3820    ----------------------------------------------------------------------
3821    (*) These are not used by any known coding system.
3822
3823    Control characters for these functions are defined by macros
3824    ISO_CODE_XXX in `coding.h'.
3825
3826    Designations are done by the following escape sequences:
3827    ----------------------------------------------------------------------
3828    escape sequence      description
3829    ----------------------------------------------------------------------
3830    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3831    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3832    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3833    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3834    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3835    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3836    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3837    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3838    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3839    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3840    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3841    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3842    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3843    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3844    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3845    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3846    ----------------------------------------------------------------------
3847
3848    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3849    of dimension 1, chars 94, and final character <F>, etc...
3850
3851    Note (*): Although these designations are not allowed in ISO2022,
3852    Emacs accepts them on decoding, and produces them on encoding
3853    CHARS96 character sets in a coding system which is characterized as
3854    7-bit environment, non-locking-shift, and non-single-shift.
3855
3856    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3857    '(' can be omitted.  We refer to this as "short-form" hereafter.
3858
3859    Now you may notice that there are a lot of ways for encoding the
3860    same multilingual text in ISO2022.  Actually, there exist many
3861    coding systems such as Compound Text (used in X11's inter client
3862    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3863    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3864    localized platforms), and all of these are variants of ISO2022.
3865
3866    In addition to the above, Emacs handles two more kinds of escape
3867    sequences: ISO6429's direction specification and Emacs' private
3868    sequence for specifying character composition.
3869
3870    ISO6429's direction specification takes the following form:
3871         o CSI ']'      -- end of the current direction
3872         o CSI '0' ']'  -- end of the current direction
3873         o CSI '1' ']'  -- start of left-to-right text
3874         o CSI '2' ']'  -- start of right-to-left text
3875    The control character CSI (0x9B: control sequence introducer) is
3876    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3877
3878    Character composition specification takes the following form:
3879         o ESC '0' -- start character composition
3880         o ESC '1' -- end character composition
3881    Since these are not standard escape sequences of any ISO standard,
3882    their use with these meanings is restricted to Emacs only.  */
3883
3884 static void
3885 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3886 {
3887   int i;
3888
3889   for (i = 0; i < 4; i++)
3890     {
3891       if (!NILP (coding_system))
3892         iso->charset[i] =
3893           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3894       else
3895         iso->charset[i] = Qt;
3896       iso->invalid_designated[i] = 0;
3897     }
3898   iso->esc = ISO_ESC_NOTHING;
3899   iso->esc_bytes_index = 0;
3900   iso->register_left = 0;
3901   iso->register_right = 1;
3902   iso->switched_dir_and_no_valid_charset_yet = 0;
3903   iso->invalid_switch_dir = 0;
3904   iso->output_direction_sequence = 0;
3905   iso->output_literally = 0;
3906 #ifdef ENABLE_COMPOSITE_CHARS
3907   if (iso->composite_chars)
3908     Dynarr_reset (iso->composite_chars);
3909 #endif
3910 }
3911
3912 static int
3913 fit_to_be_escape_quoted (unsigned char c)
3914 {
3915   switch (c)
3916     {
3917     case ISO_CODE_ESC:
3918     case ISO_CODE_CSI:
3919     case ISO_CODE_SS2:
3920     case ISO_CODE_SS3:
3921     case ISO_CODE_SO:
3922     case ISO_CODE_SI:
3923       return 1;
3924
3925     default:
3926       return 0;
3927     }
3928 }
3929
3930 /* Parse one byte of an ISO2022 escape sequence.
3931    If the result is an invalid escape sequence, return 0 and
3932    do not change anything in STR.  Otherwise, if the result is
3933    an incomplete escape sequence, update ISO2022.ESC and
3934    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3935    all the state variables (but not ISO2022.ESC_BYTES) and
3936    return 1.
3937
3938    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3939    or invocation of an invalid character set and treat that as
3940    an unrecognized escape sequence. */
3941
3942 static int
3943 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3944                    unsigned char c, unsigned int *flags,
3945                    int check_invalid_charsets)
3946 {
3947   /* (1) If we're at the end of a designation sequence, CS is the
3948      charset being designated and REG is the register to designate
3949      it to.
3950
3951      (2) If we're at the end of a locking-shift sequence, REG is
3952      the register to invoke and HALF (0 == left, 1 == right) is
3953      the half to invoke it into.
3954
3955      (3) If we're at the end of a single-shift sequence, REG is
3956      the register to invoke. */
3957   Lisp_Object cs = Qnil;
3958   int reg, half;
3959
3960   /* NOTE: This code does goto's all over the fucking place.
3961      The reason for this is that we're basically implementing
3962      a state machine here, and hierarchical languages like C
3963      don't really provide a clean way of doing this. */
3964
3965   if (! (*flags & CODING_STATE_ESCAPE))
3966     /* At beginning of escape sequence; we need to reset our
3967        escape-state variables. */
3968     iso->esc = ISO_ESC_NOTHING;
3969
3970   iso->output_literally = 0;
3971   iso->output_direction_sequence = 0;
3972
3973   switch (iso->esc)
3974     {
3975     case ISO_ESC_NOTHING:
3976       iso->esc_bytes_index = 0;
3977       switch (c)
3978         {
3979         case ISO_CODE_ESC:      /* Start escape sequence */
3980           *flags |= CODING_STATE_ESCAPE;
3981           iso->esc = ISO_ESC;
3982           goto not_done;
3983
3984         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3985           *flags |= CODING_STATE_ESCAPE;
3986           iso->esc = ISO_ESC_5_11;
3987           goto not_done;
3988
3989         case ISO_CODE_SO:       /* locking shift 1 */
3990           reg = 1; half = 0;
3991           goto locking_shift;
3992         case ISO_CODE_SI:       /* locking shift 0 */
3993           reg = 0; half = 0;
3994           goto locking_shift;
3995
3996         case ISO_CODE_SS2:      /* single shift */
3997           reg = 2;
3998           goto single_shift;
3999         case ISO_CODE_SS3:      /* single shift */
4000           reg = 3;
4001           goto single_shift;
4002
4003         default:                        /* Other control characters */
4004           return 0;
4005         }
4006
4007     case ISO_ESC:
4008       switch (c)
4009         {
4010           /**** single shift ****/
4011
4012         case 'N':       /* single shift 2 */
4013           reg = 2;
4014           goto single_shift;
4015         case 'O':       /* single shift 3 */
4016           reg = 3;
4017           goto single_shift;
4018
4019           /**** locking shift ****/
4020
4021         case '~':       /* locking shift 1 right */
4022           reg = 1; half = 1;
4023           goto locking_shift;
4024         case 'n':       /* locking shift 2 */
4025           reg = 2; half = 0;
4026           goto locking_shift;
4027         case '}':       /* locking shift 2 right */
4028           reg = 2; half = 1;
4029           goto locking_shift;
4030         case 'o':       /* locking shift 3 */
4031           reg = 3; half = 0;
4032           goto locking_shift;
4033         case '|':       /* locking shift 3 right */
4034           reg = 3; half = 1;
4035           goto locking_shift;
4036
4037 #ifdef ENABLE_COMPOSITE_CHARS
4038           /**** composite ****/
4039
4040         case '0':
4041           iso->esc = ISO_ESC_START_COMPOSITE;
4042           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4043             CODING_STATE_COMPOSITE;
4044           return 1;
4045
4046         case '1':
4047           iso->esc = ISO_ESC_END_COMPOSITE;
4048           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4049             ~CODING_STATE_COMPOSITE;
4050           return 1;
4051 #endif /* ENABLE_COMPOSITE_CHARS */
4052
4053           /**** directionality ****/
4054
4055         case '[':
4056           iso->esc = ISO_ESC_5_11;
4057           goto not_done;
4058
4059           /**** designation ****/
4060
4061         case '$':       /* multibyte charset prefix */
4062           iso->esc = ISO_ESC_2_4;
4063           goto not_done;
4064
4065         default:
4066           if (0x28 <= c && c <= 0x2F)
4067             {
4068               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4069               goto not_done;
4070             }
4071
4072           /* This function is called with CODESYS equal to nil when
4073              doing coding-system detection. */
4074           if (!NILP (codesys)
4075               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4076               && fit_to_be_escape_quoted (c))
4077             {
4078               iso->esc = ISO_ESC_LITERAL;
4079               *flags &= CODING_STATE_ISO2022_LOCK;
4080               return 1;
4081             }
4082
4083           /* bzzzt! */
4084           return 0;
4085         }
4086
4087
4088
4089       /**** directionality ****/
4090
4091     case ISO_ESC_5_11:          /* ISO6429 direction control */
4092       if (c == ']')
4093         {
4094           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4095           goto directionality;
4096         }
4097       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4098       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4099       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4100       else               return 0;
4101       goto not_done;
4102
4103     case ISO_ESC_5_11_0:
4104       if (c == ']')
4105         {
4106           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4107           goto directionality;
4108         }
4109       return 0;
4110
4111     case ISO_ESC_5_11_1:
4112       if (c == ']')
4113         {
4114           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4115           goto directionality;
4116         }
4117       return 0;
4118
4119     case ISO_ESC_5_11_2:
4120       if (c == ']')
4121         {
4122           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4123           goto directionality;
4124         }
4125       return 0;
4126
4127     directionality:
4128       iso->esc = ISO_ESC_DIRECTIONALITY;
4129       /* Various junk here to attempt to preserve the direction sequences
4130          literally in the text if they would otherwise be swallowed due
4131          to invalid designations that don't show up as actual charset
4132          changes in the text. */
4133       if (iso->invalid_switch_dir)
4134         {
4135           /* We already inserted a direction switch literally into the
4136              text.  We assume (#### this may not be right) that the
4137              next direction switch is the one going the other way,
4138              and we need to output that literally as well. */
4139           iso->output_literally = 1;
4140           iso->invalid_switch_dir = 0;
4141         }
4142       else
4143         {
4144           int jj;
4145
4146           /* If we are in the thrall of an invalid designation,
4147            then stick the directionality sequence literally into the
4148            output stream so it ends up in the original text again. */
4149           for (jj = 0; jj < 4; jj++)
4150             if (iso->invalid_designated[jj])
4151               break;
4152           if (jj < 4)
4153             {
4154               iso->output_literally = 1;
4155               iso->invalid_switch_dir = 1;
4156             }
4157           else
4158             /* Indicate that we haven't yet seen a valid designation,
4159                so that if a switch-dir is directly followed by an
4160                invalid designation, both get inserted literally. */
4161             iso->switched_dir_and_no_valid_charset_yet = 1;
4162         }
4163       return 1;
4164
4165
4166       /**** designation ****/
4167
4168     case ISO_ESC_2_4:
4169       if (0x28 <= c && c <= 0x2F)
4170         {
4171           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4172           goto not_done;
4173         }
4174       if (0x40 <= c && c <= 0x42)
4175         {
4176           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4177                                       *flags & CODING_STATE_R2L ?
4178                                       CHARSET_RIGHT_TO_LEFT :
4179                                       CHARSET_LEFT_TO_RIGHT);
4180           reg = 0;
4181           goto designated;
4182         }
4183       return 0;
4184
4185     default:
4186       {
4187         int type =-1;
4188
4189         if (c < '0' || c > '~')
4190           return 0; /* bad final byte */
4191
4192         if (iso->esc >= ISO_ESC_2_8 &&
4193             iso->esc <= ISO_ESC_2_15)
4194           {
4195             type = ((iso->esc >= ISO_ESC_2_12) ?
4196                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4197             reg = (iso->esc - ISO_ESC_2_8) & 3;
4198           }
4199         else if (iso->esc >= ISO_ESC_2_4_8 &&
4200                  iso->esc <= ISO_ESC_2_4_15)
4201           {
4202             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4203                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4204             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4205           }
4206         else
4207           {
4208             /* Can this ever be reached? -slb */
4209             abort();
4210           }
4211
4212         cs = CHARSET_BY_ATTRIBUTES (type, c,
4213                                     *flags & CODING_STATE_R2L ?
4214                                     CHARSET_RIGHT_TO_LEFT :
4215                                     CHARSET_LEFT_TO_RIGHT);
4216         goto designated;
4217       }
4218     }
4219
4220  not_done:
4221   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4222   return -1;
4223
4224  single_shift:
4225   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4226     /* can't invoke something that ain't there. */
4227     return 0;
4228   iso->esc = ISO_ESC_SINGLE_SHIFT;
4229   *flags &= CODING_STATE_ISO2022_LOCK;
4230   if (reg == 2)
4231     *flags |= CODING_STATE_SS2;
4232   else
4233     *flags |= CODING_STATE_SS3;
4234   return 1;
4235
4236  locking_shift:
4237   if (check_invalid_charsets &&
4238       !CHARSETP (iso->charset[reg]))
4239     /* can't invoke something that ain't there. */
4240     return 0;
4241   if (half)
4242     iso->register_right = reg;
4243   else
4244     iso->register_left = reg;
4245   *flags &= CODING_STATE_ISO2022_LOCK;
4246   iso->esc = ISO_ESC_LOCKING_SHIFT;
4247   return 1;
4248
4249  designated:
4250   if (NILP (cs) && check_invalid_charsets)
4251     {
4252       iso->invalid_designated[reg] = 1;
4253       iso->charset[reg] = Vcharset_ascii;
4254       iso->esc = ISO_ESC_DESIGNATE;
4255       *flags &= CODING_STATE_ISO2022_LOCK;
4256       iso->output_literally = 1;
4257       if (iso->switched_dir_and_no_valid_charset_yet)
4258         {
4259           /* We encountered a switch-direction followed by an
4260              invalid designation.  Ensure that the switch-direction
4261              gets outputted; otherwise it will probably get eaten
4262              when the text is written out again. */
4263           iso->switched_dir_and_no_valid_charset_yet = 0;
4264           iso->output_direction_sequence = 1;
4265           /* And make sure that the switch-dir going the other
4266              way gets outputted, as well. */
4267           iso->invalid_switch_dir = 1;
4268         }
4269       return 1;
4270     }
4271   /* This function is called with CODESYS equal to nil when
4272      doing coding-system detection. */
4273   if (!NILP (codesys))
4274     {
4275       charset_conversion_spec_dynarr *dyn =
4276         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4277
4278       if (dyn)
4279         {
4280           int i;
4281
4282           for (i = 0; i < Dynarr_length (dyn); i++)
4283             {
4284               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4285               if (EQ (cs, spec->from_charset))
4286                 cs = spec->to_charset;
4287             }
4288         }
4289     }
4290
4291   iso->charset[reg] = cs;
4292   iso->esc = ISO_ESC_DESIGNATE;
4293   *flags &= CODING_STATE_ISO2022_LOCK;
4294   if (iso->invalid_designated[reg])
4295     {
4296       iso->invalid_designated[reg] = 0;
4297       iso->output_literally = 1;
4298     }
4299   if (iso->switched_dir_and_no_valid_charset_yet)
4300     iso->switched_dir_and_no_valid_charset_yet = 0;
4301   return 1;
4302 }
4303
4304 static int
4305 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4306                        unsigned int n)
4307 {
4308   int mask;
4309
4310   /* #### There are serious deficiencies in the recognition mechanism
4311      here.  This needs to be much smarter if it's going to cut it.
4312      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4313      it should be detected as Latin-1.
4314      All the ISO2022 stuff in this file should be synced up with the
4315      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4316      Perhaps we should wait till R2L works in FSF Emacs? */
4317
4318   if (!st->iso2022.initted)
4319     {
4320       reset_iso2022 (Qnil, &st->iso2022.iso);
4321       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4322                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4323                           CODING_CATEGORY_ISO_8_1_MASK |
4324                           CODING_CATEGORY_ISO_8_2_MASK |
4325                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4326       st->iso2022.flags = 0;
4327       st->iso2022.high_byte_count = 0;
4328       st->iso2022.saw_single_shift = 0;
4329       st->iso2022.initted = 1;
4330     }
4331
4332   mask = st->iso2022.mask;
4333
4334   while (n--)
4335     {
4336       int c = *src++;
4337       if (c >= 0xA0)
4338         {
4339           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4340           st->iso2022.high_byte_count++;
4341         }
4342       else
4343         {
4344           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4345             {
4346               if (st->iso2022.high_byte_count & 1)
4347                 /* odd number of high bytes; assume not iso-8-2 */
4348                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4349             }
4350           st->iso2022.high_byte_count = 0;
4351           st->iso2022.saw_single_shift = 0;
4352           if (c > 0x80)
4353             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4354         }
4355       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4356           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4357         { /* control chars */
4358           switch (c)
4359             {
4360               /* Allow and ignore control characters that you might
4361                  reasonably see in a text file */
4362             case '\r':
4363             case '\n':
4364             case '\t':
4365             case  7: /* bell */
4366             case  8: /* backspace */
4367             case 11: /* vertical tab */
4368             case 12: /* form feed */
4369             case 26: /* MS-DOS C-z junk */
4370             case 31: /* '^_' -- for info */
4371               goto label_continue_loop;
4372
4373             default:
4374               break;
4375             }
4376         }
4377
4378       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4379           || BYTE_C1_P (c))
4380         {
4381           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4382                                  &st->iso2022.flags, 0))
4383             {
4384               switch (st->iso2022.iso.esc)
4385                 {
4386                 case ISO_ESC_DESIGNATE:
4387                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4388                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4389                   break;
4390                 case ISO_ESC_LOCKING_SHIFT:
4391                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4392                   goto ran_out_of_chars;
4393                 case ISO_ESC_SINGLE_SHIFT:
4394                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4395                   st->iso2022.saw_single_shift = 1;
4396                   break;
4397                 default:
4398                   break;
4399                 }
4400             }
4401           else
4402             {
4403               mask = 0;
4404               goto ran_out_of_chars;
4405             }
4406         }
4407     label_continue_loop:;
4408     }
4409
4410  ran_out_of_chars:
4411
4412   return mask;
4413 }
4414
4415 static int
4416 postprocess_iso2022_mask (int mask)
4417 {
4418   /* #### kind of cheesy */
4419   /* If seven-bit ISO is allowed, then assume that the encoding is
4420      entirely seven-bit and turn off the eight-bit ones. */
4421   if (mask & CODING_CATEGORY_ISO_7_MASK)
4422     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4423                CODING_CATEGORY_ISO_8_1_MASK |
4424                CODING_CATEGORY_ISO_8_2_MASK);
4425   return mask;
4426 }
4427
4428 /* If FLAGS is a null pointer or specifies right-to-left motion,
4429    output a switch-dir-to-left-to-right sequence to DST.
4430    Also update FLAGS if it is not a null pointer.
4431    If INTERNAL_P is set, we are outputting in internal format and
4432    need to handle the CSI differently. */
4433
4434 static void
4435 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4436                                  unsigned_char_dynarr *dst,
4437                                  unsigned int *flags,
4438                                  int internal_p)
4439 {
4440   if (!flags || (*flags & CODING_STATE_R2L))
4441     {
4442       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4443         {
4444           Dynarr_add (dst, ISO_CODE_ESC);
4445           Dynarr_add (dst, '[');
4446         }
4447       else if (internal_p)
4448         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4449       else
4450         Dynarr_add (dst, ISO_CODE_CSI);
4451       Dynarr_add (dst, '0');
4452       Dynarr_add (dst, ']');
4453       if (flags)
4454         *flags &= ~CODING_STATE_R2L;
4455     }
4456 }
4457
4458 /* If FLAGS is a null pointer or specifies a direction different from
4459    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4460    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4461    sequence to DST.  Also update FLAGS if it is not a null pointer.
4462    If INTERNAL_P is set, we are outputting in internal format and
4463    need to handle the CSI differently. */
4464
4465 static void
4466 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4467                           unsigned_char_dynarr *dst, unsigned int *flags,
4468                           int internal_p)
4469 {
4470   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4471       direction == CHARSET_LEFT_TO_RIGHT)
4472     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4473   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4474            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4475            direction == CHARSET_RIGHT_TO_LEFT)
4476     {
4477       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4478         {
4479           Dynarr_add (dst, ISO_CODE_ESC);
4480           Dynarr_add (dst, '[');
4481         }
4482       else if (internal_p)
4483         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4484       else
4485         Dynarr_add (dst, ISO_CODE_CSI);
4486       Dynarr_add (dst, '2');
4487       Dynarr_add (dst, ']');
4488       if (flags)
4489         *flags |= CODING_STATE_R2L;
4490     }
4491 }
4492
4493 /* Convert ISO2022-format data to internal format. */
4494
4495 static void
4496 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4497                        unsigned_char_dynarr *dst, unsigned int n)
4498 {
4499   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4500   unsigned int flags  = str->flags;
4501   unsigned int ch     = str->ch;
4502   eol_type_t eol_type = str->eol_type;
4503 #ifdef ENABLE_COMPOSITE_CHARS
4504   unsigned_char_dynarr *real_dst = dst;
4505 #endif
4506   Lisp_Object coding_system;
4507
4508   XSETCODING_SYSTEM (coding_system, str->codesys);
4509
4510 #ifdef ENABLE_COMPOSITE_CHARS
4511   if (flags & CODING_STATE_COMPOSITE)
4512     dst = str->iso2022.composite_chars;
4513 #endif /* ENABLE_COMPOSITE_CHARS */
4514
4515   while (n--)
4516     {
4517       unsigned char c = *src++;
4518       if (flags & CODING_STATE_ESCAPE)
4519         {       /* Within ESC sequence */
4520           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4521                                           c, &flags, 1);
4522
4523           if (retval)
4524             {
4525               switch (str->iso2022.esc)
4526                 {
4527 #ifdef ENABLE_COMPOSITE_CHARS
4528                 case ISO_ESC_START_COMPOSITE:
4529                   if (str->iso2022.composite_chars)
4530                     Dynarr_reset (str->iso2022.composite_chars);
4531                   else
4532                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4533                   dst = str->iso2022.composite_chars;
4534                   break;
4535                 case ISO_ESC_END_COMPOSITE:
4536                   {
4537                     Bufbyte comstr[MAX_EMCHAR_LEN];
4538                     Bytecount len;
4539                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4540                                                          Dynarr_length (dst));
4541                     dst = real_dst;
4542                     len = set_charptr_emchar (comstr, emch);
4543                     Dynarr_add_many (dst, comstr, len);
4544                     break;
4545                   }
4546 #endif /* ENABLE_COMPOSITE_CHARS */
4547
4548                 case ISO_ESC_LITERAL:
4549                   DECODE_ADD_BINARY_CHAR (c, dst);
4550                   break;
4551
4552                 default:
4553                   /* Everything else handled already */
4554                   break;
4555                 }
4556             }
4557
4558           /* Attempted error recovery. */
4559           if (str->iso2022.output_direction_sequence)
4560             ensure_correct_direction (flags & CODING_STATE_R2L ?
4561                                       CHARSET_RIGHT_TO_LEFT :
4562                                       CHARSET_LEFT_TO_RIGHT,
4563                                       str->codesys, dst, 0, 1);
4564           /* More error recovery. */
4565           if (!retval || str->iso2022.output_literally)
4566             {
4567               /* Output the (possibly invalid) sequence */
4568               int i;
4569               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4570                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4571               flags &= CODING_STATE_ISO2022_LOCK;
4572               if (!retval)
4573                 n++, src--;/* Repeat the loop with the same character. */
4574               else
4575                 {
4576                   /* No sense in reprocessing the final byte of the
4577                      escape sequence; it could mess things up anyway.
4578                      Just add it now. */
4579                   DECODE_ADD_BINARY_CHAR (c, dst);
4580                 }
4581             }
4582           ch = 0;
4583         }
4584       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4585         { /* Control characters */
4586
4587           /***** Error-handling *****/
4588
4589           /* If we were in the middle of a character, dump out the
4590              partial character. */
4591           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4592
4593           /* If we just saw a single-shift character, dump it out.
4594              This may dump out the wrong sort of single-shift character,
4595              but least it will give an indication that something went
4596              wrong. */
4597           if (flags & CODING_STATE_SS2)
4598             {
4599               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4600               flags &= ~CODING_STATE_SS2;
4601             }
4602           if (flags & CODING_STATE_SS3)
4603             {
4604               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4605               flags &= ~CODING_STATE_SS3;
4606             }
4607
4608           /***** Now handle the control characters. *****/
4609
4610           /* Handle CR/LF */
4611           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4612
4613           flags &= CODING_STATE_ISO2022_LOCK;
4614
4615           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4616             DECODE_ADD_BINARY_CHAR (c, dst);
4617         }
4618       else
4619         {                       /* Graphic characters */
4620           Lisp_Object charset;
4621           int lb;
4622           int reg;
4623
4624           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4625
4626           /* Now determine the charset. */
4627           reg = ((flags & CODING_STATE_SS2) ? 2
4628                  : (flags & CODING_STATE_SS3) ? 3
4629                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4630                  : str->iso2022.register_left);
4631           charset = str->iso2022.charset[reg];
4632
4633           /* Error checking: */
4634           if (! CHARSETP (charset)
4635               || str->iso2022.invalid_designated[reg]
4636               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4637                   && XCHARSET_CHARS (charset) == 94))
4638             /* Mrmph.  We are trying to invoke a register that has no
4639                or an invalid charset in it, or trying to add a character
4640                outside the range of the charset.  Insert that char literally
4641                to preserve it for the output. */
4642             {
4643               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4644               DECODE_ADD_BINARY_CHAR (c, dst);
4645             }
4646
4647           else
4648             {
4649               /* Things are probably hunky-dorey. */
4650
4651               /* Fetch reverse charset, maybe. */
4652               if (((flags & CODING_STATE_R2L) &&
4653                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4654                   ||
4655                   (!(flags & CODING_STATE_R2L) &&
4656                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4657                 {
4658                   Lisp_Object new_charset =
4659                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4660                   if (!NILP (new_charset))
4661                     charset = new_charset;
4662                 }
4663
4664               lb = XCHARSET_LEADING_BYTE (charset);
4665               switch (XCHARSET_REP_BYTES (charset))
4666                 {
4667                 case 1: /* ASCII */
4668                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4669                   Dynarr_add (dst, c & 0x7F);
4670                   break;
4671
4672                 case 2: /* one-byte official */
4673                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4674                   Dynarr_add (dst, lb);
4675                   Dynarr_add (dst, c | 0x80);
4676                   break;
4677
4678                 case 3: /* one-byte private or two-byte official */
4679                   if (XCHARSET_PRIVATE_P (charset))
4680                     {
4681                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4682                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4683                       Dynarr_add (dst, lb);
4684                       Dynarr_add (dst, c | 0x80);
4685                     }
4686                   else
4687                     {
4688                       if (ch)
4689                         {
4690                           Dynarr_add (dst, lb);
4691                           Dynarr_add (dst, ch | 0x80);
4692                           Dynarr_add (dst, c | 0x80);
4693                           ch = 0;
4694                         }
4695                       else
4696                         ch = c;
4697                     }
4698                   break;
4699
4700                 default:        /* two-byte private */
4701                   if (ch)
4702                     {
4703                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4704                       Dynarr_add (dst, lb);
4705                       Dynarr_add (dst, ch | 0x80);
4706                       Dynarr_add (dst, c | 0x80);
4707                       ch = 0;
4708                     }
4709                   else
4710                     ch = c;
4711                 }
4712             }
4713
4714           if (!ch)
4715             flags &= CODING_STATE_ISO2022_LOCK;
4716         }
4717
4718     label_continue_loop:;
4719     }
4720
4721   if (flags & CODING_STATE_END)
4722     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4723
4724   str->flags = flags;
4725   str->ch    = ch;
4726 }
4727
4728
4729 /***** ISO2022 encoder *****/
4730
4731 /* Designate CHARSET into register REG. */
4732
4733 static void
4734 iso2022_designate (Lisp_Object charset, unsigned char reg,
4735                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4736 {
4737   static CONST char inter94[] = "()*+";
4738   static CONST char inter96[] = ",-./";
4739   unsigned int type;
4740   unsigned char final;
4741   Lisp_Object old_charset = str->iso2022.charset[reg];
4742
4743   str->iso2022.charset[reg] = charset;
4744   if (!CHARSETP (charset))
4745     /* charset might be an initial nil or t. */
4746     return;
4747   type = XCHARSET_TYPE (charset);
4748   final = XCHARSET_FINAL (charset);
4749   if (!str->iso2022.force_charset_on_output[reg] &&
4750       CHARSETP (old_charset) &&
4751       XCHARSET_TYPE (old_charset) == type &&
4752       XCHARSET_FINAL (old_charset) == final)
4753     return;
4754
4755   str->iso2022.force_charset_on_output[reg] = 0;
4756
4757   {
4758     charset_conversion_spec_dynarr *dyn =
4759       str->codesys->iso2022.output_conv;
4760
4761     if (dyn)
4762       {
4763         int i;
4764
4765         for (i = 0; i < Dynarr_length (dyn); i++)
4766           {
4767             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4768             if (EQ (charset, spec->from_charset))
4769                 charset = spec->to_charset;
4770           }
4771       }
4772   }
4773
4774   Dynarr_add (dst, ISO_CODE_ESC);
4775   switch (type)
4776     {
4777     case CHARSET_TYPE_94:
4778       Dynarr_add (dst, inter94[reg]);
4779       break;
4780     case CHARSET_TYPE_96:
4781       Dynarr_add (dst, inter96[reg]);
4782       break;
4783     case CHARSET_TYPE_94X94:
4784       Dynarr_add (dst, '$');
4785       if (reg != 0
4786           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4787           || final < '@'
4788           || final > 'B')
4789         Dynarr_add (dst, inter94[reg]);
4790       break;
4791     case CHARSET_TYPE_96X96:
4792       Dynarr_add (dst, '$');
4793       Dynarr_add (dst, inter96[reg]);
4794       break;
4795     }
4796   Dynarr_add (dst, final);
4797 }
4798
4799 static void
4800 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4801 {
4802   if (str->iso2022.register_left != 0)
4803     {
4804       Dynarr_add (dst, ISO_CODE_SI);
4805       str->iso2022.register_left = 0;
4806     }
4807 }
4808
4809 static void
4810 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4811 {
4812   if (str->iso2022.register_left != 1)
4813     {
4814       Dynarr_add (dst, ISO_CODE_SO);
4815       str->iso2022.register_left = 1;
4816     }
4817 }
4818
4819 /* Convert internally-formatted data to ISO2022 format. */
4820
4821 static void
4822 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4823                        unsigned_char_dynarr *dst, unsigned int n)
4824 {
4825   unsigned char charmask, c;
4826   unsigned char char_boundary;
4827   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4828   unsigned int flags          = str->flags;
4829   unsigned int ch             = str->ch;
4830   Lisp_Coding_System *codesys = str->codesys;
4831   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4832   int i;
4833   Lisp_Object charset;
4834   int half;
4835
4836 #ifdef ENABLE_COMPOSITE_CHARS
4837   /* flags for handling composite chars.  We do a little switcharoo
4838      on the source while we're outputting the composite char. */
4839   unsigned int saved_n = 0;
4840   CONST unsigned char *saved_src = NULL;
4841   int in_composite = 0;
4842 #endif /* ENABLE_COMPOSITE_CHARS */
4843
4844   char_boundary = str->iso2022.current_char_boundary;
4845   charset = str->iso2022.current_charset;
4846   half = str->iso2022.current_half;
4847
4848 #ifdef ENABLE_COMPOSITE_CHARS
4849  back_to_square_n:
4850 #endif
4851   while (n--)
4852     {
4853       c = *src++;
4854
4855       if (BYTE_ASCII_P (c))
4856         {               /* Processing ASCII character */
4857           ch = 0;
4858
4859           restore_left_to_right_direction (codesys, dst, &flags, 0);
4860
4861           /* Make sure G0 contains ASCII */
4862           if ((c > ' ' && c < ISO_CODE_DEL) ||
4863               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4864             {
4865               ensure_normal_shift (str, dst);
4866               iso2022_designate (Vcharset_ascii, 0, str, dst);
4867             }
4868
4869           /* If necessary, restore everything to the default state
4870              at end-of-line */
4871           if (c == '\n' &&
4872               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4873             {
4874               restore_left_to_right_direction (codesys, dst, &flags, 0);
4875
4876               ensure_normal_shift (str, dst);
4877
4878               for (i = 0; i < 4; i++)
4879                 {
4880                   Lisp_Object initial_charset =
4881                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4882                   iso2022_designate (initial_charset, i, str, dst);
4883                 }
4884             }
4885           if (c == '\n')
4886             {
4887               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4888                 Dynarr_add (dst, '\r');
4889               if (eol_type != EOL_CR)
4890                 Dynarr_add (dst, c);
4891             }
4892           else
4893             {
4894               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4895                   && fit_to_be_escape_quoted (c))
4896                 Dynarr_add (dst, ISO_CODE_ESC);
4897               Dynarr_add (dst, c);
4898             }
4899           char_boundary = 1;
4900         }
4901
4902       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4903         { /* Processing Leading Byte */
4904           ch = 0;
4905           charset = CHARSET_BY_LEADING_BYTE (c);
4906           if (LEADING_BYTE_PREFIX_P(c))
4907             ch = c;
4908           else if (!EQ (charset, Vcharset_control_1)
4909 #ifdef ENABLE_COMPOSITE_CHARS
4910                    && !EQ (charset, Vcharset_composite)
4911 #endif
4912                    )
4913             {
4914               int reg;
4915
4916               ensure_correct_direction (XCHARSET_DIRECTION (charset),
4917                                         codesys, dst, &flags, 0);
4918
4919               /* Now determine which register to use. */
4920               reg = -1;
4921               for (i = 0; i < 4; i++)
4922                 {
4923                   if (EQ (charset, str->iso2022.charset[i]) ||
4924                       EQ (charset,
4925                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4926                     {
4927                       reg = i;
4928                       break;
4929                     }
4930                 }
4931
4932               if (reg == -1)
4933                 {
4934                   if (XCHARSET_GRAPHIC (charset) != 0)
4935                     {
4936                       if (!NILP (str->iso2022.charset[1]) &&
4937                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4938                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4939                         reg = 1;
4940                       else if (!NILP (str->iso2022.charset[2]))
4941                         reg = 2;
4942                       else if (!NILP (str->iso2022.charset[3]))
4943                         reg = 3;
4944                       else
4945                         reg = 0;
4946                     }
4947                   else
4948                     reg = 0;
4949                 }
4950
4951               iso2022_designate (charset, reg, str, dst);
4952
4953               /* Now invoke that register. */
4954               switch (reg)
4955                 {
4956                 case 0:
4957                   ensure_normal_shift (str, dst);
4958                   half = 0;
4959                   break;
4960
4961                 case 1:
4962                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4963                     {
4964                       ensure_shift_out (str, dst);
4965                       half = 0;
4966                     }
4967                   else
4968                     half = 1;
4969                   break;
4970
4971                 case 2:
4972                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4973                     {
4974                       Dynarr_add (dst, ISO_CODE_ESC);
4975                       Dynarr_add (dst, 'N');
4976                       half = 0;
4977                     }
4978                   else
4979                     {
4980                       Dynarr_add (dst, ISO_CODE_SS2);
4981                       half = 1;
4982                     }
4983                   break;
4984
4985                 case 3:
4986                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4987                     {
4988                       Dynarr_add (dst, ISO_CODE_ESC);
4989                       Dynarr_add (dst, 'O');
4990                       half = 0;
4991                     }
4992                   else
4993                     {
4994                       Dynarr_add (dst, ISO_CODE_SS3);
4995                       half = 1;
4996                     }
4997                   break;
4998
4999                 default:
5000                   abort ();
5001                 }
5002             }
5003           char_boundary = 0;
5004         }
5005       else
5006         {                       /* Processing Non-ASCII character */
5007           charmask = (half == 0 ? 0x7F : 0xFF);
5008           char_boundary = 1;
5009           if (EQ (charset, Vcharset_control_1))
5010             {
5011               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5012                   && fit_to_be_escape_quoted (c))
5013                 Dynarr_add (dst, ISO_CODE_ESC);
5014               /* you asked for it ... */
5015               Dynarr_add (dst, c - 0x20);
5016             }
5017           else
5018             {
5019               switch (XCHARSET_REP_BYTES (charset))
5020                 {
5021                 case 2:
5022                   Dynarr_add (dst, c & charmask);
5023                   break;
5024                 case 3:
5025                   if (XCHARSET_PRIVATE_P (charset))
5026                     {
5027                       Dynarr_add (dst, c & charmask);
5028                       ch = 0;
5029                     }
5030                   else if (ch)
5031                     {
5032 #ifdef ENABLE_COMPOSITE_CHARS
5033                       if (EQ (charset, Vcharset_composite))
5034                         {
5035                           if (in_composite)
5036                             {
5037                               /* #### Bother! We don't know how to
5038                                  handle this yet. */
5039                               Dynarr_add (dst, '~');
5040                             }
5041                           else
5042                             {
5043                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5044                                                        ch & 0x7F, c & 0x7F);
5045                               Lisp_Object lstr = composite_char_string (emch);
5046                               saved_n = n;
5047                               saved_src = src;
5048                               in_composite = 1;
5049                               src = XSTRING_DATA   (lstr);
5050                               n   = XSTRING_LENGTH (lstr);
5051                               Dynarr_add (dst, ISO_CODE_ESC);
5052                               Dynarr_add (dst, '0'); /* start composing */
5053                             }
5054                         }
5055                       else
5056 #endif /* ENABLE_COMPOSITE_CHARS */
5057                         {
5058                           Dynarr_add (dst, ch & charmask);
5059                           Dynarr_add (dst, c & charmask);
5060                         }
5061                       ch = 0;
5062                     }
5063                   else
5064                     {
5065                       ch = c;
5066                       char_boundary = 0;
5067                     }
5068                   break;
5069                 case 4:
5070                   if (ch)
5071                     {
5072                       Dynarr_add (dst, ch & charmask);
5073                       Dynarr_add (dst, c & charmask);
5074                       ch = 0;
5075                     }
5076                   else
5077                     {
5078                       ch = c;
5079                       char_boundary = 0;
5080                     }
5081                   break;
5082                 default:
5083                   abort ();
5084                 }
5085             }
5086         }
5087     }
5088
5089 #ifdef ENABLE_COMPOSITE_CHARS
5090   if (in_composite)
5091     {
5092       n = saved_n;
5093       src = saved_src;
5094       in_composite = 0;
5095       Dynarr_add (dst, ISO_CODE_ESC);
5096       Dynarr_add (dst, '1'); /* end composing */
5097       goto back_to_square_n; /* Wheeeeeeeee ..... */
5098     }
5099 #endif /* ENABLE_COMPOSITE_CHARS */
5100
5101   if (char_boundary && flags & CODING_STATE_END)
5102     {
5103       restore_left_to_right_direction (codesys, dst, &flags, 0);
5104       ensure_normal_shift (str, dst);
5105       for (i = 0; i < 4; i++)
5106         {
5107           Lisp_Object initial_charset =
5108             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5109           iso2022_designate (initial_charset, i, str, dst);
5110         }
5111     }
5112
5113   str->flags = flags;
5114   str->ch    = ch;
5115   str->iso2022.current_char_boundary = char_boundary;
5116   str->iso2022.current_charset = charset;
5117   str->iso2022.current_half = half;
5118
5119   /* Verbum caro factum est! */
5120 }
5121 #endif /* MULE */
5122 \f
5123 /************************************************************************/
5124 /*                     No-conversion methods                            */
5125 /************************************************************************/
5126
5127 /* This is used when reading in "binary" files -- i.e. files that may
5128    contain all 256 possible byte values and that are not to be
5129    interpreted as being in any particular decoding. */
5130 static void
5131 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5132                              unsigned_char_dynarr *dst, unsigned int n)
5133 {
5134   unsigned char c;
5135   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5136   unsigned int flags  = str->flags;
5137   unsigned int ch     = str->ch;
5138   eol_type_t eol_type = str->eol_type;
5139
5140   while (n--)
5141     {
5142       c = *src++;
5143
5144       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5145       DECODE_ADD_BINARY_CHAR (c, dst);
5146     label_continue_loop:;
5147     }
5148
5149   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5150
5151   str->flags = flags;
5152   str->ch    = ch;
5153 }
5154
5155 static void
5156 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5157                              unsigned_char_dynarr *dst, unsigned int n)
5158 {
5159   unsigned char c;
5160   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5161   unsigned int flags  = str->flags;
5162   unsigned int ch     = str->ch;
5163   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5164
5165   while (n--)
5166     {
5167       c = *src++;
5168       if (c == '\n')
5169         {
5170           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5171             Dynarr_add (dst, '\r');
5172           if (eol_type != EOL_CR)
5173             Dynarr_add (dst, '\n');
5174           ch = 0;
5175         }
5176       else if (BYTE_ASCII_P (c))
5177         {
5178           assert (ch == 0);
5179           Dynarr_add (dst, c);
5180         }
5181       else if (BUFBYTE_LEADING_BYTE_P (c))
5182         {
5183           assert (ch == 0);
5184           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5185               c == LEADING_BYTE_CONTROL_1)
5186             ch = c;
5187           else
5188             Dynarr_add (dst, '~'); /* untranslatable character */
5189         }
5190       else
5191         {
5192           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5193             Dynarr_add (dst, c);
5194           else if (ch == LEADING_BYTE_CONTROL_1)
5195             {
5196               assert (c < 0xC0);
5197               Dynarr_add (dst, c - 0x20);
5198             }
5199           /* else it should be the second or third byte of an
5200              untranslatable character, so ignore it */
5201           ch = 0;
5202         }
5203     }
5204
5205   str->flags = flags;
5206   str->ch    = ch;
5207 }
5208
5209 \f
5210 /************************************************************************/
5211 /*                   Simple internal/external functions                 */
5212 /************************************************************************/
5213
5214 static Extbyte_dynarr *conversion_out_dynarr;
5215 static Bufbyte_dynarr *conversion_in_dynarr;
5216
5217 /* Determine coding system from coding format */
5218
5219 /* #### not correct for all values of `fmt'! */
5220 static Lisp_Object
5221 external_data_format_to_coding_system (enum external_data_format fmt)
5222 {
5223   switch (fmt)
5224     {
5225     case FORMAT_FILENAME:
5226     case FORMAT_TERMINAL:
5227       if (EQ (Vfile_name_coding_system, Qnil) ||
5228           EQ (Vfile_name_coding_system, Qbinary))
5229         return Qnil;
5230       else
5231         return Fget_coding_system (Vfile_name_coding_system);
5232 #ifdef MULE
5233     case FORMAT_CTEXT:
5234       return Fget_coding_system (Qctext);
5235 #endif
5236     default:
5237       return Qnil;
5238     }
5239 }
5240
5241 Extbyte *
5242 convert_to_external_format (CONST Bufbyte *ptr,
5243                             Bytecount len,
5244                             Extcount *len_out,
5245                             enum external_data_format fmt)
5246 {
5247   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5248
5249   if (!conversion_out_dynarr)
5250     conversion_out_dynarr = Dynarr_new (Extbyte);
5251   else
5252     Dynarr_reset (conversion_out_dynarr);
5253
5254   if (NILP (coding_system))
5255     {
5256       CONST Bufbyte *end = ptr + len;
5257
5258       for (; ptr < end;)
5259         {
5260           Bufbyte c =
5261             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5262             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5263             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5264             '~';
5265
5266           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5267           INC_CHARPTR (ptr);
5268         }
5269
5270 #ifdef ERROR_CHECK_BUFPOS
5271       assert (ptr == end);
5272 #endif
5273     }
5274   else
5275     {
5276       Lisp_Object instream, outstream, da_outstream;
5277       Lstream *istr, *ostr;
5278       struct gcpro gcpro1, gcpro2, gcpro3;
5279       char tempbuf[1024]; /* some random amount */
5280
5281       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5282       da_outstream = make_dynarr_output_stream
5283         ((unsigned_char_dynarr *) conversion_out_dynarr);
5284       outstream =
5285         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5286       istr = XLSTREAM (instream);
5287       ostr = XLSTREAM (outstream);
5288       GCPRO3 (instream, outstream, da_outstream);
5289       while (1)
5290         {
5291           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5292           if (!size_in_bytes)
5293             break;
5294           Lstream_write (ostr, tempbuf, size_in_bytes);
5295         }
5296       Lstream_close (istr);
5297       Lstream_close (ostr);
5298       UNGCPRO;
5299       Lstream_delete (istr);
5300       Lstream_delete (ostr);
5301       Lstream_delete (XLSTREAM (da_outstream));
5302     }
5303
5304   *len_out = Dynarr_length (conversion_out_dynarr);
5305   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5306   return Dynarr_atp (conversion_out_dynarr, 0);
5307 }
5308
5309 Bufbyte *
5310 convert_from_external_format (CONST Extbyte *ptr,
5311                               Extcount len,
5312                               Bytecount *len_out,
5313                               enum external_data_format fmt)
5314 {
5315   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5316
5317   if (!conversion_in_dynarr)
5318     conversion_in_dynarr = Dynarr_new (Bufbyte);
5319   else
5320     Dynarr_reset (conversion_in_dynarr);
5321
5322   if (NILP (coding_system))
5323     {
5324       CONST Extbyte *end = ptr + len;
5325       for (; ptr < end; ptr++)
5326         {
5327           Extbyte c = *ptr;
5328           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5329         }
5330     }
5331   else
5332     {
5333       Lisp_Object instream, outstream, da_outstream;
5334       Lstream *istr, *ostr;
5335       struct gcpro gcpro1, gcpro2, gcpro3;
5336       char tempbuf[1024]; /* some random amount */
5337
5338       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5339       da_outstream = make_dynarr_output_stream
5340         ((unsigned_char_dynarr *) conversion_in_dynarr);
5341       outstream =
5342         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5343       istr = XLSTREAM (instream);
5344       ostr = XLSTREAM (outstream);
5345       GCPRO3 (instream, outstream, da_outstream);
5346       while (1)
5347         {
5348           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5349           if (!size_in_bytes)
5350             break;
5351           Lstream_write (ostr, tempbuf, size_in_bytes);
5352         }
5353       Lstream_close (istr);
5354       Lstream_close (ostr);
5355       UNGCPRO;
5356       Lstream_delete (istr);
5357       Lstream_delete (ostr);
5358       Lstream_delete (XLSTREAM (da_outstream));
5359     }
5360
5361   *len_out = Dynarr_length (conversion_in_dynarr);
5362   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5363   return Dynarr_atp (conversion_in_dynarr, 0);
5364 }
5365
5366 \f
5367 /************************************************************************/
5368 /*                             Initialization                           */
5369 /************************************************************************/
5370
5371 void
5372 syms_of_mule_coding (void)
5373 {
5374   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5375   deferror (&Qcoding_system_error, "coding-system-error",
5376             "Coding-system error", Qio_error);
5377
5378   DEFSUBR (Fcoding_system_p);
5379   DEFSUBR (Ffind_coding_system);
5380   DEFSUBR (Fget_coding_system);
5381   DEFSUBR (Fcoding_system_list);
5382   DEFSUBR (Fcoding_system_name);
5383   DEFSUBR (Fmake_coding_system);
5384   DEFSUBR (Fcopy_coding_system);
5385   DEFSUBR (Fsubsidiary_coding_system);
5386
5387   DEFSUBR (Fcoding_system_type);
5388   DEFSUBR (Fcoding_system_doc_string);
5389 #ifdef MULE
5390   DEFSUBR (Fcoding_system_charset);
5391 #endif
5392   DEFSUBR (Fcoding_system_property);
5393
5394   DEFSUBR (Fcoding_category_list);
5395   DEFSUBR (Fset_coding_priority_list);
5396   DEFSUBR (Fcoding_priority_list);
5397   DEFSUBR (Fset_coding_category_system);
5398   DEFSUBR (Fcoding_category_system);
5399
5400   DEFSUBR (Fdetect_coding_region);
5401   DEFSUBR (Fdecode_coding_region);
5402   DEFSUBR (Fencode_coding_region);
5403 #ifdef MULE
5404   DEFSUBR (Fdecode_shift_jis_char);
5405   DEFSUBR (Fencode_shift_jis_char);
5406   DEFSUBR (Fdecode_big5_char);
5407   DEFSUBR (Fencode_big5_char);
5408   DEFSUBR (Fset_ucs_char);
5409   DEFSUBR (Fucs_char);
5410   DEFSUBR (Fset_char_ucs);
5411   DEFSUBR (Fchar_ucs);
5412 #endif /* MULE */
5413   defsymbol (&Qcoding_system_p, "coding-system-p");
5414   defsymbol (&Qno_conversion, "no-conversion");
5415 #ifdef MULE
5416   defsymbol (&Qbig5, "big5");
5417   defsymbol (&Qshift_jis, "shift-jis");
5418   defsymbol (&Qucs4, "ucs-4");
5419   defsymbol (&Qutf8, "utf-8");
5420   defsymbol (&Qccl, "ccl");
5421   defsymbol (&Qiso2022, "iso2022");
5422 #endif /* MULE */
5423   defsymbol (&Qmnemonic, "mnemonic");
5424   defsymbol (&Qeol_type, "eol-type");
5425   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5426   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5427
5428   defsymbol (&Qcr, "cr");
5429   defsymbol (&Qlf, "lf");
5430   defsymbol (&Qcrlf, "crlf");
5431   defsymbol (&Qeol_cr, "eol-cr");
5432   defsymbol (&Qeol_lf, "eol-lf");
5433   defsymbol (&Qeol_crlf, "eol-crlf");
5434 #ifdef MULE
5435   defsymbol (&Qcharset_g0, "charset-g0");
5436   defsymbol (&Qcharset_g1, "charset-g1");
5437   defsymbol (&Qcharset_g2, "charset-g2");
5438   defsymbol (&Qcharset_g3, "charset-g3");
5439   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5440   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5441   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5442   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5443   defsymbol (&Qno_iso6429, "no-iso6429");
5444   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5445   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5446
5447   defsymbol (&Qshort, "short");
5448   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5449   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5450   defsymbol (&Qseven, "seven");
5451   defsymbol (&Qlock_shift, "lock-shift");
5452   defsymbol (&Qescape_quoted, "escape-quoted");
5453 #endif /* MULE */
5454   defsymbol (&Qencode, "encode");
5455   defsymbol (&Qdecode, "decode");
5456
5457 #ifdef MULE
5458   defsymbol (&Qctext, "ctext");
5459   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5460              "shift-jis");
5461   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5462              "big5");
5463   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5464              "ucs-4");
5465   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5466              "utf-8");
5467   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5468              "iso-7");
5469   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5470              "iso-8-designate");
5471   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5472              "iso-8-1");
5473   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5474              "iso-8-2");
5475   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5476              "iso-lock-shift");
5477 #endif /* MULE */
5478   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5479              "no-conversion");
5480 }
5481
5482 void
5483 lstream_type_create_mule_coding (void)
5484 {
5485   LSTREAM_HAS_METHOD (decoding, reader);
5486   LSTREAM_HAS_METHOD (decoding, writer);
5487   LSTREAM_HAS_METHOD (decoding, rewinder);
5488   LSTREAM_HAS_METHOD (decoding, seekable_p);
5489   LSTREAM_HAS_METHOD (decoding, flusher);
5490   LSTREAM_HAS_METHOD (decoding, closer);
5491   LSTREAM_HAS_METHOD (decoding, marker);
5492
5493   LSTREAM_HAS_METHOD (encoding, reader);
5494   LSTREAM_HAS_METHOD (encoding, writer);
5495   LSTREAM_HAS_METHOD (encoding, rewinder);
5496   LSTREAM_HAS_METHOD (encoding, seekable_p);
5497   LSTREAM_HAS_METHOD (encoding, flusher);
5498   LSTREAM_HAS_METHOD (encoding, closer);
5499   LSTREAM_HAS_METHOD (encoding, marker);
5500 }
5501
5502 void
5503 vars_of_mule_coding (void)
5504 {
5505   int i;
5506
5507   /* Initialize to something reasonable ... */
5508   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5509     {
5510       coding_category_system[i] = Qnil;
5511       coding_category_by_priority[i] = i;
5512     }
5513
5514   Fprovide (intern ("file-coding"));
5515
5516   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5517 Coding system used for TTY keyboard input.
5518 Not used under a windowing system.
5519 */ );
5520   Vkeyboard_coding_system = Qnil;
5521
5522   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5523 Coding system used for TTY display output.
5524 Not used under a windowing system.
5525 */ );
5526   Vterminal_coding_system = Qnil;
5527
5528   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5529 Overriding coding system used when writing a file or process.
5530 You should *bind* this, not set it.  If this is non-nil, it specifies
5531 the coding system that will be used when a file or process is read
5532 in, and overrides `buffer-file-coding-system-for-read',
5533 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5534 this one for permanent changes to the environment.
5535 */ );
5536   Vcoding_system_for_read = Qnil;
5537
5538   DEFVAR_LISP ("coding-system-for-write",
5539                &Vcoding_system_for_write /*
5540 Overriding coding system used when writing a file or process.
5541 You should *bind* this, not set it.  If this is non-nil, it specifies
5542 the coding system that will be used when a file or process is wrote
5543 in, and overrides `buffer-file-coding-system',
5544 `write-region-pre-hook', etc.  Use those variables instead of this one
5545 for permanent changes to the environment.
5546 */ );
5547   Vcoding_system_for_write = Qnil;
5548
5549   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5550 Coding system used to convert pathnames when accessing files.
5551 */ );
5552   Vfile_name_coding_system = Qnil;
5553
5554   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5555 Non-nil means the buffer contents are regarded as multi-byte form
5556 of characters, not a binary code.  This affects the display, file I/O,
5557 and behaviors of various editing commands.
5558
5559 Setting this to nil does not do anything.
5560 */ );
5561   enable_multibyte_characters = 1;
5562 }
5563
5564 void
5565 complex_vars_of_mule_coding (void)
5566 {
5567   staticpro (&Vcoding_system_hash_table);
5568   Vcoding_system_hash_table =
5569     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5570
5571   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5572
5573 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5574 {                                               \
5575   struct codesys_prop csp;                      \
5576   csp.sym = (Sym);                              \
5577   csp.prop_type = (Prop_Type);                  \
5578   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5579 } while (0)
5580
5581   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5582   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5583   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5584   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5585   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5586   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5587   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5588 #ifdef MULE
5589   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5590   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5591   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5592   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5593   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5594   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5595   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5596   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5597   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5598   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5599   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5600   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5601   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5602   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5603   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5604   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5605   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5606
5607   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5608   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5609 #endif /* MULE */
5610   /* Need to create this here or we're really screwed. */
5611   Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
5612                        list2 (Qmnemonic, build_string ("Noconv")));
5613
5614   Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
5615                        Qbinary);
5616
5617   /* Need this for bootstrapping */
5618   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5619     Fget_coding_system (Qno_conversion);
5620
5621 #ifdef MULE
5622   {
5623     unsigned int i;
5624
5625     for (i = 0; i < 65536; i++)
5626       ucs_to_mule_table[i] = Qnil;
5627   }
5628   staticpro (&mule_to_ucs_table);
5629   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5630 #endif /* MULE */
5631 }