git.chise.org Git - chise/xemacs-chise.git.1/blob - src/mule-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #if 0                           /* while file-coding not split up */
  27
  28 #include <config.h>
  29 #include "lisp.h"
  30
  31 #include "buffer.h"
  32 #include "elhash.h"
  33 #include "insdel.h"
  34 #include "lstream.h"
  35 #include "mule-ccl.h"
  36 #include "mule-coding.h"
  37
  38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  39
  40 Lisp_Object Vkeyboard_coding_system;
  41 Lisp_Object Vterminal_coding_system;
  42 Lisp_Object Vcoding_system_for_read;
  43 Lisp_Object Vcoding_system_for_write;
  44 Lisp_Object Vfile_name_coding_system;
  45
  46 /* Table of symbols identifying each coding category. */
  47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  48
  49 /* Coding system currently associated with each coding category. */
  50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  51
  52 /* Table of all coding categories in decreasing order of priority.
  53    This describes a permutation of the possible coding categories. */
  54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  55
  56 Lisp_Object Qcoding_system_p;
  57
  58 Lisp_Object Qbig5, Qshift_jis, Qno_conversion, Qccl, Qiso2022;
  59 /* Qinternal in general.c */
  60
  61 Lisp_Object Qmnemonic, Qeol_type;
  62 Lisp_Object Qcr, Qcrlf, Qlf;
  63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  64 Lisp_Object Qpost_read_conversion;
  65 Lisp_Object Qpre_write_conversion;
  66
  67 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  68 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  69 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  70 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  71 Lisp_Object Qno_iso6429, Qescape_quoted;
  72 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  73
  74 Lisp_Object Qencode, Qdecode;
  75
  76 Lisp_Object Qctext;
  77
  78 Lisp_Object Vcoding_system_hashtable;
  79
  80 int enable_multibyte_characters;
  81
  82 /* Additional information used by the ISO2022 decoder and detector. */
  83 struct iso2022_decoder
  84 {
  85   /* CHARSET holds the character sets currently assigned to the G0
  86      through G3 variables.  It is initialized from the array
  87      INITIAL_CHARSET in CODESYS. */
  88   Lisp_Object charset[4];
  89
  90   /* Which registers are currently invoked into the left (GL) and
  91      right (GR) halves of the 8-bit encoding space? */
  92   int register_left, register_right;
  93
  94   /* ISO_ESC holds a value indicating part of an escape sequence
  95      that has already been seen. */
  96   enum iso_esc_flag esc;
  97
  98   /* This records the bytes we've seen so far in an escape sequence,
  99      in case the sequence is invalid (we spit out the bytes unchanged). */
 100   unsigned char esc_bytes[8];
 101
 102   /* Index for next byte to store in ISO escape sequence. */
 103   int esc_bytes_index;
 104
 105   /* Stuff seen so far when composing a string. */
 106   unsigned_char_dynarr *composite_chars;
 107
 108   /* If we saw an invalid designation sequence for a particular
 109      register, we flag it here and switch to ASCII.  The next time we
 110      see a valid designation for this register, we turn off the flag
 111      and do the designation normally, but pretend the sequence was
 112      invalid.  The effect of all this is that (most of the time) the
 113      escape sequences for both the switch to the unknown charset, and
 114      the switch back to the known charset, get inserted literally into
 115      the buffer and saved out as such.  The hope is that we can
 116      preserve the escape sequences so that the resulting written out
 117      file makes sense.  If we don't do any of this, the designation
 118      to the invalid charset will be preserved but that switch back
 119      to the known charset will probably get eaten because it was
 120      the same charset that was already present in the register. */
 121   unsigned char invalid_designated[4];
 122
 123   /* We try to do similar things as above for direction-switching
 124      sequences.  If we encountered a direction switch while an
 125      invalid designation was present, or an invalid designation
 126      just after a direction switch (i.e. no valid designation
 127      encountered yet), we insert the direction-switch escape
 128      sequence literally into the output stream, and later on
 129      insert the corresponding direction-restoring escape sequence
 130      literally also. */
 131   unsigned int switched_dir_and_no_valid_charset_yet :1;
 132   unsigned int invalid_switch_dir :1;
 133
 134   /* Tells the decoder to output the escape sequence literally
 135      even though it was valid.  Used in the games we play to
 136      avoid lossage when we encounter invalid designations. */
 137   unsigned int output_literally :1;
 138   /* We encountered a direction switch followed by an invalid
 139      designation.  We didn't output the direction switch
 140      literally because we didn't know about the invalid designation;
 141      but we have to do so now. */
 142   unsigned int output_direction_sequence :1;
 143 };
 144
 145 EXFUN (Fcopy_coding_system, 2);
 146 struct detection_state;
 147 static int detect_coding_sjis (struct detection_state *st,
 148                                CONST unsigned char *src,
 149                                unsigned int n);
 150 static void decode_coding_sjis (Lstream *decoding,
 151                                 CONST unsigned char *src,
 152                                 unsigned_char_dynarr *dst,
 153                                 unsigned int n);
 154 static void encode_coding_sjis (Lstream *encoding,
 155                                 CONST unsigned char *src,
 156                                 unsigned_char_dynarr *dst,
 157                                 unsigned int n);
 158 static int detect_coding_big5 (struct detection_state *st,
 159                                CONST unsigned char *src,
 160                                unsigned int n);
 161 static void decode_coding_big5 (Lstream *decoding,
 162                                 CONST unsigned char *src,
 163                                 unsigned_char_dynarr *dst, unsigned int n);
 164 static void encode_coding_big5 (Lstream *encoding,
 165                                 CONST unsigned char *src,
 166                                 unsigned_char_dynarr *dst, unsigned int n);
 167 static int postprocess_iso2022_mask (int mask);
 168 static void reset_iso2022 (Lisp_Object coding_system,
 169                            struct iso2022_decoder *iso);
 170 static int detect_coding_iso2022 (struct detection_state *st,
 171                                   CONST unsigned char *src,
 172                                   unsigned int n);
 173 static void decode_coding_iso2022 (Lstream *decoding,
 174                                    CONST unsigned char *src,
 175                                    unsigned_char_dynarr *dst, unsigned int n);
 176 static void encode_coding_iso2022 (Lstream *encoding,
 177                                    CONST unsigned char *src,
 178                                    unsigned_char_dynarr *dst, unsigned int n);
 179 static void decode_coding_no_conversion (Lstream *decoding,
 180                                          CONST unsigned char *src,
 181                                          unsigned_char_dynarr *dst,
 182                                          unsigned int n);
 183 static void encode_coding_no_conversion (Lstream *encoding,
 184                                          CONST unsigned char *src,
 185                                          unsigned_char_dynarr *dst,
 186                                          unsigned int n);
 187 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 188                          unsigned_char_dynarr *dst, unsigned int n);
 189 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 190                          unsigned_char_dynarr *dst, unsigned int n);
 191
 192 typedef struct codesys_prop codesys_prop;
 193 struct codesys_prop
 194 {
 195   Lisp_Object sym;
 196   int prop_type;
 197 };
 198
 199 typedef struct
 200 {
 201   Dynarr_declare (codesys_prop);
 202 } codesys_prop_dynarr;
 203
 204 codesys_prop_dynarr *the_codesys_prop_dynarr;
 205
 206 enum codesys_prop_enum
 207 {
 208   CODESYS_PROP_ALL_OK,
 209   CODESYS_PROP_ISO2022,
 210   CODESYS_PROP_CCL
 211 };
 212
 213 \f
 214 /************************************************************************/
 215 /*                       Coding system functions                        */
 216 /************************************************************************/
 217
 218 static Lisp_Object
 219 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 220 {
 221   struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 222
 223   (markobj) (CODING_SYSTEM_NAME (codesys));
 224   (markobj) (CODING_SYSTEM_DOC_STRING (codesys));
 225   (markobj) (CODING_SYSTEM_MNEMONIC (codesys));
 226   (markobj) (CODING_SYSTEM_EOL_LF (codesys));
 227   (markobj) (CODING_SYSTEM_EOL_CRLF (codesys));
 228   (markobj) (CODING_SYSTEM_EOL_CR (codesys));
 229
 230   switch (CODING_SYSTEM_TYPE (codesys))
 231     {
 232       int i;
 233     case CODESYS_ISO2022:
 234       for (i = 0; i < 4; i++)
 235         (markobj) (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 236       if (codesys->iso2022.input_conv)
 237         {
 238           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 239             {
 240               struct charset_conversion_spec *ccs =
 241                 Dynarr_atp (codesys->iso2022.input_conv, i);
 242               (markobj) (ccs->from_charset);
 243               (markobj) (ccs->to_charset);
 244             }
 245         }
 246       if (codesys->iso2022.output_conv)
 247         {
 248           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 249             {
 250               struct charset_conversion_spec *ccs =
 251                 Dynarr_atp (codesys->iso2022.output_conv, i);
 252               (markobj) (ccs->from_charset);
 253               (markobj) (ccs->to_charset);
 254             }
 255         }
 256       break;
 257
 258     case CODESYS_CCL:
 259       (markobj) (CODING_SYSTEM_CCL_DECODE (codesys));
 260       (markobj) (CODING_SYSTEM_CCL_ENCODE (codesys));
 261       break;
 262     default:
 263       break;
 264     }
 265
 266   (markobj) (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 267   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 268 }
 269
 270 static void
 271 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 272                      int escapeflag)
 273 {
 274   struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 275   if (print_readably)
 276     error ("printing unreadable object #<coding_system 0x%x>",
 277            c->header.uid);
 278
 279   write_c_string ("#<coding_system ", printcharfun);
 280   print_internal (c->name, printcharfun, 1);
 281   write_c_string (">", printcharfun);
 282 }
 283
 284 static void
 285 finalize_coding_system (void *header, int for_disksave)
 286 {
 287   struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
 288   /* Since coding systems never go away, this function is not
 289      necessary.  But it would be necessary if we changed things
 290      so that coding systems could go away. */
 291   if (!for_disksave) /* see comment in lstream.c */
 292     {
 293       switch (CODING_SYSTEM_TYPE (c))
 294         {
 295         case CODESYS_ISO2022:
 296           if (c->iso2022.input_conv)
 297             {
 298               Dynarr_free (c->iso2022.input_conv);
 299               c->iso2022.input_conv = 0;
 300             }
 301           if (c->iso2022.output_conv)
 302             {
 303               Dynarr_free (c->iso2022.output_conv);
 304               c->iso2022.output_conv = 0;
 305             }
 306           break;
 307
 308         default:
 309           break;
 310         }
 311     }
 312 }
 313
 314 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 315                                mark_coding_system, print_coding_system,
 316                                finalize_coding_system,
 317                                0, 0, struct Lisp_Coding_System);
 318
 319 static enum eol_type
 320 symbol_to_eol_type (Lisp_Object symbol)
 321 {
 322   CHECK_SYMBOL (symbol);
 323   if (NILP (symbol))      return EOL_AUTODETECT;
 324   if (EQ (symbol, Qlf))   return EOL_LF;
 325   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 326   if (EQ (symbol, Qcr))   return EOL_CR;
 327
 328   signal_simple_error ("Unrecognized eol type", symbol);
 329   return EOL_AUTODETECT; /* not reached */
 330 }
 331
 332 static Lisp_Object
 333 eol_type_to_symbol (enum eol_type type)
 334 {
 335   switch (type)
 336     {
 337     case EOL_LF:         return Qlf;
 338     case EOL_CRLF:       return Qcrlf;
 339     case EOL_CR:         return Qcr;
 340     case EOL_AUTODETECT: return Qnil;
 341     default:             abort (); return Qnil; /* not reached */
 342     }
 343 }
 344
 345 static void
 346 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
 347 {
 348   Lisp_Object codesys_obj;
 349   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 350   char *codesys_name = (char *) alloca (len + 7);
 351   Lisp_Object codesys_name_sym, sub_codesys_obj;
 352
 353   /* kludge */
 354
 355   XSETCODING_SYSTEM (codesys_obj, codesys);
 356
 357   memcpy (codesys_name,
 358           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 359
 360 #define DEFINE_SUB_CODESYS(op_sys, Type) do {   \
 361     strcpy (codesys_name + len, "-" op_sys);    \
 362     codesys_name_sym = intern (codesys_name);   \
 363     sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
 364     XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
 365     CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
 366 } while (0)
 367
 368   DEFINE_SUB_CODESYS("unix", EOL_LF);
 369   DEFINE_SUB_CODESYS("dos",  EOL_CRLF);
 370   DEFINE_SUB_CODESYS("mac",  EOL_CR);
 371 }
 372
 373 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 374 Return t if OBJECT is a coding system.
 375 A coding system is an object that defines how text containing multiple
 376 character sets is encoded into a stream of (typically 8-bit) bytes.
 377 The coding system is used to decode the stream into a series of
 378 characters (which may be from multiple charsets) when the text is read
 379 from a file or process, and is used to encode the text back into the
 380 same format when it is written out to a file or process.
 381
 382 For example, many ISO2022-compliant coding systems (such as Compound
 383 Text, which is used for inter-client data under the X Window System)
 384 use escape sequences to switch between different charsets -- Japanese
 385 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 386 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 387 `make-coding-system' for more information.
 388
 389 Coding systems are normally identified using a symbol, and the
 390 symbol is accepted in place of the actual coding system object whenever
 391 a coding system is called for. (This is similar to how faces work.)
 392 */
 393        (object))
 394 {
 395   return CODING_SYSTEMP (object) ? Qt : Qnil;
 396 }
 397
 398 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 399 Retrieve the coding system of the given name.
 400
 401 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 402 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 403 If there is no such coding system, nil is returned.  Otherwise the
 404 associated coding system object is returned.
 405 */
 406        (coding_system_or_name))
 407 {
 408   if (NILP (coding_system_or_name))
 409     coding_system_or_name = Qbinary;
 410   if (CODING_SYSTEMP (coding_system_or_name))
 411     return coding_system_or_name;
 412   CHECK_SYMBOL (coding_system_or_name);
 413
 414   return Fgethash (coding_system_or_name, Vcoding_system_hashtable, Qnil);
 415 }
 416
 417 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 418 Retrieve the coding system of the given name.
 419 Same as `find-coding-system' except that if there is no such
 420 coding system, an error is signaled instead of returning nil.
 421 */
 422        (name))
 423 {
 424   Lisp_Object coding_system = Ffind_coding_system (name);
 425
 426   if (NILP (coding_system))
 427     signal_simple_error ("No such coding system", name);
 428   return coding_system;
 429 }
 430
 431 /* We store the coding systems in hash tables with the names as the key and the
 432    actual coding system object as the value.  Occasionally we need to use them
 433    in a list format.  These routines provide us with that. */
 434 struct coding_system_list_closure
 435 {
 436   Lisp_Object *coding_system_list;
 437 };
 438
 439 static int
 440 add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents,
 441                                   void *coding_system_list_closure)
 442 {
 443   /* This function can GC */
 444   Lisp_Object key, contents;
 445   Lisp_Object *coding_system_list;
 446   struct coding_system_list_closure *cscl =
 447     (struct coding_system_list_closure *) coding_system_list_closure;
 448   CVOID_TO_LISP (key, hash_key);
 449   VOID_TO_LISP (contents, hash_contents);
 450   coding_system_list = cscl->coding_system_list;
 451
 452   *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name,
 453                                *coding_system_list);
 454   return 0;
 455 }
 456
 457 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 458 Return a list of the names of all defined coding systems.
 459 */
 460        ())
 461 {
 462   Lisp_Object coding_system_list = Qnil;
 463   struct gcpro gcpro1;
 464   struct coding_system_list_closure coding_system_list_closure;
 465
 466   GCPRO1 (coding_system_list);
 467   coding_system_list_closure.coding_system_list = &coding_system_list;
 468   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hashtable,
 469                  &coding_system_list_closure);
 470   UNGCPRO;
 471
 472   return coding_system_list;
 473 }
 474
 475 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 476 Return the name of the given coding system.
 477 */
 478        (coding_system))
 479 {
 480   coding_system = Fget_coding_system (coding_system);
 481   return XCODING_SYSTEM_NAME (coding_system);
 482 }
 483
 484 static struct Lisp_Coding_System *
 485 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 486 {
 487   struct Lisp_Coding_System *codesys =
 488     alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
 489
 490   zero_lcrecord (codesys);
 491   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 492   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 493   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 494   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 495   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 496   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 497   CODING_SYSTEM_TYPE     (codesys) = type;
 498
 499   if (type == CODESYS_ISO2022)
 500     {
 501       int i;
 502       for (i = 0; i < 4; i++)
 503         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 504     }
 505   else if (type == CODESYS_CCL)
 506     {
 507       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 508       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 509     }
 510
 511   CODING_SYSTEM_NAME (codesys) = name;
 512
 513   return codesys;
 514 }
 515
 516 /* Given a list of charset conversion specs as specified in a Lisp
 517    program, parse it into STORE_HERE. */
 518
 519 static void
 520 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 521                                 Lisp_Object spec_list)
 522 {
 523   Lisp_Object rest;
 524
 525   EXTERNAL_LIST_LOOP (rest, spec_list)
 526     {
 527       Lisp_Object car = XCAR (rest);
 528       Lisp_Object from, to;
 529       struct charset_conversion_spec spec;
 530
 531       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 532         signal_simple_error ("Invalid charset conversion spec", car);
 533       from = Fget_charset (XCAR (car));
 534       to = Fget_charset (XCAR (XCDR (car)));
 535       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 536         signal_simple_error_2
 537           ("Attempted conversion between different charset types",
 538            from, to);
 539       spec.from_charset = from;
 540       spec.to_charset = to;
 541
 542       Dynarr_add (store_here, spec);
 543     }
 544 }
 545
 546 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 547    specs, return the equivalent as the Lisp programmer would see it.
 548
 549    If LOAD_HERE is 0, return Qnil. */
 550
 551 static Lisp_Object
 552 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 553 {
 554   int i;
 555   Lisp_Object result = Qnil;
 556
 557   if (!load_here)
 558     return Qnil;
 559   for (i = 0; i < Dynarr_length (load_here); i++)
 560     {
 561       struct charset_conversion_spec *ccs =
 562         Dynarr_atp (load_here, i);
 563       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 564     }
 565
 566   return Fnreverse (result);
 567 }
 568
 569 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 570 Register symbol NAME as a coding system.
 571
 572 TYPE describes the conversion method used and should be one of
 573
 574 nil or 'undecided
 575      Automatic conversion.  XEmacs attempts to detect the coding system
 576      used in the file.
 577 'no-conversion
 578      No conversion.  Use this for binary files and such.  On output,
 579      graphic characters that are not in ASCII or Latin-1 will be
 580      replaced by a ?. (For a no-conversion-encoded buffer, these
 581      characters will only be present if you explicitly insert them.)
 582 'shift-jis
 583      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 584 'iso2022
 585      Any ISO2022-compliant encoding.  Among other things, this includes
 586      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 587      standard Unix encoding for Japanese and other languages), and
 588      Compound Text (the encoding used in X11).  You can specify more
 589      specific information about the conversion with the FLAGS argument.
 590 'big5
 591      Big5 (the encoding commonly used for Taiwanese).
 592 'ccl
 593      The conversion is performed using a user-written pseudo-code
 594      program.  CCL (Code Conversion Language) is the name of this
 595      pseudo-code.
 596 'internal
 597      Write out or read in the raw contents of the memory representing
 598      the buffer's text.  This is primarily useful for debugging
 599      purposes, and is only enabled when XEmacs has been compiled with
 600      DEBUG_XEMACS defined (via the --debug configure option).
 601      WARNING: Reading in a file using 'internal conversion can result
 602      in an internal inconsistency in the memory representing a
 603      buffer's text, which will produce unpredictable results and may
 604      cause XEmacs to crash.  Under normal circumstances you should
 605      never use 'internal conversion.
 606
 607 DOC-STRING is a string describing the coding system.
 608
 609 PROPS is a property list, describing the specific nature of the
 610 character set.  Recognized properties are:
 611
 612 'mnemonic
 613      String to be displayed in the modeline when this coding system is
 614      active.
 615
 616 'eol-type
 617      End-of-line conversion to be used.  It should be one of
 618
 619         nil
 620                 Automatically detect the end-of-line type (LF, CRLF,
 621                 or CR).  Also generate subsidiary coding systems named
 622                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 623                 identical to this coding system but have an EOL-TYPE
 624                 value of 'lf, 'crlf, and 'cr, respectively.
 625         'lf
 626                 The end of a line is marked externally using ASCII LF.
 627                 Since this is also the way that XEmacs represents an
 628                 end-of-line internally, specifying this option results
 629                 in no end-of-line conversion.  This is the standard
 630                 format for Unix text files.
 631         'crlf
 632                 The end of a line is marked externally using ASCII
 633                 CRLF.  This is the standard format for MS-DOS text
 634                 files.
 635         'cr
 636                 The end of a line is marked externally using ASCII CR.
 637                 This is the standard format for Macintosh text files.
 638         t
 639                 Automatically detect the end-of-line type but do not
 640                 generate subsidiary coding systems.  (This value is
 641                 converted to nil when stored internally, and
 642                 `coding-system-property' will return nil.)
 643
 644 'post-read-conversion
 645      Function called after a file has been read in, to perform the
 646      decoding.  Called with two arguments, BEG and END, denoting
 647      a region of the current buffer to be decoded.
 648
 649 'pre-write-conversion
 650      Function called before a file is written out, to perform the
 651      encoding.  Called with two arguments, BEG and END, denoting
 652      a region of the current buffer to be encoded.
 653
 654
 655 The following additional properties are recognized if TYPE is 'iso2022:
 656
 657 'charset-g0
 658 'charset-g1
 659 'charset-g2
 660 'charset-g3
 661      The character set initially designated to the G0 - G3 registers.
 662      The value should be one of
 663
 664           -- A charset object (designate that character set)
 665           -- nil (do not ever use this register)
 666           -- t (no character set is initially designated to
 667                 the register, but may be later on; this automatically
 668                 sets the corresponding `force-g*-on-output' property)
 669
 670 'force-g0-on-output
 671 'force-g1-on-output
 672 'force-g2-on-output
 673 'force-g2-on-output
 674      If non-nil, send an explicit designation sequence on output before
 675      using the specified register.
 676
 677 'short
 678      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 679      "ESC $ B" on output in place of the full designation sequences
 680      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 681
 682 'no-ascii-eol
 683      If non-nil, don't designate ASCII to G0 at each end of line on output.
 684      Setting this to non-nil also suppresses other state-resetting that
 685      normally happens at the end of a line.
 686
 687 'no-ascii-cntl
 688      If non-nil, don't designate ASCII to G0 before control chars on output.
 689
 690 'seven
 691      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 692      environment.
 693
 694 'lock-shift
 695      If non-nil, use locking-shift (SO/SI) instead of single-shift
 696      or designation by escape sequence.
 697
 698 'no-iso6429
 699      If non-nil, don't use ISO6429's direction specification.
 700
 701 'escape-quoted
 702      If non-nil, literal control characters that are the same as
 703      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 704      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 705      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 706      so that they can be properly distinguished from an escape sequence.
 707      (Note that doing this results in a non-portable encoding.) This
 708      encoding flag is used for byte-compiled files.  Note that ESC
 709      is a good choice for a quoting character because there are no
 710      escape sequences whose second byte is a character from the Control-0
 711      or Control-1 character sets; this is explicitly disallowed by the
 712      ISO2022 standard.
 713
 714 'input-charset-conversion
 715      A list of conversion specifications, specifying conversion of
 716      characters in one charset to another when decoding is performed.
 717      Each specification is a list of two elements: the source charset,
 718      and the destination charset.
 719
 720 'output-charset-conversion
 721      A list of conversion specifications, specifying conversion of
 722      characters in one charset to another when encoding is performed.
 723      The form of each specification is the same as for
 724      'input-charset-conversion.
 725
 726
 727 The following additional properties are recognized (and required)
 728 if TYPE is 'ccl:
 729
 730 'decode
 731      CCL program used for decoding (converting to internal format).
 732
 733 'encode
 734      CCL program used for encoding (converting to external format).
 735 */
 736        (name, type, doc_string, props))
 737 {
 738   struct Lisp_Coding_System *codesys;
 739   Lisp_Object rest, key, value;
 740   enum coding_system_type ty;
 741   int need_to_setup_eol_systems = 1;
 742
 743   /* Convert type to constant */
 744   if (NILP (type) || EQ (type, Qundecided))
 745                                       { ty = CODESYS_AUTODETECT; }
 746   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 747   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 748   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 749   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 750   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 751 #ifdef DEBUG_XEMACS
 752   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 753 #endif
 754   else
 755     signal_simple_error ("Invalid coding system type", type);
 756
 757   CHECK_SYMBOL (name);
 758
 759   codesys = allocate_coding_system (ty, name);
 760
 761   if (NILP (doc_string))
 762     doc_string = build_string ("");
 763   else
 764     CHECK_STRING (doc_string);
 765   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 766
 767   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 768     {
 769       if (EQ (key, Qmnemonic))
 770         {
 771           if (!NILP (value))
 772             CHECK_STRING (value);
 773           CODING_SYSTEM_MNEMONIC (codesys) = value;
 774         }
 775
 776       else if (EQ (key, Qeol_type))
 777         {
 778           need_to_setup_eol_systems = NILP (value);
 779           if (EQ (value, Qt))
 780             value = Qnil;
 781           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 782         }
 783
 784       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 785       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 786       else if (ty == CODESYS_ISO2022)
 787         {
 788 #define FROB_INITIAL_CHARSET(charset_num) \
 789   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 790     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 791
 792           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 793           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 794           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 795           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 796
 797 #define FROB_FORCE_CHARSET(charset_num) \
 798   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 799
 800           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 801           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 802           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 803           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 804
 805 #define FROB_BOOLEAN_PROPERTY(prop) \
 806   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 807
 808           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 809           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 810           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 811           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 812           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 813           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 814           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 815
 816           else if (EQ (key, Qinput_charset_conversion))
 817             {
 818               codesys->iso2022.input_conv =
 819                 Dynarr_new (charset_conversion_spec);
 820               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 821                                               value);
 822             }
 823           else if (EQ (key, Qoutput_charset_conversion))
 824             {
 825               codesys->iso2022.output_conv =
 826                 Dynarr_new (charset_conversion_spec);
 827               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 828                                               value);
 829             }
 830           else
 831             signal_simple_error ("Unrecognized property", key);
 832         }
 833       else if (EQ (type, Qccl))
 834         {
 835           if (EQ (key, Qdecode))
 836             {
 837               CHECK_VECTOR (value);
 838               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 839             }
 840           else if (EQ (key, Qencode))
 841             {
 842               CHECK_VECTOR (value);
 843               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 844             }
 845           else
 846             signal_simple_error ("Unrecognized property", key);
 847         }
 848       else
 849         signal_simple_error ("Unrecognized property", key);
 850     }
 851
 852   if (need_to_setup_eol_systems)
 853     setup_eol_coding_systems (codesys);
 854
 855   {
 856     Lisp_Object codesys_obj;
 857     XSETCODING_SYSTEM (codesys_obj, codesys);
 858     Fputhash (name, codesys_obj, Vcoding_system_hashtable);
 859     return codesys_obj;
 860   }
 861 }
 862
 863 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 864 Copy OLD-CODING-SYSTEM to NEW-NAME.
 865 If NEW-NAME does not name an existing coding system, a new one will
 866 be created.
 867 */
 868        (old_coding_system, new_name))
 869 {
 870   Lisp_Object new_coding_system;
 871   old_coding_system = Fget_coding_system (old_coding_system);
 872   new_coding_system = Ffind_coding_system (new_name);
 873   if (NILP (new_coding_system))
 874     {
 875       XSETCODING_SYSTEM (new_coding_system,
 876                          allocate_coding_system
 877                          (XCODING_SYSTEM_TYPE (old_coding_system),
 878                           new_name));
 879       Fputhash (new_name, new_coding_system, Vcoding_system_hashtable);
 880     }
 881
 882   {
 883     struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 884     struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 885     memcpy (((char *) to  ) + sizeof (to->header),
 886             ((char *) from) + sizeof (from->header),
 887             sizeof (*from) - sizeof (from->header));
 888     to->name = new_name;
 889   }
 890   return new_coding_system;
 891 }
 892
 893 static Lisp_Object
 894 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
 895 {
 896   struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
 897   Lisp_Object new_coding_system;
 898
 899   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
 900     return coding_system;
 901
 902   switch (type)
 903     {
 904     case EOL_AUTODETECT: return coding_system;
 905     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
 906     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
 907     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
 908     default:       abort ();
 909     }
 910
 911   return NILP (new_coding_system) ? coding_system : new_coding_system;
 912 }
 913
 914 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
 915 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
 916 */
 917        (coding_system, eol_type))
 918 {
 919   coding_system = Fget_coding_system (coding_system);
 920
 921   return subsidiary_coding_system (coding_system,
 922                                    symbol_to_eol_type (eol_type));
 923 }
 924
 925 \f
 926 /************************************************************************/
 927 /*                         Coding system accessors                      */
 928 /************************************************************************/
 929
 930 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
 931 Return the doc string for CODING-SYSTEM.
 932 */
 933        (coding_system))
 934 {
 935   coding_system = Fget_coding_system (coding_system);
 936   return XCODING_SYSTEM_DOC_STRING (coding_system);
 937 }
 938
 939 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
 940 Return the type of CODING-SYSTEM.
 941 */
 942        (coding_system))
 943 {
 944   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
 945     {
 946     case CODESYS_AUTODETECT:    return Qundecided;
 947     case CODESYS_SHIFT_JIS:     return Qshift_jis;
 948     case CODESYS_ISO2022:       return Qiso2022;
 949     case CODESYS_BIG5:          return Qbig5;
 950     case CODESYS_CCL:           return Qccl;
 951     case CODESYS_NO_CONVERSION: return Qno_conversion;
 952 #ifdef DEBUG_XEMACS
 953     case CODESYS_INTERNAL:      return Qinternal;
 954 #endif
 955     default:
 956       abort ();
 957     }
 958
 959   return Qnil; /* not reached */
 960 }
 961
 962 static
 963 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
 964 {
 965   Lisp_Object cs
 966     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
 967
 968   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
 969 }
 970
 971 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
 972 Return initial charset of CODING-SYSTEM designated to GNUM.
 973 GNUM allows 0 .. 3.
 974 */
 975        (coding_system, gnum))
 976 {
 977   coding_system = Fget_coding_system (coding_system);
 978   CHECK_INT (gnum);
 979
 980   return coding_system_charset (coding_system, XINT (gnum));
 981 }
 982
 983 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
 984 Return the PROP property of CODING-SYSTEM.
 985 */
 986        (coding_system, prop))
 987 {
 988   int i, ok = 0;
 989   enum coding_system_type type;
 990
 991   coding_system = Fget_coding_system (coding_system);
 992   CHECK_SYMBOL (prop);
 993   type = XCODING_SYSTEM_TYPE (coding_system);
 994
 995   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
 996     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
 997       {
 998         ok = 1;
 999         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1000           {
1001           case CODESYS_PROP_ALL_OK:
1002             break;
1003
1004           case CODESYS_PROP_ISO2022:
1005             if (type != CODESYS_ISO2022)
1006               signal_simple_error
1007                 ("Property only valid in ISO2022 coding systems",
1008                  prop);
1009             break;
1010
1011           case CODESYS_PROP_CCL:
1012             if (type != CODESYS_CCL)
1013               signal_simple_error
1014                 ("Property only valid in CCL coding systems",
1015                  prop);
1016             break;
1017
1018           default:
1019             abort ();
1020           }
1021       }
1022
1023   if (!ok)
1024     signal_simple_error ("Unrecognized property", prop);
1025
1026   if (EQ (prop, Qname))
1027     return XCODING_SYSTEM_NAME (coding_system);
1028   else if (EQ (prop, Qtype))
1029     return Fcoding_system_type (coding_system);
1030   else if (EQ (prop, Qdoc_string))
1031     return XCODING_SYSTEM_DOC_STRING (coding_system);
1032   else if (EQ (prop, Qmnemonic))
1033     return XCODING_SYSTEM_MNEMONIC (coding_system);
1034   else if (EQ (prop, Qeol_type))
1035     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1036   else if (EQ (prop, Qeol_lf))
1037     return XCODING_SYSTEM_EOL_LF (coding_system);
1038   else if (EQ (prop, Qeol_crlf))
1039     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1040   else if (EQ (prop, Qeol_cr))
1041     return XCODING_SYSTEM_EOL_CR (coding_system);
1042   else if (EQ (prop, Qpost_read_conversion))
1043     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1044   else if (EQ (prop, Qpre_write_conversion))
1045     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1046   else if (type == CODESYS_ISO2022)
1047     {
1048       if (EQ (prop, Qcharset_g0))
1049         return coding_system_charset (coding_system, 0);
1050       else if (EQ (prop, Qcharset_g1))
1051         return coding_system_charset (coding_system, 1);
1052       else if (EQ (prop, Qcharset_g2))
1053         return coding_system_charset (coding_system, 2);
1054       else if (EQ (prop, Qcharset_g3))
1055         return coding_system_charset (coding_system, 3);
1056
1057 #define FORCE_CHARSET(charset_num) \
1058   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1059    (coding_system, charset_num) ? Qt : Qnil)
1060
1061       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1062       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1063       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1064       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1065
1066 #define LISP_BOOLEAN(prop) \
1067   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1068
1069       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1070       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1071       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1072       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1073       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1074       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1075       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1076
1077       else if (EQ (prop, Qinput_charset_conversion))
1078         return
1079           unparse_charset_conversion_specs
1080             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1081       else if (EQ (prop, Qoutput_charset_conversion))
1082         return
1083           unparse_charset_conversion_specs
1084             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1085       else
1086         abort ();
1087     }
1088   else if (type == CODESYS_CCL)
1089     {
1090       if (EQ (prop, Qdecode))
1091         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1092       else if (EQ (prop, Qencode))
1093         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1094       else
1095         abort ();
1096     }
1097   else
1098     abort ();
1099
1100   return Qnil; /* not reached */
1101 }
1102
1103 \f
1104 /************************************************************************/
1105 /*                       Coding category functions                      */
1106 /************************************************************************/
1107
1108 static int
1109 decode_coding_category (Lisp_Object symbol)
1110 {
1111   int i;
1112
1113   CHECK_SYMBOL (symbol);
1114   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1115     if (EQ (coding_category_symbol[i], symbol))
1116       return i;
1117
1118   signal_simple_error ("Unrecognized coding category", symbol);
1119   return 0; /* not reached */
1120 }
1121
1122 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1123 Return a list of all recognized coding categories.
1124 */
1125        ())
1126 {
1127   int i;
1128   Lisp_Object list = Qnil;
1129
1130   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1131     list = Fcons (coding_category_symbol[i], list);
1132   return list;
1133 }
1134
1135 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1136 Change the priority order of the coding categories.
1137 LIST should be list of coding categories, in descending order of
1138 priority.  Unspecified coding categories will be lower in priority
1139 than all specified ones, in the same relative order they were in
1140 previously.
1141 */
1142        (list))
1143 {
1144   int category_to_priority[CODING_CATEGORY_LAST + 1];
1145   int i, j;
1146   Lisp_Object rest;
1147
1148   /* First generate a list that maps coding categories to priorities. */
1149
1150   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1151     category_to_priority[i] = -1;
1152
1153   /* Highest priority comes from the specified list. */
1154   i = 0;
1155   EXTERNAL_LIST_LOOP (rest, list)
1156     {
1157       int cat = decode_coding_category (XCAR (rest));
1158
1159       if (category_to_priority[cat] >= 0)
1160         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1161       category_to_priority[cat] = i++;
1162     }
1163
1164   /* Now go through the existing categories by priority to retrieve
1165      the categories not yet specified and preserve their priority
1166      order. */
1167   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1168     {
1169       int cat = coding_category_by_priority[j];
1170       if (category_to_priority[cat] < 0)
1171         category_to_priority[cat] = i++;
1172     }
1173
1174   /* Now we need to construct the inverse of the mapping we just
1175      constructed. */
1176
1177   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1178     coding_category_by_priority[category_to_priority[i]] = i;
1179
1180   /* Phew!  That was confusing. */
1181   return Qnil;
1182 }
1183
1184 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1185 Return a list of coding categories in descending order of priority.
1186 */
1187        ())
1188 {
1189   int i;
1190   Lisp_Object list = Qnil;
1191
1192   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1193     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1194                   list);
1195   return list;
1196 }
1197
1198 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1199 Change the coding system associated with a coding category.
1200 */
1201        (coding_category, coding_system))
1202 {
1203   int cat = decode_coding_category (coding_category);
1204
1205   coding_system = Fget_coding_system (coding_system);
1206   coding_category_system[cat] = coding_system;
1207   return Qnil;
1208 }
1209
1210 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1211 Return the coding system associated with a coding category.
1212 */
1213        (coding_category))
1214 {
1215   int cat = decode_coding_category (coding_category);
1216   Lisp_Object sys = coding_category_system[cat];
1217
1218   if (!NILP (sys))
1219     return XCODING_SYSTEM_NAME (sys);
1220   return Qnil;
1221 }
1222
1223 \f
1224 /************************************************************************/
1225 /*                     Detecting the encoding of data                   */
1226 /************************************************************************/
1227
1228 struct detection_state
1229 {
1230   enum eol_type eol_type;
1231   int seen_non_ascii;
1232   int mask;
1233
1234   struct
1235   {
1236     int mask;
1237     int in_second_byte;
1238   }
1239   big5;
1240
1241   struct
1242   {
1243     int mask;
1244     int in_second_byte;
1245   }
1246   shift_jis;
1247
1248   struct
1249   {
1250     int mask;
1251     int initted;
1252     struct iso2022_decoder iso;
1253     unsigned int flags;
1254     int high_byte_count;
1255     unsigned int saw_single_shift:1;
1256   }
1257   iso2022;
1258
1259   struct
1260   {
1261     int seen_anything;
1262     int just_saw_cr;
1263   }
1264   eol;
1265 };
1266
1267 static int
1268 acceptable_control_char_p (int c)
1269 {
1270   switch (c)
1271     {
1272       /* Allow and ignore control characters that you might
1273          reasonably see in a text file */
1274     case '\r':
1275     case '\n':
1276     case '\t':
1277     case  7: /* bell */
1278     case  8: /* backspace */
1279     case 11: /* vertical tab */
1280     case 12: /* form feed */
1281     case 26: /* MS-DOS C-z junk */
1282     case 31: /* '^_' -- for info */
1283       return 1;
1284     default:
1285       return 0;
1286     }
1287 }
1288
1289 static int
1290 mask_has_at_most_one_bit_p (int mask)
1291 {
1292   /* Perhaps the only thing useful you learn from intensive Microsoft
1293      technical interviews */
1294   return (mask & (mask - 1)) == 0;
1295 }
1296
1297 static enum eol_type
1298 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1299                  unsigned int n)
1300 {
1301   int c;
1302
1303   while (n--)
1304     {
1305       c = *src++;
1306       if (c == '\r')
1307         st->eol.just_saw_cr = 1;
1308       else
1309         {
1310           if (c == '\n')
1311             {
1312               if (st->eol.just_saw_cr)
1313                 return EOL_CRLF;
1314               else if (st->eol.seen_anything)
1315                 return EOL_LF;
1316             }
1317           else if (st->eol.just_saw_cr)
1318             return EOL_CR;
1319           st->eol.just_saw_cr = 0;
1320         }
1321       st->eol.seen_anything = 1;
1322     }
1323
1324   return EOL_AUTODETECT;
1325 }
1326
1327 /* Attempt to determine the encoding and EOL type of the given text.
1328    Before calling this function for the first type, you must initialize
1329    st->eol_type as appropriate and initialize st->mask to ~0.
1330
1331    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1332    not yet known.
1333
1334    st->mask holds the determined coding category mask, or ~0 if only
1335    ASCII has been seen so far.
1336
1337    Returns:
1338
1339    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1340         is present in st->mask
1341    1 == definitive answers are here for both st->eol_type and st->mask
1342 */
1343
1344 static int
1345 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1346                     unsigned int n, int just_do_eol)
1347 {
1348   int c;
1349
1350   if (st->eol_type == EOL_AUTODETECT)
1351     st->eol_type = detect_eol_type (st, src, n);
1352
1353   if (just_do_eol)
1354     return st->eol_type != EOL_AUTODETECT;
1355
1356   if (!st->seen_non_ascii)
1357     {
1358       for (; n; n--, src++)
1359         {
1360           c = *src;
1361           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1362             {
1363               st->seen_non_ascii = 1;
1364               st->shift_jis.mask = ~0;
1365               st->big5.mask = ~0;
1366               st->iso2022.mask = ~0;
1367               break;
1368             }
1369         }
1370     }
1371
1372   if (!n)
1373     return 0;
1374
1375   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1376     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1377   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1378     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1379   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1380     st->big5.mask = detect_coding_big5 (st, src, n);
1381
1382   st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1383
1384   {
1385     int retval = mask_has_at_most_one_bit_p (st->mask);
1386     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1387     return retval && st->eol_type != EOL_AUTODETECT;
1388   }
1389 }
1390
1391 static Lisp_Object
1392 coding_system_from_mask (int mask)
1393 {
1394   if (mask == ~0)
1395     {
1396       /* If the file was entirely or basically ASCII, use the
1397          default value of `buffer-file-coding-system'. */
1398       Lisp_Object retval =
1399         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1400       if (!NILP (retval))
1401         {
1402           retval = Ffind_coding_system (retval);
1403           if (NILP (retval))
1404             {
1405               warn_when_safe
1406                 (Qbad_variable, Qwarning,
1407                  "Invalid `default-buffer-file-coding-system', set to nil");
1408               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1409             }
1410         }
1411       if (NILP (retval))
1412         retval = Fget_coding_system (Qno_conversion);
1413       return retval;
1414     }
1415   else
1416     {
1417       int i;
1418       int cat = -1;
1419
1420       mask = postprocess_iso2022_mask (mask);
1421
1422       /* Look through the coding categories by priority and find
1423          the first one that is allowed. */
1424       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1425         {
1426           cat = coding_category_by_priority[i];
1427           if ((mask & (1 << cat)) &&
1428               !NILP (coding_category_system[cat]))
1429             break;
1430         }
1431       if (cat >= 0)
1432         return coding_category_system[cat];
1433       else
1434         return Fget_coding_system (Qno_conversion);
1435     }
1436 }
1437
1438 /* Given a seekable read stream and potential coding system and EOL type
1439    as specified, do any autodetection that is called for.  If the
1440    coding system and/or EOL type are not autodetect, they will be left
1441    alone; but this function will never return an autodetect coding system
1442    or EOL type.
1443
1444    This function does not automatically fetch subsidiary coding systems;
1445    that should be unnecessary with the explicit eol-type argument. */
1446
1447 void
1448 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1449                               enum eol_type *eol_type_in_out)
1450 {
1451   struct detection_state decst;
1452
1453   if (*eol_type_in_out == EOL_AUTODETECT)
1454     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1455
1456   xzero (decst);
1457   decst.eol_type = *eol_type_in_out;
1458   decst.mask = ~0;
1459
1460   /* If autodetection is called for, do it now. */
1461   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1462       *eol_type_in_out == EOL_AUTODETECT)
1463     {
1464
1465       while (1)
1466         {
1467           unsigned char random_buffer[4096];
1468           int nread;
1469
1470           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1471           if (!nread)
1472             break;
1473           if (detect_coding_type (&decst, random_buffer, nread,
1474                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1475                                   CODESYS_AUTODETECT))
1476             break;
1477         }
1478
1479       *eol_type_in_out = decst.eol_type;
1480       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1481         *codesys_in_out = coding_system_from_mask (decst.mask);
1482     }
1483
1484   /* If we absolutely can't determine the EOL type, just assume LF. */
1485   if (*eol_type_in_out == EOL_AUTODETECT)
1486     *eol_type_in_out = EOL_LF;
1487
1488   Lstream_rewind (stream);
1489 }
1490
1491 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1492 Detect coding system of the text in the region between START and END.
1493 Returned a list of possible coding systems ordered by priority.
1494 If only ASCII characters are found, it returns 'undecided or one of
1495 its subsidiary coding systems according to a detected end-of-line
1496 type.  Optional arg BUFFER defaults to the current buffer.
1497 */
1498        (start, end, buffer))
1499 {
1500   Lisp_Object val = Qnil;
1501   struct buffer *buf = decode_buffer (buffer, 0);
1502   Bufpos b, e;
1503   Lisp_Object instream, lb_instream;
1504   Lstream *istr, *lb_istr;
1505   struct detection_state decst;
1506   struct gcpro gcpro1, gcpro2;
1507
1508   get_buffer_range_char (buf, start, end, &b, &e, 0);
1509   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1510   lb_istr = XLSTREAM (lb_instream);
1511   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1512   istr = XLSTREAM (instream);
1513   GCPRO2 (instream, lb_instream);
1514   xzero (decst);
1515   decst.eol_type = EOL_AUTODETECT;
1516   decst.mask = ~0;
1517   while (1)
1518     {
1519       unsigned char random_buffer[4096];
1520       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1521
1522       if (!nread)
1523         break;
1524       if (detect_coding_type (&decst, random_buffer, nread, 0))
1525         break;
1526     }
1527
1528   if (decst.mask == ~0)
1529     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1530                                     decst.eol_type);
1531   else
1532     {
1533       int i;
1534
1535       val = Qnil;
1536
1537       decst.mask = postprocess_iso2022_mask (decst.mask);
1538
1539       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1540         {
1541           int sys = coding_category_by_priority[i];
1542           if (decst.mask & (1 << sys))
1543             {
1544               Lisp_Object codesys = coding_category_system[sys];
1545               if (!NILP (codesys))
1546                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1547               val = Fcons (codesys, val);
1548             }
1549         }
1550     }
1551   Lstream_close (istr);
1552   UNGCPRO;
1553   Lstream_delete (istr);
1554   Lstream_delete (lb_istr);
1555   return val;
1556 }
1557
1558 \f
1559 /************************************************************************/
1560 /*           Converting to internal Mule format ("decoding")            */
1561 /************************************************************************/
1562
1563 /* A decoding stream is a stream used for decoding text (i.e.
1564    converting from some external format to internal format).
1565    The decoding-stream object keeps track of the actual coding
1566    stream, the stream that is at the other end, and data that
1567    needs to be persistent across the lifetime of the stream. */
1568
1569 /* Handle the EOL stuff related to just-read-in character C.
1570    EOL_TYPE is the EOL type of the coding stream.
1571    FLAGS is the current value of FLAGS in the coding stream, and may
1572    be modified by this macro.  (The macro only looks at the
1573    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1574    bytes are to be written.  You need to also define a local goto
1575    label "label_continue_loop" that is at the end of the main
1576    character-reading loop.
1577
1578    If C is a CR character, then this macro handles it entirely and
1579    jumps to label_continue_loop.  Otherwise, this macro does not add
1580    anything to DST, and continues normally.  You should continue
1581    processing C normally after this macro. */
1582
1583 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1584 do {                                                            \
1585   if (c == '\r')                                                \
1586     {                                                           \
1587       if (eol_type == EOL_CR)                                   \
1588         Dynarr_add (dst, '\n');                                 \
1589       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1590         Dynarr_add (dst, c);                                    \
1591       else                                                      \
1592         flags |= CODING_STATE_CR;                               \
1593       goto label_continue_loop;                                 \
1594     }                                                           \
1595   else if (flags & CODING_STATE_CR)                             \
1596     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1597       if (c != '\n')                                            \
1598         Dynarr_add (dst, '\r');                                 \
1599       flags &= ~CODING_STATE_CR;                                \
1600     }                                                           \
1601 } while (0)
1602
1603 /* C should be a binary character in the range 0 - 255; convert
1604    to internal format and add to Dynarr DST. */
1605
1606 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1607 do {                                            \
1608   if (BYTE_ASCII_P (c))                         \
1609     Dynarr_add (dst, c);                        \
1610   else if (BYTE_C1_P (c))                       \
1611     {                                           \
1612       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1613       Dynarr_add (dst, c + 0x20);               \
1614     }                                           \
1615   else                                          \
1616     {                                           \
1617       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1618       Dynarr_add (dst, c);                      \
1619     }                                           \
1620 } while (0)
1621
1622 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1623 do {                                    \
1624   if (ch)                               \
1625     {                                   \
1626       DECODE_ADD_BINARY_CHAR (ch, dst); \
1627       ch = 0;                           \
1628     }                                   \
1629 } while (0)
1630
1631 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1632 do {                                    \
1633   DECODE_OUTPUT_PARTIAL_CHAR (ch);      \
1634   if ((flags & CODING_STATE_END) &&     \
1635       (flags & CODING_STATE_CR))        \
1636     Dynarr_add (dst, '\r');             \
1637 } while (0)
1638
1639 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1640
1641 struct decoding_stream
1642 {
1643   /* Coding system that governs the conversion. */
1644   struct Lisp_Coding_System *codesys;
1645
1646   /* Stream that we read the encoded data from or
1647      write the decoded data to. */
1648   Lstream *other_end;
1649
1650   /* If we are reading, then we can return only a fixed amount of
1651      data, so if the conversion resulted in too much data, we store it
1652      here for retrieval the next time around. */
1653   unsigned_char_dynarr *runoff;
1654
1655   /* FLAGS holds flags indicating the current state of the decoding.
1656      Some of these flags are dependent on the coding system. */
1657   unsigned int flags;
1658
1659   /* CH holds a partially built-up character.  Since we only deal
1660      with one- and two-byte characters at the moment, we only use
1661      this to store the first byte of a two-byte character. */
1662   unsigned int ch;
1663
1664   /* EOL_TYPE specifies the type of end-of-line conversion that
1665      currently applies.  We need to keep this separate from the
1666      EOL type stored in CODESYS because the latter might indicate
1667      automatic EOL-type detection while the former will always
1668      indicate a particular EOL type. */
1669   enum eol_type eol_type;
1670
1671   /* Additional ISO2022 information.  We define the structure above
1672      because it's also needed by the detection routines. */
1673   struct iso2022_decoder iso2022;
1674
1675   /* Additional information (the state of the running CCL program)
1676      used by the CCL decoder. */
1677   struct ccl_program ccl;
1678
1679   struct detection_state decst;
1680 };
1681
1682 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1683 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1684 static int decoding_rewinder   (Lstream *stream);
1685 static int decoding_seekable_p (Lstream *stream);
1686 static int decoding_flusher    (Lstream *stream);
1687 static int decoding_closer     (Lstream *stream);
1688 static Lisp_Object decoding_marker (Lisp_Object stream,
1689                                     void (*markobj) (Lisp_Object));
1690
1691 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1692                                sizeof (struct decoding_stream));
1693
1694 static Lisp_Object
1695 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1696 {
1697   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1698   Lisp_Object str_obj;
1699
1700   /* We do not need to mark the coding systems or charsets stored
1701      within the stream because they are stored in a global list
1702      and automatically marked. */
1703
1704   XSETLSTREAM (str_obj, str);
1705   (markobj) (str_obj);
1706   if (str->imp->marker)
1707     return (str->imp->marker) (str_obj, markobj);
1708   else
1709     return Qnil;
1710 }
1711
1712 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1713    so we read data from the other end, decode it, and store it into DATA. */
1714
1715 static int
1716 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1717 {
1718   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1719   unsigned char *orig_data = data;
1720   int read_size;
1721   int error_occurred = 0;
1722
1723   /* We need to interface to mule_decode(), which expects to take some
1724      amount of data and store the result into a Dynarr.  We have
1725      mule_decode() store into str->runoff, and take data from there
1726      as necessary. */
1727
1728   /* We loop until we have enough data, reading chunks from the other
1729      end and decoding it. */
1730   while (1)
1731     {
1732       /* Take data from the runoff if we can.  Make sure to take at
1733          most SIZE bytes, and delete the data from the runoff. */
1734       if (Dynarr_length (str->runoff) > 0)
1735         {
1736           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1737           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1738           Dynarr_delete_many (str->runoff, 0, chunk);
1739           data += chunk;
1740           size -= chunk;
1741         }
1742
1743       if (size == 0)
1744         break; /* No more room for data */
1745
1746       if (str->flags & CODING_STATE_END)
1747         /* This means that on the previous iteration, we hit the EOF on
1748            the other end.  We loop once more so that mule_decode() can
1749            output any final stuff it may be holding, or any "go back
1750            to a sane state" escape sequences. (This latter makes sense
1751            during encoding.) */
1752         break;
1753
1754       /* Exhausted the runoff, so get some more.  DATA has at least
1755          SIZE bytes left of storage in it, so it's OK to read directly
1756          into it.  (We'll be overwriting above, after we've decoded it
1757          into the runoff.) */
1758       read_size = Lstream_read (str->other_end, data, size);
1759       if (read_size < 0)
1760         {
1761           error_occurred = 1;
1762           break;
1763         }
1764       if (read_size == 0)
1765         /* There might be some more end data produced in the translation.
1766            See the comment above. */
1767         str->flags |= CODING_STATE_END;
1768       mule_decode (stream, data, str->runoff, read_size);
1769     }
1770
1771   if (data - orig_data == 0)
1772     return error_occurred ? -1 : 0;
1773   else
1774     return data - orig_data;
1775 }
1776
1777 static int
1778 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1779 {
1780   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1781   int retval;
1782
1783   /* Decode all our data into the runoff, and then attempt to write
1784      it all out to the other end.  Remove whatever chunk we succeeded
1785      in writing. */
1786   mule_decode (stream, data, str->runoff, size);
1787   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1788                           Dynarr_length (str->runoff));
1789   if (retval > 0)
1790     Dynarr_delete_many (str->runoff, 0, retval);
1791   /* Do NOT return retval.  The return value indicates how much
1792      of the incoming data was written, not how many bytes were
1793      written. */
1794   return size;
1795 }
1796
1797 static void
1798 reset_decoding_stream (struct decoding_stream *str)
1799 {
1800   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1801     {
1802       Lisp_Object coding_system;
1803       XSETCODING_SYSTEM (coding_system, str->codesys);
1804       reset_iso2022 (coding_system, &str->iso2022);
1805     }
1806   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1807     {
1808       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1809     }
1810
1811   str->flags = str->ch = 0;
1812 }
1813
1814 static int
1815 decoding_rewinder (Lstream *stream)
1816 {
1817   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1818   reset_decoding_stream (str);
1819   Dynarr_reset (str->runoff);
1820   return Lstream_rewind (str->other_end);
1821 }
1822
1823 static int
1824 decoding_seekable_p (Lstream *stream)
1825 {
1826   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1827   return Lstream_seekable_p (str->other_end);
1828 }
1829
1830 static int
1831 decoding_flusher (Lstream *stream)
1832 {
1833   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1834   return Lstream_flush (str->other_end);
1835 }
1836
1837 static int
1838 decoding_closer (Lstream *stream)
1839 {
1840   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1841   if (stream->flags & LSTREAM_FL_WRITE)
1842     {
1843       str->flags |= CODING_STATE_END;
1844       decoding_writer (stream, 0, 0);
1845     }
1846   Dynarr_free (str->runoff);
1847   if (str->iso2022.composite_chars)
1848     Dynarr_free (str->iso2022.composite_chars);
1849   return Lstream_close (str->other_end);
1850 }
1851
1852 Lisp_Object
1853 decoding_stream_coding_system (Lstream *stream)
1854 {
1855   Lisp_Object coding_system;
1856   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1857
1858   XSETCODING_SYSTEM (coding_system, str->codesys);
1859   return subsidiary_coding_system (coding_system, str->eol_type);
1860 }
1861
1862 void
1863 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1864 {
1865   struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1866   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1867   str->codesys = cs;
1868   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1869     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1870   reset_decoding_stream (str);
1871 }
1872
1873 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
1874    stream for writing, no automatic code detection will be performed.
1875    The reason for this is that automatic code detection requires a
1876    seekable input.  Things will also fail if you open a decoding
1877    stream for reading using a non-fully-specified coding system and
1878    a non-seekable input stream. */
1879
1880 static Lisp_Object
1881 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1882                         CONST char *mode)
1883 {
1884   Lstream *lstr = Lstream_new (lstream_decoding, mode);
1885   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1886   Lisp_Object obj;
1887
1888   xzero (*str);
1889   str->other_end = stream;
1890   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1891   str->eol_type = EOL_AUTODETECT;
1892   if (!strcmp (mode, "r")
1893       && Lstream_seekable_p (stream))
1894     /* We can determine the coding system now. */
1895     determine_real_coding_system (stream, &codesys, &str->eol_type);
1896   set_decoding_stream_coding_system (lstr, codesys);
1897   str->decst.eol_type = str->eol_type;
1898   str->decst.mask = ~0;
1899   XSETLSTREAM (obj, lstr);
1900   return obj;
1901 }
1902
1903 Lisp_Object
1904 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1905 {
1906   return make_decoding_stream_1 (stream, codesys, "r");
1907 }
1908
1909 Lisp_Object
1910 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1911 {
1912   return make_decoding_stream_1 (stream, codesys, "w");
1913 }
1914
1915 /* Note: the decode_coding_* functions all take the same
1916    arguments as mule_decode(), which is to say some SRC data of
1917    size N, which is to be stored into dynamic array DST.
1918    DECODING is the stream within which the decoding is
1919    taking place, but no data is actually read from or
1920    written to that stream; that is handled in decoding_reader()
1921    or decoding_writer().  This allows the same functions to
1922    be used for both reading and writing. */
1923
1924 static void
1925 mule_decode (Lstream *decoding, CONST unsigned char *src,
1926              unsigned_char_dynarr *dst, unsigned int n)
1927 {
1928   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1929
1930   /* If necessary, do encoding-detection now.  We do this when
1931      we're a writing stream or a non-seekable reading stream,
1932      meaning that we can't just process the whole input,
1933      rewind, and start over. */
1934
1935   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1936       str->eol_type == EOL_AUTODETECT)
1937     {
1938       Lisp_Object codesys;
1939
1940       XSETCODING_SYSTEM (codesys, str->codesys);
1941       detect_coding_type (&str->decst, src, n,
1942                           CODING_SYSTEM_TYPE (str->codesys) !=
1943                           CODESYS_AUTODETECT);
1944       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1945           str->decst.mask != ~0)
1946         /* #### This is cheesy.  What we really ought to do is
1947            buffer up a certain amount of data so as to get a
1948            less random result. */
1949         codesys = coding_system_from_mask (str->decst.mask);
1950       str->eol_type = str->decst.eol_type;
1951       if (XCODING_SYSTEM (codesys) != str->codesys)
1952         {
1953           /* Preserve the CODING_STATE_END flag in case it was set.
1954              If we erase it, bad things might happen. */
1955           int was_end = str->flags & CODING_STATE_END;
1956           set_decoding_stream_coding_system (decoding, codesys);
1957           if (was_end)
1958             str->flags |= CODING_STATE_END;
1959         }
1960     }
1961
1962   switch (CODING_SYSTEM_TYPE (str->codesys))
1963     {
1964 #ifdef DEBUG_XEMACS
1965     case CODESYS_INTERNAL:
1966       Dynarr_add_many (dst, src, n);
1967       break;
1968 #endif
1969     case CODESYS_AUTODETECT:
1970       /* If we got this far and still haven't decided on the coding
1971          system, then do no conversion. */
1972     case CODESYS_NO_CONVERSION:
1973       decode_coding_no_conversion (decoding, src, dst, n);
1974       break;
1975     case CODESYS_SHIFT_JIS:
1976       decode_coding_sjis (decoding, src, dst, n);
1977       break;
1978     case CODESYS_BIG5:
1979       decode_coding_big5 (decoding, src, dst, n);
1980       break;
1981     case CODESYS_CCL:
1982       ccl_driver (&str->ccl, src, dst, n, 0);
1983       break;
1984     case CODESYS_ISO2022:
1985       decode_coding_iso2022 (decoding, src, dst, n);
1986       break;
1987     default:
1988       abort ();
1989     }
1990 }
1991
1992 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
1993 Decode the text between START and END which is encoded in CODING-SYSTEM.
1994 This is useful if you've read in encoded text from a file without decoding
1995 it (e.g. you read in a JIS-formatted file but used the `binary' or
1996 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
1997 Return length of decoded text.
1998 BUFFER defaults to the current buffer if unspecified.
1999 */
2000        (start, end, coding_system, buffer))
2001 {
2002   Bufpos b, e;
2003   struct buffer *buf = decode_buffer (buffer, 0);
2004   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2005   Lstream *istr, *ostr;
2006   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2007
2008   get_buffer_range_char (buf, start, end, &b, &e, 0);
2009
2010   barf_if_buffer_read_only (buf, b, e);
2011
2012   coding_system = Fget_coding_system (coding_system);
2013   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2014   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2015   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2016                                               coding_system);
2017   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2018                                            Fget_coding_system (Qbinary));
2019   istr = XLSTREAM (instream);
2020   ostr = XLSTREAM (outstream);
2021   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2022
2023   /* The chain of streams looks like this:
2024
2025      [BUFFER] <----- send through
2026                      ------> [ENCODE AS BINARY]
2027                              ------> [DECODE AS SPECIFIED]
2028                                      ------> [BUFFER]
2029    */
2030
2031   while (1)
2032     {
2033       char tempbuf[1024]; /* some random amount */
2034       Bufpos newpos, even_newer_pos;
2035       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2036       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2037
2038       if (!size_in_bytes)
2039         break;
2040       newpos = lisp_buffer_stream_startpos (istr);
2041       Lstream_write (ostr, tempbuf, size_in_bytes);
2042       even_newer_pos = lisp_buffer_stream_startpos (istr);
2043       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2044                            even_newer_pos, 0);
2045     }
2046   Lstream_close (istr);
2047   Lstream_close (ostr);
2048   UNGCPRO;
2049   Lstream_delete (istr);
2050   Lstream_delete (ostr);
2051   Lstream_delete (XLSTREAM (de_outstream));
2052   Lstream_delete (XLSTREAM (lb_outstream));
2053   return Qnil;
2054 }
2055
2056 \f
2057 /************************************************************************/
2058 /*           Converting to an external encoding ("encoding")            */
2059 /************************************************************************/
2060
2061 /* An encoding stream is an output stream.  When you create the
2062    stream, you specify the coding system that governs the encoding
2063    and another stream that the resulting encoded data is to be
2064    sent to, and then start sending data to it. */
2065
2066 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2067
2068 struct encoding_stream
2069 {
2070   /* Coding system that governs the conversion. */
2071   struct Lisp_Coding_System *codesys;
2072
2073   /* Stream that we read the encoded data from or
2074      write the decoded data to. */
2075   Lstream *other_end;
2076
2077   /* If we are reading, then we can return only a fixed amount of
2078      data, so if the conversion resulted in too much data, we store it
2079      here for retrieval the next time around. */
2080   unsigned_char_dynarr *runoff;
2081
2082   /* FLAGS holds flags indicating the current state of the encoding.
2083      Some of these flags are dependent on the coding system. */
2084   unsigned int flags;
2085
2086   /* CH holds a partially built-up character.  Since we only deal
2087      with one- and two-byte characters at the moment, we only use
2088      this to store the first byte of a two-byte character. */
2089   unsigned int ch;
2090
2091   /* Additional information used by the ISO2022 encoder. */
2092   struct
2093     {
2094       /* CHARSET holds the character sets currently assigned to the G0
2095          through G3 registers.  It is initialized from the array
2096          INITIAL_CHARSET in CODESYS. */
2097       Lisp_Object charset[4];
2098
2099       /* Which registers are currently invoked into the left (GL) and
2100          right (GR) halves of the 8-bit encoding space? */
2101       int register_left, register_right;
2102
2103       /* Whether we need to explicitly designate the charset in the
2104          G? register before using it.  It is initialized from the
2105          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2106       unsigned char force_charset_on_output[4];
2107
2108       /* Other state variables that need to be preserved across
2109          invocations. */
2110       Lisp_Object current_charset;
2111       int current_half;
2112       int current_char_boundary;
2113     } iso2022;
2114
2115   /* Additional information (the state of the running CCL program)
2116      used by the CCL encoder. */
2117   struct ccl_program ccl;
2118 };
2119
2120 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2121 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2122                             size_t size);
2123 static int encoding_rewinder   (Lstream *stream);
2124 static int encoding_seekable_p (Lstream *stream);
2125 static int encoding_flusher    (Lstream *stream);
2126 static int encoding_closer     (Lstream *stream);
2127 static Lisp_Object encoding_marker (Lisp_Object stream,
2128                                     void (*markobj) (Lisp_Object));
2129
2130 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2131                                sizeof (struct encoding_stream));
2132
2133 static Lisp_Object
2134 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2135 {
2136   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2137   Lisp_Object str_obj;
2138
2139   /* We do not need to mark the coding systems or charsets stored
2140      within the stream because they are stored in a global list
2141      and automatically marked. */
2142
2143   XSETLSTREAM (str_obj, str);
2144   (markobj) (str_obj);
2145   if (str->imp->marker)
2146     return (str->imp->marker) (str_obj, markobj);
2147   else
2148     return Qnil;
2149 }
2150
2151 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2152    so we read data from the other end, encode it, and store it into DATA. */
2153
2154 static int
2155 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2156 {
2157   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2158   unsigned char *orig_data = data;
2159   int read_size;
2160   int error_occurred = 0;
2161
2162   /* We need to interface to mule_encode(), which expects to take some
2163      amount of data and store the result into a Dynarr.  We have
2164      mule_encode() store into str->runoff, and take data from there
2165      as necessary. */
2166
2167   /* We loop until we have enough data, reading chunks from the other
2168      end and encoding it. */
2169   while (1)
2170     {
2171       /* Take data from the runoff if we can.  Make sure to take at
2172          most SIZE bytes, and delete the data from the runoff. */
2173       if (Dynarr_length (str->runoff) > 0)
2174         {
2175           int chunk = min ((int) size, Dynarr_length (str->runoff));
2176           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2177           Dynarr_delete_many (str->runoff, 0, chunk);
2178           data += chunk;
2179           size -= chunk;
2180         }
2181
2182       if (size == 0)
2183         break; /* No more room for data */
2184
2185       if (str->flags & CODING_STATE_END)
2186         /* This means that on the previous iteration, we hit the EOF on
2187            the other end.  We loop once more so that mule_encode() can
2188            output any final stuff it may be holding, or any "go back
2189            to a sane state" escape sequences. (This latter makes sense
2190            during encoding.) */
2191         break;
2192
2193       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2194          left of storage in it, so it's OK to read directly into it.
2195          (We'll be overwriting above, after we've encoded it into the
2196          runoff.) */
2197       read_size = Lstream_read (str->other_end, data, size);
2198       if (read_size < 0)
2199         {
2200           error_occurred = 1;
2201           break;
2202         }
2203       if (read_size == 0)
2204         /* There might be some more end data produced in the translation.
2205            See the comment above. */
2206         str->flags |= CODING_STATE_END;
2207       mule_encode (stream, data, str->runoff, read_size);
2208     }
2209
2210   if (data == orig_data)
2211     return error_occurred ? -1 : 0;
2212   else
2213     return data - orig_data;
2214 }
2215
2216 static int
2217 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2218 {
2219   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2220   int retval;
2221
2222   /* Encode all our data into the runoff, and then attempt to write
2223      it all out to the other end.  Remove whatever chunk we succeeded
2224      in writing. */
2225   mule_encode (stream, data, str->runoff, size);
2226   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2227                           Dynarr_length (str->runoff));
2228   if (retval > 0)
2229     Dynarr_delete_many (str->runoff, 0, retval);
2230   /* Do NOT return retval.  The return value indicates how much
2231      of the incoming data was written, not how many bytes were
2232      written. */
2233   return size;
2234 }
2235
2236 static void
2237 reset_encoding_stream (struct encoding_stream *str)
2238 {
2239   switch (CODING_SYSTEM_TYPE (str->codesys))
2240     {
2241     case CODESYS_ISO2022:
2242       {
2243         int i;
2244
2245         for (i = 0; i < 4; i++)
2246           {
2247             str->iso2022.charset[i] =
2248               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2249             str->iso2022.force_charset_on_output[i] =
2250               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2251           }
2252         str->iso2022.register_left = 0;
2253         str->iso2022.register_right = 1;
2254         str->iso2022.current_charset = Qnil;
2255         str->iso2022.current_half = 0;
2256         str->iso2022.current_char_boundary = 1;
2257         break;
2258       }
2259     case CODESYS_CCL:
2260       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2261       break;
2262     default:
2263       break;
2264     }
2265
2266   str->flags = str->ch = 0;
2267 }
2268
2269 static int
2270 encoding_rewinder (Lstream *stream)
2271 {
2272   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2273   reset_encoding_stream (str);
2274   Dynarr_reset (str->runoff);
2275   return Lstream_rewind (str->other_end);
2276 }
2277
2278 static int
2279 encoding_seekable_p (Lstream *stream)
2280 {
2281   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2282   return Lstream_seekable_p (str->other_end);
2283 }
2284
2285 static int
2286 encoding_flusher (Lstream *stream)
2287 {
2288   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2289   return Lstream_flush (str->other_end);
2290 }
2291
2292 static int
2293 encoding_closer (Lstream *stream)
2294 {
2295   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2296   if (stream->flags & LSTREAM_FL_WRITE)
2297     {
2298       str->flags |= CODING_STATE_END;
2299       encoding_writer (stream, 0, 0);
2300     }
2301   Dynarr_free (str->runoff);
2302   return Lstream_close (str->other_end);
2303 }
2304
2305 Lisp_Object
2306 encoding_stream_coding_system (Lstream *stream)
2307 {
2308   Lisp_Object coding_system;
2309   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2310
2311   XSETCODING_SYSTEM (coding_system, str->codesys);
2312   return coding_system;
2313 }
2314
2315 void
2316 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2317 {
2318   struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2319   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2320   str->codesys = cs;
2321   reset_encoding_stream (str);
2322 }
2323
2324 static Lisp_Object
2325 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2326                         CONST char *mode)
2327 {
2328   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2329   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2330   Lisp_Object obj;
2331
2332   xzero (*str);
2333   str->runoff = Dynarr_new (unsigned_char);
2334   str->other_end = stream;
2335   set_encoding_stream_coding_system (lstr, codesys);
2336   XSETLSTREAM (obj, lstr);
2337   return obj;
2338 }
2339
2340 Lisp_Object
2341 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2342 {
2343   return make_encoding_stream_1 (stream, codesys, "r");
2344 }
2345
2346 Lisp_Object
2347 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2348 {
2349   return make_encoding_stream_1 (stream, codesys, "w");
2350 }
2351
2352 /* Convert N bytes of internally-formatted data stored in SRC to an
2353    external format, according to the encoding stream ENCODING.
2354    Store the encoded data into DST. */
2355
2356 static void
2357 mule_encode (Lstream *encoding, CONST unsigned char *src,
2358              unsigned_char_dynarr *dst, unsigned int n)
2359 {
2360   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2361
2362   switch (CODING_SYSTEM_TYPE (str->codesys))
2363     {
2364 #ifdef DEBUG_XEMACS
2365     case CODESYS_INTERNAL:
2366       Dynarr_add_many (dst, src, n);
2367       break;
2368 #endif
2369     case CODESYS_AUTODETECT:
2370       /* If we got this far and still haven't decided on the coding
2371          system, then do no conversion. */
2372     case CODESYS_NO_CONVERSION:
2373       encode_coding_no_conversion (encoding, src, dst, n);
2374       break;
2375     case CODESYS_SHIFT_JIS:
2376       encode_coding_sjis (encoding, src, dst, n);
2377       break;
2378     case CODESYS_BIG5:
2379       encode_coding_big5 (encoding, src, dst, n);
2380       break;
2381     case CODESYS_CCL:
2382       ccl_driver (&str->ccl, src, dst, n, 0);
2383       break;
2384     case CODESYS_ISO2022:
2385       encode_coding_iso2022 (encoding, src, dst, n);
2386       break;
2387     default:
2388       abort ();
2389     }
2390 }
2391
2392 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2393 Encode the text between START and END using CODING-SYSTEM.
2394 This will, for example, convert Japanese characters into stuff such as
2395 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2396 text.  BUFFER defaults to the current buffer if unspecified.
2397 */
2398        (start, end, coding_system, buffer))
2399 {
2400   Bufpos b, e;
2401   struct buffer *buf = decode_buffer (buffer, 0);
2402   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2403   Lstream *istr, *ostr;
2404   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2405
2406   get_buffer_range_char (buf, start, end, &b, &e, 0);
2407
2408   barf_if_buffer_read_only (buf, b, e);
2409
2410   coding_system = Fget_coding_system (coding_system);
2411   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2412   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2413   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2414                                               Fget_coding_system (Qbinary));
2415   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2416                                            coding_system);
2417   istr = XLSTREAM (instream);
2418   ostr = XLSTREAM (outstream);
2419   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2420   /* The chain of streams looks like this:
2421
2422      [BUFFER] <----- send through
2423                      ------> [ENCODE AS SPECIFIED]
2424                              ------> [DECODE AS BINARY]
2425                                      ------> [BUFFER]
2426    */
2427   while (1)
2428     {
2429       char tempbuf[1024]; /* some random amount */
2430       Bufpos newpos, even_newer_pos;
2431       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2432       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2433
2434       if (!size_in_bytes)
2435         break;
2436       newpos = lisp_buffer_stream_startpos (istr);
2437       Lstream_write (ostr, tempbuf, size_in_bytes);
2438       even_newer_pos = lisp_buffer_stream_startpos (istr);
2439       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2440                            even_newer_pos, 0);
2441     }
2442
2443   {
2444     Charcount retlen =
2445       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2446     Lstream_close (istr);
2447     Lstream_close (ostr);
2448     UNGCPRO;
2449     Lstream_delete (istr);
2450     Lstream_delete (ostr);
2451     Lstream_delete (XLSTREAM (de_outstream));
2452     Lstream_delete (XLSTREAM (lb_outstream));
2453     return make_int (retlen);
2454   }
2455 }
2456
2457 \f
2458 /************************************************************************/
2459 /*                          Shift-JIS methods                           */
2460 /************************************************************************/
2461
2462 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2463    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2464    as is.  A character of JISX0201-Kana (TYPE94 character set) is
2465    encoded by "position-code + 0x80".  A character of JISX0208
2466    (TYPE94x94 character set) is encoded in 2-byte but two
2467    position-codes are divided and shifted so that it fit in the range
2468    below.
2469
2470    --- CODE RANGE of Shift-JIS ---
2471    (character set)      (range)
2472    ASCII                0x00 .. 0x7F
2473    JISX0201-Kana        0xA0 .. 0xDF
2474    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2475             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2476    -------------------------------
2477
2478 */
2479
2480 /* Is this the first byte of a Shift-JIS two-byte char? */
2481
2482 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2483   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2484
2485 /* Is this the second byte of a Shift-JIS two-byte char? */
2486
2487 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2488   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2489
2490 #define BYTE_SJIS_KATAKANA_P(c) \
2491   ((c) >= 0xA1 && (c) <= 0xDF)
2492
2493 static int
2494 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2495                     unsigned int n)
2496 {
2497   int c;
2498
2499   while (n--)
2500     {
2501       c = *src++;
2502       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2503         return 0;
2504       if (st->shift_jis.in_second_byte)
2505         {
2506           st->shift_jis.in_second_byte = 0;
2507           if (c < 0x40)
2508             return 0;
2509         }
2510       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2511         st->shift_jis.in_second_byte = 1;
2512     }
2513   return CODING_CATEGORY_SHIFT_JIS_MASK;
2514 }
2515
2516 /* Convert Shift-JIS data to internal format. */
2517
2518 static void
2519 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2520                     unsigned_char_dynarr *dst, unsigned int n)
2521 {
2522   unsigned char c;
2523   unsigned int flags, ch;
2524   enum eol_type eol_type;
2525   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2526
2527   CODING_STREAM_DECOMPOSE (str, flags, ch);
2528   eol_type = str->eol_type;
2529
2530   while (n--)
2531     {
2532       c = *src++;
2533
2534       if (ch)
2535         {
2536           /* Previous character was first byte of Shift-JIS Kanji char. */
2537           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2538             {
2539               unsigned char e1, e2;
2540
2541               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2542               DECODE_SJIS (ch, c, e1, e2);
2543               Dynarr_add (dst, e1);
2544               Dynarr_add (dst, e2);
2545             }
2546           else
2547             {
2548               DECODE_ADD_BINARY_CHAR (ch, dst);
2549               DECODE_ADD_BINARY_CHAR (c, dst);
2550             }
2551           ch = 0;
2552         }
2553       else
2554         {
2555           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2556           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2557             ch = c;
2558           else if (BYTE_SJIS_KATAKANA_P (c))
2559             {
2560               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2561               Dynarr_add (dst, c);
2562             }
2563           else
2564             DECODE_ADD_BINARY_CHAR (c, dst);
2565         }
2566     label_continue_loop:;
2567     }
2568
2569   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2570
2571   CODING_STREAM_COMPOSE (str, flags, ch);
2572 }
2573
2574 /* Convert internally-formatted data to Shift-JIS. */
2575
2576 static void
2577 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2578                     unsigned_char_dynarr *dst, unsigned int n)
2579 {
2580   unsigned char c;
2581   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2582   unsigned int flags, ch;
2583   enum eol_type eol_type;
2584
2585   CODING_STREAM_DECOMPOSE (str, flags, ch);
2586   eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2587
2588   while (n--)
2589     {
2590       c = *src++;
2591       if (c == '\n')
2592         {
2593           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2594             Dynarr_add (dst, '\r');
2595           if (eol_type != EOL_CR)
2596             Dynarr_add (dst, '\n');
2597           ch = 0;
2598         }
2599       else if (BYTE_ASCII_P (c))
2600         {
2601           Dynarr_add (dst, c);
2602           ch = 0;
2603         }
2604       else if (BUFBYTE_LEADING_BYTE_P (c))
2605         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2606               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2607               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2608       else if (ch)
2609         {
2610           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2611             {
2612               Dynarr_add (dst, c);
2613               ch = 0;
2614             }
2615           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2616                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2617             ch = c;
2618           else
2619             {
2620               unsigned char j1, j2;
2621               ENCODE_SJIS (ch, c, j1, j2);
2622               Dynarr_add (dst, j1);
2623               Dynarr_add (dst, j2);
2624               ch = 0;
2625             }
2626         }
2627     }
2628
2629   CODING_STREAM_COMPOSE (str, flags, ch);
2630 }
2631
2632 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2633 Decode a JISX0208 character of Shift-JIS coding-system.
2634 CODE is the character code in Shift-JIS as a cons of type bytes.
2635 Return the corresponding character.
2636 */
2637        (code))
2638 {
2639   unsigned char c1, c2, s1, s2;
2640
2641   CHECK_CONS (code);
2642   CHECK_INT (XCAR (code));
2643   CHECK_INT (XCDR (code));
2644   s1 = XINT (XCAR (code));
2645   s2 = XINT (XCDR (code));
2646   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2647       BYTE_SJIS_TWO_BYTE_2_P (s2))
2648     {
2649       DECODE_SJIS (s1, s2, c1, c2);
2650       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2651                                    c1 & 0x7F, c2 & 0x7F));
2652     }
2653   else
2654     return Qnil;
2655 }
2656
2657 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2658 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2659 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2660 */
2661        (ch))
2662 {
2663   Lisp_Object charset;
2664   int c1, c2, s1, s2;
2665
2666   CHECK_CHAR_COERCE_INT (ch);
2667   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2668   if (EQ (charset, Vcharset_japanese_jisx0208))
2669     {
2670       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2671       return Fcons (make_int (s1), make_int (s2));
2672     }
2673   else
2674     return Qnil;
2675 }
2676
2677 \f
2678 /************************************************************************/
2679 /*                            Big5 methods                              */
2680 /************************************************************************/
2681
2682 /* BIG5 is a coding system encoding two character sets: ASCII and
2683    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2684    character set and is encoded in two-byte.
2685
2686    --- CODE RANGE of BIG5 ---
2687    (character set)      (range)
2688    ASCII                0x00 .. 0x7F
2689    Big5 (1st byte)      0xA1 .. 0xFE
2690         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2691    --------------------------
2692
2693    Since the number of characters in Big5 is larger than maximum
2694    characters in Emacs' charset (96x96), it can't be handled as one
2695    charset.  So, in Emacs, Big5 is devided into two: `charset-big5-1'
2696    and `charset-big5-2'.  Both <type>s are TYPE94x94.  The former
2697    contains frequently used characters and the latter contains less
2698    frequently used characters.  */
2699
2700 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2701   ((c) >= 0xA1 && (c) <= 0xFE)
2702
2703 /* Is this the second byte of a Shift-JIS two-byte char? */
2704
2705 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2706   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2707
2708 /* Number of Big5 characters which have the same code in 1st byte.  */
2709
2710 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2711
2712 /* Code conversion macros.  These are macros because they are used in
2713    inner loops during code conversion.
2714
2715    Note that temporary variables in macros introduce the classic
2716    dynamic-scoping problems with variable names.  We use capital-
2717    lettered variables in the assumption that XEmacs does not use
2718    capital letters in variables except in a very formalized way
2719    (e.g. Qstring). */
2720
2721 /* Convert Big5 code (b1, b2) into its internal string representation
2722    (lb, c1, c2). */
2723
2724 /* There is a much simpler way to split the Big5 charset into two.
2725    For the moment I'm going to leave the algorithm as-is because it
2726    claims to separate out the most-used characters into a single
2727    charset, which perhaps will lead to optimizations in various
2728    places.
2729
2730    The way the algorithm works is something like this:
2731
2732    Big5 can be viewed as a 94x157 charset, where the row is
2733    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2734    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2735    the split between low and high column numbers is apparently
2736    meaningless; ascending rows produce less and less frequent chars.
2737    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2738    the first charset, and the upper half (0xC9 .. 0xFE) to the
2739    second.  To do the conversion, we convert the character into
2740    a single number where 0 .. 156 is the first row, 157 .. 313
2741    is the second, etc.  That way, the characters are ordered by
2742    decreasing frequency.  Then we just chop the space in two
2743    and coerce the result into a 94x94 space.
2744    */
2745
2746 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2747 {                                                                       \
2748   int B1 = b1, B2 = b2;                                                 \
2749   unsigned int I                                                        \
2750     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2751                                                                         \
2752   if (B1 < 0xC9)                                                        \
2753     {                                                                   \
2754       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2755     }                                                                   \
2756   else                                                                  \
2757     {                                                                   \
2758       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2759       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2760     }                                                                   \
2761   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2762   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2763 } while (0)
2764
2765 /* Convert the internal string representation of a Big5 character
2766    (lb, c1, c2) into Big5 code (b1, b2). */
2767
2768 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2769 {                                                                       \
2770   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2771                                                                         \
2772   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2773     {                                                                   \
2774       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2775     }                                                                   \
2776   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2777   b2 = I % BIG5_SAME_ROW;                                               \
2778   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2779 } while (0)
2780
2781 static int
2782 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2783                     unsigned int n)
2784 {
2785   int c;
2786
2787   while (n--)
2788     {
2789       c = *src++;
2790       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2791           (c >= 0x80 && c <= 0xA0))
2792         return 0;
2793       if (st->big5.in_second_byte)
2794         {
2795           st->big5.in_second_byte = 0;
2796           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2797             return 0;
2798         }
2799       else if (c >= 0xA1)
2800         st->big5.in_second_byte = 1;
2801     }
2802   return CODING_CATEGORY_BIG5_MASK;
2803 }
2804
2805 /* Convert Big5 data to internal format. */
2806
2807 static void
2808 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2809                     unsigned_char_dynarr *dst, unsigned int n)
2810 {
2811   unsigned char c;
2812   unsigned int flags, ch;
2813   enum eol_type eol_type;
2814   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2815
2816   CODING_STREAM_DECOMPOSE (str, flags, ch);
2817   eol_type = str->eol_type;
2818
2819   while (n--)
2820     {
2821       c = *src++;
2822       if (ch)
2823         {
2824           /* Previous character was first byte of Big5 char. */
2825           if (BYTE_BIG5_TWO_BYTE_2_P (c))
2826             {
2827               unsigned char b1, b2, b3;
2828               DECODE_BIG5 (ch, c, b1, b2, b3);
2829               Dynarr_add (dst, b1);
2830               Dynarr_add (dst, b2);
2831               Dynarr_add (dst, b3);
2832             }
2833           else
2834             {
2835               DECODE_ADD_BINARY_CHAR (ch, dst);
2836               DECODE_ADD_BINARY_CHAR (c, dst);
2837             }
2838           ch = 0;
2839         }
2840       else
2841         {
2842           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2843           if (BYTE_BIG5_TWO_BYTE_1_P (c))
2844             ch = c;
2845           else
2846             DECODE_ADD_BINARY_CHAR (c, dst);
2847         }
2848     label_continue_loop:;
2849     }
2850
2851   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2852
2853   CODING_STREAM_COMPOSE (str, flags, ch);
2854 }
2855
2856 /* Convert internally-formatted data to Big5. */
2857
2858 static void
2859 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2860                     unsigned_char_dynarr *dst, unsigned int n)
2861 {
2862   unsigned char c;
2863   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2864   unsigned int flags, ch;
2865   enum eol_type eol_type;
2866
2867   CODING_STREAM_DECOMPOSE (str, flags, ch);
2868   eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2869
2870   while (n--)
2871     {
2872       c = *src++;
2873       if (c == '\n')
2874         {
2875           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2876             Dynarr_add (dst, '\r');
2877           if (eol_type != EOL_CR)
2878             Dynarr_add (dst, '\n');
2879         }
2880       else if (BYTE_ASCII_P (c))
2881         {
2882           /* ASCII. */
2883           Dynarr_add (dst, c);
2884         }
2885       else if (BUFBYTE_LEADING_BYTE_P (c))
2886         {
2887           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2888               c == LEADING_BYTE_CHINESE_BIG5_2)
2889             {
2890               /* A recognized leading byte. */
2891               ch = c;
2892               continue; /* not done with this character. */
2893             }
2894           /* otherwise just ignore this character. */
2895         }
2896       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2897                ch == LEADING_BYTE_CHINESE_BIG5_2)
2898         {
2899           /* Previous char was a recognized leading byte. */
2900           ch = (ch << 8) | c;
2901           continue; /* not done with this character. */
2902         }
2903       else if (ch)
2904         {
2905           /* Encountering second byte of a Big5 character. */
2906           unsigned char b1, b2;
2907
2908           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2909           Dynarr_add (dst, b1);
2910           Dynarr_add (dst, b2);
2911         }
2912
2913       ch = 0;
2914     }
2915
2916   CODING_STREAM_COMPOSE (str, flags, ch);
2917 }
2918
2919
2920 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2921 Decode a Big5 character CODE of BIG5 coding-system.
2922 CODE is the character code in BIG5, a cons of two integers.
2923 Return the corresponding character.
2924 */
2925        (code))
2926 {
2927   unsigned char c1, c2, b1, b2;
2928
2929   CHECK_CONS (code);
2930   CHECK_INT (XCAR (code));
2931   CHECK_INT (XCDR (code));
2932   b1 = XINT (XCAR (code));
2933   b2 = XINT (XCDR (code));
2934   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2935       BYTE_BIG5_TWO_BYTE_2_P (b2))
2936     {
2937       int leading_byte;
2938       Lisp_Object charset;
2939       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
2940       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
2941       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
2942     }
2943   else
2944     return Qnil;
2945 }
2946
2947 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
2948 Encode the Big5 character CH to BIG5 coding-system.
2949 Return the corresponding character code in Big5.
2950 */
2951        (ch))
2952 {
2953   Lisp_Object charset;
2954   int c1, c2, b1, b2;
2955
2956   CHECK_CHAR_COERCE_INT (ch);
2957   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2958   if (EQ (charset, Vcharset_chinese_big5_1) ||
2959       EQ (charset, Vcharset_chinese_big5_2))
2960     {
2961       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
2962                    b1, b2);
2963       return Fcons (make_int (b1), make_int (b2));
2964     }
2965   else
2966     return Qnil;
2967 }
2968
2969 \f
2970 /************************************************************************/
2971 /*                           ISO2022 methods                            */
2972 /************************************************************************/
2973
2974 /* The following note describes the coding system ISO2022 briefly.
2975    Since the intention of this note is to help understanding of the
2976    programs in this file, some parts are NOT ACCURATE or OVERLY
2977    SIMPLIFIED.  For thorough understanding, please refer to the
2978    original document of ISO2022.
2979
2980    ISO2022 provides many mechanisms to encode several character sets
2981    in 7-bit and 8-bit environments.  If one chooses 7-bit environment,
2982    all text is encoded by codes of less than 128.  This may make the
2983    encoded text a little bit longer, but the text get more stability
2984    to pass through several gateways (some of them strip off MSB).
2985
2986    There are two kind of character sets: control character set and
2987    graphic character set.  The former contains control characters such
2988    as `newline' and `escape' to provide control functions (control
2989    functions are provided also by escape sequence).  The latter
2990    contains graphic characters such as 'A' and '-'.  Emacs recognizes
2991    two control character sets and many graphic character sets.
2992
2993    Graphic character sets are classified into one of four types,
2994    according to the dimension and number of characters in the set:
2995    TYPE94, TYPE96, TYPE94x94, and TYPE96x96.  In addition, each
2996    character set is assigned an identification byte, unique for each
2997    type, called "final character" (denoted as <F> hereafter).  The <F>
2998    of each character set is decided by ECMA(*) when it is registered
2999    in ISO.  Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3000    private use only).
3001
3002    Note (*): ECMA = European Computer Manufacturers Association
3003
3004    Here are examples of graphic character set [NAME(<F>)]:
3005         o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3006         o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3007         o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3008         o TYPE96x96 -- none for the moment
3009
3010    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3011         C0 [0x00..0x1F] -- control character plane 0
3012         GL [0x20..0x7F] -- graphic character plane 0
3013         C1 [0x80..0x9F] -- control character plane 1
3014         GR [0xA0..0xFF] -- graphic character plane 1
3015
3016    A control character set is directly designated and invoked to C0 or
3017    C1 by an escape sequence.  The most common case is that:
3018    - ISO646's  control character set is designated/invoked to C0, and
3019    - ISO6429's control character set is designated/invoked to C1,
3020    and usually these designations/invocations are omitted in encoded
3021    text.  In a 7-bit environment, only C0 can be used, and a control
3022    character for C1 is encoded by an appropriate escape sequence to
3023    fit into the environment.  All control characters for C1 are
3024    defined to have corresponding escape sequences.
3025
3026    A graphic character set is at first designated to one of four
3027    graphic registers (G0 through G3), then these graphic registers are
3028    invoked to GL or GR.  These designations and invocations can be
3029    done independently.  The most common case is that G0 is invoked to
3030    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3031    these invocations and designations are omitted in encoded text.
3032    In a 7-bit environment, only GL can be used.
3033
3034    When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3035    GL, codes 0x20 and 0x7F of the GL area work as control characters
3036    SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3037    should not be used.
3038
3039    There are two ways of invocation: locking-shift and single-shift.
3040    With locking-shift, the invocation lasts until the next different
3041    invocation, whereas with single-shift, the invocation works only
3042    for the following character and doesn't affect locking-shift.
3043    Invocations are done by the following control characters or escape
3044    sequences.
3045
3046    ----------------------------------------------------------------------
3047    abbrev  function                  cntrl escape seq   description
3048    ----------------------------------------------------------------------
3049    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3050    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3051    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR
3052    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3053    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR
3054    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3055    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR
3056    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3057    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3058    ----------------------------------------------------------------------
3059    The first four are for locking-shift.  Control characters for these
3060    functions are defined by macros ISO_CODE_XXX in `coding.h'.
3061
3062    Designations are done by the following escape sequences.
3063    ----------------------------------------------------------------------
3064    escape sequence      description
3065    ----------------------------------------------------------------------
3066    ESC '(' <F>          designate TYPE94<F> to G0
3067    ESC ')' <F>          designate TYPE94<F> to G1
3068    ESC '*' <F>          designate TYPE94<F> to G2
3069    ESC '+' <F>          designate TYPE94<F> to G3
3070    ESC ',' <F>          designate TYPE96<F> to G0 (*)
3071    ESC '-' <F>          designate TYPE96<F> to G1
3072    ESC '.' <F>          designate TYPE96<F> to G2
3073    ESC '/' <F>          designate TYPE96<F> to G3
3074    ESC '$' '(' <F>      designate TYPE94x94<F> to G0 (**)
3075    ESC '$' ')' <F>      designate TYPE94x94<F> to G1
3076    ESC '$' '*' <F>      designate TYPE94x94<F> to G2
3077    ESC '$' '+' <F>      designate TYPE94x94<F> to G3
3078    ESC '$' ',' <F>      designate TYPE96x96<F> to G0 (*)
3079    ESC '$' '-' <F>      designate TYPE96x96<F> to G1
3080    ESC '$' '.' <F>      designate TYPE96x96<F> to G2
3081    ESC '$' '/' <F>      designate TYPE96x96<F> to G3
3082    ----------------------------------------------------------------------
3083    In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3084    and final character <F>, and etc.
3085
3086    Note (*): Although these designations are not allowed in ISO2022,
3087    Emacs accepts them on decoding, and produces them on encoding
3088    TYPE96 or TYPE96x96 character set in a coding system which is
3089    characterized as 7-bit environment, non-locking-shift, and
3090    non-single-shift.
3091
3092    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3093    '(' can be omitted.  We call this as "short-form" here after.
3094
3095    Now you may notice that there are a lot of ways for encoding the
3096    same multilingual text in ISO2022.  Actually, there exist many
3097    coding systems such as Compound Text (used in X's inter client
3098    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3099    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3100    localized platforms), and all of these are variants of ISO2022.
3101
3102    In addition to the above, Emacs handles two more kinds of escape
3103    sequences: ISO6429's direction specification and Emacs' private
3104    sequence for specifying character composition.
3105
3106    ISO6429's direction specification takes the following format:
3107         o CSI ']'      -- end of the current direction
3108         o CSI '0' ']'  -- end of the current direction
3109         o CSI '1' ']'  -- start of left-to-right text
3110         o CSI '2' ']'  -- start of right-to-left text
3111    The control character CSI (0x9B: control sequence introducer) is
3112    abbreviated to the escape sequence ESC '[' in 7-bit environment.
3113
3114    Character composition specification takes the following format:
3115         o ESC '0' -- start character composition
3116         o ESC '1' -- end character composition
3117    Since these are not standard escape sequences of any ISO, the use
3118    of them for these meanings is restricted to Emacs only.  */
3119
3120 static void
3121 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3122 {
3123   int i;
3124
3125   for (i = 0; i < 4; i++)
3126     {
3127       if (!NILP (coding_system))
3128         iso->charset[i] =
3129           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3130       else
3131         iso->charset[i] = Qt;
3132       iso->invalid_designated[i] = 0;
3133     }
3134   iso->esc = ISO_ESC_NOTHING;
3135   iso->esc_bytes_index = 0;
3136   iso->register_left = 0;
3137   iso->register_right = 1;
3138   iso->switched_dir_and_no_valid_charset_yet = 0;
3139   iso->invalid_switch_dir = 0;
3140   iso->output_direction_sequence = 0;
3141   iso->output_literally = 0;
3142   if (iso->composite_chars)
3143     Dynarr_reset (iso->composite_chars);
3144 }
3145
3146 static int
3147 fit_to_be_escape_quoted (unsigned char c)
3148 {
3149   switch (c)
3150     {
3151     case ISO_CODE_ESC:
3152     case ISO_CODE_CSI:
3153     case ISO_CODE_SS2:
3154     case ISO_CODE_SS3:
3155     case ISO_CODE_SO:
3156     case ISO_CODE_SI:
3157       return 1;
3158
3159     default:
3160       return 0;
3161     }
3162 }
3163
3164 /* Parse one byte of an ISO2022 escape sequence.
3165    If the result is an invalid escape sequence, return 0 and
3166    do not change anything in STR.  Otherwise, if the result is
3167    an incomplete escape sequence, update ISO2022.ESC and
3168    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3169    all the state variables (but not ISO2022.ESC_BYTES) and
3170    return 1.
3171
3172    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3173    or invocation of an invalid character set and treat that as
3174    an unrecognized escape sequence. */
3175
3176 static int
3177 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3178                    unsigned char c, unsigned int *flags,
3179                    int check_invalid_charsets)
3180 {
3181   /* (1) If we're at the end of a designation sequence, CS is the
3182      charset being designated and REG is the register to designate
3183      it to.
3184
3185      (2) If we're at the end of a locking-shift sequence, REG is
3186      the register to invoke and HALF (0 == left, 1 == right) is
3187      the half to invoke it into.
3188
3189      (3) If we're at the end of a single-shift sequence, REG is
3190      the register to invoke. */
3191   Lisp_Object cs = Qnil;
3192   int reg, half;
3193
3194   /* NOTE: This code does goto's all over the fucking place.
3195      The reason for this is that we're basically implementing
3196      a state machine here, and hierarchical languages like C
3197      don't really provide a clean way of doing this. */
3198
3199   if (! (*flags & CODING_STATE_ESCAPE))
3200     /* At beginning of escape sequence; we need to reset our
3201        escape-state variables. */
3202     iso->esc = ISO_ESC_NOTHING;
3203
3204   iso->output_literally = 0;
3205   iso->output_direction_sequence = 0;
3206
3207   switch (iso->esc)
3208     {
3209     case ISO_ESC_NOTHING:
3210       iso->esc_bytes_index = 0;
3211       switch (c)
3212         {
3213         case ISO_CODE_ESC:      /* Start escape sequence */
3214           *flags |= CODING_STATE_ESCAPE;
3215           iso->esc = ISO_ESC;
3216           goto not_done;
3217
3218         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3219           *flags |= CODING_STATE_ESCAPE;
3220           iso->esc = ISO_ESC_5_11;
3221           goto not_done;
3222
3223         case ISO_CODE_SO:       /* locking shift 1 */
3224           reg = 1; half = 0;
3225           goto locking_shift;
3226         case ISO_CODE_SI:       /* locking shift 0 */
3227           reg = 0; half = 0;
3228           goto locking_shift;
3229
3230         case ISO_CODE_SS2:      /* single shift */
3231           reg = 2;
3232           goto single_shift;
3233         case ISO_CODE_SS3:      /* single shift */
3234           reg = 3;
3235           goto single_shift;
3236
3237         default:                        /* Other control characters */
3238           return 0;
3239         }
3240
3241     case ISO_ESC:
3242       switch (c)
3243         {
3244           /**** single shift ****/
3245
3246         case 'N':       /* single shift 2 */
3247           reg = 2;
3248           goto single_shift;
3249         case 'O':       /* single shift 3 */
3250           reg = 3;
3251           goto single_shift;
3252
3253           /**** locking shift ****/
3254
3255         case '~':       /* locking shift 1 right */
3256           reg = 1; half = 1;
3257           goto locking_shift;
3258         case 'n':       /* locking shift 2 */
3259           reg = 2; half = 0;
3260           goto locking_shift;
3261         case '}':       /* locking shift 2 right */
3262           reg = 2; half = 1;
3263           goto locking_shift;
3264         case 'o':       /* locking shift 3 */
3265           reg = 3; half = 0;
3266           goto locking_shift;
3267         case '|':       /* locking shift 3 right */
3268           reg = 3; half = 1;
3269           goto locking_shift;
3270
3271           /**** composite ****/
3272
3273         case '0':
3274           iso->esc = ISO_ESC_START_COMPOSITE;
3275           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3276             CODING_STATE_COMPOSITE;
3277           return 1;
3278
3279         case '1':
3280           iso->esc = ISO_ESC_END_COMPOSITE;
3281           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3282             ~CODING_STATE_COMPOSITE;
3283           return 1;
3284
3285           /**** directionality ****/
3286
3287         case '[':
3288           iso->esc = ISO_ESC_5_11;
3289           goto not_done;
3290
3291           /**** designation ****/
3292
3293         case '$':       /* multibyte charset prefix */
3294           iso->esc = ISO_ESC_2_4;
3295           goto not_done;
3296
3297         default:
3298           if (0x28 <= c && c <= 0x2F)
3299             {
3300               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3301               goto not_done;
3302             }
3303
3304           /* This function is called with CODESYS equal to nil when
3305              doing coding-system detection. */
3306           if (!NILP (codesys)
3307               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3308               && fit_to_be_escape_quoted (c))
3309             {
3310               iso->esc = ISO_ESC_LITERAL;
3311               *flags &= CODING_STATE_ISO2022_LOCK;
3312               return 1;
3313             }
3314
3315           /* bzzzt! */
3316           return 0;
3317         }
3318
3319
3320
3321       /**** directionality ****/
3322
3323     case ISO_ESC_5_11:          /* ISO6429 direction control */
3324       if (c == ']')
3325         {
3326           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3327           goto directionality;
3328         }
3329       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
3330       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3331       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3332       else               return 0;
3333       goto not_done;
3334
3335     case ISO_ESC_5_11_0:
3336       if (c == ']')
3337         {
3338           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3339           goto directionality;
3340         }
3341       return 0;
3342
3343     case ISO_ESC_5_11_1:
3344       if (c == ']')
3345         {
3346           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3347           goto directionality;
3348         }
3349       return 0;
3350
3351     case ISO_ESC_5_11_2:
3352       if (c == ']')
3353         {
3354           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3355           goto directionality;
3356         }
3357       return 0;
3358
3359     directionality:
3360       iso->esc = ISO_ESC_DIRECTIONALITY;
3361       /* Various junk here to attempt to preserve the direction sequences
3362          literally in the text if they would otherwise be swallowed due
3363          to invalid designations that don't show up as actual charset
3364          changes in the text. */
3365       if (iso->invalid_switch_dir)
3366         {
3367           /* We already inserted a direction switch literally into the
3368              text.  We assume (#### this may not be right) that the
3369              next direction switch is the one going the other way,
3370              and we need to output that literally as well. */
3371           iso->output_literally = 1;
3372           iso->invalid_switch_dir = 0;
3373         }
3374       else
3375         {
3376           int jj;
3377
3378           /* If we are in the thrall of an invalid designation,
3379            then stick the directionality sequence literally into the
3380            output stream so it ends up in the original text again. */
3381           for (jj = 0; jj < 4; jj++)
3382             if (iso->invalid_designated[jj])
3383               break;
3384           if (jj < 4)
3385             {
3386               iso->output_literally = 1;
3387               iso->invalid_switch_dir = 1;
3388             }
3389           else
3390             /* Indicate that we haven't yet seen a valid designation,
3391                so that if a switch-dir is directly followed by an
3392                invalid designation, both get inserted literally. */
3393             iso->switched_dir_and_no_valid_charset_yet = 1;
3394         }
3395       return 1;
3396
3397
3398       /**** designation ****/
3399
3400     case ISO_ESC_2_4:
3401       if (0x28 <= c && c <= 0x2F)
3402         {
3403           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
3404           goto not_done;
3405         }
3406       if (0x40 <= c && c <= 0x42)
3407         {
3408           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3409                                       *flags & CODING_STATE_R2L ?
3410                                       CHARSET_RIGHT_TO_LEFT :
3411                                       CHARSET_LEFT_TO_RIGHT);
3412           reg = 0;
3413           goto designated;
3414         }
3415       return 0;
3416
3417     default:
3418       {
3419         int type =-1;
3420
3421         if (c < '0' || c > '~')
3422           return 0; /* bad final byte */
3423
3424         if (iso->esc >= ISO_ESC_2_8 &&
3425             iso->esc <= ISO_ESC_2_15)
3426           {
3427             type = ((iso->esc >= ISO_ESC_2_12) ?
3428                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
3429             reg = (iso->esc - ISO_ESC_2_8) & 3;
3430           }
3431         else if (iso->esc >= ISO_ESC_2_4_8 &&
3432                  iso->esc <= ISO_ESC_2_4_15)
3433           {
3434             type = ((iso->esc >= ISO_ESC_2_4_12) ?
3435                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
3436             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3437           }
3438         else
3439           {
3440             /* Can this ever be reached? -slb */
3441             abort();
3442           }
3443
3444         cs = CHARSET_BY_ATTRIBUTES (type, c,
3445                                     *flags & CODING_STATE_R2L ?
3446                                     CHARSET_RIGHT_TO_LEFT :
3447                                     CHARSET_LEFT_TO_RIGHT);
3448         goto designated;
3449       }
3450     }
3451
3452  not_done:
3453   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3454   return -1;
3455
3456  single_shift:
3457   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3458     /* can't invoke something that ain't there. */
3459     return 0;
3460   iso->esc = ISO_ESC_SINGLE_SHIFT;
3461   *flags &= CODING_STATE_ISO2022_LOCK;
3462   if (reg == 2)
3463     *flags |= CODING_STATE_SS2;
3464   else
3465     *flags |= CODING_STATE_SS3;
3466   return 1;
3467
3468  locking_shift:
3469   if (check_invalid_charsets &&
3470       !CHARSETP (iso->charset[reg]))
3471     /* can't invoke something that ain't there. */
3472     return 0;
3473   if (half)
3474     iso->register_right = reg;
3475   else
3476     iso->register_left = reg;
3477   *flags &= CODING_STATE_ISO2022_LOCK;
3478   iso->esc = ISO_ESC_LOCKING_SHIFT;
3479   return 1;
3480
3481  designated:
3482   if (NILP (cs) && check_invalid_charsets)
3483     {
3484       iso->invalid_designated[reg] = 1;
3485       iso->charset[reg] = Vcharset_ascii;
3486       iso->esc = ISO_ESC_DESIGNATE;
3487       *flags &= CODING_STATE_ISO2022_LOCK;
3488       iso->output_literally = 1;
3489       if (iso->switched_dir_and_no_valid_charset_yet)
3490         {
3491           /* We encountered a switch-direction followed by an
3492              invalid designation.  Ensure that the switch-direction
3493              gets outputted; otherwise it will probably get eaten
3494              when the text is written out again. */
3495           iso->switched_dir_and_no_valid_charset_yet = 0;
3496           iso->output_direction_sequence = 1;
3497           /* And make sure that the switch-dir going the other
3498              way gets outputted, as well. */
3499           iso->invalid_switch_dir = 1;
3500         }
3501       return 1;
3502     }
3503   /* This function is called with CODESYS equal to nil when
3504      doing coding-system detection. */
3505   if (!NILP (codesys))
3506     {
3507       charset_conversion_spec_dynarr *dyn =
3508         XCODING_SYSTEM (codesys)->iso2022.input_conv;
3509
3510       if (dyn)
3511         {
3512           int i;
3513
3514           for (i = 0; i < Dynarr_length (dyn); i++)
3515             {
3516               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3517               if (EQ (cs, spec->from_charset))
3518                 cs = spec->to_charset;
3519             }
3520         }
3521     }
3522
3523   iso->charset[reg] = cs;
3524   iso->esc = ISO_ESC_DESIGNATE;
3525   *flags &= CODING_STATE_ISO2022_LOCK;
3526   if (iso->invalid_designated[reg])
3527     {
3528       iso->invalid_designated[reg] = 0;
3529       iso->output_literally = 1;
3530     }
3531   if (iso->switched_dir_and_no_valid_charset_yet)
3532     iso->switched_dir_and_no_valid_charset_yet = 0;
3533   return 1;
3534 }
3535
3536 static int
3537 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3538                        unsigned int n)
3539 {
3540   int c;
3541   int mask;
3542
3543   /* #### There are serious deficiencies in the recognition mechanism
3544      here.  This needs to be much smarter if it's going to cut it. */
3545
3546   if (!st->iso2022.initted)
3547     {
3548       reset_iso2022 (Qnil, &st->iso2022.iso);
3549       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3550                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3551                           CODING_CATEGORY_ISO_8_1_MASK |
3552                           CODING_CATEGORY_ISO_8_2_MASK |
3553                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3554       st->iso2022.flags = 0;
3555       st->iso2022.high_byte_count = 0;
3556       st->iso2022.saw_single_shift = 0;
3557       st->iso2022.initted = 1;
3558     }
3559
3560   mask = st->iso2022.mask;
3561
3562   while (n--)
3563     {
3564       c = *src++;
3565       if (c >= 0xA0)
3566         {
3567           mask &= ~CODING_CATEGORY_ISO_7_MASK;
3568           st->iso2022.high_byte_count++;
3569         }
3570       else
3571         {
3572           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3573             {
3574               if (st->iso2022.high_byte_count & 1)
3575                 /* odd number of high bytes; assume not iso-8-2 */
3576                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3577             }
3578           st->iso2022.high_byte_count = 0;
3579           st->iso2022.saw_single_shift = 0;
3580           if (c > 0x80)
3581             mask &= ~CODING_CATEGORY_ISO_7_MASK;
3582         }
3583       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3584           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3585         { /* control chars */
3586           switch (c)
3587             {
3588               /* Allow and ignore control characters that you might
3589                  reasonably see in a text file */
3590             case '\r':
3591             case '\n':
3592             case '\t':
3593             case  7: /* bell */
3594             case  8: /* backspace */
3595             case 11: /* vertical tab */
3596             case 12: /* form feed */
3597             case 26: /* MS-DOS C-z junk */
3598             case 31: /* '^_' -- for info */
3599               goto label_continue_loop;
3600
3601             default:
3602               break;
3603             }
3604         }
3605
3606       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3607           || BYTE_C1_P (c))
3608         {
3609           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3610                                  &st->iso2022.flags, 0))
3611             {
3612               switch (st->iso2022.iso.esc)
3613                 {
3614                 case ISO_ESC_DESIGNATE:
3615                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3616                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3617                   break;
3618                 case ISO_ESC_LOCKING_SHIFT:
3619                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3620                   goto ran_out_of_chars;
3621                 case ISO_ESC_SINGLE_SHIFT:
3622                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3623                   st->iso2022.saw_single_shift = 1;
3624                   break;
3625                 default:
3626                   break;
3627                 }
3628             }
3629           else
3630             {
3631               mask = 0;
3632               goto ran_out_of_chars;
3633             }
3634         }
3635     label_continue_loop:;
3636     }
3637
3638  ran_out_of_chars:
3639
3640   return mask;
3641 }
3642
3643 static int
3644 postprocess_iso2022_mask (int mask)
3645 {
3646   /* #### kind of cheesy */
3647   /* If seven-bit ISO is allowed, then assume that the encoding is
3648      entirely seven-bit and turn off the eight-bit ones. */
3649   if (mask & CODING_CATEGORY_ISO_7_MASK)
3650     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3651                CODING_CATEGORY_ISO_8_1_MASK |
3652                CODING_CATEGORY_ISO_8_2_MASK);
3653   return mask;
3654 }
3655
3656 /* If FLAGS is a null pointer or specifies right-to-left motion,
3657    output a switch-dir-to-left-to-right sequence to DST.
3658    Also update FLAGS if it is not a null pointer.
3659    If INTERNAL_P is set, we are outputting in internal format and
3660    need to handle the CSI differently. */
3661
3662 static void
3663 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3664                                  unsigned_char_dynarr *dst,
3665                                  unsigned int *flags,
3666                                  int internal_p)
3667 {
3668   if (!flags || (*flags & CODING_STATE_R2L))
3669     {
3670       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3671         {
3672           Dynarr_add (dst, ISO_CODE_ESC);
3673           Dynarr_add (dst, '[');
3674         }
3675       else if (internal_p)
3676         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3677       else
3678         Dynarr_add (dst, ISO_CODE_CSI);
3679       Dynarr_add (dst, '0');
3680       Dynarr_add (dst, ']');
3681       if (flags)
3682         *flags &= ~CODING_STATE_R2L;
3683     }
3684 }
3685
3686 /* If FLAGS is a null pointer or specifies a direction different from
3687    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3688    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3689    sequence to DST.  Also update FLAGS if it is not a null pointer.
3690    If INTERNAL_P is set, we are outputting in internal format and
3691    need to handle the CSI differently. */
3692
3693 static void
3694 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3695                           unsigned_char_dynarr *dst, unsigned int *flags,
3696                           int internal_p)
3697 {
3698   if ((!flags || (*flags & CODING_STATE_R2L)) &&
3699       direction == CHARSET_LEFT_TO_RIGHT)
3700     restore_left_to_right_direction (codesys, dst, flags, internal_p);
3701   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3702            && (!flags || !(*flags & CODING_STATE_R2L)) &&
3703            direction == CHARSET_RIGHT_TO_LEFT)
3704     {
3705       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3706         {
3707           Dynarr_add (dst, ISO_CODE_ESC);
3708           Dynarr_add (dst, '[');
3709         }
3710       else if (internal_p)
3711         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3712       else
3713         Dynarr_add (dst, ISO_CODE_CSI);
3714       Dynarr_add (dst, '2');
3715       Dynarr_add (dst, ']');
3716       if (flags)
3717         *flags |= CODING_STATE_R2L;
3718     }
3719 }
3720
3721 /* Convert ISO2022-format data to internal format. */
3722
3723 static void
3724 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3725                        unsigned_char_dynarr *dst, unsigned int n)
3726 {
3727   unsigned char c;
3728   unsigned int flags, ch;
3729   enum eol_type eol_type;
3730   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3731   Lisp_Object coding_system;
3732   unsigned_char_dynarr *real_dst = dst;
3733
3734   CODING_STREAM_DECOMPOSE (str, flags, ch);
3735   eol_type = str->eol_type;
3736   XSETCODING_SYSTEM (coding_system, str->codesys);
3737
3738   if (flags & CODING_STATE_COMPOSITE)
3739     dst = str->iso2022.composite_chars;
3740
3741   while (n--)
3742     {
3743       c = *src++;
3744       if (flags & CODING_STATE_ESCAPE)
3745         {       /* Within ESC sequence */
3746           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
3747                                           c, &flags, 1);
3748
3749           if (retval)
3750             {
3751               switch (str->iso2022.esc)
3752                 {
3753                 case ISO_ESC_START_COMPOSITE:
3754                   if (str->iso2022.composite_chars)
3755                     Dynarr_reset (str->iso2022.composite_chars);
3756                   else
3757                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3758                   dst = str->iso2022.composite_chars;
3759                   break;
3760                 case ISO_ESC_END_COMPOSITE:
3761                   {
3762                     Bufbyte comstr[MAX_EMCHAR_LEN];
3763                     Bytecount len;
3764                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3765                                                          Dynarr_length (dst));
3766                     dst = real_dst;
3767                     len = set_charptr_emchar (comstr, emch);
3768                     Dynarr_add_many (dst, comstr, len);
3769                     break;
3770                   }
3771
3772                 case ISO_ESC_LITERAL:
3773                   DECODE_ADD_BINARY_CHAR (c, dst);
3774                   break;
3775
3776                 default:
3777                   /* Everything else handled already */
3778                   break;
3779                 }
3780             }
3781
3782           /* Attempted error recovery. */
3783           if (str->iso2022.output_direction_sequence)
3784             ensure_correct_direction (flags & CODING_STATE_R2L ?
3785                                       CHARSET_RIGHT_TO_LEFT :
3786                                       CHARSET_LEFT_TO_RIGHT,
3787                                       str->codesys, dst, 0, 1);
3788           /* More error recovery. */
3789           if (!retval || str->iso2022.output_literally)
3790             {
3791               /* Output the (possibly invalid) sequence */
3792               int i;
3793               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3794                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3795               flags &= CODING_STATE_ISO2022_LOCK;
3796               if (!retval)
3797                 n++, src--;/* Repeat the loop with the same character. */
3798               else
3799                 {
3800                   /* No sense in reprocessing the final byte of the
3801                      escape sequence; it could mess things up anyway.
3802                      Just add it now. */
3803                   DECODE_ADD_BINARY_CHAR (c, dst);
3804                 }
3805             }
3806           ch = 0;
3807         }
3808       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3809         { /* Control characters */
3810
3811           /***** Error-handling *****/
3812
3813           /* If we were in the middle of a character, dump out the
3814              partial character. */
3815           DECODE_OUTPUT_PARTIAL_CHAR (ch);
3816
3817           /* If we just saw a single-shift character, dump it out.
3818              This may dump out the wrong sort of single-shift character,
3819              but least it will give an indication that something went
3820              wrong. */
3821           if (flags & CODING_STATE_SS2)
3822             {
3823               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3824               flags &= ~CODING_STATE_SS2;
3825             }
3826           if (flags & CODING_STATE_SS3)
3827             {
3828               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3829               flags &= ~CODING_STATE_SS3;
3830             }
3831
3832           /***** Now handle the control characters. *****/
3833
3834           /* Handle CR/LF */
3835           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3836
3837           flags &= CODING_STATE_ISO2022_LOCK;
3838
3839           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3840             DECODE_ADD_BINARY_CHAR (c, dst);
3841         }
3842       else
3843         {                       /* Graphic characters */
3844           Lisp_Object charset;
3845           int lb;
3846           int reg;
3847
3848           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3849
3850           /* Now determine the charset. */
3851           reg = ((flags & CODING_STATE_SS2) ? 2
3852                  : (flags & CODING_STATE_SS3) ? 3
3853                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3854                  : str->iso2022.register_left);
3855           charset = str->iso2022.charset[reg];
3856
3857           /* Error checking: */
3858           if (NILP (charset) || str->iso2022.invalid_designated[reg]
3859               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3860                   && XCHARSET_CHARS (charset) == 94))
3861             /* Mrmph.  We are trying to invoke a register that has no
3862                or an invalid charset in it, or trying to add a character
3863                outside the range of the charset.  Insert that char literally
3864                to preserve it for the output. */
3865             {
3866               DECODE_OUTPUT_PARTIAL_CHAR (ch);
3867               DECODE_ADD_BINARY_CHAR (c, dst);
3868             }
3869
3870           else
3871             {
3872               /* Things are probably hunky-dorey. */
3873
3874               /* Fetch reverse charset, maybe. */
3875               if (((flags & CODING_STATE_R2L) &&
3876                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3877                   ||
3878                   (!(flags & CODING_STATE_R2L) &&
3879                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3880                 {
3881                   Lisp_Object new_charset =
3882                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3883                   if (!NILP (new_charset))
3884                     charset = new_charset;
3885                 }
3886
3887               lb = XCHARSET_LEADING_BYTE (charset);
3888               switch (XCHARSET_REP_BYTES (charset))
3889                 {
3890                 case 1: /* ASCII */
3891                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
3892                   Dynarr_add (dst, c & 0x7F);
3893                   break;
3894
3895                 case 2: /* one-byte official */
3896                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
3897                   Dynarr_add (dst, lb);
3898                   Dynarr_add (dst, c | 0x80);
3899                   break;
3900
3901                 case 3: /* one-byte private or two-byte official */
3902                   if (XCHARSET_PRIVATE_P (charset))
3903                     {
3904                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
3905                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3906                       Dynarr_add (dst, lb);
3907                       Dynarr_add (dst, c | 0x80);
3908                     }
3909                   else
3910                     {
3911                       if (ch)
3912                         {
3913                           Dynarr_add (dst, lb);
3914                           Dynarr_add (dst, ch | 0x80);
3915                           Dynarr_add (dst, c | 0x80);
3916                           ch = 0;
3917                         }
3918                       else
3919                         ch = c;
3920                     }
3921                   break;
3922
3923                 default:        /* two-byte private */
3924                   if (ch)
3925                     {
3926                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3927                       Dynarr_add (dst, lb);
3928                       Dynarr_add (dst, ch | 0x80);
3929                       Dynarr_add (dst, c | 0x80);
3930                       ch = 0;
3931                     }
3932                   else
3933                     ch = c;
3934                 }
3935             }
3936
3937           if (!ch)
3938             flags &= CODING_STATE_ISO2022_LOCK;
3939         }
3940
3941     label_continue_loop:;
3942     }
3943
3944   if (flags & CODING_STATE_END)
3945     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3946
3947   CODING_STREAM_COMPOSE (str, flags, ch);
3948 }
3949
3950
3951 /***** ISO2022 encoder *****/
3952
3953 /* Designate CHARSET into register REG. */
3954
3955 static void
3956 iso2022_designate (Lisp_Object charset, unsigned char reg,
3957                    struct encoding_stream *str, unsigned_char_dynarr *dst)
3958 {
3959   CONST char *inter94 = "()*+", *inter96= ",-./";
3960   unsigned int type;
3961   unsigned char final;
3962   Lisp_Object old_charset = str->iso2022.charset[reg];
3963
3964   str->iso2022.charset[reg] = charset;
3965   if (!CHARSETP (charset))
3966     /* charset might be an initial nil or t. */
3967     return;
3968   type = XCHARSET_TYPE (charset);
3969   final = XCHARSET_FINAL (charset);
3970   if (!str->iso2022.force_charset_on_output[reg] &&
3971       CHARSETP (old_charset) &&
3972       XCHARSET_TYPE (old_charset) == type &&
3973       XCHARSET_FINAL (old_charset) == final)
3974     return;
3975
3976   str->iso2022.force_charset_on_output[reg] = 0;
3977
3978   {
3979     charset_conversion_spec_dynarr *dyn =
3980       str->codesys->iso2022.output_conv;
3981
3982     if (dyn)
3983       {
3984         int i;
3985
3986         for (i = 0; i < Dynarr_length (dyn); i++)
3987           {
3988             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3989             if (EQ (charset, spec->from_charset))
3990                 charset = spec->to_charset;
3991           }
3992       }
3993   }
3994
3995   Dynarr_add (dst, ISO_CODE_ESC);
3996   switch (type)
3997     {
3998     case CHARSET_TYPE_94:
3999       Dynarr_add (dst, inter94[reg]);
4000       break;
4001     case CHARSET_TYPE_96:
4002       Dynarr_add (dst, inter96[reg]);
4003       break;
4004     case CHARSET_TYPE_94X94:
4005       Dynarr_add (dst, '$');
4006       if (reg != 0
4007           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4008           || final < '@'
4009           || final > 'B')
4010         Dynarr_add (dst, inter94[reg]);
4011       break;
4012     case CHARSET_TYPE_96X96:
4013       Dynarr_add (dst, '$');
4014       Dynarr_add (dst, inter96[reg]);
4015       break;
4016     }
4017   Dynarr_add (dst, final);
4018 }
4019
4020 static void
4021 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4022 {
4023   if (str->iso2022.register_left != 0)
4024     {
4025       Dynarr_add (dst, ISO_CODE_SI);
4026       str->iso2022.register_left = 0;
4027     }
4028 }
4029
4030 static void
4031 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4032 {
4033   if (str->iso2022.register_left != 1)
4034     {
4035       Dynarr_add (dst, ISO_CODE_SO);
4036       str->iso2022.register_left = 1;
4037     }
4038 }
4039
4040 /* Convert internally-formatted data to ISO2022 format. */
4041
4042 static void
4043 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4044                        unsigned_char_dynarr *dst, unsigned int n)
4045 {
4046   unsigned char charmask, c;
4047   unsigned int flags, ch;
4048   enum eol_type eol_type;
4049   unsigned char char_boundary;
4050   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4051   struct Lisp_Coding_System *codesys = str->codesys;
4052   int i;
4053   Lisp_Object charset;
4054   int half;
4055
4056   /* flags for handling composite chars.  We do a little switcharoo
4057      on the source while we're outputting the composite char. */
4058   unsigned int saved_n = 0;
4059   CONST unsigned char *saved_src = NULL;
4060   int in_composite = 0;
4061
4062   CODING_STREAM_DECOMPOSE (str, flags, ch);
4063   eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4064   char_boundary = str->iso2022.current_char_boundary;
4065   charset = str->iso2022.current_charset;
4066   half = str->iso2022.current_half;
4067
4068  back_to_square_n:
4069   while (n--)
4070     {
4071       c = *src++;
4072
4073       if (BYTE_ASCII_P (c))
4074         {               /* Processing ASCII character */
4075           ch = 0;
4076
4077           restore_left_to_right_direction (codesys, dst, &flags, 0);
4078
4079           /* Make sure G0 contains ASCII */
4080           if ((c > ' ' && c < ISO_CODE_DEL) ||
4081               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4082             {
4083               ensure_normal_shift (str, dst);
4084               iso2022_designate (Vcharset_ascii, 0, str, dst);
4085             }
4086
4087           /* If necessary, restore everything to the default state
4088              at end-of-line */
4089           if (c == '\n' &&
4090               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4091             {
4092               restore_left_to_right_direction (codesys, dst, &flags, 0);
4093
4094               ensure_normal_shift (str, dst);
4095
4096               for (i = 0; i < 4; i++)
4097                 {
4098                   Lisp_Object initial_charset =
4099                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4100                   iso2022_designate (initial_charset, i, str, dst);
4101                 }
4102             }
4103           if (c == '\n')
4104             {
4105               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4106                 Dynarr_add (dst, '\r');
4107               if (eol_type != EOL_CR)
4108                 Dynarr_add (dst, c);
4109             }
4110           else
4111             {
4112               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4113                   && fit_to_be_escape_quoted (c))
4114                 Dynarr_add (dst, ISO_CODE_ESC);
4115               Dynarr_add (dst, c);
4116             }
4117           char_boundary = 1;
4118         }
4119
4120       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4121         { /* Processing Leading Byte */
4122           ch = 0;
4123           charset = CHARSET_BY_LEADING_BYTE (c);
4124           if (LEADING_BYTE_PREFIX_P(c))
4125             ch = c;
4126           else if (!EQ (charset, Vcharset_control_1)
4127                    && !EQ (charset, Vcharset_composite))
4128             {
4129               int reg;
4130
4131               ensure_correct_direction (XCHARSET_DIRECTION (charset),
4132                                         codesys, dst, &flags, 0);
4133
4134               /* Now determine which register to use. */
4135               reg = -1;
4136               for (i = 0; i < 4; i++)
4137                 {
4138                   if (EQ (charset, str->iso2022.charset[i]) ||
4139                       EQ (charset,
4140                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4141                     {
4142                       reg = i;
4143                       break;
4144                     }
4145                 }
4146
4147               if (reg == -1)
4148                 {
4149                   if (XCHARSET_GRAPHIC (charset) != 0)
4150                     {
4151                       if (!NILP (str->iso2022.charset[1]) &&
4152                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4153                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4154                         reg = 1;
4155                       else if (!NILP (str->iso2022.charset[2]))
4156                         reg = 2;
4157                       else if (!NILP (str->iso2022.charset[3]))
4158                         reg = 3;
4159                       else
4160                         reg = 0;
4161                     }
4162                   else
4163                     reg = 0;
4164                 }
4165
4166               iso2022_designate (charset, reg, str, dst);
4167
4168               /* Now invoke that register. */
4169               switch (reg)
4170                 {
4171                 case 0:
4172                   ensure_normal_shift (str, dst);
4173                   half = 0;
4174                   break;
4175
4176                 case 1:
4177                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4178                     {
4179                       ensure_shift_out (str, dst);
4180                       half = 0;
4181                     }
4182                   else
4183                     half = 1;
4184                   break;
4185
4186                 case 2:
4187                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4188                     {
4189                       Dynarr_add (dst, ISO_CODE_ESC);
4190                       Dynarr_add (dst, 'N');
4191                       half = 0;
4192                     }
4193                   else
4194                     {
4195                       Dynarr_add (dst, ISO_CODE_SS2);
4196                       half = 1;
4197                     }
4198                   break;
4199
4200                 case 3:
4201                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4202                     {
4203                       Dynarr_add (dst, ISO_CODE_ESC);
4204                       Dynarr_add (dst, 'O');
4205                       half = 0;
4206                     }
4207                   else
4208                     {
4209                       Dynarr_add (dst, ISO_CODE_SS3);
4210                       half = 1;
4211                     }
4212                   break;
4213
4214                 default:
4215                   abort ();
4216                 }
4217             }
4218           char_boundary = 0;
4219         }
4220       else
4221         {                       /* Processing Non-ASCII character */
4222           charmask = (half == 0 ? 0x7F : 0xFF);
4223           char_boundary = 1;
4224           if (EQ (charset, Vcharset_control_1))
4225             {
4226               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4227                   && fit_to_be_escape_quoted (c))
4228                 Dynarr_add (dst, ISO_CODE_ESC);
4229               /* you asked for it ... */
4230               Dynarr_add (dst, c - 0x20);
4231             }
4232           else
4233             {
4234               switch (XCHARSET_REP_BYTES (charset))
4235                 {
4236                 case 2:
4237                   Dynarr_add (dst, c & charmask);
4238                   break;
4239                 case 3:
4240                   if (XCHARSET_PRIVATE_P (charset))
4241                     {
4242                       Dynarr_add (dst, c & charmask);
4243                       ch = 0;
4244                     }
4245                   else if (ch)
4246                     {
4247                       if (EQ (charset, Vcharset_composite))
4248                         {
4249                           if (in_composite)
4250                             {
4251                               /* #### Bother! We don't know how to
4252                                  handle this yet. */
4253                               Dynarr_add (dst, '~');
4254                             }
4255                           else
4256                             {
4257                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4258                                                        ch & 0x7F, c & 0x7F);
4259                               Lisp_Object lstr = composite_char_string (emch);
4260                               saved_n = n;
4261                               saved_src = src;
4262                               in_composite = 1;
4263                               src = XSTRING_DATA   (lstr);
4264                               n   = XSTRING_LENGTH (lstr);
4265                               Dynarr_add (dst, ISO_CODE_ESC);
4266                               Dynarr_add (dst, '0'); /* start composing */
4267                             }
4268                         }
4269                       else
4270                         {
4271                           Dynarr_add (dst, ch & charmask);
4272                           Dynarr_add (dst, c & charmask);
4273                         }
4274                       ch = 0;
4275                     }
4276                   else
4277                     {
4278                       ch = c;
4279                       char_boundary = 0;
4280                     }
4281                   break;
4282                 case 4:
4283                   if (ch)
4284                     {
4285                       Dynarr_add (dst, ch & charmask);
4286                       Dynarr_add (dst, c & charmask);
4287                       ch = 0;
4288                     }
4289                   else
4290                     {
4291                       ch = c;
4292                       char_boundary = 0;
4293                     }
4294                   break;
4295                 default:
4296                   abort ();
4297                 }
4298             }
4299         }
4300     }
4301
4302   if (in_composite)
4303     {
4304       n = saved_n;
4305       src = saved_src;
4306       in_composite = 0;
4307       Dynarr_add (dst, ISO_CODE_ESC);
4308       Dynarr_add (dst, '1'); /* end composing */
4309       goto back_to_square_n; /* Wheeeeeeeee ..... */
4310     }
4311
4312   if (char_boundary && flags & CODING_STATE_END)
4313     {
4314       restore_left_to_right_direction (codesys, dst, &flags, 0);
4315       ensure_normal_shift (str, dst);
4316       for (i = 0; i < 4; i++)
4317         {
4318           Lisp_Object initial_charset =
4319             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4320           iso2022_designate (initial_charset, i, str, dst);
4321         }
4322     }
4323
4324   CODING_STREAM_COMPOSE (str, flags, ch);
4325   str->iso2022.current_char_boundary = char_boundary;
4326   str->iso2022.current_charset = charset;
4327   str->iso2022.current_half = half;
4328
4329   /* Verbum caro factum est! */
4330 }
4331
4332 \f
4333 /************************************************************************/
4334 /*                     No-conversion methods                            */
4335 /************************************************************************/
4336
4337 /* This is used when reading in "binary" files -- i.e. files that may
4338    contain all 256 possible byte values and that are not to be
4339    interpreted as being in any particular decoding. */
4340 static void
4341 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4342                              unsigned_char_dynarr *dst, unsigned int n)
4343 {
4344   unsigned char c;
4345   unsigned int flags, ch;
4346   enum eol_type eol_type;
4347   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4348
4349   CODING_STREAM_DECOMPOSE (str, flags, ch);
4350   eol_type = str->eol_type;
4351
4352   while (n--)
4353     {
4354       c = *src++;
4355
4356       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4357       DECODE_ADD_BINARY_CHAR (c, dst);
4358     label_continue_loop:;
4359     }
4360
4361   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4362
4363   CODING_STREAM_COMPOSE (str, flags, ch);
4364 }
4365
4366 static void
4367 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4368                              unsigned_char_dynarr *dst, unsigned int n)
4369 {
4370   unsigned char c;
4371   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4372   unsigned int flags, ch;
4373   enum eol_type eol_type;
4374
4375   CODING_STREAM_DECOMPOSE (str, flags, ch);
4376   eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4377
4378   while (n--)
4379     {
4380       c = *src++;
4381       if (c == '\n')
4382         {
4383           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4384             Dynarr_add (dst, '\r');
4385           if (eol_type != EOL_CR)
4386             Dynarr_add (dst, '\n');
4387           ch = 0;
4388         }
4389       else if (BYTE_ASCII_P (c))
4390         {
4391           assert (ch == 0);
4392           Dynarr_add (dst, c);
4393         }
4394       else if (BUFBYTE_LEADING_BYTE_P (c))
4395         {
4396           assert (ch == 0);
4397           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
4398               c == LEADING_BYTE_CONTROL_1)
4399             ch = c;
4400           else
4401             Dynarr_add (dst, '~'); /* untranslatable character */
4402         }
4403       else
4404         {
4405           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
4406             Dynarr_add (dst, c);
4407           else if (ch == LEADING_BYTE_CONTROL_1)
4408             {
4409               assert (c < 0xC0);
4410               Dynarr_add (dst, c - 0x20);
4411             }
4412           /* else it should be the second or third byte of an
4413              untranslatable character, so ignore it */
4414           ch = 0;
4415         }
4416     }
4417
4418   CODING_STREAM_COMPOSE (str, flags, ch);
4419 }
4420
4421 \f
4422 /************************************************************************/
4423 /*                   Simple internal/external functions                 */
4424 /************************************************************************/
4425
4426 static Extbyte_dynarr *conversion_out_dynarr;
4427 static Bufbyte_dynarr *conversion_in_dynarr;
4428
4429 /* Determine coding system from coding format */
4430
4431 #define FILE_NAME_CODING_SYSTEM                         \
4432  ((NILP (Vfile_name_coding_system) ||                   \
4433    (EQ ((Vfile_name_coding_system), Qbinary))) ?        \
4434   Qnil : Fget_coding_system (Vfile_name_coding_system))
4435
4436 /* #### not correct for all values of `fmt'! */
4437 #define FMT_CODING_SYSTEM(fmt)                                  \
4438  (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM     :    \
4439   ((fmt) == FORMAT_CTEXT   ) ? Fget_coding_system (Qctext) :    \
4440   ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM     :    \
4441   Qnil)
4442
4443 CONST Extbyte *
4444 convert_to_external_format (CONST Bufbyte *ptr,
4445                             Bytecount len,
4446                             Extcount *len_out,
4447                             enum external_data_format fmt)
4448 {
4449   Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4450
4451   if (!conversion_out_dynarr)
4452     conversion_out_dynarr = Dynarr_new (Extbyte);
4453   else
4454     Dynarr_reset (conversion_out_dynarr);
4455
4456   if (NILP (coding_system))
4457     {
4458       CONST Bufbyte *end = ptr + len;
4459
4460       for (; ptr < end;)
4461         {
4462           Bufbyte c =
4463             (BYTE_ASCII_P (*ptr))                  ? *ptr :
4464             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
4465             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
4466             '~';
4467
4468           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4469           INC_CHARPTR (ptr);
4470         }
4471
4472 #ifdef ERROR_CHECK_BUFPOS
4473       assert (ptr == end);
4474 #endif
4475     }
4476   else
4477     {
4478       Lisp_Object instream, outstream, da_outstream;
4479       Lstream *istr, *ostr;
4480       struct gcpro gcpro1, gcpro2, gcpro3;
4481       char tempbuf[1024]; /* some random amount */
4482
4483       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4484       da_outstream = make_dynarr_output_stream
4485         ((unsigned_char_dynarr *) conversion_out_dynarr);
4486       outstream =
4487         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
4488       istr = XLSTREAM (instream);
4489       ostr = XLSTREAM (outstream);
4490       GCPRO3 (instream, outstream, da_outstream);
4491       while (1)
4492         {
4493           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4494           if (!size_in_bytes)
4495             break;
4496           Lstream_write (ostr, tempbuf, size_in_bytes);
4497         }
4498       Lstream_close (istr);
4499       Lstream_close (ostr);
4500       UNGCPRO;
4501       Lstream_delete (istr);
4502       Lstream_delete (ostr);
4503       Lstream_delete (XLSTREAM (da_outstream));
4504     }
4505
4506   *len_out = Dynarr_length (conversion_out_dynarr);
4507   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4508   return Dynarr_atp (conversion_out_dynarr, 0);
4509 }
4510
4511 CONST Bufbyte *
4512 convert_from_external_format (CONST Extbyte *ptr,
4513                               Extcount len,
4514                               Bytecount *len_out,
4515                               enum external_data_format fmt)
4516 {
4517   Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4518
4519   if (!conversion_in_dynarr)
4520     conversion_in_dynarr = Dynarr_new (Bufbyte);
4521   else
4522     Dynarr_reset (conversion_in_dynarr);
4523
4524   if (NILP (coding_system))
4525     {
4526       CONST Extbyte *end = ptr + len;
4527       for (; ptr < end; ptr++)
4528         {
4529           Extbyte c = *ptr;
4530           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4531         }
4532     }
4533   else
4534     {
4535       Lisp_Object instream, outstream, da_outstream;
4536       Lstream *istr, *ostr;
4537       struct gcpro gcpro1, gcpro2, gcpro3;
4538       char tempbuf[1024]; /* some random amount */
4539
4540       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4541       da_outstream = make_dynarr_output_stream
4542         ((unsigned_char_dynarr *) conversion_in_dynarr);
4543       outstream =
4544         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
4545       istr = XLSTREAM (instream);
4546       ostr = XLSTREAM (outstream);
4547       GCPRO3 (instream, outstream, da_outstream);
4548       while (1)
4549         {
4550           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4551           if (!size_in_bytes)
4552             break;
4553           Lstream_write (ostr, tempbuf, size_in_bytes);
4554         }
4555       Lstream_close (istr);
4556       Lstream_close (ostr);
4557       UNGCPRO;
4558       Lstream_delete (istr);
4559       Lstream_delete (ostr);
4560       Lstream_delete (XLSTREAM (da_outstream));
4561     }
4562
4563   *len_out = Dynarr_length (conversion_in_dynarr);
4564   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4565   return Dynarr_atp (conversion_in_dynarr, 0);
4566 }
4567
4568 \f
4569 /************************************************************************/
4570 /*                             Initialization                           */
4571 /************************************************************************/
4572
4573 void
4574 syms_of_mule_coding (void)
4575 {
4576   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
4577   deferror (&Qcoding_system_error, "coding-system-error",
4578             "Coding-system error", Qio_error);
4579
4580   DEFSUBR (Fcoding_system_p);
4581   DEFSUBR (Ffind_coding_system);
4582   DEFSUBR (Fget_coding_system);
4583   DEFSUBR (Fcoding_system_list);
4584   DEFSUBR (Fcoding_system_name);
4585   DEFSUBR (Fmake_coding_system);
4586   DEFSUBR (Fcopy_coding_system);
4587   DEFSUBR (Fsubsidiary_coding_system);
4588
4589   DEFSUBR (Fcoding_system_type);
4590   DEFSUBR (Fcoding_system_doc_string);
4591   DEFSUBR (Fcoding_system_charset);
4592   DEFSUBR (Fcoding_system_property);
4593
4594   DEFSUBR (Fcoding_category_list);
4595   DEFSUBR (Fset_coding_priority_list);
4596   DEFSUBR (Fcoding_priority_list);
4597   DEFSUBR (Fset_coding_category_system);
4598   DEFSUBR (Fcoding_category_system);
4599
4600   DEFSUBR (Fdetect_coding_region);
4601   DEFSUBR (Fdecode_coding_region);
4602   DEFSUBR (Fencode_coding_region);
4603   DEFSUBR (Fdecode_shift_jis_char);
4604   DEFSUBR (Fencode_shift_jis_char);
4605   DEFSUBR (Fdecode_big5_char);
4606   DEFSUBR (Fencode_big5_char);
4607
4608   defsymbol (&Qcoding_system_p, "coding-system-p");
4609
4610   defsymbol (&Qbig5, "big5");
4611   defsymbol (&Qshift_jis, "shift-jis");
4612   defsymbol (&Qno_conversion, "no-conversion");
4613   defsymbol (&Qccl, "ccl");
4614   defsymbol (&Qiso2022, "iso2022");
4615
4616   defsymbol (&Qmnemonic, "mnemonic");
4617   defsymbol (&Qeol_type, "eol-type");
4618   defsymbol (&Qpost_read_conversion, "post-read-conversion");
4619   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4620
4621   defsymbol (&Qcr, "cr");
4622   defsymbol (&Qlf, "lf");
4623   defsymbol (&Qcrlf, "crlf");
4624   defsymbol (&Qeol_cr, "eol-cr");
4625   defsymbol (&Qeol_lf, "eol-lf");
4626   defsymbol (&Qeol_crlf, "eol-crlf");
4627
4628   defsymbol (&Qcharset_g0, "charset-g0");
4629   defsymbol (&Qcharset_g1, "charset-g1");
4630   defsymbol (&Qcharset_g2, "charset-g2");
4631   defsymbol (&Qcharset_g3, "charset-g3");
4632   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4633   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4634   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4635   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4636   defsymbol (&Qshort, "short");
4637   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4638   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4639   defsymbol (&Qseven, "seven");
4640   defsymbol (&Qlock_shift, "lock-shift");
4641   defsymbol (&Qno_iso6429, "no-iso6429");
4642   defsymbol (&Qescape_quoted, "escape-quoted");
4643   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4644   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4645
4646   defsymbol (&Qencode, "encode");
4647   defsymbol (&Qdecode, "decode");
4648
4649   defsymbol (&Qctext, "ctext");
4650
4651   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4652              "shift-jis");
4653   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4654              "iso-7");
4655   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4656              "iso-8-designate");
4657   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4658              "iso-8-1");
4659   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4660              "iso-8-2");
4661   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4662              "iso-lock-shift");
4663   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4664              "big5");
4665   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4666              "no-conversion");
4667 }
4668
4669 void
4670 lstream_type_create_mule_coding (void)
4671 {
4672   LSTREAM_HAS_METHOD (decoding, reader);
4673   LSTREAM_HAS_METHOD (decoding, writer);
4674   LSTREAM_HAS_METHOD (decoding, rewinder);
4675   LSTREAM_HAS_METHOD (decoding, seekable_p);
4676   LSTREAM_HAS_METHOD (decoding, flusher);
4677   LSTREAM_HAS_METHOD (decoding, closer);
4678   LSTREAM_HAS_METHOD (decoding, marker);
4679
4680   LSTREAM_HAS_METHOD (encoding, reader);
4681   LSTREAM_HAS_METHOD (encoding, writer);
4682   LSTREAM_HAS_METHOD (encoding, rewinder);
4683   LSTREAM_HAS_METHOD (encoding, seekable_p);
4684   LSTREAM_HAS_METHOD (encoding, flusher);
4685   LSTREAM_HAS_METHOD (encoding, closer);
4686   LSTREAM_HAS_METHOD (encoding, marker);
4687 }
4688
4689 void
4690 vars_of_mule_coding (void)
4691 {
4692   int i;
4693
4694   /* Initialize to something reasonable ... */
4695   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4696     {
4697       coding_category_system[i] = Qnil;
4698       coding_category_by_priority[i] = i;
4699     }
4700
4701   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4702 Coding system used for TTY keyboard input.
4703 Not used under a windowing system.
4704 */ );
4705   Vkeyboard_coding_system = Qnil;
4706
4707   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4708 Coding system used for TTY display output.
4709 Not used under a windowing system.
4710 */ );
4711   Vterminal_coding_system = Qnil;
4712
4713   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
4714 Overriding coding system used when writing a file or process.
4715 You should *bind* this, not set it.  If this is non-nil, it specifies
4716 the coding system that will be used when a file or process is read
4717 in, and overrides `buffer-file-coding-system-for-read',
4718 `insert-file-contents-pre-hook', etc.  Use those variables instead of
4719 this one for permanent changes to the environment.
4720 */ );
4721   Vcoding_system_for_read = Qnil;
4722
4723   DEFVAR_LISP ("coding-system-for-write",
4724                &Vcoding_system_for_write /*
4725 Overriding coding system used when writing a file or process.
4726 You should *bind* this, not set it.  If this is non-nil, it specifies
4727 the coding system that will be used when a file or process is wrote
4728 in, and overrides `buffer-file-coding-system',
4729 `write-region-pre-hook', etc.  Use those variables instead of this one
4730 for permanent changes to the environment.
4731 */ );
4732   Vcoding_system_for_write = Qnil;
4733
4734   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
4735 Coding system used to convert pathnames when accessing files.
4736 */ );
4737   Vfile_name_coding_system = Qnil;
4738
4739   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
4740 Non-nil means the buffer contents are regarded as multi-byte form
4741 of characters, not a binary code.  This affects the display, file I/O,
4742 and behaviors of various editing commands.
4743
4744 Setting this to nil does not do anything.
4745 */ );
4746   enable_multibyte_characters = 1;
4747 }
4748
4749 void
4750 complex_vars_of_mule_coding (void)
4751 {
4752   staticpro (&Vcoding_system_hashtable);
4753   Vcoding_system_hashtable = make_lisp_hashtable (50, HASHTABLE_NONWEAK,
4754                                                   HASHTABLE_EQ);
4755
4756   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
4757
4758 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
4759 {                                               \
4760   struct codesys_prop csp;                      \
4761   csp.sym = (Sym);                              \
4762   csp.prop_type = (Prop_Type);                  \
4763   Dynarr_add (the_codesys_prop_dynarr, csp);    \
4764 } while (0)
4765
4766   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
4767   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
4768   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
4769   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
4770   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
4771   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
4772   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
4773
4774   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4775   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4776   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4777   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4778   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4779   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4780   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4781   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4782   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4783   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4784   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4785   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4786   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4787   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4788   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4789   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4790   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4791
4792   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
4793   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
4794
4795   /* Need to create this here or we're really screwed. */
4796   Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4797                        list2 (Qmnemonic, build_string ("Noconv")));
4798
4799   Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4800                        Qbinary);
4801
4802   /* Need this for bootstrapping */
4803   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4804     Fget_coding_system (Qno_conversion);
4805 }
4806
4807 #endif