src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28 #include "buffer.h"
  29 #include "elhash.h"
  30 #include "insdel.h"
  31 #include "lstream.h"
  32 #ifdef MULE
  33 #include "mule-ccl.h"
  34 #include "chartab.h"
  35 #endif
  36 #include "file-coding.h"
  37
  38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  39
  40 Lisp_Object Vkeyboard_coding_system;
  41 Lisp_Object Vterminal_coding_system;
  42 Lisp_Object Vcoding_system_for_read;
  43 Lisp_Object Vcoding_system_for_write;
  44 Lisp_Object Vfile_name_coding_system;
  45
  46 /* Table of symbols identifying each coding category. */
  47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  48
  49 /* Coding system currently associated with each coding category. */
  50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  51
  52 /* Table of all coding categories in decreasing order of priority.
  53    This describes a permutation of the possible coding categories. */
  54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  55
  56 Lisp_Object Qcoding_system_p;
  57
  58 Lisp_Object Qno_conversion, Qccl, Qiso2022;
  59 /* Qinternal in general.c */
  60
  61 Lisp_Object Qmnemonic, Qeol_type;
  62 Lisp_Object Qcr, Qcrlf, Qlf;
  63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  64 Lisp_Object Qpost_read_conversion;
  65 Lisp_Object Qpre_write_conversion;
  66
  67 #ifdef MULE
  68 Lisp_Object Qucs4, Qutf8;
  69 Lisp_Object Qbig5, Qshift_jis;
  70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  73 Lisp_Object Qno_iso6429;
  74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  75 Lisp_Object Qctext, Qescape_quoted;
  76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  77 #endif
  78 Lisp_Object Qencode, Qdecode;
  79
  80 Lisp_Object Vcoding_system_hash_table;
  81
  82 int enable_multibyte_characters;
  83
  84 #ifdef MULE
  85 /* Additional information used by the ISO2022 decoder and detector. */
  86 struct iso2022_decoder
  87 {
  88   /* CHARSET holds the character sets currently assigned to the G0
  89      through G3 variables.  It is initialized from the array
  90      INITIAL_CHARSET in CODESYS. */
  91   Lisp_Object charset[4];
  92
  93   /* Which registers are currently invoked into the left (GL) and
  94      right (GR) halves of the 8-bit encoding space? */
  95   int register_left, register_right;
  96
  97   /* ISO_ESC holds a value indicating part of an escape sequence
  98      that has already been seen. */
  99   enum iso_esc_flag esc;
 100
 101   /* This records the bytes we've seen so far in an escape sequence,
 102      in case the sequence is invalid (we spit out the bytes unchanged). */
 103   unsigned char esc_bytes[8];
 104
 105   /* Index for next byte to store in ISO escape sequence. */
 106   int esc_bytes_index;
 107
 108 #ifdef ENABLE_COMPOSITE_CHARS
 109   /* Stuff seen so far when composing a string. */
 110   unsigned_char_dynarr *composite_chars;
 111 #endif
 112
 113   /* If we saw an invalid designation sequence for a particular
 114      register, we flag it here and switch to ASCII.  The next time we
 115      see a valid designation for this register, we turn off the flag
 116      and do the designation normally, but pretend the sequence was
 117      invalid.  The effect of all this is that (most of the time) the
 118      escape sequences for both the switch to the unknown charset, and
 119      the switch back to the known charset, get inserted literally into
 120      the buffer and saved out as such.  The hope is that we can
 121      preserve the escape sequences so that the resulting written out
 122      file makes sense.  If we don't do any of this, the designation
 123      to the invalid charset will be preserved but that switch back
 124      to the known charset will probably get eaten because it was
 125      the same charset that was already present in the register. */
 126   unsigned char invalid_designated[4];
 127
 128   /* We try to do similar things as above for direction-switching
 129      sequences.  If we encountered a direction switch while an
 130      invalid designation was present, or an invalid designation
 131      just after a direction switch (i.e. no valid designation
 132      encountered yet), we insert the direction-switch escape
 133      sequence literally into the output stream, and later on
 134      insert the corresponding direction-restoring escape sequence
 135      literally also. */
 136   unsigned int switched_dir_and_no_valid_charset_yet :1;
 137   unsigned int invalid_switch_dir :1;
 138
 139   /* Tells the decoder to output the escape sequence literally
 140      even though it was valid.  Used in the games we play to
 141      avoid lossage when we encounter invalid designations. */
 142   unsigned int output_literally :1;
 143   /* We encountered a direction switch followed by an invalid
 144      designation.  We didn't output the direction switch
 145      literally because we didn't know about the invalid designation;
 146      but we have to do so now. */
 147   unsigned int output_direction_sequence :1;
 148 };
 149 #endif /* MULE */
 150 EXFUN (Fcopy_coding_system, 2);
 151 #ifdef MULE
 152 struct detection_state;
 153 static int detect_coding_sjis (struct detection_state *st,
 154                                CONST unsigned char *src,
 155                                unsigned int n);
 156 static void decode_coding_sjis (Lstream *decoding,
 157                                 CONST unsigned char *src,
 158                                 unsigned_char_dynarr *dst,
 159                                 unsigned int n);
 160 static void encode_coding_sjis (Lstream *encoding,
 161                                 CONST unsigned char *src,
 162                                 unsigned_char_dynarr *dst,
 163                                 unsigned int n);
 164 static int detect_coding_big5 (struct detection_state *st,
 165                                CONST unsigned char *src,
 166                                unsigned int n);
 167 static void decode_coding_big5 (Lstream *decoding,
 168                                 CONST unsigned char *src,
 169                                 unsigned_char_dynarr *dst, unsigned int n);
 170 static void encode_coding_big5 (Lstream *encoding,
 171                                 CONST unsigned char *src,
 172                                 unsigned_char_dynarr *dst, unsigned int n);
 173 static int detect_coding_ucs4 (struct detection_state *st,
 174                                CONST unsigned char *src,
 175                                unsigned int n);
 176 static void decode_coding_ucs4 (Lstream *decoding,
 177                                 CONST unsigned char *src,
 178                                 unsigned_char_dynarr *dst, unsigned int n);
 179 static void encode_coding_ucs4 (Lstream *encoding,
 180                                 CONST unsigned char *src,
 181                                 unsigned_char_dynarr *dst, unsigned int n);
 182 static int detect_coding_utf8 (struct detection_state *st,
 183                                CONST unsigned char *src,
 184                                unsigned int n);
 185 static void decode_coding_utf8 (Lstream *decoding,
 186                                 CONST unsigned char *src,
 187                                 unsigned_char_dynarr *dst, unsigned int n);
 188 static void encode_coding_utf8 (Lstream *encoding,
 189                                 CONST unsigned char *src,
 190                                 unsigned_char_dynarr *dst, unsigned int n);
 191 static int postprocess_iso2022_mask (int mask);
 192 static void reset_iso2022 (Lisp_Object coding_system,
 193                            struct iso2022_decoder *iso);
 194 static int detect_coding_iso2022 (struct detection_state *st,
 195                                   CONST unsigned char *src,
 196                                   unsigned int n);
 197 static void decode_coding_iso2022 (Lstream *decoding,
 198                                    CONST unsigned char *src,
 199                                    unsigned_char_dynarr *dst, unsigned int n);
 200 static void encode_coding_iso2022 (Lstream *encoding,
 201                                    CONST unsigned char *src,
 202                                    unsigned_char_dynarr *dst, unsigned int n);
 203 #endif /* MULE */
 204 static void decode_coding_no_conversion (Lstream *decoding,
 205                                          CONST unsigned char *src,
 206                                          unsigned_char_dynarr *dst,
 207                                          unsigned int n);
 208 static void encode_coding_no_conversion (Lstream *encoding,
 209                                          CONST unsigned char *src,
 210                                          unsigned_char_dynarr *dst,
 211                                          unsigned int n);
 212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 213                          unsigned_char_dynarr *dst, unsigned int n);
 214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 215                          unsigned_char_dynarr *dst, unsigned int n);
 216
 217 typedef struct codesys_prop codesys_prop;
 218 struct codesys_prop
 219 {
 220   Lisp_Object sym;
 221   int prop_type;
 222 };
 223
 224 typedef struct
 225 {
 226   Dynarr_declare (codesys_prop);
 227 } codesys_prop_dynarr;
 228
 229 codesys_prop_dynarr *the_codesys_prop_dynarr;
 230
 231 enum codesys_prop_enum
 232 {
 233   CODESYS_PROP_ALL_OK,
 234   CODESYS_PROP_ISO2022,
 235   CODESYS_PROP_CCL
 236 };
 237
 238 \f
 239 /************************************************************************/
 240 /*                       Coding system functions                        */
 241 /************************************************************************/
 242
 243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 245 static void finalize_coding_system (void *header, int for_disksave);
 246
 247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 248                                mark_coding_system, print_coding_system,
 249                                finalize_coding_system,
 250                                0, 0, struct Lisp_Coding_System);
 251
 252 static Lisp_Object
 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 254 {
 255   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 256
 257   markobj (CODING_SYSTEM_NAME (codesys));
 258   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 259   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 260   markobj (CODING_SYSTEM_EOL_LF (codesys));
 261   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 262   markobj (CODING_SYSTEM_EOL_CR (codesys));
 263
 264   switch (CODING_SYSTEM_TYPE (codesys))
 265     {
 266 #ifdef MULE
 267       int i;
 268     case CODESYS_ISO2022:
 269       for (i = 0; i < 4; i++)
 270         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 271       if (codesys->iso2022.input_conv)
 272         {
 273           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 274             {
 275               struct charset_conversion_spec *ccs =
 276                 Dynarr_atp (codesys->iso2022.input_conv, i);
 277               markobj (ccs->from_charset);
 278               markobj (ccs->to_charset);
 279             }
 280         }
 281       if (codesys->iso2022.output_conv)
 282         {
 283           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 284             {
 285               struct charset_conversion_spec *ccs =
 286                 Dynarr_atp (codesys->iso2022.output_conv, i);
 287               markobj (ccs->from_charset);
 288               markobj (ccs->to_charset);
 289             }
 290         }
 291       break;
 292
 293     case CODESYS_CCL:
 294       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 295       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 296       break;
 297 #endif /* MULE */
 298     default:
 299       break;
 300     }
 301
 302   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 303   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 304 }
 305
 306 static void
 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 308                      int escapeflag)
 309 {
 310   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 311   if (print_readably)
 312     error ("printing unreadable object #<coding_system 0x%x>",
 313            c->header.uid);
 314
 315   write_c_string ("#<coding_system ", printcharfun);
 316   print_internal (c->name, printcharfun, 1);
 317   write_c_string (">", printcharfun);
 318 }
 319
 320 static void
 321 finalize_coding_system (void *header, int for_disksave)
 322 {
 323   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 324   /* Since coding systems never go away, this function is not
 325      necessary.  But it would be necessary if we changed things
 326      so that coding systems could go away. */
 327   if (!for_disksave) /* see comment in lstream.c */
 328     {
 329       switch (CODING_SYSTEM_TYPE (c))
 330         {
 331 #ifdef MULE
 332         case CODESYS_ISO2022:
 333           if (c->iso2022.input_conv)
 334             {
 335               Dynarr_free (c->iso2022.input_conv);
 336               c->iso2022.input_conv = 0;
 337             }
 338           if (c->iso2022.output_conv)
 339             {
 340               Dynarr_free (c->iso2022.output_conv);
 341               c->iso2022.output_conv = 0;
 342             }
 343           break;
 344 #endif /* MULE */
 345         default:
 346           break;
 347         }
 348     }
 349 }
 350
 351 static enum eol_type
 352 symbol_to_eol_type (Lisp_Object symbol)
 353 {
 354   CHECK_SYMBOL (symbol);
 355   if (NILP (symbol))      return EOL_AUTODETECT;
 356   if (EQ (symbol, Qlf))   return EOL_LF;
 357   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 358   if (EQ (symbol, Qcr))   return EOL_CR;
 359
 360   signal_simple_error ("Unrecognized eol type", symbol);
 361   return EOL_AUTODETECT; /* not reached */
 362 }
 363
 364 static Lisp_Object
 365 eol_type_to_symbol (enum eol_type type)
 366 {
 367   switch (type)
 368     {
 369     default: abort ();
 370     case EOL_LF:         return Qlf;
 371     case EOL_CRLF:       return Qcrlf;
 372     case EOL_CR:         return Qcr;
 373     case EOL_AUTODETECT: return Qnil;
 374     }
 375 }
 376
 377 static void
 378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 379 {
 380   Lisp_Object codesys_obj;
 381   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 382   char *codesys_name = (char *) alloca (len + 7);
 383   int mlen = -1;
 384   char *codesys_mnemonic=0;
 385
 386   Lisp_Object codesys_name_sym, sub_codesys_obj;
 387
 388   /* kludge */
 389
 390   XSETCODING_SYSTEM (codesys_obj, codesys);
 391
 392   memcpy (codesys_name,
 393           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 394
 395   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 396     {
 397       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 398       codesys_mnemonic = (char *) alloca (mlen + 7);
 399       memcpy (codesys_mnemonic,
 400               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 401     }
 402
 403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 404   strcpy (codesys_name + len, "-" op_sys);                                      \
 405   if (mlen != -1)                                                               \
 406     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 407   codesys_name_sym = intern (codesys_name);                                     \
 408   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 409   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 410   if (mlen != -1)                                                               \
 411     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 412       build_string (codesys_mnemonic);                                          \
 413   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 414 } while (0)
 415
 416   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 417   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 418   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 419 }
 420
 421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 422 Return t if OBJECT is a coding system.
 423 A coding system is an object that defines how text containing multiple
 424 character sets is encoded into a stream of (typically 8-bit) bytes.
 425 The coding system is used to decode the stream into a series of
 426 characters (which may be from multiple charsets) when the text is read
 427 from a file or process, and is used to encode the text back into the
 428 same format when it is written out to a file or process.
 429
 430 For example, many ISO2022-compliant coding systems (such as Compound
 431 Text, which is used for inter-client data under the X Window System)
 432 use escape sequences to switch between different charsets -- Japanese
 433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 435 `make-coding-system' for more information.
 436
 437 Coding systems are normally identified using a symbol, and the
 438 symbol is accepted in place of the actual coding system object whenever
 439 a coding system is called for. (This is similar to how faces work.)
 440 */
 441        (object))
 442 {
 443   return CODING_SYSTEMP (object) ? Qt : Qnil;
 444 }
 445
 446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 447 Retrieve the coding system of the given name.
 448
 449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 450 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 451 If there is no such coding system, nil is returned.  Otherwise the
 452 associated coding system object is returned.
 453 */
 454        (coding_system_or_name))
 455 {
 456   if (CODING_SYSTEMP (coding_system_or_name))
 457     return coding_system_or_name;
 458
 459   if (NILP (coding_system_or_name))
 460     coding_system_or_name = Qbinary;
 461   else
 462     CHECK_SYMBOL (coding_system_or_name);
 463
 464   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 465 }
 466
 467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 468 Retrieve the coding system of the given name.
 469 Same as `find-coding-system' except that if there is no such
 470 coding system, an error is signaled instead of returning nil.
 471 */
 472        (name))
 473 {
 474   Lisp_Object coding_system = Ffind_coding_system (name);
 475
 476   if (NILP (coding_system))
 477     signal_simple_error ("No such coding system", name);
 478   return coding_system;
 479 }
 480
 481 /* We store the coding systems in hash tables with the names as the key and the
 482    actual coding system object as the value.  Occasionally we need to use them
 483    in a list format.  These routines provide us with that. */
 484 struct coding_system_list_closure
 485 {
 486   Lisp_Object *coding_system_list;
 487 };
 488
 489 static int
 490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 491                                   void *coding_system_list_closure)
 492 {
 493   /* This function can GC */
 494   struct coding_system_list_closure *cscl =
 495     (struct coding_system_list_closure *) coding_system_list_closure;
 496   Lisp_Object *coding_system_list = cscl->coding_system_list;
 497
 498   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 499                                *coding_system_list);
 500   return 0;
 501 }
 502
 503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 504 Return a list of the names of all defined coding systems.
 505 */
 506        ())
 507 {
 508   Lisp_Object coding_system_list = Qnil;
 509   struct gcpro gcpro1;
 510   struct coding_system_list_closure coding_system_list_closure;
 511
 512   GCPRO1 (coding_system_list);
 513   coding_system_list_closure.coding_system_list = &coding_system_list;
 514   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 515                  &coding_system_list_closure);
 516   UNGCPRO;
 517
 518   return coding_system_list;
 519 }
 520
 521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 522 Return the name of the given coding system.
 523 */
 524        (coding_system))
 525 {
 526   coding_system = Fget_coding_system (coding_system);
 527   return XCODING_SYSTEM_NAME (coding_system);
 528 }
 529
 530 static Lisp_Coding_System *
 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 532 {
 533   Lisp_Coding_System *codesys =
 534     alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system);
 535
 536   zero_lcrecord (codesys);
 537   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 538   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 539   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 540   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 541   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 542   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 543   CODING_SYSTEM_TYPE     (codesys) = type;
 544   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 545 #ifdef MULE
 546   if (type == CODESYS_ISO2022)
 547     {
 548       int i;
 549       for (i = 0; i < 4; i++)
 550         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 551     }
 552   else if (type == CODESYS_CCL)
 553     {
 554       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 555       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 556     }
 557 #endif /* MULE */
 558   CODING_SYSTEM_NAME (codesys) = name;
 559
 560   return codesys;
 561 }
 562
 563 #ifdef MULE
 564 /* Given a list of charset conversion specs as specified in a Lisp
 565    program, parse it into STORE_HERE. */
 566
 567 static void
 568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 569                                 Lisp_Object spec_list)
 570 {
 571   Lisp_Object rest;
 572
 573   EXTERNAL_LIST_LOOP (rest, spec_list)
 574     {
 575       Lisp_Object car = XCAR (rest);
 576       Lisp_Object from, to;
 577       struct charset_conversion_spec spec;
 578
 579       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 580         signal_simple_error ("Invalid charset conversion spec", car);
 581       from = Fget_charset (XCAR (car));
 582       to = Fget_charset (XCAR (XCDR (car)));
 583       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 584         signal_simple_error_2
 585           ("Attempted conversion between different charset types",
 586            from, to);
 587       spec.from_charset = from;
 588       spec.to_charset = to;
 589
 590       Dynarr_add (store_here, spec);
 591     }
 592 }
 593
 594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 595    specs, return the equivalent as the Lisp programmer would see it.
 596
 597    If LOAD_HERE is 0, return Qnil. */
 598
 599 static Lisp_Object
 600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 601 {
 602   int i;
 603   Lisp_Object result;
 604
 605   if (!load_here)
 606     return Qnil;
 607   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 608     {
 609       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 610       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 611     }
 612
 613   return Fnreverse (result);
 614 }
 615
 616 #endif /* MULE */
 617
 618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 619 Register symbol NAME as a coding system.
 620
 621 TYPE describes the conversion method used and should be one of
 622
 623 nil or 'undecided
 624      Automatic conversion.  XEmacs attempts to detect the coding system
 625      used in the file.
 626 'no-conversion
 627      No conversion.  Use this for binary files and such.  On output,
 628      graphic characters that are not in ASCII or Latin-1 will be
 629      replaced by a ?. (For a no-conversion-encoded buffer, these
 630      characters will only be present if you explicitly insert them.)
 631 'shift-jis
 632      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 633 'ucs-4
 634      ISO 10646 UCS-4 encoding.
 635 'utf-8
 636      ISO 10646 UTF-8 encoding.
 637 'iso2022
 638      Any ISO2022-compliant encoding.  Among other things, this includes
 639      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 640      standard Unix encoding for Japanese and other languages), and
 641      Compound Text (the encoding used in X11).  You can specify more
 642      specific information about the conversion with the FLAGS argument.
 643 'big5
 644      Big5 (the encoding commonly used for Taiwanese).
 645 'ccl
 646      The conversion is performed using a user-written pseudo-code
 647      program.  CCL (Code Conversion Language) is the name of this
 648      pseudo-code.
 649 'internal
 650      Write out or read in the raw contents of the memory representing
 651      the buffer's text.  This is primarily useful for debugging
 652      purposes, and is only enabled when XEmacs has been compiled with
 653      DEBUG_XEMACS defined (via the --debug configure option).
 654      WARNING: Reading in a file using 'internal conversion can result
 655      in an internal inconsistency in the memory representing a
 656      buffer's text, which will produce unpredictable results and may
 657      cause XEmacs to crash.  Under normal circumstances you should
 658      never use 'internal conversion.
 659
 660 DOC-STRING is a string describing the coding system.
 661
 662 PROPS is a property list, describing the specific nature of the
 663 character set.  Recognized properties are:
 664
 665 'mnemonic
 666      String to be displayed in the modeline when this coding system is
 667      active.
 668
 669 'eol-type
 670      End-of-line conversion to be used.  It should be one of
 671
 672         nil
 673                 Automatically detect the end-of-line type (LF, CRLF,
 674                 or CR).  Also generate subsidiary coding systems named
 675                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 676                 identical to this coding system but have an EOL-TYPE
 677                 value of 'lf, 'crlf, and 'cr, respectively.
 678         'lf
 679                 The end of a line is marked externally using ASCII LF.
 680                 Since this is also the way that XEmacs represents an
 681                 end-of-line internally, specifying this option results
 682                 in no end-of-line conversion.  This is the standard
 683                 format for Unix text files.
 684         'crlf
 685                 The end of a line is marked externally using ASCII
 686                 CRLF.  This is the standard format for MS-DOS text
 687                 files.
 688         'cr
 689                 The end of a line is marked externally using ASCII CR.
 690                 This is the standard format for Macintosh text files.
 691         t
 692                 Automatically detect the end-of-line type but do not
 693                 generate subsidiary coding systems.  (This value is
 694                 converted to nil when stored internally, and
 695                 `coding-system-property' will return nil.)
 696
 697 'post-read-conversion
 698      Function called after a file has been read in, to perform the
 699      decoding.  Called with two arguments, BEG and END, denoting
 700      a region of the current buffer to be decoded.
 701
 702 'pre-write-conversion
 703      Function called before a file is written out, to perform the
 704      encoding.  Called with two arguments, BEG and END, denoting
 705      a region of the current buffer to be encoded.
 706
 707
 708 The following additional properties are recognized if TYPE is 'iso2022:
 709
 710 'charset-g0
 711 'charset-g1
 712 'charset-g2
 713 'charset-g3
 714      The character set initially designated to the G0 - G3 registers.
 715      The value should be one of
 716
 717           -- A charset object (designate that character set)
 718           -- nil (do not ever use this register)
 719           -- t (no character set is initially designated to
 720                 the register, but may be later on; this automatically
 721                 sets the corresponding `force-g*-on-output' property)
 722
 723 'force-g0-on-output
 724 'force-g1-on-output
 725 'force-g2-on-output
 726 'force-g2-on-output
 727      If non-nil, send an explicit designation sequence on output before
 728      using the specified register.
 729
 730 'short
 731      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 732      "ESC $ B" on output in place of the full designation sequences
 733      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 734
 735 'no-ascii-eol
 736      If non-nil, don't designate ASCII to G0 at each end of line on output.
 737      Setting this to non-nil also suppresses other state-resetting that
 738      normally happens at the end of a line.
 739
 740 'no-ascii-cntl
 741      If non-nil, don't designate ASCII to G0 before control chars on output.
 742
 743 'seven
 744      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 745      environment.
 746
 747 'lock-shift
 748      If non-nil, use locking-shift (SO/SI) instead of single-shift
 749      or designation by escape sequence.
 750
 751 'no-iso6429
 752      If non-nil, don't use ISO6429's direction specification.
 753
 754 'escape-quoted
 755      If non-nil, literal control characters that are the same as
 756      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 757      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 758      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 759      so that they can be properly distinguished from an escape sequence.
 760      (Note that doing this results in a non-portable encoding.) This
 761      encoding flag is used for byte-compiled files.  Note that ESC
 762      is a good choice for a quoting character because there are no
 763      escape sequences whose second byte is a character from the Control-0
 764      or Control-1 character sets; this is explicitly disallowed by the
 765      ISO2022 standard.
 766
 767 'input-charset-conversion
 768      A list of conversion specifications, specifying conversion of
 769      characters in one charset to another when decoding is performed.
 770      Each specification is a list of two elements: the source charset,
 771      and the destination charset.
 772
 773 'output-charset-conversion
 774      A list of conversion specifications, specifying conversion of
 775      characters in one charset to another when encoding is performed.
 776      The form of each specification is the same as for
 777      'input-charset-conversion.
 778
 779
 780 The following additional properties are recognized (and required)
 781 if TYPE is 'ccl:
 782
 783 'decode
 784      CCL program used for decoding (converting to internal format).
 785
 786 'encode
 787      CCL program used for encoding (converting to external format).
 788 */
 789        (name, type, doc_string, props))
 790 {
 791   Lisp_Coding_System *codesys;
 792   Lisp_Object rest, key, value;
 793   enum coding_system_type ty;
 794   int need_to_setup_eol_systems = 1;
 795
 796   /* Convert type to constant */
 797   if (NILP (type) || EQ (type, Qundecided))
 798                                       { ty = CODESYS_AUTODETECT; }
 799 #ifdef MULE
 800   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 801   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 802   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 803   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 804   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 805   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 806 #endif
 807   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 808 #ifdef DEBUG_XEMACS
 809   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 810 #endif
 811   else
 812     signal_simple_error ("Invalid coding system type", type);
 813
 814   CHECK_SYMBOL (name);
 815
 816   codesys = allocate_coding_system (ty, name);
 817
 818   if (NILP (doc_string))
 819     doc_string = build_string ("");
 820   else
 821     CHECK_STRING (doc_string);
 822   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 823
 824   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 825     {
 826       if (EQ (key, Qmnemonic))
 827         {
 828           if (!NILP (value))
 829             CHECK_STRING (value);
 830           CODING_SYSTEM_MNEMONIC (codesys) = value;
 831         }
 832
 833       else if (EQ (key, Qeol_type))
 834         {
 835           need_to_setup_eol_systems = NILP (value);
 836           if (EQ (value, Qt))
 837             value = Qnil;
 838           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 839         }
 840
 841       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 842       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 843 #ifdef MULE
 844       else if (ty == CODESYS_ISO2022)
 845         {
 846 #define FROB_INITIAL_CHARSET(charset_num) \
 847   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 848     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 849
 850           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 851           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 852           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 853           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 854
 855 #define FROB_FORCE_CHARSET(charset_num) \
 856   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 857
 858           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 859           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 860           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 861           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 862
 863 #define FROB_BOOLEAN_PROPERTY(prop) \
 864   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 865
 866           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 867           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 868           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 869           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 870           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 871           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 872           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 873
 874           else if (EQ (key, Qinput_charset_conversion))
 875             {
 876               codesys->iso2022.input_conv =
 877                 Dynarr_new (charset_conversion_spec);
 878               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 879                                               value);
 880             }
 881           else if (EQ (key, Qoutput_charset_conversion))
 882             {
 883               codesys->iso2022.output_conv =
 884                 Dynarr_new (charset_conversion_spec);
 885               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 886                                               value);
 887             }
 888           else
 889             signal_simple_error ("Unrecognized property", key);
 890         }
 891       else if (EQ (type, Qccl))
 892         {
 893           if (EQ (key, Qdecode))
 894             {
 895               CHECK_VECTOR (value);
 896               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 897             }
 898           else if (EQ (key, Qencode))
 899             {
 900               CHECK_VECTOR (value);
 901               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 902             }
 903           else
 904             signal_simple_error ("Unrecognized property", key);
 905         }
 906 #endif /* MULE */
 907       else
 908         signal_simple_error ("Unrecognized property", key);
 909     }
 910
 911   if (need_to_setup_eol_systems)
 912     setup_eol_coding_systems (codesys);
 913
 914   {
 915     Lisp_Object codesys_obj;
 916     XSETCODING_SYSTEM (codesys_obj, codesys);
 917     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 918     return codesys_obj;
 919   }
 920 }
 921
 922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 923 Copy OLD-CODING-SYSTEM to NEW-NAME.
 924 If NEW-NAME does not name an existing coding system, a new one will
 925 be created.
 926 */
 927        (old_coding_system, new_name))
 928 {
 929   Lisp_Object new_coding_system;
 930   old_coding_system = Fget_coding_system (old_coding_system);
 931   new_coding_system = Ffind_coding_system (new_name);
 932   if (NILP (new_coding_system))
 933     {
 934       XSETCODING_SYSTEM (new_coding_system,
 935                          allocate_coding_system
 936                          (XCODING_SYSTEM_TYPE (old_coding_system),
 937                           new_name));
 938       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 939     }
 940
 941   {
 942     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 943     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 944     memcpy (((char *) to  ) + sizeof (to->header),
 945             ((char *) from) + sizeof (from->header),
 946             sizeof (*from) - sizeof (from->header));
 947     to->name = new_name;
 948   }
 949   return new_coding_system;
 950 }
 951
 952 static Lisp_Object
 953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
 954 {
 955   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
 956   Lisp_Object new_coding_system;
 957
 958   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
 959     return coding_system;
 960
 961   switch (type)
 962     {
 963     case EOL_AUTODETECT: return coding_system;
 964     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
 965     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
 966     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
 967     default:       abort ();
 968     }
 969
 970   return NILP (new_coding_system) ? coding_system : new_coding_system;
 971 }
 972
 973 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
 974 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
 975 */
 976        (coding_system, eol_type))
 977 {
 978   coding_system = Fget_coding_system (coding_system);
 979
 980   return subsidiary_coding_system (coding_system,
 981                                    symbol_to_eol_type (eol_type));
 982 }
 983
 984 \f
 985 /************************************************************************/
 986 /*                         Coding system accessors                      */
 987 /************************************************************************/
 988
 989 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
 990 Return the doc string for CODING-SYSTEM.
 991 */
 992        (coding_system))
 993 {
 994   coding_system = Fget_coding_system (coding_system);
 995   return XCODING_SYSTEM_DOC_STRING (coding_system);
 996 }
 997
 998 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
 999 Return the type of CODING-SYSTEM.
1000 */
1001        (coding_system))
1002 {
1003   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1004     {
1005     default: abort ();
1006     case CODESYS_AUTODETECT:    return Qundecided;
1007 #ifdef MULE
1008     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1009     case CODESYS_ISO2022:       return Qiso2022;
1010     case CODESYS_BIG5:          return Qbig5;
1011     case CODESYS_UCS4:          return Qucs4;
1012     case CODESYS_UTF8:          return Qutf8;
1013     case CODESYS_CCL:           return Qccl;
1014 #endif
1015     case CODESYS_NO_CONVERSION: return Qno_conversion;
1016 #ifdef DEBUG_XEMACS
1017     case CODESYS_INTERNAL:      return Qinternal;
1018 #endif
1019     }
1020 }
1021
1022 #ifdef MULE
1023 static
1024 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1025 {
1026   Lisp_Object cs
1027     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1028
1029   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1030 }
1031
1032 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1033 Return initial charset of CODING-SYSTEM designated to GNUM.
1034 GNUM allows 0 .. 3.
1035 */
1036        (coding_system, gnum))
1037 {
1038   coding_system = Fget_coding_system (coding_system);
1039   CHECK_INT (gnum);
1040
1041   return coding_system_charset (coding_system, XINT (gnum));
1042 }
1043 #endif /* MULE */
1044
1045 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1046 Return the PROP property of CODING-SYSTEM.
1047 */
1048        (coding_system, prop))
1049 {
1050   int i, ok = 0;
1051   enum coding_system_type type;
1052
1053   coding_system = Fget_coding_system (coding_system);
1054   CHECK_SYMBOL (prop);
1055   type = XCODING_SYSTEM_TYPE (coding_system);
1056
1057   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1058     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1059       {
1060         ok = 1;
1061         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1062           {
1063           case CODESYS_PROP_ALL_OK:
1064             break;
1065 #ifdef MULE
1066           case CODESYS_PROP_ISO2022:
1067             if (type != CODESYS_ISO2022)
1068               signal_simple_error
1069                 ("Property only valid in ISO2022 coding systems",
1070                  prop);
1071             break;
1072
1073           case CODESYS_PROP_CCL:
1074             if (type != CODESYS_CCL)
1075               signal_simple_error
1076                 ("Property only valid in CCL coding systems",
1077                  prop);
1078             break;
1079 #endif /* MULE */
1080           default:
1081             abort ();
1082           }
1083       }
1084
1085   if (!ok)
1086     signal_simple_error ("Unrecognized property", prop);
1087
1088   if (EQ (prop, Qname))
1089     return XCODING_SYSTEM_NAME (coding_system);
1090   else if (EQ (prop, Qtype))
1091     return Fcoding_system_type (coding_system);
1092   else if (EQ (prop, Qdoc_string))
1093     return XCODING_SYSTEM_DOC_STRING (coding_system);
1094   else if (EQ (prop, Qmnemonic))
1095     return XCODING_SYSTEM_MNEMONIC (coding_system);
1096   else if (EQ (prop, Qeol_type))
1097     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1098   else if (EQ (prop, Qeol_lf))
1099     return XCODING_SYSTEM_EOL_LF (coding_system);
1100   else if (EQ (prop, Qeol_crlf))
1101     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1102   else if (EQ (prop, Qeol_cr))
1103     return XCODING_SYSTEM_EOL_CR (coding_system);
1104   else if (EQ (prop, Qpost_read_conversion))
1105     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1106   else if (EQ (prop, Qpre_write_conversion))
1107     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1108 #ifdef MULE
1109   else if (type == CODESYS_ISO2022)
1110     {
1111       if (EQ (prop, Qcharset_g0))
1112         return coding_system_charset (coding_system, 0);
1113       else if (EQ (prop, Qcharset_g1))
1114         return coding_system_charset (coding_system, 1);
1115       else if (EQ (prop, Qcharset_g2))
1116         return coding_system_charset (coding_system, 2);
1117       else if (EQ (prop, Qcharset_g3))
1118         return coding_system_charset (coding_system, 3);
1119
1120 #define FORCE_CHARSET(charset_num) \
1121   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1122    (coding_system, charset_num) ? Qt : Qnil)
1123
1124       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1125       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1126       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1127       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1128
1129 #define LISP_BOOLEAN(prop) \
1130   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1131
1132       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1133       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1134       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1135       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1136       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1137       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1138       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1139
1140       else if (EQ (prop, Qinput_charset_conversion))
1141         return
1142           unparse_charset_conversion_specs
1143             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1144       else if (EQ (prop, Qoutput_charset_conversion))
1145         return
1146           unparse_charset_conversion_specs
1147             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1148       else
1149         abort ();
1150     }
1151   else if (type == CODESYS_CCL)
1152     {
1153       if (EQ (prop, Qdecode))
1154         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1155       else if (EQ (prop, Qencode))
1156         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1157       else
1158         abort ();
1159     }
1160 #endif /* MULE */
1161   else
1162     abort ();
1163
1164   return Qnil; /* not reached */
1165 }
1166
1167 \f
1168 /************************************************************************/
1169 /*                       Coding category functions                      */
1170 /************************************************************************/
1171
1172 static int
1173 decode_coding_category (Lisp_Object symbol)
1174 {
1175   int i;
1176
1177   CHECK_SYMBOL (symbol);
1178   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1179     if (EQ (coding_category_symbol[i], symbol))
1180       return i;
1181
1182   signal_simple_error ("Unrecognized coding category", symbol);
1183   return 0; /* not reached */
1184 }
1185
1186 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1187 Return a list of all recognized coding categories.
1188 */
1189        ())
1190 {
1191   int i;
1192   Lisp_Object list = Qnil;
1193
1194   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1195     list = Fcons (coding_category_symbol[i], list);
1196   return list;
1197 }
1198
1199 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1200 Change the priority order of the coding categories.
1201 LIST should be list of coding categories, in descending order of
1202 priority.  Unspecified coding categories will be lower in priority
1203 than all specified ones, in the same relative order they were in
1204 previously.
1205 */
1206        (list))
1207 {
1208   int category_to_priority[CODING_CATEGORY_LAST + 1];
1209   int i, j;
1210   Lisp_Object rest;
1211
1212   /* First generate a list that maps coding categories to priorities. */
1213
1214   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1215     category_to_priority[i] = -1;
1216
1217   /* Highest priority comes from the specified list. */
1218   i = 0;
1219   EXTERNAL_LIST_LOOP (rest, list)
1220     {
1221       int cat = decode_coding_category (XCAR (rest));
1222
1223       if (category_to_priority[cat] >= 0)
1224         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1225       category_to_priority[cat] = i++;
1226     }
1227
1228   /* Now go through the existing categories by priority to retrieve
1229      the categories not yet specified and preserve their priority
1230      order. */
1231   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1232     {
1233       int cat = coding_category_by_priority[j];
1234       if (category_to_priority[cat] < 0)
1235         category_to_priority[cat] = i++;
1236     }
1237
1238   /* Now we need to construct the inverse of the mapping we just
1239      constructed. */
1240
1241   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1242     coding_category_by_priority[category_to_priority[i]] = i;
1243
1244   /* Phew!  That was confusing. */
1245   return Qnil;
1246 }
1247
1248 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1249 Return a list of coding categories in descending order of priority.
1250 */
1251        ())
1252 {
1253   int i;
1254   Lisp_Object list = Qnil;
1255
1256   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1257     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1258                   list);
1259   return list;
1260 }
1261
1262 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1263 Change the coding system associated with a coding category.
1264 */
1265        (coding_category, coding_system))
1266 {
1267   int cat = decode_coding_category (coding_category);
1268
1269   coding_system = Fget_coding_system (coding_system);
1270   coding_category_system[cat] = coding_system;
1271   return Qnil;
1272 }
1273
1274 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1275 Return the coding system associated with a coding category.
1276 */
1277        (coding_category))
1278 {
1279   int cat = decode_coding_category (coding_category);
1280   Lisp_Object sys = coding_category_system[cat];
1281
1282   if (!NILP (sys))
1283     return XCODING_SYSTEM_NAME (sys);
1284   return Qnil;
1285 }
1286
1287 \f
1288 /************************************************************************/
1289 /*                     Detecting the encoding of data                   */
1290 /************************************************************************/
1291
1292 struct detection_state
1293 {
1294   enum eol_type eol_type;
1295   int seen_non_ascii;
1296   int mask;
1297 #ifdef MULE
1298   struct
1299     {
1300       int mask;
1301       int in_second_byte;
1302     }
1303   big5;
1304
1305   struct
1306     {
1307       int mask;
1308       int in_second_byte;
1309     }
1310   shift_jis;
1311
1312   struct
1313     {
1314       int mask;
1315       int in_byte;
1316   }
1317   ucs4;
1318
1319   struct
1320     {
1321       int mask;
1322       int in_byte;
1323     }
1324   utf8;
1325
1326   struct
1327     {
1328       int mask;
1329       int initted;
1330       struct iso2022_decoder iso;
1331       unsigned int flags;
1332       int high_byte_count;
1333       unsigned int saw_single_shift:1;
1334     }
1335   iso2022;
1336 #endif
1337   struct
1338     {
1339       int seen_anything;
1340       int just_saw_cr;
1341     }
1342   eol;
1343 };
1344
1345 static int
1346 acceptable_control_char_p (int c)
1347 {
1348   switch (c)
1349     {
1350       /* Allow and ignore control characters that you might
1351          reasonably see in a text file */
1352     case '\r':
1353     case '\n':
1354     case '\t':
1355     case  7: /* bell */
1356     case  8: /* backspace */
1357     case 11: /* vertical tab */
1358     case 12: /* form feed */
1359     case 26: /* MS-DOS C-z junk */
1360     case 31: /* '^_' -- for info */
1361       return 1;
1362     default:
1363       return 0;
1364     }
1365 }
1366
1367 static int
1368 mask_has_at_most_one_bit_p (int mask)
1369 {
1370   /* Perhaps the only thing useful you learn from intensive Microsoft
1371      technical interviews */
1372   return (mask & (mask - 1)) == 0;
1373 }
1374
1375 static enum eol_type
1376 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1377                  unsigned int n)
1378 {
1379   int c;
1380
1381   while (n--)
1382     {
1383       c = *src++;
1384       if (c == '\r')
1385         st->eol.just_saw_cr = 1;
1386       else
1387         {
1388           if (c == '\n')
1389             {
1390               if (st->eol.just_saw_cr)
1391                 return EOL_CRLF;
1392               else if (st->eol.seen_anything)
1393                 return EOL_LF;
1394             }
1395           else if (st->eol.just_saw_cr)
1396             return EOL_CR;
1397           st->eol.just_saw_cr = 0;
1398         }
1399       st->eol.seen_anything = 1;
1400     }
1401
1402   return EOL_AUTODETECT;
1403 }
1404
1405 /* Attempt to determine the encoding and EOL type of the given text.
1406    Before calling this function for the first type, you must initialize
1407    st->eol_type as appropriate and initialize st->mask to ~0.
1408
1409    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1410    not yet known.
1411
1412    st->mask holds the determined coding category mask, or ~0 if only
1413    ASCII has been seen so far.
1414
1415    Returns:
1416
1417    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1418         is present in st->mask
1419    1 == definitive answers are here for both st->eol_type and st->mask
1420 */
1421
1422 static int
1423 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1424                     unsigned int n, int just_do_eol)
1425 {
1426   int c;
1427
1428   if (st->eol_type == EOL_AUTODETECT)
1429     st->eol_type = detect_eol_type (st, src, n);
1430
1431   if (just_do_eol)
1432     return st->eol_type != EOL_AUTODETECT;
1433
1434   if (!st->seen_non_ascii)
1435     {
1436       for (; n; n--, src++)
1437         {
1438           c = *src;
1439           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1440             {
1441               st->seen_non_ascii = 1;
1442 #ifdef MULE
1443               st->shift_jis.mask = ~0;
1444               st->big5.mask = ~0;
1445               st->ucs4.mask = ~0;
1446               st->utf8.mask = ~0;
1447               st->iso2022.mask = ~0;
1448 #endif
1449               break;
1450             }
1451         }
1452     }
1453
1454   if (!n)
1455     return 0;
1456 #ifdef MULE
1457   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1458     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1459   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1460     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1461   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1462     st->big5.mask = detect_coding_big5 (st, src, n);
1463   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1464     st->utf8.mask = detect_coding_utf8 (st, src, n);
1465   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1466     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1467
1468   st->mask
1469     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1470     | st->utf8.mask | st->ucs4.mask;
1471 #endif
1472   {
1473     int retval = mask_has_at_most_one_bit_p (st->mask);
1474     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1475     return retval && st->eol_type != EOL_AUTODETECT;
1476   }
1477 }
1478
1479 static Lisp_Object
1480 coding_system_from_mask (int mask)
1481 {
1482   if (mask == ~0)
1483     {
1484       /* If the file was entirely or basically ASCII, use the
1485          default value of `buffer-file-coding-system'. */
1486       Lisp_Object retval =
1487         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1488       if (!NILP (retval))
1489         {
1490           retval = Ffind_coding_system (retval);
1491           if (NILP (retval))
1492             {
1493               warn_when_safe
1494                 (Qbad_variable, Qwarning,
1495                  "Invalid `default-buffer-file-coding-system', set to nil");
1496               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1497             }
1498         }
1499       if (NILP (retval))
1500         retval = Fget_coding_system (Qno_conversion);
1501       return retval;
1502     }
1503   else
1504     {
1505       int i;
1506       int cat = -1;
1507 #ifdef MULE
1508       mask = postprocess_iso2022_mask (mask);
1509 #endif
1510       /* Look through the coding categories by priority and find
1511          the first one that is allowed. */
1512       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1513         {
1514           cat = coding_category_by_priority[i];
1515           if ((mask & (1 << cat)) &&
1516               !NILP (coding_category_system[cat]))
1517             break;
1518         }
1519       if (cat >= 0)
1520         return coding_category_system[cat];
1521       else
1522         return Fget_coding_system (Qno_conversion);
1523     }
1524 }
1525
1526 /* Given a seekable read stream and potential coding system and EOL type
1527    as specified, do any autodetection that is called for.  If the
1528    coding system and/or EOL type are not autodetect, they will be left
1529    alone; but this function will never return an autodetect coding system
1530    or EOL type.
1531
1532    This function does not automatically fetch subsidiary coding systems;
1533    that should be unnecessary with the explicit eol-type argument. */
1534
1535 void
1536 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1537                               enum eol_type *eol_type_in_out)
1538 {
1539   struct detection_state decst;
1540
1541   if (*eol_type_in_out == EOL_AUTODETECT)
1542     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1543
1544   xzero (decst);
1545   decst.eol_type = *eol_type_in_out;
1546   decst.mask = ~0;
1547
1548   /* If autodetection is called for, do it now. */
1549   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1550       *eol_type_in_out == EOL_AUTODETECT)
1551     {
1552
1553       while (1)
1554         {
1555           unsigned char random_buffer[4096];
1556           int nread;
1557
1558           nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1559           if (!nread)
1560             break;
1561           if (detect_coding_type (&decst, random_buffer, nread,
1562                                   XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1563                                   CODESYS_AUTODETECT))
1564             break;
1565         }
1566
1567       *eol_type_in_out = decst.eol_type;
1568       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1569         *codesys_in_out = coding_system_from_mask (decst.mask);
1570     }
1571
1572   /* If we absolutely can't determine the EOL type, just assume LF. */
1573   if (*eol_type_in_out == EOL_AUTODETECT)
1574     *eol_type_in_out = EOL_LF;
1575
1576   Lstream_rewind (stream);
1577 }
1578
1579 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1580 Detect coding system of the text in the region between START and END.
1581 Returned a list of possible coding systems ordered by priority.
1582 If only ASCII characters are found, it returns 'undecided or one of
1583 its subsidiary coding systems according to a detected end-of-line
1584 type.  Optional arg BUFFER defaults to the current buffer.
1585 */
1586        (start, end, buffer))
1587 {
1588   Lisp_Object val = Qnil;
1589   struct buffer *buf = decode_buffer (buffer, 0);
1590   Bufpos b, e;
1591   Lisp_Object instream, lb_instream;
1592   Lstream *istr, *lb_istr;
1593   struct detection_state decst;
1594   struct gcpro gcpro1, gcpro2;
1595
1596   get_buffer_range_char (buf, start, end, &b, &e, 0);
1597   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1598   lb_istr = XLSTREAM (lb_instream);
1599   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1600   istr = XLSTREAM (instream);
1601   GCPRO2 (instream, lb_instream);
1602   xzero (decst);
1603   decst.eol_type = EOL_AUTODETECT;
1604   decst.mask = ~0;
1605   while (1)
1606     {
1607       unsigned char random_buffer[4096];
1608       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1609
1610       if (!nread)
1611         break;
1612       if (detect_coding_type (&decst, random_buffer, nread, 0))
1613         break;
1614     }
1615
1616   if (decst.mask == ~0)
1617     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1618                                     decst.eol_type);
1619   else
1620     {
1621       int i;
1622
1623       val = Qnil;
1624 #ifdef MULE
1625       decst.mask = postprocess_iso2022_mask (decst.mask);
1626 #endif
1627       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1628         {
1629           int sys = coding_category_by_priority[i];
1630           if (decst.mask & (1 << sys))
1631             {
1632               Lisp_Object codesys = coding_category_system[sys];
1633               if (!NILP (codesys))
1634                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1635               val = Fcons (codesys, val);
1636             }
1637         }
1638     }
1639   Lstream_close (istr);
1640   UNGCPRO;
1641   Lstream_delete (istr);
1642   Lstream_delete (lb_istr);
1643   return val;
1644 }
1645
1646 \f
1647 /************************************************************************/
1648 /*           Converting to internal Mule format ("decoding")            */
1649 /************************************************************************/
1650
1651 /* A decoding stream is a stream used for decoding text (i.e.
1652    converting from some external format to internal format).
1653    The decoding-stream object keeps track of the actual coding
1654    stream, the stream that is at the other end, and data that
1655    needs to be persistent across the lifetime of the stream. */
1656
1657 /* Handle the EOL stuff related to just-read-in character C.
1658    EOL_TYPE is the EOL type of the coding stream.
1659    FLAGS is the current value of FLAGS in the coding stream, and may
1660    be modified by this macro.  (The macro only looks at the
1661    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1662    bytes are to be written.  You need to also define a local goto
1663    label "label_continue_loop" that is at the end of the main
1664    character-reading loop.
1665
1666    If C is a CR character, then this macro handles it entirely and
1667    jumps to label_continue_loop.  Otherwise, this macro does not add
1668    anything to DST, and continues normally.  You should continue
1669    processing C normally after this macro. */
1670
1671 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1672 do {                                                            \
1673   if (c == '\r')                                                \
1674     {                                                           \
1675       if (eol_type == EOL_CR)                                   \
1676         Dynarr_add (dst, '\n');                                 \
1677       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1678         Dynarr_add (dst, c);                                    \
1679       else                                                      \
1680         flags |= CODING_STATE_CR;                               \
1681       goto label_continue_loop;                                 \
1682     }                                                           \
1683   else if (flags & CODING_STATE_CR)                             \
1684     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1685       if (c != '\n')                                            \
1686         Dynarr_add (dst, '\r');                                 \
1687       flags &= ~CODING_STATE_CR;                                \
1688     }                                                           \
1689 } while (0)
1690
1691 /* C should be a binary character in the range 0 - 255; convert
1692    to internal format and add to Dynarr DST. */
1693
1694 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1695 do {                                            \
1696   if (BYTE_ASCII_P (c))                         \
1697     Dynarr_add (dst, c);                        \
1698   else if (BYTE_C1_P (c))                       \
1699     {                                           \
1700       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1701       Dynarr_add (dst, c + 0x20);               \
1702     }                                           \
1703   else                                          \
1704     {                                           \
1705       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1706       Dynarr_add (dst, c);                      \
1707     }                                           \
1708 } while (0)
1709
1710 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1711 do {                                    \
1712   if (ch)                               \
1713     {                                   \
1714       DECODE_ADD_BINARY_CHAR (ch, dst); \
1715       ch = 0;                           \
1716     }                                   \
1717 } while (0)
1718
1719 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1720 do {                                    \
1721   if (flags & CODING_STATE_END)         \
1722     {                                   \
1723       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1724       if (flags & CODING_STATE_CR)      \
1725         Dynarr_add (dst, '\r');         \
1726     }                                   \
1727 } while (0)
1728
1729 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1730
1731 struct decoding_stream
1732 {
1733   /* Coding system that governs the conversion. */
1734   Lisp_Coding_System *codesys;
1735
1736   /* Stream that we read the encoded data from or
1737      write the decoded data to. */
1738   Lstream *other_end;
1739
1740   /* If we are reading, then we can return only a fixed amount of
1741      data, so if the conversion resulted in too much data, we store it
1742      here for retrieval the next time around. */
1743   unsigned_char_dynarr *runoff;
1744
1745   /* FLAGS holds flags indicating the current state of the decoding.
1746      Some of these flags are dependent on the coding system. */
1747   unsigned int flags;
1748
1749   /* CH holds a partially built-up character.  Since we only deal
1750      with one- and two-byte characters at the moment, we only use
1751      this to store the first byte of a two-byte character. */
1752   unsigned int ch;
1753
1754   /* EOL_TYPE specifies the type of end-of-line conversion that
1755      currently applies.  We need to keep this separate from the
1756      EOL type stored in CODESYS because the latter might indicate
1757      automatic EOL-type detection while the former will always
1758      indicate a particular EOL type. */
1759   enum eol_type eol_type;
1760 #ifdef MULE
1761   /* Additional ISO2022 information.  We define the structure above
1762      because it's also needed by the detection routines. */
1763   struct iso2022_decoder iso2022;
1764
1765   /* Additional information (the state of the running CCL program)
1766      used by the CCL decoder. */
1767   struct ccl_program ccl;
1768 #endif
1769   struct detection_state decst;
1770 };
1771
1772 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1773 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1774 static int decoding_rewinder   (Lstream *stream);
1775 static int decoding_seekable_p (Lstream *stream);
1776 static int decoding_flusher    (Lstream *stream);
1777 static int decoding_closer     (Lstream *stream);
1778
1779 static Lisp_Object decoding_marker (Lisp_Object stream,
1780                                     void (*markobj) (Lisp_Object));
1781
1782 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1783                                sizeof (struct decoding_stream));
1784
1785 static Lisp_Object
1786 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1787 {
1788   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1789   Lisp_Object str_obj;
1790
1791   /* We do not need to mark the coding systems or charsets stored
1792      within the stream because they are stored in a global list
1793      and automatically marked. */
1794
1795   XSETLSTREAM (str_obj, str);
1796   markobj (str_obj);
1797   if (str->imp->marker)
1798     return (str->imp->marker) (str_obj, markobj);
1799   else
1800     return Qnil;
1801 }
1802
1803 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1804    so we read data from the other end, decode it, and store it into DATA. */
1805
1806 static int
1807 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1808 {
1809   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1810   unsigned char *orig_data = data;
1811   int read_size;
1812   int error_occurred = 0;
1813
1814   /* We need to interface to mule_decode(), which expects to take some
1815      amount of data and store the result into a Dynarr.  We have
1816      mule_decode() store into str->runoff, and take data from there
1817      as necessary. */
1818
1819   /* We loop until we have enough data, reading chunks from the other
1820      end and decoding it. */
1821   while (1)
1822     {
1823       /* Take data from the runoff if we can.  Make sure to take at
1824          most SIZE bytes, and delete the data from the runoff. */
1825       if (Dynarr_length (str->runoff) > 0)
1826         {
1827           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1828           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1829           Dynarr_delete_many (str->runoff, 0, chunk);
1830           data += chunk;
1831           size -= chunk;
1832         }
1833
1834       if (size == 0)
1835         break; /* No more room for data */
1836
1837       if (str->flags & CODING_STATE_END)
1838         /* This means that on the previous iteration, we hit the EOF on
1839            the other end.  We loop once more so that mule_decode() can
1840            output any final stuff it may be holding, or any "go back
1841            to a sane state" escape sequences. (This latter makes sense
1842            during encoding.) */
1843         break;
1844
1845       /* Exhausted the runoff, so get some more.  DATA has at least
1846          SIZE bytes left of storage in it, so it's OK to read directly
1847          into it.  (We'll be overwriting above, after we've decoded it
1848          into the runoff.) */
1849       read_size = Lstream_read (str->other_end, data, size);
1850       if (read_size < 0)
1851         {
1852           error_occurred = 1;
1853           break;
1854         }
1855       if (read_size == 0)
1856         /* There might be some more end data produced in the translation.
1857            See the comment above. */
1858         str->flags |= CODING_STATE_END;
1859       mule_decode (stream, data, str->runoff, read_size);
1860     }
1861
1862   if (data - orig_data == 0)
1863     return error_occurred ? -1 : 0;
1864   else
1865     return data - orig_data;
1866 }
1867
1868 static int
1869 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1870 {
1871   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1872   int retval;
1873
1874   /* Decode all our data into the runoff, and then attempt to write
1875      it all out to the other end.  Remove whatever chunk we succeeded
1876      in writing. */
1877   mule_decode (stream, data, str->runoff, size);
1878   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1879                           Dynarr_length (str->runoff));
1880   if (retval > 0)
1881     Dynarr_delete_many (str->runoff, 0, retval);
1882   /* Do NOT return retval.  The return value indicates how much
1883      of the incoming data was written, not how many bytes were
1884      written. */
1885   return size;
1886 }
1887
1888 static void
1889 reset_decoding_stream (struct decoding_stream *str)
1890 {
1891 #ifdef MULE
1892   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1893     {
1894       Lisp_Object coding_system;
1895       XSETCODING_SYSTEM (coding_system, str->codesys);
1896       reset_iso2022 (coding_system, &str->iso2022);
1897     }
1898   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1899     {
1900       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1901     }
1902 #endif /* MULE */
1903   str->flags = str->ch = 0;
1904 }
1905
1906 static int
1907 decoding_rewinder (Lstream *stream)
1908 {
1909   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1910   reset_decoding_stream (str);
1911   Dynarr_reset (str->runoff);
1912   return Lstream_rewind (str->other_end);
1913 }
1914
1915 static int
1916 decoding_seekable_p (Lstream *stream)
1917 {
1918   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1919   return Lstream_seekable_p (str->other_end);
1920 }
1921
1922 static int
1923 decoding_flusher (Lstream *stream)
1924 {
1925   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1926   return Lstream_flush (str->other_end);
1927 }
1928
1929 static int
1930 decoding_closer (Lstream *stream)
1931 {
1932   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1933   if (stream->flags & LSTREAM_FL_WRITE)
1934     {
1935       str->flags |= CODING_STATE_END;
1936       decoding_writer (stream, 0, 0);
1937     }
1938   Dynarr_free (str->runoff);
1939 #ifdef MULE
1940 #ifdef ENABLE_COMPOSITE_CHARS
1941   if (str->iso2022.composite_chars)
1942     Dynarr_free (str->iso2022.composite_chars);
1943 #endif
1944 #endif
1945   return Lstream_close (str->other_end);
1946 }
1947
1948 Lisp_Object
1949 decoding_stream_coding_system (Lstream *stream)
1950 {
1951   Lisp_Object coding_system;
1952   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1953
1954   XSETCODING_SYSTEM (coding_system, str->codesys);
1955   return subsidiary_coding_system (coding_system, str->eol_type);
1956 }
1957
1958 void
1959 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1960 {
1961   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1962   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1963   str->codesys = cs;
1964   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1965     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1966   reset_decoding_stream (str);
1967 }
1968
1969 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
1970    stream for writing, no automatic code detection will be performed.
1971    The reason for this is that automatic code detection requires a
1972    seekable input.  Things will also fail if you open a decoding
1973    stream for reading using a non-fully-specified coding system and
1974    a non-seekable input stream. */
1975
1976 static Lisp_Object
1977 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1978                         CONST char *mode)
1979 {
1980   Lstream *lstr = Lstream_new (lstream_decoding, mode);
1981   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1982   Lisp_Object obj;
1983
1984   xzero (*str);
1985   str->other_end = stream;
1986   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1987   str->eol_type = EOL_AUTODETECT;
1988   if (!strcmp (mode, "r")
1989       && Lstream_seekable_p (stream))
1990     /* We can determine the coding system now. */
1991     determine_real_coding_system (stream, &codesys, &str->eol_type);
1992   set_decoding_stream_coding_system (lstr, codesys);
1993   str->decst.eol_type = str->eol_type;
1994   str->decst.mask = ~0;
1995   XSETLSTREAM (obj, lstr);
1996   return obj;
1997 }
1998
1999 Lisp_Object
2000 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2001 {
2002   return make_decoding_stream_1 (stream, codesys, "r");
2003 }
2004
2005 Lisp_Object
2006 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2007 {
2008   return make_decoding_stream_1 (stream, codesys, "w");
2009 }
2010
2011 /* Note: the decode_coding_* functions all take the same
2012    arguments as mule_decode(), which is to say some SRC data of
2013    size N, which is to be stored into dynamic array DST.
2014    DECODING is the stream within which the decoding is
2015    taking place, but no data is actually read from or
2016    written to that stream; that is handled in decoding_reader()
2017    or decoding_writer().  This allows the same functions to
2018    be used for both reading and writing. */
2019
2020 static void
2021 mule_decode (Lstream *decoding, CONST unsigned char *src,
2022              unsigned_char_dynarr *dst, unsigned int n)
2023 {
2024   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2025
2026   /* If necessary, do encoding-detection now.  We do this when
2027      we're a writing stream or a non-seekable reading stream,
2028      meaning that we can't just process the whole input,
2029      rewind, and start over. */
2030
2031   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2032       str->eol_type == EOL_AUTODETECT)
2033     {
2034       Lisp_Object codesys;
2035
2036       XSETCODING_SYSTEM (codesys, str->codesys);
2037       detect_coding_type (&str->decst, src, n,
2038                           CODING_SYSTEM_TYPE (str->codesys) !=
2039                           CODESYS_AUTODETECT);
2040       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2041           str->decst.mask != ~0)
2042         /* #### This is cheesy.  What we really ought to do is
2043            buffer up a certain amount of data so as to get a
2044            less random result. */
2045         codesys = coding_system_from_mask (str->decst.mask);
2046       str->eol_type = str->decst.eol_type;
2047       if (XCODING_SYSTEM (codesys) != str->codesys)
2048         {
2049           /* Preserve the CODING_STATE_END flag in case it was set.
2050              If we erase it, bad things might happen. */
2051           int was_end = str->flags & CODING_STATE_END;
2052           set_decoding_stream_coding_system (decoding, codesys);
2053           if (was_end)
2054             str->flags |= CODING_STATE_END;
2055         }
2056     }
2057
2058   switch (CODING_SYSTEM_TYPE (str->codesys))
2059     {
2060 #ifdef DEBUG_XEMACS
2061     case CODESYS_INTERNAL:
2062       Dynarr_add_many (dst, src, n);
2063       break;
2064 #endif
2065     case CODESYS_AUTODETECT:
2066       /* If we got this far and still haven't decided on the coding
2067          system, then do no conversion. */
2068     case CODESYS_NO_CONVERSION:
2069       decode_coding_no_conversion (decoding, src, dst, n);
2070       break;
2071 #ifdef MULE
2072     case CODESYS_SHIFT_JIS:
2073       decode_coding_sjis (decoding, src, dst, n);
2074       break;
2075     case CODESYS_BIG5:
2076       decode_coding_big5 (decoding, src, dst, n);
2077       break;
2078     case CODESYS_UCS4:
2079       decode_coding_ucs4 (decoding, src, dst, n);
2080       break;
2081     case CODESYS_UTF8:
2082       decode_coding_utf8 (decoding, src, dst, n);
2083       break;
2084     case CODESYS_CCL:
2085       ccl_driver (&str->ccl, src, dst, n, 0);
2086       break;
2087     case CODESYS_ISO2022:
2088       decode_coding_iso2022 (decoding, src, dst, n);
2089       break;
2090 #endif /* MULE */
2091     default:
2092       abort ();
2093     }
2094 }
2095
2096 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2097 Decode the text between START and END which is encoded in CODING-SYSTEM.
2098 This is useful if you've read in encoded text from a file without decoding
2099 it (e.g. you read in a JIS-formatted file but used the `binary' or
2100 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2101 Return length of decoded text.
2102 BUFFER defaults to the current buffer if unspecified.
2103 */
2104        (start, end, coding_system, buffer))
2105 {
2106   Bufpos b, e;
2107   struct buffer *buf = decode_buffer (buffer, 0);
2108   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2109   Lstream *istr, *ostr;
2110   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2111
2112   get_buffer_range_char (buf, start, end, &b, &e, 0);
2113
2114   barf_if_buffer_read_only (buf, b, e);
2115
2116   coding_system = Fget_coding_system (coding_system);
2117   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2118   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2119   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2120                                               coding_system);
2121   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2122                                            Fget_coding_system (Qbinary));
2123   istr = XLSTREAM (instream);
2124   ostr = XLSTREAM (outstream);
2125   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2126
2127   /* The chain of streams looks like this:
2128
2129      [BUFFER] <----- send through
2130                      ------> [ENCODE AS BINARY]
2131                              ------> [DECODE AS SPECIFIED]
2132                                      ------> [BUFFER]
2133    */
2134
2135   while (1)
2136     {
2137       char tempbuf[1024]; /* some random amount */
2138       Bufpos newpos, even_newer_pos;
2139       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2140       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2141
2142       if (!size_in_bytes)
2143         break;
2144       newpos = lisp_buffer_stream_startpos (istr);
2145       Lstream_write (ostr, tempbuf, size_in_bytes);
2146       even_newer_pos = lisp_buffer_stream_startpos (istr);
2147       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2148                            even_newer_pos, 0);
2149     }
2150   Lstream_close (istr);
2151   Lstream_close (ostr);
2152   UNGCPRO;
2153   Lstream_delete (istr);
2154   Lstream_delete (ostr);
2155   Lstream_delete (XLSTREAM (de_outstream));
2156   Lstream_delete (XLSTREAM (lb_outstream));
2157   return Qnil;
2158 }
2159
2160 \f
2161 /************************************************************************/
2162 /*           Converting to an external encoding ("encoding")            */
2163 /************************************************************************/
2164
2165 /* An encoding stream is an output stream.  When you create the
2166    stream, you specify the coding system that governs the encoding
2167    and another stream that the resulting encoded data is to be
2168    sent to, and then start sending data to it. */
2169
2170 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2171
2172 struct encoding_stream
2173 {
2174   /* Coding system that governs the conversion. */
2175   Lisp_Coding_System *codesys;
2176
2177   /* Stream that we read the encoded data from or
2178      write the decoded data to. */
2179   Lstream *other_end;
2180
2181   /* If we are reading, then we can return only a fixed amount of
2182      data, so if the conversion resulted in too much data, we store it
2183      here for retrieval the next time around. */
2184   unsigned_char_dynarr *runoff;
2185
2186   /* FLAGS holds flags indicating the current state of the encoding.
2187      Some of these flags are dependent on the coding system. */
2188   unsigned int flags;
2189
2190   /* CH holds a partially built-up character.  Since we only deal
2191      with one- and two-byte characters at the moment, we only use
2192      this to store the first byte of a two-byte character. */
2193   unsigned int ch;
2194 #ifdef MULE
2195   /* Additional information used by the ISO2022 encoder. */
2196   struct
2197     {
2198       /* CHARSET holds the character sets currently assigned to the G0
2199          through G3 registers.  It is initialized from the array
2200          INITIAL_CHARSET in CODESYS. */
2201       Lisp_Object charset[4];
2202
2203       /* Which registers are currently invoked into the left (GL) and
2204          right (GR) halves of the 8-bit encoding space? */
2205       int register_left, register_right;
2206
2207       /* Whether we need to explicitly designate the charset in the
2208          G? register before using it.  It is initialized from the
2209          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2210       unsigned char force_charset_on_output[4];
2211
2212       /* Other state variables that need to be preserved across
2213          invocations. */
2214       Lisp_Object current_charset;
2215       int current_half;
2216       int current_char_boundary;
2217     } iso2022;
2218
2219   /* Additional information (the state of the running CCL program)
2220      used by the CCL encoder. */
2221   struct ccl_program ccl;
2222 #endif /* MULE */
2223 };
2224
2225 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2226 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2227                             size_t size);
2228 static int encoding_rewinder   (Lstream *stream);
2229 static int encoding_seekable_p (Lstream *stream);
2230 static int encoding_flusher    (Lstream *stream);
2231 static int encoding_closer     (Lstream *stream);
2232
2233 static Lisp_Object encoding_marker (Lisp_Object stream,
2234                                     void (*markobj) (Lisp_Object));
2235
2236 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2237                                sizeof (struct encoding_stream));
2238
2239 static Lisp_Object
2240 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2241 {
2242   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2243   Lisp_Object str_obj;
2244
2245   /* We do not need to mark the coding systems or charsets stored
2246      within the stream because they are stored in a global list
2247      and automatically marked. */
2248
2249   XSETLSTREAM (str_obj, str);
2250   markobj (str_obj);
2251   if (str->imp->marker)
2252     return (str->imp->marker) (str_obj, markobj);
2253   else
2254     return Qnil;
2255 }
2256
2257 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2258    so we read data from the other end, encode it, and store it into DATA. */
2259
2260 static int
2261 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2262 {
2263   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2264   unsigned char *orig_data = data;
2265   int read_size;
2266   int error_occurred = 0;
2267
2268   /* We need to interface to mule_encode(), which expects to take some
2269      amount of data and store the result into a Dynarr.  We have
2270      mule_encode() store into str->runoff, and take data from there
2271      as necessary. */
2272
2273   /* We loop until we have enough data, reading chunks from the other
2274      end and encoding it. */
2275   while (1)
2276     {
2277       /* Take data from the runoff if we can.  Make sure to take at
2278          most SIZE bytes, and delete the data from the runoff. */
2279       if (Dynarr_length (str->runoff) > 0)
2280         {
2281           int chunk = min ((int) size, Dynarr_length (str->runoff));
2282           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2283           Dynarr_delete_many (str->runoff, 0, chunk);
2284           data += chunk;
2285           size -= chunk;
2286         }
2287
2288       if (size == 0)
2289         break; /* No more room for data */
2290
2291       if (str->flags & CODING_STATE_END)
2292         /* This means that on the previous iteration, we hit the EOF on
2293            the other end.  We loop once more so that mule_encode() can
2294            output any final stuff it may be holding, or any "go back
2295            to a sane state" escape sequences. (This latter makes sense
2296            during encoding.) */
2297         break;
2298
2299       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2300          left of storage in it, so it's OK to read directly into it.
2301          (We'll be overwriting above, after we've encoded it into the
2302          runoff.) */
2303       read_size = Lstream_read (str->other_end, data, size);
2304       if (read_size < 0)
2305         {
2306           error_occurred = 1;
2307           break;
2308         }
2309       if (read_size == 0)
2310         /* There might be some more end data produced in the translation.
2311            See the comment above. */
2312         str->flags |= CODING_STATE_END;
2313       mule_encode (stream, data, str->runoff, read_size);
2314     }
2315
2316   if (data == orig_data)
2317     return error_occurred ? -1 : 0;
2318   else
2319     return data - orig_data;
2320 }
2321
2322 static int
2323 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2324 {
2325   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2326   int retval;
2327
2328   /* Encode all our data into the runoff, and then attempt to write
2329      it all out to the other end.  Remove whatever chunk we succeeded
2330      in writing. */
2331   mule_encode (stream, data, str->runoff, size);
2332   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2333                           Dynarr_length (str->runoff));
2334   if (retval > 0)
2335     Dynarr_delete_many (str->runoff, 0, retval);
2336   /* Do NOT return retval.  The return value indicates how much
2337      of the incoming data was written, not how many bytes were
2338      written. */
2339   return size;
2340 }
2341
2342 static void
2343 reset_encoding_stream (struct encoding_stream *str)
2344 {
2345 #ifdef MULE
2346   switch (CODING_SYSTEM_TYPE (str->codesys))
2347     {
2348     case CODESYS_ISO2022:
2349       {
2350         int i;
2351
2352         for (i = 0; i < 4; i++)
2353           {
2354             str->iso2022.charset[i] =
2355               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2356             str->iso2022.force_charset_on_output[i] =
2357               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2358           }
2359         str->iso2022.register_left = 0;
2360         str->iso2022.register_right = 1;
2361         str->iso2022.current_charset = Qnil;
2362         str->iso2022.current_half = 0;
2363         str->iso2022.current_char_boundary = 1;
2364         break;
2365       }
2366     case CODESYS_CCL:
2367       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2368       break;
2369     default:
2370       break;
2371     }
2372 #endif /* MULE */
2373
2374   str->flags = str->ch = 0;
2375 }
2376
2377 static int
2378 encoding_rewinder (Lstream *stream)
2379 {
2380   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2381   reset_encoding_stream (str);
2382   Dynarr_reset (str->runoff);
2383   return Lstream_rewind (str->other_end);
2384 }
2385
2386 static int
2387 encoding_seekable_p (Lstream *stream)
2388 {
2389   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2390   return Lstream_seekable_p (str->other_end);
2391 }
2392
2393 static int
2394 encoding_flusher (Lstream *stream)
2395 {
2396   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2397   return Lstream_flush (str->other_end);
2398 }
2399
2400 static int
2401 encoding_closer (Lstream *stream)
2402 {
2403   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2404   if (stream->flags & LSTREAM_FL_WRITE)
2405     {
2406       str->flags |= CODING_STATE_END;
2407       encoding_writer (stream, 0, 0);
2408     }
2409   Dynarr_free (str->runoff);
2410   return Lstream_close (str->other_end);
2411 }
2412
2413 Lisp_Object
2414 encoding_stream_coding_system (Lstream *stream)
2415 {
2416   Lisp_Object coding_system;
2417   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2418
2419   XSETCODING_SYSTEM (coding_system, str->codesys);
2420   return coding_system;
2421 }
2422
2423 void
2424 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2425 {
2426   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2427   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2428   str->codesys = cs;
2429   reset_encoding_stream (str);
2430 }
2431
2432 static Lisp_Object
2433 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2434                         CONST char *mode)
2435 {
2436   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2437   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2438   Lisp_Object obj;
2439
2440   xzero (*str);
2441   str->runoff = Dynarr_new (unsigned_char);
2442   str->other_end = stream;
2443   set_encoding_stream_coding_system (lstr, codesys);
2444   XSETLSTREAM (obj, lstr);
2445   return obj;
2446 }
2447
2448 Lisp_Object
2449 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2450 {
2451   return make_encoding_stream_1 (stream, codesys, "r");
2452 }
2453
2454 Lisp_Object
2455 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2456 {
2457   return make_encoding_stream_1 (stream, codesys, "w");
2458 }
2459
2460 /* Convert N bytes of internally-formatted data stored in SRC to an
2461    external format, according to the encoding stream ENCODING.
2462    Store the encoded data into DST. */
2463
2464 static void
2465 mule_encode (Lstream *encoding, CONST unsigned char *src,
2466              unsigned_char_dynarr *dst, unsigned int n)
2467 {
2468   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2469
2470   switch (CODING_SYSTEM_TYPE (str->codesys))
2471     {
2472 #ifdef DEBUG_XEMACS
2473     case CODESYS_INTERNAL:
2474       Dynarr_add_many (dst, src, n);
2475       break;
2476 #endif
2477     case CODESYS_AUTODETECT:
2478       /* If we got this far and still haven't decided on the coding
2479          system, then do no conversion. */
2480     case CODESYS_NO_CONVERSION:
2481       encode_coding_no_conversion (encoding, src, dst, n);
2482       break;
2483 #ifdef MULE
2484     case CODESYS_SHIFT_JIS:
2485       encode_coding_sjis (encoding, src, dst, n);
2486       break;
2487     case CODESYS_BIG5:
2488       encode_coding_big5 (encoding, src, dst, n);
2489       break;
2490     case CODESYS_UCS4:
2491       encode_coding_ucs4 (encoding, src, dst, n);
2492       break;
2493     case CODESYS_UTF8:
2494       encode_coding_utf8 (encoding, src, dst, n);
2495       break;
2496     case CODESYS_CCL:
2497       ccl_driver (&str->ccl, src, dst, n, 0);
2498       break;
2499     case CODESYS_ISO2022:
2500       encode_coding_iso2022 (encoding, src, dst, n);
2501       break;
2502 #endif /* MULE */
2503     default:
2504       abort ();
2505     }
2506 }
2507
2508 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2509 Encode the text between START and END using CODING-SYSTEM.
2510 This will, for example, convert Japanese characters into stuff such as
2511 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2512 text.  BUFFER defaults to the current buffer if unspecified.
2513 */
2514        (start, end, coding_system, buffer))
2515 {
2516   Bufpos b, e;
2517   struct buffer *buf = decode_buffer (buffer, 0);
2518   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2519   Lstream *istr, *ostr;
2520   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2521
2522   get_buffer_range_char (buf, start, end, &b, &e, 0);
2523
2524   barf_if_buffer_read_only (buf, b, e);
2525
2526   coding_system = Fget_coding_system (coding_system);
2527   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2528   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2529   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2530                                               Fget_coding_system (Qbinary));
2531   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2532                                            coding_system);
2533   istr = XLSTREAM (instream);
2534   ostr = XLSTREAM (outstream);
2535   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2536   /* The chain of streams looks like this:
2537
2538      [BUFFER] <----- send through
2539                      ------> [ENCODE AS SPECIFIED]
2540                              ------> [DECODE AS BINARY]
2541                                      ------> [BUFFER]
2542    */
2543   while (1)
2544     {
2545       char tempbuf[1024]; /* some random amount */
2546       Bufpos newpos, even_newer_pos;
2547       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2548       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2549
2550       if (!size_in_bytes)
2551         break;
2552       newpos = lisp_buffer_stream_startpos (istr);
2553       Lstream_write (ostr, tempbuf, size_in_bytes);
2554       even_newer_pos = lisp_buffer_stream_startpos (istr);
2555       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2556                            even_newer_pos, 0);
2557     }
2558
2559   {
2560     Charcount retlen =
2561       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2562     Lstream_close (istr);
2563     Lstream_close (ostr);
2564     UNGCPRO;
2565     Lstream_delete (istr);
2566     Lstream_delete (ostr);
2567     Lstream_delete (XLSTREAM (de_outstream));
2568     Lstream_delete (XLSTREAM (lb_outstream));
2569     return make_int (retlen);
2570   }
2571 }
2572
2573 #ifdef MULE
2574 \f
2575 /************************************************************************/
2576 /*                          Shift-JIS methods                           */
2577 /************************************************************************/
2578
2579 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2580    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2581    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2582    encoded by "position-code + 0x80".  A character of JISX0208
2583    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2584    position-codes are divided and shifted so that it fit in the range
2585    below.
2586
2587    --- CODE RANGE of Shift-JIS ---
2588    (character set)      (range)
2589    ASCII                0x00 .. 0x7F
2590    JISX0201-Kana        0xA0 .. 0xDF
2591    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2592             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2593    -------------------------------
2594
2595 */
2596
2597 /* Is this the first byte of a Shift-JIS two-byte char? */
2598
2599 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2600   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2601
2602 /* Is this the second byte of a Shift-JIS two-byte char? */
2603
2604 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2605   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2606
2607 #define BYTE_SJIS_KATAKANA_P(c) \
2608   ((c) >= 0xA1 && (c) <= 0xDF)
2609
2610 static int
2611 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2612                     unsigned int n)
2613 {
2614   int c;
2615
2616   while (n--)
2617     {
2618       c = *src++;
2619       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2620         return 0;
2621       if (st->shift_jis.in_second_byte)
2622         {
2623           st->shift_jis.in_second_byte = 0;
2624           if (c < 0x40)
2625             return 0;
2626         }
2627       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2628         st->shift_jis.in_second_byte = 1;
2629     }
2630   return CODING_CATEGORY_SHIFT_JIS_MASK;
2631 }
2632
2633 /* Convert Shift-JIS data to internal format. */
2634
2635 static void
2636 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2637                     unsigned_char_dynarr *dst, unsigned int n)
2638 {
2639   unsigned char c;
2640   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2641   unsigned int flags  = str->flags;
2642   unsigned int ch     = str->ch;
2643   eol_type_t eol_type = str->eol_type;
2644
2645   while (n--)
2646     {
2647       c = *src++;
2648
2649       if (ch)
2650         {
2651           /* Previous character was first byte of Shift-JIS Kanji char. */
2652           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2653             {
2654               unsigned char e1, e2;
2655
2656               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2657               DECODE_SJIS (ch, c, e1, e2);
2658               Dynarr_add (dst, e1);
2659               Dynarr_add (dst, e2);
2660             }
2661           else
2662             {
2663               DECODE_ADD_BINARY_CHAR (ch, dst);
2664               DECODE_ADD_BINARY_CHAR (c, dst);
2665             }
2666           ch = 0;
2667         }
2668       else
2669         {
2670           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2671           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2672             ch = c;
2673           else if (BYTE_SJIS_KATAKANA_P (c))
2674             {
2675               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2676               Dynarr_add (dst, c);
2677             }
2678           else
2679             DECODE_ADD_BINARY_CHAR (c, dst);
2680         }
2681     label_continue_loop:;
2682     }
2683
2684   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2685
2686   str->flags = flags;
2687   str->ch    = ch;
2688 }
2689
2690 /* Convert internally-formatted data to Shift-JIS. */
2691
2692 static void
2693 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2694                     unsigned_char_dynarr *dst, unsigned int n)
2695 {
2696   unsigned char c;
2697   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2698   unsigned int flags  = str->flags;
2699   unsigned int ch     = str->ch;
2700   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2701
2702   while (n--)
2703     {
2704       c = *src++;
2705       if (c == '\n')
2706         {
2707           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2708             Dynarr_add (dst, '\r');
2709           if (eol_type != EOL_CR)
2710             Dynarr_add (dst, '\n');
2711           ch = 0;
2712         }
2713       else if (BYTE_ASCII_P (c))
2714         {
2715           Dynarr_add (dst, c);
2716           ch = 0;
2717         }
2718       else if (BUFBYTE_LEADING_BYTE_P (c))
2719         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2720               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2721               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2722       else if (ch)
2723         {
2724           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2725             {
2726               Dynarr_add (dst, c);
2727               ch = 0;
2728             }
2729           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2730                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2731             ch = c;
2732           else
2733             {
2734               unsigned char j1, j2;
2735               ENCODE_SJIS (ch, c, j1, j2);
2736               Dynarr_add (dst, j1);
2737               Dynarr_add (dst, j2);
2738               ch = 0;
2739             }
2740         }
2741     }
2742
2743   str->flags = flags;
2744   str->ch    = ch;
2745 }
2746
2747 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2748 Decode a JISX0208 character of Shift-JIS coding-system.
2749 CODE is the character code in Shift-JIS as a cons of type bytes.
2750 Return the corresponding character.
2751 */
2752        (code))
2753 {
2754   unsigned char c1, c2, s1, s2;
2755
2756   CHECK_CONS (code);
2757   CHECK_INT (XCAR (code));
2758   CHECK_INT (XCDR (code));
2759   s1 = XINT (XCAR (code));
2760   s2 = XINT (XCDR (code));
2761   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2762       BYTE_SJIS_TWO_BYTE_2_P (s2))
2763     {
2764       DECODE_SJIS (s1, s2, c1, c2);
2765       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2766                                    c1 & 0x7F, c2 & 0x7F));
2767     }
2768   else
2769     return Qnil;
2770 }
2771
2772 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2773 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2774 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2775 */
2776        (ch))
2777 {
2778   Lisp_Object charset;
2779   int c1, c2, s1, s2;
2780
2781   CHECK_CHAR_COERCE_INT (ch);
2782   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2783   if (EQ (charset, Vcharset_japanese_jisx0208))
2784     {
2785       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2786       return Fcons (make_int (s1), make_int (s2));
2787     }
2788   else
2789     return Qnil;
2790 }
2791
2792 \f
2793 /************************************************************************/
2794 /*                            Big5 methods                              */
2795 /************************************************************************/
2796
2797 /* BIG5 is a coding system encoding two character sets: ASCII and
2798    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2799    character set and is encoded in two-byte.
2800
2801    --- CODE RANGE of BIG5 ---
2802    (character set)      (range)
2803    ASCII                0x00 .. 0x7F
2804    Big5 (1st byte)      0xA1 .. 0xFE
2805         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2806    --------------------------
2807
2808    Since the number of characters in Big5 is larger than maximum
2809    characters in Emacs' charset (96x96), it can't be handled as one
2810    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2811    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
2812    contains frequently used characters and the latter contains less
2813    frequently used characters.  */
2814
2815 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2816   ((c) >= 0xA1 && (c) <= 0xFE)
2817
2818 /* Is this the second byte of a Shift-JIS two-byte char? */
2819
2820 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2821   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2822
2823 /* Number of Big5 characters which have the same code in 1st byte.  */
2824
2825 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2826
2827 /* Code conversion macros.  These are macros because they are used in
2828    inner loops during code conversion.
2829
2830    Note that temporary variables in macros introduce the classic
2831    dynamic-scoping problems with variable names.  We use capital-
2832    lettered variables in the assumption that XEmacs does not use
2833    capital letters in variables except in a very formalized way
2834    (e.g. Qstring). */
2835
2836 /* Convert Big5 code (b1, b2) into its internal string representation
2837    (lb, c1, c2). */
2838
2839 /* There is a much simpler way to split the Big5 charset into two.
2840    For the moment I'm going to leave the algorithm as-is because it
2841    claims to separate out the most-used characters into a single
2842    charset, which perhaps will lead to optimizations in various
2843    places.
2844
2845    The way the algorithm works is something like this:
2846
2847    Big5 can be viewed as a 94x157 charset, where the row is
2848    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2849    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
2850    the split between low and high column numbers is apparently
2851    meaningless; ascending rows produce less and less frequent chars.
2852    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2853    the first charset, and the upper half (0xC9 .. 0xFE) to the
2854    second.  To do the conversion, we convert the character into
2855    a single number where 0 .. 156 is the first row, 157 .. 313
2856    is the second, etc.  That way, the characters are ordered by
2857    decreasing frequency.  Then we just chop the space in two
2858    and coerce the result into a 94x94 space.
2859    */
2860
2861 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
2862 {                                                                       \
2863   int B1 = b1, B2 = b2;                                                 \
2864   unsigned int I                                                        \
2865     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
2866                                                                         \
2867   if (B1 < 0xC9)                                                        \
2868     {                                                                   \
2869       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
2870     }                                                                   \
2871   else                                                                  \
2872     {                                                                   \
2873       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
2874       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
2875     }                                                                   \
2876   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
2877   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
2878 } while (0)
2879
2880 /* Convert the internal string representation of a Big5 character
2881    (lb, c1, c2) into Big5 code (b1, b2). */
2882
2883 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
2884 {                                                                       \
2885   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
2886                                                                         \
2887   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
2888     {                                                                   \
2889       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
2890     }                                                                   \
2891   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
2892   b2 = I % BIG5_SAME_ROW;                                               \
2893   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
2894 } while (0)
2895
2896 static int
2897 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2898                     unsigned int n)
2899 {
2900   int c;
2901
2902   while (n--)
2903     {
2904       c = *src++;
2905       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2906           (c >= 0x80 && c <= 0xA0))
2907         return 0;
2908       if (st->big5.in_second_byte)
2909         {
2910           st->big5.in_second_byte = 0;
2911           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2912             return 0;
2913         }
2914       else if (c >= 0xA1)
2915         st->big5.in_second_byte = 1;
2916     }
2917   return CODING_CATEGORY_BIG5_MASK;
2918 }
2919
2920 /* Convert Big5 data to internal format. */
2921
2922 static void
2923 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2924                     unsigned_char_dynarr *dst, unsigned int n)
2925 {
2926   unsigned char c;
2927   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2928   unsigned int flags  = str->flags;
2929   unsigned int ch     = str->ch;
2930   eol_type_t eol_type = str->eol_type;
2931
2932   while (n--)
2933     {
2934       c = *src++;
2935       if (ch)
2936         {
2937           /* Previous character was first byte of Big5 char. */
2938           if (BYTE_BIG5_TWO_BYTE_2_P (c))
2939             {
2940               unsigned char b1, b2, b3;
2941               DECODE_BIG5 (ch, c, b1, b2, b3);
2942               Dynarr_add (dst, b1);
2943               Dynarr_add (dst, b2);
2944               Dynarr_add (dst, b3);
2945             }
2946           else
2947             {
2948               DECODE_ADD_BINARY_CHAR (ch, dst);
2949               DECODE_ADD_BINARY_CHAR (c, dst);
2950             }
2951           ch = 0;
2952         }
2953       else
2954         {
2955           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2956           if (BYTE_BIG5_TWO_BYTE_1_P (c))
2957             ch = c;
2958           else
2959             DECODE_ADD_BINARY_CHAR (c, dst);
2960         }
2961     label_continue_loop:;
2962     }
2963
2964   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2965
2966   str->flags = flags;
2967   str->ch    = ch;
2968 }
2969
2970 /* Convert internally-formatted data to Big5. */
2971
2972 static void
2973 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2974                     unsigned_char_dynarr *dst, unsigned int n)
2975 {
2976   unsigned char c;
2977   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2978   unsigned int flags  = str->flags;
2979   unsigned int ch     = str->ch;
2980   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2981
2982   while (n--)
2983     {
2984       c = *src++;
2985       if (c == '\n')
2986         {
2987           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2988             Dynarr_add (dst, '\r');
2989           if (eol_type != EOL_CR)
2990             Dynarr_add (dst, '\n');
2991         }
2992       else if (BYTE_ASCII_P (c))
2993         {
2994           /* ASCII. */
2995           Dynarr_add (dst, c);
2996         }
2997       else if (BUFBYTE_LEADING_BYTE_P (c))
2998         {
2999           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3000               c == LEADING_BYTE_CHINESE_BIG5_2)
3001             {
3002               /* A recognized leading byte. */
3003               ch = c;
3004               continue; /* not done with this character. */
3005             }
3006           /* otherwise just ignore this character. */
3007         }
3008       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3009                ch == LEADING_BYTE_CHINESE_BIG5_2)
3010         {
3011           /* Previous char was a recognized leading byte. */
3012           ch = (ch << 8) | c;
3013           continue; /* not done with this character. */
3014         }
3015       else if (ch)
3016         {
3017           /* Encountering second byte of a Big5 character. */
3018           unsigned char b1, b2;
3019
3020           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3021           Dynarr_add (dst, b1);
3022           Dynarr_add (dst, b2);
3023         }
3024
3025       ch = 0;
3026     }
3027
3028   str->flags = flags;
3029   str->ch    = ch;
3030 }
3031
3032
3033 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3034 Decode a Big5 character CODE of BIG5 coding-system.
3035 CODE is the character code in BIG5, a cons of two integers.
3036 Return the corresponding character.
3037 */
3038        (code))
3039 {
3040   unsigned char c1, c2, b1, b2;
3041
3042   CHECK_CONS (code);
3043   CHECK_INT (XCAR (code));
3044   CHECK_INT (XCDR (code));
3045   b1 = XINT (XCAR (code));
3046   b2 = XINT (XCDR (code));
3047   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3048       BYTE_BIG5_TWO_BYTE_2_P (b2))
3049     {
3050       int leading_byte;
3051       Lisp_Object charset;
3052       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3053       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3054       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3055     }
3056   else
3057     return Qnil;
3058 }
3059
3060 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3061 Encode the Big5 character CH to BIG5 coding-system.
3062 Return the corresponding character code in Big5.
3063 */
3064        (ch))
3065 {
3066   Lisp_Object charset;
3067   int c1, c2, b1, b2;
3068
3069   CHECK_CHAR_COERCE_INT (ch);
3070   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3071   if (EQ (charset, Vcharset_chinese_big5_1) ||
3072       EQ (charset, Vcharset_chinese_big5_2))
3073     {
3074       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3075                    b1, b2);
3076       return Fcons (make_int (b1), make_int (b2));
3077     }
3078   else
3079     return Qnil;
3080 }
3081
3082 \f
3083 /************************************************************************/
3084 /*                           UCS-4 methods                              */
3085 /*                                                                      */
3086 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3087 /*                                                                      */
3088 /************************************************************************/
3089
3090 Lisp_Object ucs_to_mule_table[65536];
3091 Lisp_Object mule_to_ucs_table;
3092
3093 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3094 Map UCS-4 code CODE to Mule character CHARACTER.
3095
3096 Return T on success, NIL on failure.
3097 */
3098        (code, character))
3099 {
3100   unsigned int c;
3101
3102   CHECK_CHAR (character);
3103   CHECK_INT (code);
3104   c = XINT (code);
3105
3106   if (c < sizeof (ucs_to_mule_table))
3107     {
3108       ucs_to_mule_table[c] = character;
3109       return Qt;
3110     }
3111   else
3112     return Qnil;
3113 }
3114
3115 static Lisp_Object
3116 ucs_to_char (unsigned long code)
3117 {
3118   if (code < sizeof (ucs_to_mule_table))
3119     {
3120       return ucs_to_mule_table[code];
3121     }
3122   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3123     {
3124       unsigned int c;
3125
3126       code -= 0xe00000;
3127       c = code % (94 * 94);
3128       return make_char
3129         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3130                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3131                      CHARSET_LEFT_TO_RIGHT),
3132                     c / 94 + 33, c % 94 + 33));
3133     }
3134   else
3135     return Qnil;
3136 }
3137
3138 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3139 Return Mule character corresponding to UCS code CODE (a positive integer).
3140 */
3141        (code))
3142 {
3143   CHECK_NATNUM (code);
3144   return ucs_to_char (XINT (code));
3145 }
3146
3147 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3148 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3149 */
3150        (character, code))
3151 {
3152   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3153           Fset_char_ucs is more restrictive on index arg, but should
3154           check code arg in a char_table method. */
3155   CHECK_CHAR (character);
3156   CHECK_NATNUM (code);
3157   return Fput_char_table (character, code, mule_to_ucs_table);
3158 }
3159
3160 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3161 Return the UCS code (a positive integer) corresponding to CHARACTER.
3162 */
3163        (character))
3164 {
3165   return Fget_char_table (character, mule_to_ucs_table);
3166 }
3167
3168 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3169    JIS X 0208 double-width `=' instead.
3170    #### do something more appropriate (use blob?)
3171         Danger, Will Robinson!  Data loss.  Should we signal user? */
3172 static void
3173 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3174 {
3175   Lisp_Object chr = ucs_to_char (ch);
3176
3177   if (! NILP (chr))
3178     {
3179       Bufbyte work[MAX_EMCHAR_LEN];
3180       int len;
3181
3182       ch = XCHAR (chr);
3183       len = (ch < 128) ?
3184         simple_set_charptr_emchar (work, ch) :
3185         non_ascii_set_charptr_emchar (work, ch);
3186       Dynarr_add_many (dst, work, len);
3187     }
3188   else
3189     {
3190       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3191       Dynarr_add (dst, 34 + 128);
3192       Dynarr_add (dst, 46 + 128);
3193     }
3194 }
3195
3196 static unsigned long
3197 mule_char_to_ucs4 (Lisp_Object charset,
3198                    unsigned char h, unsigned char l)
3199 {
3200   Lisp_Object code
3201     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3202                        mule_to_ucs_table);
3203
3204   if (INTP (code))
3205     {
3206       return XINT (code);
3207     }
3208   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3209             (XCHARSET_CHARS (charset) == 94) )
3210     {
3211       unsigned char final = XCHARSET_FINAL (charset);
3212
3213       if ( ('@' <= final) && (final < 0x7f) )
3214         {
3215           return 0xe00000 + (final - '@') * 94 * 94
3216             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3217         }
3218       else
3219         {
3220           return '?';
3221         }
3222     }
3223   else
3224     {
3225       return '?';
3226     }
3227 }
3228
3229 static void
3230 encode_ucs4 (Lisp_Object charset,
3231              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3232 {
3233   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3234   Dynarr_add (dst,  code >> 24);
3235   Dynarr_add (dst, (code >> 16) & 255);
3236   Dynarr_add (dst, (code >>  8) & 255);
3237   Dynarr_add (dst,  code        & 255);
3238 }
3239
3240 static int
3241 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3242                     unsigned int n)
3243 {
3244   while (n--)
3245     {
3246       int c = *src++;
3247       switch (st->ucs4.in_byte)
3248         {
3249         case 0:
3250           if (c >= 128)
3251             return 0;
3252           else
3253             st->ucs4.in_byte++;
3254           break;
3255         case 3:
3256           st->ucs4.in_byte = 0;
3257           break;
3258         default:
3259           st->ucs4.in_byte++;
3260         }
3261     }
3262   return CODING_CATEGORY_UCS4_MASK;
3263 }
3264
3265 static void
3266 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3267                     unsigned_char_dynarr *dst, unsigned int n)
3268 {
3269   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3270   unsigned int flags = str->flags;
3271   unsigned int ch    = str->ch;
3272
3273   while (n--)
3274     {
3275       unsigned char c = *src++;
3276       switch (flags)
3277         {
3278         case 0:
3279           ch = c;
3280           flags = 3;
3281           break;
3282         case 1:
3283           decode_ucs4 ( ( ch << 8 ) | c, dst);
3284           ch = 0;
3285           flags = 0;
3286           break;
3287         default:
3288           ch = ( ch << 8 ) | c;
3289           flags--;
3290         }
3291     }
3292   if (flags & CODING_STATE_END)
3293     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3294
3295   str->flags = flags;
3296   str->ch    = ch;
3297 }
3298
3299 static void
3300 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3301                     unsigned_char_dynarr *dst, unsigned int n)
3302 {
3303   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3304   unsigned int flags = str->flags;
3305   unsigned int ch = str->ch;
3306   unsigned char char_boundary = str->iso2022.current_char_boundary;
3307   Lisp_Object charset = str->iso2022.current_charset;
3308
3309 #ifdef ENABLE_COMPOSITE_CHARS
3310   /* flags for handling composite chars.  We do a little switcharoo
3311      on the source while we're outputting the composite char. */
3312   unsigned int saved_n = 0;
3313   CONST unsigned char *saved_src = NULL;
3314   int in_composite = 0;
3315
3316  back_to_square_n:
3317 #endif
3318
3319   while (n--)
3320     {
3321       unsigned char c = *src++;
3322
3323       if (BYTE_ASCII_P (c))
3324         {               /* Processing ASCII character */
3325           ch = 0;
3326           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3327           char_boundary = 1;
3328         }
3329       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3330         { /* Processing Leading Byte */
3331           ch = 0;
3332           charset = CHARSET_BY_LEADING_BYTE (c);
3333           if (LEADING_BYTE_PREFIX_P(c))
3334             ch = c;
3335           char_boundary = 0;
3336         }
3337       else
3338         {                       /* Processing Non-ASCII character */
3339           char_boundary = 1;
3340           if (EQ (charset, Vcharset_control_1))
3341             {
3342               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3343             }
3344           else
3345             {
3346               switch (XCHARSET_REP_BYTES (charset))
3347                 {
3348                 case 2:
3349                   encode_ucs4 (charset, c, 0, dst);
3350                   break;
3351                 case 3:
3352                   if (XCHARSET_PRIVATE_P (charset))
3353                     {
3354                       encode_ucs4 (charset, c, 0, dst);
3355                       ch = 0;
3356                     }
3357                   else if (ch)
3358                     {
3359 #ifdef ENABLE_COMPOSITE_CHARS
3360                       if (EQ (charset, Vcharset_composite))
3361                         {
3362                           if (in_composite)
3363                             {
3364                               /* #### Bother! We don't know how to
3365                                  handle this yet. */
3366                               Dynarr_add (dst, 0);
3367                               Dynarr_add (dst, 0);
3368                               Dynarr_add (dst, 0);
3369                               Dynarr_add (dst, '~');
3370                             }
3371                           else
3372                             {
3373                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3374                                                        ch & 0x7F, c & 0x7F);
3375                               Lisp_Object lstr = composite_char_string (emch);
3376                               saved_n = n;
3377                               saved_src = src;
3378                               in_composite = 1;
3379                               src = XSTRING_DATA   (lstr);
3380                               n   = XSTRING_LENGTH (lstr);
3381                             }
3382                         }
3383                       else
3384 #endif /* ENABLE_COMPOSITE_CHARS */
3385                         {
3386                           encode_ucs4(charset, ch, c, dst);
3387                         }
3388                       ch = 0;
3389                     }
3390                   else
3391                     {
3392                       ch = c;
3393                       char_boundary = 0;
3394                     }
3395                   break;
3396                 case 4:
3397                   if (ch)
3398                     {
3399                       encode_ucs4 (charset, ch, c, dst);
3400                       ch = 0;
3401                     }
3402                   else
3403                     {
3404                       ch = c;
3405                       char_boundary = 0;
3406                     }
3407                   break;
3408                 default:
3409                   abort ();
3410                 }
3411             }
3412         }
3413     }
3414
3415 #ifdef ENABLE_COMPOSITE_CHARS
3416   if (in_composite)
3417     {
3418       n = saved_n;
3419       src = saved_src;
3420       in_composite = 0;
3421       goto back_to_square_n; /* Wheeeeeeeee ..... */
3422     }
3423 #endif /* ENABLE_COMPOSITE_CHARS */
3424
3425   str->flags = flags;
3426   str->ch = ch;
3427   str->iso2022.current_char_boundary = char_boundary;
3428   str->iso2022.current_charset = charset;
3429
3430   /* Verbum caro factum est! */
3431 }
3432
3433 \f
3434 /************************************************************************/
3435 /*                           UTF-8 methods                              */
3436 /************************************************************************/
3437
3438 static int
3439 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3440                     unsigned int n)
3441 {
3442   while (n--)
3443     {
3444       unsigned char c = *src++;
3445       switch (st->utf8.in_byte)
3446         {
3447         case 0:
3448           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3449             return 0;
3450           else if (c >= 0xfc)
3451             st->utf8.in_byte = 5;
3452           else if (c >= 0xf8)
3453             st->utf8.in_byte = 4;
3454           else if (c >= 0xf0)
3455             st->utf8.in_byte = 3;
3456           else if (c >= 0xe0)
3457             st->utf8.in_byte = 2;
3458           else if (c >= 0xc0)
3459             st->utf8.in_byte = 1;
3460           else if (c >= 0x80)
3461             return 0;
3462           break;
3463         default:
3464           if ((c & 0xc0) != 0x80)
3465             return 0;
3466           else
3467             st->utf8.in_byte--;
3468         }
3469     }
3470   return CODING_CATEGORY_UTF8_MASK;
3471 }
3472
3473 static void
3474 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3475                     unsigned_char_dynarr *dst, unsigned int n)
3476 {
3477   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3478   unsigned int flags  = str->flags;
3479   unsigned int ch     = str->ch;
3480   eol_type_t eol_type = str->eol_type;
3481
3482   while (n--)
3483     {
3484       unsigned char c = *src++;
3485       switch (flags)
3486         {
3487         case 0:
3488           if ( c >= 0xfc )
3489             {
3490               ch = c & 0x01;
3491               flags = 5;
3492             }
3493           else if ( c >= 0xf8 )
3494             {
3495               ch = c & 0x03;
3496               flags = 4;
3497             }
3498           else if ( c >= 0xf0 )
3499             {
3500               ch = c & 0x07;
3501               flags = 3;
3502             }
3503           else if ( c >= 0xe0 )
3504             {
3505               ch = c & 0x0f;
3506               flags = 2;
3507             }
3508           else if ( c >= 0xc0 )
3509             {
3510               ch = c & 0x1f;
3511               flags = 1;
3512             }
3513           else
3514             {
3515               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3516               decode_ucs4 (c, dst);
3517             }
3518           break;
3519         case 1:
3520           ch = ( ch << 6 ) | ( c & 0x3f );
3521           decode_ucs4 (ch, dst);
3522           ch = 0;
3523           flags = 0;
3524           break;
3525         default:
3526           ch = ( ch << 6 ) | ( c & 0x3f );
3527           flags--;
3528         }
3529     label_continue_loop:;
3530     }
3531
3532   if (flags & CODING_STATE_END)
3533     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3534
3535   str->flags = flags;
3536   str->ch    = ch;
3537 }
3538
3539 static void
3540 encode_utf8 (Lisp_Object charset,
3541              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3542 {
3543   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3544   if ( code <= 0x7f )
3545     {
3546       Dynarr_add (dst, code);
3547     }
3548   else if ( code <= 0x7ff )
3549     {
3550       Dynarr_add (dst, (code >> 6) | 0xc0);
3551       Dynarr_add (dst, (code & 0x3f) | 0x80);
3552     }
3553   else if ( code <= 0xffff )
3554     {
3555       Dynarr_add (dst,  (code >> 12) | 0xe0);
3556       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3557       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3558     }
3559   else if ( code <= 0x1fffff )
3560     {
3561       Dynarr_add (dst,  (code >> 18) | 0xf0);
3562       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3563       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3564       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3565     }
3566   else if ( code <= 0x3ffffff )
3567     {
3568       Dynarr_add (dst,  (code >> 24) | 0xf8);
3569       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3570       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3571       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3572       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3573     }
3574   else
3575     {
3576       Dynarr_add (dst,  (code >> 30) | 0xfc);
3577       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3578       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3579       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3580       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3581       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3582     }
3583 }
3584
3585 static void
3586 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3587                     unsigned_char_dynarr *dst, unsigned int n)
3588 {
3589   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3590   unsigned int flags  = str->flags;
3591   unsigned int ch     = str->ch;
3592   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3593   unsigned char char_boundary = str->iso2022.current_char_boundary;
3594   Lisp_Object charset = str->iso2022.current_charset;
3595
3596 #ifdef ENABLE_COMPOSITE_CHARS
3597   /* flags for handling composite chars.  We do a little switcharoo
3598      on the source while we're outputting the composite char. */
3599   unsigned int saved_n = 0;
3600   CONST unsigned char *saved_src = NULL;
3601   int in_composite = 0;
3602
3603  back_to_square_n:
3604 #endif /* ENABLE_COMPOSITE_CHARS */
3605
3606   while (n--)
3607     {
3608       unsigned char c = *src++;
3609
3610       if (BYTE_ASCII_P (c))
3611         {               /* Processing ASCII character */
3612           ch = 0;
3613           if (c == '\n')
3614             {
3615               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3616                 Dynarr_add (dst, '\r');
3617               if (eol_type != EOL_CR)
3618                 Dynarr_add (dst, c);
3619             }
3620           else
3621             encode_utf8 (Vcharset_ascii, c, 0, dst);
3622           char_boundary = 1;
3623         }
3624       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3625         { /* Processing Leading Byte */
3626           ch = 0;
3627           charset = CHARSET_BY_LEADING_BYTE (c);
3628           if (LEADING_BYTE_PREFIX_P(c))
3629             ch = c;
3630           char_boundary = 0;
3631         }
3632       else
3633         {                       /* Processing Non-ASCII character */
3634           char_boundary = 1;
3635           if (EQ (charset, Vcharset_control_1))
3636             {
3637               encode_utf8 (Vcharset_control_1, c, 0, dst);
3638             }
3639           else
3640             {
3641               switch (XCHARSET_REP_BYTES (charset))
3642                 {
3643                 case 2:
3644                   encode_utf8 (charset, c, 0, dst);
3645                   break;
3646                 case 3:
3647                   if (XCHARSET_PRIVATE_P (charset))
3648                     {
3649                       encode_utf8 (charset, c, 0, dst);
3650                       ch = 0;
3651                     }
3652                   else if (ch)
3653                     {
3654 #ifdef ENABLE_COMPOSITE_CHARS
3655                       if (EQ (charset, Vcharset_composite))
3656                         {
3657                           if (in_composite)
3658                             {
3659                               /* #### Bother! We don't know how to
3660                                  handle this yet. */
3661                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3662                             }
3663                           else
3664                             {
3665                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3666                                                        ch & 0x7F, c & 0x7F);
3667                               Lisp_Object lstr = composite_char_string (emch);
3668                               saved_n = n;
3669                               saved_src = src;
3670                               in_composite = 1;
3671                               src = XSTRING_DATA   (lstr);
3672                               n   = XSTRING_LENGTH (lstr);
3673                             }
3674                         }
3675                       else
3676 #endif /* ENABLE_COMPOSITE_CHARS */
3677                         {
3678                           encode_utf8 (charset, ch, c, dst);
3679                         }
3680                       ch = 0;
3681                     }
3682                   else
3683                     {
3684                       ch = c;
3685                       char_boundary = 0;
3686                     }
3687                   break;
3688                 case 4:
3689                   if (ch)
3690                     {
3691                       encode_utf8 (charset, ch, c, dst);
3692                       ch = 0;
3693                     }
3694                   else
3695                     {
3696                       ch = c;
3697                       char_boundary = 0;
3698                     }
3699                   break;
3700                 default:
3701                   abort ();
3702                 }
3703             }
3704         }
3705     }
3706
3707 #ifdef ENABLE_COMPOSITE_CHARS
3708   if (in_composite)
3709     {
3710       n = saved_n;
3711       src = saved_src;
3712       in_composite = 0;
3713       goto back_to_square_n; /* Wheeeeeeeee ..... */
3714     }
3715 #endif
3716
3717   str->flags = flags;
3718   str->ch    = ch;
3719   str->iso2022.current_char_boundary = char_boundary;
3720   str->iso2022.current_charset = charset;
3721
3722   /* Verbum caro factum est! */
3723 }
3724
3725 \f
3726 /************************************************************************/
3727 /*                           ISO2022 methods                            */
3728 /************************************************************************/
3729
3730 /* The following note describes the coding system ISO2022 briefly.
3731    Since the intention of this note is to help understand the
3732    functions in this file, some parts are NOT ACCURATE or OVERLY
3733    SIMPLIFIED.  For thorough understanding, please refer to the
3734    original document of ISO2022.
3735
3736    ISO2022 provides many mechanisms to encode several character sets
3737    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3738    is encoded using bytes less than 128.  This may make the encoded
3739    text a little bit longer, but the text passes more easily through
3740    several gateways, some of which strip off MSB (Most Signigant Bit).
3741
3742    There are two kinds of character sets: control character set and
3743    graphic character set.  The former contains control characters such
3744    as `newline' and `escape' to provide control functions (control
3745    functions are also provided by escape sequences).  The latter
3746    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3747    two control character sets and many graphic character sets.
3748
3749    Graphic character sets are classified into one of the following
3750    four classes, according to the number of bytes (DIMENSION) and
3751    number of characters in one dimension (CHARS) of the set:
3752    - DIMENSION1_CHARS94
3753    - DIMENSION1_CHARS96
3754    - DIMENSION2_CHARS94
3755    - DIMENSION2_CHARS96
3756
3757    In addition, each character set is assigned an identification tag,
3758    unique for each set, called "final character" (denoted as <F>
3759    hereafter).  The <F> of each character set is decided by ECMA(*)
3760    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3761    (0x30..0x3F are for private use only).
3762
3763    Note (*): ECMA = European Computer Manufacturers Association
3764
3765    Here are examples of graphic character set [NAME(<F>)]:
3766         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3767         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3768         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3769         o DIMENSION2_CHARS96 -- none for the moment
3770
3771    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3772         C0 [0x00..0x1F] -- control character plane 0
3773         GL [0x20..0x7F] -- graphic character plane 0
3774         C1 [0x80..0x9F] -- control character plane 1
3775         GR [0xA0..0xFF] -- graphic character plane 1
3776
3777    A control character set is directly designated and invoked to C0 or
3778    C1 by an escape sequence.  The most common case is that:
3779    - ISO646's  control character set is designated/invoked to C0, and
3780    - ISO6429's control character set is designated/invoked to C1,
3781    and usually these designations/invocations are omitted in encoded
3782    text.  In a 7-bit environment, only C0 can be used, and a control
3783    character for C1 is encoded by an appropriate escape sequence to
3784    fit into the environment.  All control characters for C1 are
3785    defined to have corresponding escape sequences.
3786
3787    A graphic character set is at first designated to one of four
3788    graphic registers (G0 through G3), then these graphic registers are
3789    invoked to GL or GR.  These designations and invocations can be
3790    done independently.  The most common case is that G0 is invoked to
3791    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3792    these invocations and designations are omitted in encoded text.
3793    In a 7-bit environment, only GL can be used.
3794
3795    When a graphic character set of CHARS94 is invoked to GL, codes
3796    0x20 and 0x7F of the GL area work as control characters SPACE and
3797    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3798    be used.
3799
3800    There are two ways of invocation: locking-shift and single-shift.
3801    With locking-shift, the invocation lasts until the next different
3802    invocation, whereas with single-shift, the invocation affects the
3803    following character only and doesn't affect the locking-shift
3804    state.  Invocations are done by the following control characters or
3805    escape sequences:
3806
3807    ----------------------------------------------------------------------
3808    abbrev  function                  cntrl escape seq   description
3809    ----------------------------------------------------------------------
3810    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3811    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3812    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3813    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3814    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3815    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3816    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3817    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3818    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3819    ----------------------------------------------------------------------
3820    (*) These are not used by any known coding system.
3821
3822    Control characters for these functions are defined by macros
3823    ISO_CODE_XXX in `coding.h'.
3824
3825    Designations are done by the following escape sequences:
3826    ----------------------------------------------------------------------
3827    escape sequence      description
3828    ----------------------------------------------------------------------
3829    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3830    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3831    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3832    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3833    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3834    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3835    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3836    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3837    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3838    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3839    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3840    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3841    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3842    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3843    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3844    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3845    ----------------------------------------------------------------------
3846
3847    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3848    of dimension 1, chars 94, and final character <F>, etc...
3849
3850    Note (*): Although these designations are not allowed in ISO2022,
3851    Emacs accepts them on decoding, and produces them on encoding
3852    CHARS96 character sets in a coding system which is characterized as
3853    7-bit environment, non-locking-shift, and non-single-shift.
3854
3855    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3856    '(' can be omitted.  We refer to this as "short-form" hereafter.
3857
3858    Now you may notice that there are a lot of ways for encoding the
3859    same multilingual text in ISO2022.  Actually, there exist many
3860    coding systems such as Compound Text (used in X11's inter client
3861    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3862    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3863    localized platforms), and all of these are variants of ISO2022.
3864
3865    In addition to the above, Emacs handles two more kinds of escape
3866    sequences: ISO6429's direction specification and Emacs' private
3867    sequence for specifying character composition.
3868
3869    ISO6429's direction specification takes the following form:
3870         o CSI ']'      -- end of the current direction
3871         o CSI '0' ']'  -- end of the current direction
3872         o CSI '1' ']'  -- start of left-to-right text
3873         o CSI '2' ']'  -- start of right-to-left text
3874    The control character CSI (0x9B: control sequence introducer) is
3875    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3876
3877    Character composition specification takes the following form:
3878         o ESC '0' -- start character composition
3879         o ESC '1' -- end character composition
3880    Since these are not standard escape sequences of any ISO standard,
3881    their use with these meanings is restricted to Emacs only.  */
3882
3883 static void
3884 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3885 {
3886   int i;
3887
3888   for (i = 0; i < 4; i++)
3889     {
3890       if (!NILP (coding_system))
3891         iso->charset[i] =
3892           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3893       else
3894         iso->charset[i] = Qt;
3895       iso->invalid_designated[i] = 0;
3896     }
3897   iso->esc = ISO_ESC_NOTHING;
3898   iso->esc_bytes_index = 0;
3899   iso->register_left = 0;
3900   iso->register_right = 1;
3901   iso->switched_dir_and_no_valid_charset_yet = 0;
3902   iso->invalid_switch_dir = 0;
3903   iso->output_direction_sequence = 0;
3904   iso->output_literally = 0;
3905 #ifdef ENABLE_COMPOSITE_CHARS
3906   if (iso->composite_chars)
3907     Dynarr_reset (iso->composite_chars);
3908 #endif
3909 }
3910
3911 static int
3912 fit_to_be_escape_quoted (unsigned char c)
3913 {
3914   switch (c)
3915     {
3916     case ISO_CODE_ESC:
3917     case ISO_CODE_CSI:
3918     case ISO_CODE_SS2:
3919     case ISO_CODE_SS3:
3920     case ISO_CODE_SO:
3921     case ISO_CODE_SI:
3922       return 1;
3923
3924     default:
3925       return 0;
3926     }
3927 }
3928
3929 /* Parse one byte of an ISO2022 escape sequence.
3930    If the result is an invalid escape sequence, return 0 and
3931    do not change anything in STR.  Otherwise, if the result is
3932    an incomplete escape sequence, update ISO2022.ESC and
3933    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3934    all the state variables (but not ISO2022.ESC_BYTES) and
3935    return 1.
3936
3937    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3938    or invocation of an invalid character set and treat that as
3939    an unrecognized escape sequence. */
3940
3941 static int
3942 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3943                    unsigned char c, unsigned int *flags,
3944                    int check_invalid_charsets)
3945 {
3946   /* (1) If we're at the end of a designation sequence, CS is the
3947      charset being designated and REG is the register to designate
3948      it to.
3949
3950      (2) If we're at the end of a locking-shift sequence, REG is
3951      the register to invoke and HALF (0 == left, 1 == right) is
3952      the half to invoke it into.
3953
3954      (3) If we're at the end of a single-shift sequence, REG is
3955      the register to invoke. */
3956   Lisp_Object cs = Qnil;
3957   int reg, half;
3958
3959   /* NOTE: This code does goto's all over the fucking place.
3960      The reason for this is that we're basically implementing
3961      a state machine here, and hierarchical languages like C
3962      don't really provide a clean way of doing this. */
3963
3964   if (! (*flags & CODING_STATE_ESCAPE))
3965     /* At beginning of escape sequence; we need to reset our
3966        escape-state variables. */
3967     iso->esc = ISO_ESC_NOTHING;
3968
3969   iso->output_literally = 0;
3970   iso->output_direction_sequence = 0;
3971
3972   switch (iso->esc)
3973     {
3974     case ISO_ESC_NOTHING:
3975       iso->esc_bytes_index = 0;
3976       switch (c)
3977         {
3978         case ISO_CODE_ESC:      /* Start escape sequence */
3979           *flags |= CODING_STATE_ESCAPE;
3980           iso->esc = ISO_ESC;
3981           goto not_done;
3982
3983         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3984           *flags |= CODING_STATE_ESCAPE;
3985           iso->esc = ISO_ESC_5_11;
3986           goto not_done;
3987
3988         case ISO_CODE_SO:       /* locking shift 1 */
3989           reg = 1; half = 0;
3990           goto locking_shift;
3991         case ISO_CODE_SI:       /* locking shift 0 */
3992           reg = 0; half = 0;
3993           goto locking_shift;
3994
3995         case ISO_CODE_SS2:      /* single shift */
3996           reg = 2;
3997           goto single_shift;
3998         case ISO_CODE_SS3:      /* single shift */
3999           reg = 3;
4000           goto single_shift;
4001
4002         default:                        /* Other control characters */
4003           return 0;
4004         }
4005
4006     case ISO_ESC:
4007       switch (c)
4008         {
4009           /**** single shift ****/
4010
4011         case 'N':       /* single shift 2 */
4012           reg = 2;
4013           goto single_shift;
4014         case 'O':       /* single shift 3 */
4015           reg = 3;
4016           goto single_shift;
4017
4018           /**** locking shift ****/
4019
4020         case '~':       /* locking shift 1 right */
4021           reg = 1; half = 1;
4022           goto locking_shift;
4023         case 'n':       /* locking shift 2 */
4024           reg = 2; half = 0;
4025           goto locking_shift;
4026         case '}':       /* locking shift 2 right */
4027           reg = 2; half = 1;
4028           goto locking_shift;
4029         case 'o':       /* locking shift 3 */
4030           reg = 3; half = 0;
4031           goto locking_shift;
4032         case '|':       /* locking shift 3 right */
4033           reg = 3; half = 1;
4034           goto locking_shift;
4035
4036 #ifdef ENABLE_COMPOSITE_CHARS
4037           /**** composite ****/
4038
4039         case '0':
4040           iso->esc = ISO_ESC_START_COMPOSITE;
4041           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4042             CODING_STATE_COMPOSITE;
4043           return 1;
4044
4045         case '1':
4046           iso->esc = ISO_ESC_END_COMPOSITE;
4047           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4048             ~CODING_STATE_COMPOSITE;
4049           return 1;
4050 #endif /* ENABLE_COMPOSITE_CHARS */
4051
4052           /**** directionality ****/
4053
4054         case '[':
4055           iso->esc = ISO_ESC_5_11;
4056           goto not_done;
4057
4058           /**** designation ****/
4059
4060         case '$':       /* multibyte charset prefix */
4061           iso->esc = ISO_ESC_2_4;
4062           goto not_done;
4063
4064         default:
4065           if (0x28 <= c && c <= 0x2F)
4066             {
4067               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4068               goto not_done;
4069             }
4070
4071           /* This function is called with CODESYS equal to nil when
4072              doing coding-system detection. */
4073           if (!NILP (codesys)
4074               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4075               && fit_to_be_escape_quoted (c))
4076             {
4077               iso->esc = ISO_ESC_LITERAL;
4078               *flags &= CODING_STATE_ISO2022_LOCK;
4079               return 1;
4080             }
4081
4082           /* bzzzt! */
4083           return 0;
4084         }
4085
4086
4087
4088       /**** directionality ****/
4089
4090     case ISO_ESC_5_11:          /* ISO6429 direction control */
4091       if (c == ']')
4092         {
4093           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4094           goto directionality;
4095         }
4096       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4097       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4098       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4099       else               return 0;
4100       goto not_done;
4101
4102     case ISO_ESC_5_11_0:
4103       if (c == ']')
4104         {
4105           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4106           goto directionality;
4107         }
4108       return 0;
4109
4110     case ISO_ESC_5_11_1:
4111       if (c == ']')
4112         {
4113           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4114           goto directionality;
4115         }
4116       return 0;
4117
4118     case ISO_ESC_5_11_2:
4119       if (c == ']')
4120         {
4121           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4122           goto directionality;
4123         }
4124       return 0;
4125
4126     directionality:
4127       iso->esc = ISO_ESC_DIRECTIONALITY;
4128       /* Various junk here to attempt to preserve the direction sequences
4129          literally in the text if they would otherwise be swallowed due
4130          to invalid designations that don't show up as actual charset
4131          changes in the text. */
4132       if (iso->invalid_switch_dir)
4133         {
4134           /* We already inserted a direction switch literally into the
4135              text.  We assume (#### this may not be right) that the
4136              next direction switch is the one going the other way,
4137              and we need to output that literally as well. */
4138           iso->output_literally = 1;
4139           iso->invalid_switch_dir = 0;
4140         }
4141       else
4142         {
4143           int jj;
4144
4145           /* If we are in the thrall of an invalid designation,
4146            then stick the directionality sequence literally into the
4147            output stream so it ends up in the original text again. */
4148           for (jj = 0; jj < 4; jj++)
4149             if (iso->invalid_designated[jj])
4150               break;
4151           if (jj < 4)
4152             {
4153               iso->output_literally = 1;
4154               iso->invalid_switch_dir = 1;
4155             }
4156           else
4157             /* Indicate that we haven't yet seen a valid designation,
4158                so that if a switch-dir is directly followed by an
4159                invalid designation, both get inserted literally. */
4160             iso->switched_dir_and_no_valid_charset_yet = 1;
4161         }
4162       return 1;
4163
4164
4165       /**** designation ****/
4166
4167     case ISO_ESC_2_4:
4168       if (0x28 <= c && c <= 0x2F)
4169         {
4170           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4171           goto not_done;
4172         }
4173       if (0x40 <= c && c <= 0x42)
4174         {
4175           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4176                                       *flags & CODING_STATE_R2L ?
4177                                       CHARSET_RIGHT_TO_LEFT :
4178                                       CHARSET_LEFT_TO_RIGHT);
4179           reg = 0;
4180           goto designated;
4181         }
4182       return 0;
4183
4184     default:
4185       {
4186         int type =-1;
4187
4188         if (c < '0' || c > '~')
4189           return 0; /* bad final byte */
4190
4191         if (iso->esc >= ISO_ESC_2_8 &&
4192             iso->esc <= ISO_ESC_2_15)
4193           {
4194             type = ((iso->esc >= ISO_ESC_2_12) ?
4195                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4196             reg = (iso->esc - ISO_ESC_2_8) & 3;
4197           }
4198         else if (iso->esc >= ISO_ESC_2_4_8 &&
4199                  iso->esc <= ISO_ESC_2_4_15)
4200           {
4201             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4202                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4203             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4204           }
4205         else
4206           {
4207             /* Can this ever be reached? -slb */
4208             abort();
4209           }
4210
4211         cs = CHARSET_BY_ATTRIBUTES (type, c,
4212                                     *flags & CODING_STATE_R2L ?
4213                                     CHARSET_RIGHT_TO_LEFT :
4214                                     CHARSET_LEFT_TO_RIGHT);
4215         goto designated;
4216       }
4217     }
4218
4219  not_done:
4220   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4221   return -1;
4222
4223  single_shift:
4224   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4225     /* can't invoke something that ain't there. */
4226     return 0;
4227   iso->esc = ISO_ESC_SINGLE_SHIFT;
4228   *flags &= CODING_STATE_ISO2022_LOCK;
4229   if (reg == 2)
4230     *flags |= CODING_STATE_SS2;
4231   else
4232     *flags |= CODING_STATE_SS3;
4233   return 1;
4234
4235  locking_shift:
4236   if (check_invalid_charsets &&
4237       !CHARSETP (iso->charset[reg]))
4238     /* can't invoke something that ain't there. */
4239     return 0;
4240   if (half)
4241     iso->register_right = reg;
4242   else
4243     iso->register_left = reg;
4244   *flags &= CODING_STATE_ISO2022_LOCK;
4245   iso->esc = ISO_ESC_LOCKING_SHIFT;
4246   return 1;
4247
4248  designated:
4249   if (NILP (cs) && check_invalid_charsets)
4250     {
4251       iso->invalid_designated[reg] = 1;
4252       iso->charset[reg] = Vcharset_ascii;
4253       iso->esc = ISO_ESC_DESIGNATE;
4254       *flags &= CODING_STATE_ISO2022_LOCK;
4255       iso->output_literally = 1;
4256       if (iso->switched_dir_and_no_valid_charset_yet)
4257         {
4258           /* We encountered a switch-direction followed by an
4259              invalid designation.  Ensure that the switch-direction
4260              gets outputted; otherwise it will probably get eaten
4261              when the text is written out again. */
4262           iso->switched_dir_and_no_valid_charset_yet = 0;
4263           iso->output_direction_sequence = 1;
4264           /* And make sure that the switch-dir going the other
4265              way gets outputted, as well. */
4266           iso->invalid_switch_dir = 1;
4267         }
4268       return 1;
4269     }
4270   /* This function is called with CODESYS equal to nil when
4271      doing coding-system detection. */
4272   if (!NILP (codesys))
4273     {
4274       charset_conversion_spec_dynarr *dyn =
4275         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4276
4277       if (dyn)
4278         {
4279           int i;
4280
4281           for (i = 0; i < Dynarr_length (dyn); i++)
4282             {
4283               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4284               if (EQ (cs, spec->from_charset))
4285                 cs = spec->to_charset;
4286             }
4287         }
4288     }
4289
4290   iso->charset[reg] = cs;
4291   iso->esc = ISO_ESC_DESIGNATE;
4292   *flags &= CODING_STATE_ISO2022_LOCK;
4293   if (iso->invalid_designated[reg])
4294     {
4295       iso->invalid_designated[reg] = 0;
4296       iso->output_literally = 1;
4297     }
4298   if (iso->switched_dir_and_no_valid_charset_yet)
4299     iso->switched_dir_and_no_valid_charset_yet = 0;
4300   return 1;
4301 }
4302
4303 static int
4304 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4305                        unsigned int n)
4306 {
4307   int mask;
4308
4309   /* #### There are serious deficiencies in the recognition mechanism
4310      here.  This needs to be much smarter if it's going to cut it.
4311      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4312      it should be detected as Latin-1.
4313      All the ISO2022 stuff in this file should be synced up with the
4314      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4315      Perhaps we should wait till R2L works in FSF Emacs? */
4316
4317   if (!st->iso2022.initted)
4318     {
4319       reset_iso2022 (Qnil, &st->iso2022.iso);
4320       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4321                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4322                           CODING_CATEGORY_ISO_8_1_MASK |
4323                           CODING_CATEGORY_ISO_8_2_MASK |
4324                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4325       st->iso2022.flags = 0;
4326       st->iso2022.high_byte_count = 0;
4327       st->iso2022.saw_single_shift = 0;
4328       st->iso2022.initted = 1;
4329     }
4330
4331   mask = st->iso2022.mask;
4332
4333   while (n--)
4334     {
4335       int c = *src++;
4336       if (c >= 0xA0)
4337         {
4338           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4339           st->iso2022.high_byte_count++;
4340         }
4341       else
4342         {
4343           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4344             {
4345               if (st->iso2022.high_byte_count & 1)
4346                 /* odd number of high bytes; assume not iso-8-2 */
4347                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4348             }
4349           st->iso2022.high_byte_count = 0;
4350           st->iso2022.saw_single_shift = 0;
4351           if (c > 0x80)
4352             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4353         }
4354       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4355           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4356         { /* control chars */
4357           switch (c)
4358             {
4359               /* Allow and ignore control characters that you might
4360                  reasonably see in a text file */
4361             case '\r':
4362             case '\n':
4363             case '\t':
4364             case  7: /* bell */
4365             case  8: /* backspace */
4366             case 11: /* vertical tab */
4367             case 12: /* form feed */
4368             case 26: /* MS-DOS C-z junk */
4369             case 31: /* '^_' -- for info */
4370               goto label_continue_loop;
4371
4372             default:
4373               break;
4374             }
4375         }
4376
4377       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4378           || BYTE_C1_P (c))
4379         {
4380           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4381                                  &st->iso2022.flags, 0))
4382             {
4383               switch (st->iso2022.iso.esc)
4384                 {
4385                 case ISO_ESC_DESIGNATE:
4386                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4387                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4388                   break;
4389                 case ISO_ESC_LOCKING_SHIFT:
4390                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4391                   goto ran_out_of_chars;
4392                 case ISO_ESC_SINGLE_SHIFT:
4393                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4394                   st->iso2022.saw_single_shift = 1;
4395                   break;
4396                 default:
4397                   break;
4398                 }
4399             }
4400           else
4401             {
4402               mask = 0;
4403               goto ran_out_of_chars;
4404             }
4405         }
4406     label_continue_loop:;
4407     }
4408
4409  ran_out_of_chars:
4410
4411   return mask;
4412 }
4413
4414 static int
4415 postprocess_iso2022_mask (int mask)
4416 {
4417   /* #### kind of cheesy */
4418   /* If seven-bit ISO is allowed, then assume that the encoding is
4419      entirely seven-bit and turn off the eight-bit ones. */
4420   if (mask & CODING_CATEGORY_ISO_7_MASK)
4421     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4422                CODING_CATEGORY_ISO_8_1_MASK |
4423                CODING_CATEGORY_ISO_8_2_MASK);
4424   return mask;
4425 }
4426
4427 /* If FLAGS is a null pointer or specifies right-to-left motion,
4428    output a switch-dir-to-left-to-right sequence to DST.
4429    Also update FLAGS if it is not a null pointer.
4430    If INTERNAL_P is set, we are outputting in internal format and
4431    need to handle the CSI differently. */
4432
4433 static void
4434 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4435                                  unsigned_char_dynarr *dst,
4436                                  unsigned int *flags,
4437                                  int internal_p)
4438 {
4439   if (!flags || (*flags & CODING_STATE_R2L))
4440     {
4441       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4442         {
4443           Dynarr_add (dst, ISO_CODE_ESC);
4444           Dynarr_add (dst, '[');
4445         }
4446       else if (internal_p)
4447         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4448       else
4449         Dynarr_add (dst, ISO_CODE_CSI);
4450       Dynarr_add (dst, '0');
4451       Dynarr_add (dst, ']');
4452       if (flags)
4453         *flags &= ~CODING_STATE_R2L;
4454     }
4455 }
4456
4457 /* If FLAGS is a null pointer or specifies a direction different from
4458    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4459    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4460    sequence to DST.  Also update FLAGS if it is not a null pointer.
4461    If INTERNAL_P is set, we are outputting in internal format and
4462    need to handle the CSI differently. */
4463
4464 static void
4465 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4466                           unsigned_char_dynarr *dst, unsigned int *flags,
4467                           int internal_p)
4468 {
4469   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4470       direction == CHARSET_LEFT_TO_RIGHT)
4471     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4472   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4473            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4474            direction == CHARSET_RIGHT_TO_LEFT)
4475     {
4476       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4477         {
4478           Dynarr_add (dst, ISO_CODE_ESC);
4479           Dynarr_add (dst, '[');
4480         }
4481       else if (internal_p)
4482         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4483       else
4484         Dynarr_add (dst, ISO_CODE_CSI);
4485       Dynarr_add (dst, '2');
4486       Dynarr_add (dst, ']');
4487       if (flags)
4488         *flags |= CODING_STATE_R2L;
4489     }
4490 }
4491
4492 /* Convert ISO2022-format data to internal format. */
4493
4494 static void
4495 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4496                        unsigned_char_dynarr *dst, unsigned int n)
4497 {
4498   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4499   unsigned int flags  = str->flags;
4500   unsigned int ch     = str->ch;
4501   eol_type_t eol_type = str->eol_type;
4502 #ifdef ENABLE_COMPOSITE_CHARS
4503   unsigned_char_dynarr *real_dst = dst;
4504 #endif
4505   Lisp_Object coding_system;
4506
4507   XSETCODING_SYSTEM (coding_system, str->codesys);
4508
4509 #ifdef ENABLE_COMPOSITE_CHARS
4510   if (flags & CODING_STATE_COMPOSITE)
4511     dst = str->iso2022.composite_chars;
4512 #endif /* ENABLE_COMPOSITE_CHARS */
4513
4514   while (n--)
4515     {
4516       unsigned char c = *src++;
4517       if (flags & CODING_STATE_ESCAPE)
4518         {       /* Within ESC sequence */
4519           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4520                                           c, &flags, 1);
4521
4522           if (retval)
4523             {
4524               switch (str->iso2022.esc)
4525                 {
4526 #ifdef ENABLE_COMPOSITE_CHARS
4527                 case ISO_ESC_START_COMPOSITE:
4528                   if (str->iso2022.composite_chars)
4529                     Dynarr_reset (str->iso2022.composite_chars);
4530                   else
4531                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4532                   dst = str->iso2022.composite_chars;
4533                   break;
4534                 case ISO_ESC_END_COMPOSITE:
4535                   {
4536                     Bufbyte comstr[MAX_EMCHAR_LEN];
4537                     Bytecount len;
4538                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4539                                                          Dynarr_length (dst));
4540                     dst = real_dst;
4541                     len = set_charptr_emchar (comstr, emch);
4542                     Dynarr_add_many (dst, comstr, len);
4543                     break;
4544                   }
4545 #endif /* ENABLE_COMPOSITE_CHARS */
4546
4547                 case ISO_ESC_LITERAL:
4548                   DECODE_ADD_BINARY_CHAR (c, dst);
4549                   break;
4550
4551                 default:
4552                   /* Everything else handled already */
4553                   break;
4554                 }
4555             }
4556
4557           /* Attempted error recovery. */
4558           if (str->iso2022.output_direction_sequence)
4559             ensure_correct_direction (flags & CODING_STATE_R2L ?
4560                                       CHARSET_RIGHT_TO_LEFT :
4561                                       CHARSET_LEFT_TO_RIGHT,
4562                                       str->codesys, dst, 0, 1);
4563           /* More error recovery. */
4564           if (!retval || str->iso2022.output_literally)
4565             {
4566               /* Output the (possibly invalid) sequence */
4567               int i;
4568               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4569                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4570               flags &= CODING_STATE_ISO2022_LOCK;
4571               if (!retval)
4572                 n++, src--;/* Repeat the loop with the same character. */
4573               else
4574                 {
4575                   /* No sense in reprocessing the final byte of the
4576                      escape sequence; it could mess things up anyway.
4577                      Just add it now. */
4578                   DECODE_ADD_BINARY_CHAR (c, dst);
4579                 }
4580             }
4581           ch = 0;
4582         }
4583       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4584         { /* Control characters */
4585
4586           /***** Error-handling *****/
4587
4588           /* If we were in the middle of a character, dump out the
4589              partial character. */
4590           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4591
4592           /* If we just saw a single-shift character, dump it out.
4593              This may dump out the wrong sort of single-shift character,
4594              but least it will give an indication that something went
4595              wrong. */
4596           if (flags & CODING_STATE_SS2)
4597             {
4598               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4599               flags &= ~CODING_STATE_SS2;
4600             }
4601           if (flags & CODING_STATE_SS3)
4602             {
4603               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4604               flags &= ~CODING_STATE_SS3;
4605             }
4606
4607           /***** Now handle the control characters. *****/
4608
4609           /* Handle CR/LF */
4610           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4611
4612           flags &= CODING_STATE_ISO2022_LOCK;
4613
4614           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4615             DECODE_ADD_BINARY_CHAR (c, dst);
4616         }
4617       else
4618         {                       /* Graphic characters */
4619           Lisp_Object charset;
4620           int lb;
4621           int reg;
4622
4623           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4624
4625           /* Now determine the charset. */
4626           reg = ((flags & CODING_STATE_SS2) ? 2
4627                  : (flags & CODING_STATE_SS3) ? 3
4628                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4629                  : str->iso2022.register_left);
4630           charset = str->iso2022.charset[reg];
4631
4632           /* Error checking: */
4633           if (! CHARSETP (charset)
4634               || str->iso2022.invalid_designated[reg]
4635               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4636                   && XCHARSET_CHARS (charset) == 94))
4637             /* Mrmph.  We are trying to invoke a register that has no
4638                or an invalid charset in it, or trying to add a character
4639                outside the range of the charset.  Insert that char literally
4640                to preserve it for the output. */
4641             {
4642               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4643               DECODE_ADD_BINARY_CHAR (c, dst);
4644             }
4645
4646           else
4647             {
4648               /* Things are probably hunky-dorey. */
4649
4650               /* Fetch reverse charset, maybe. */
4651               if (((flags & CODING_STATE_R2L) &&
4652                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4653                   ||
4654                   (!(flags & CODING_STATE_R2L) &&
4655                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4656                 {
4657                   Lisp_Object new_charset =
4658                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4659                   if (!NILP (new_charset))
4660                     charset = new_charset;
4661                 }
4662
4663               lb = XCHARSET_LEADING_BYTE (charset);
4664               switch (XCHARSET_REP_BYTES (charset))
4665                 {
4666                 case 1: /* ASCII */
4667                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4668                   Dynarr_add (dst, c & 0x7F);
4669                   break;
4670
4671                 case 2: /* one-byte official */
4672                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4673                   Dynarr_add (dst, lb);
4674                   Dynarr_add (dst, c | 0x80);
4675                   break;
4676
4677                 case 3: /* one-byte private or two-byte official */
4678                   if (XCHARSET_PRIVATE_P (charset))
4679                     {
4680                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4681                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4682                       Dynarr_add (dst, lb);
4683                       Dynarr_add (dst, c | 0x80);
4684                     }
4685                   else
4686                     {
4687                       if (ch)
4688                         {
4689                           Dynarr_add (dst, lb);
4690                           Dynarr_add (dst, ch | 0x80);
4691                           Dynarr_add (dst, c | 0x80);
4692                           ch = 0;
4693                         }
4694                       else
4695                         ch = c;
4696                     }
4697                   break;
4698
4699                 default:        /* two-byte private */
4700                   if (ch)
4701                     {
4702                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4703                       Dynarr_add (dst, lb);
4704                       Dynarr_add (dst, ch | 0x80);
4705                       Dynarr_add (dst, c | 0x80);
4706                       ch = 0;
4707                     }
4708                   else
4709                     ch = c;
4710                 }
4711             }
4712
4713           if (!ch)
4714             flags &= CODING_STATE_ISO2022_LOCK;
4715         }
4716
4717     label_continue_loop:;
4718     }
4719
4720   if (flags & CODING_STATE_END)
4721     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4722
4723   str->flags = flags;
4724   str->ch    = ch;
4725 }
4726
4727
4728 /***** ISO2022 encoder *****/
4729
4730 /* Designate CHARSET into register REG. */
4731
4732 static void
4733 iso2022_designate (Lisp_Object charset, unsigned char reg,
4734                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4735 {
4736   static CONST char inter94[] = "()*+";
4737   static CONST char inter96[] = ",-./";
4738   unsigned int type;
4739   unsigned char final;
4740   Lisp_Object old_charset = str->iso2022.charset[reg];
4741
4742   str->iso2022.charset[reg] = charset;
4743   if (!CHARSETP (charset))
4744     /* charset might be an initial nil or t. */
4745     return;
4746   type = XCHARSET_TYPE (charset);
4747   final = XCHARSET_FINAL (charset);
4748   if (!str->iso2022.force_charset_on_output[reg] &&
4749       CHARSETP (old_charset) &&
4750       XCHARSET_TYPE (old_charset) == type &&
4751       XCHARSET_FINAL (old_charset) == final)
4752     return;
4753
4754   str->iso2022.force_charset_on_output[reg] = 0;
4755
4756   {
4757     charset_conversion_spec_dynarr *dyn =
4758       str->codesys->iso2022.output_conv;
4759
4760     if (dyn)
4761       {
4762         int i;
4763
4764         for (i = 0; i < Dynarr_length (dyn); i++)
4765           {
4766             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4767             if (EQ (charset, spec->from_charset))
4768                 charset = spec->to_charset;
4769           }
4770       }
4771   }
4772
4773   Dynarr_add (dst, ISO_CODE_ESC);
4774   switch (type)
4775     {
4776     case CHARSET_TYPE_94:
4777       Dynarr_add (dst, inter94[reg]);
4778       break;
4779     case CHARSET_TYPE_96:
4780       Dynarr_add (dst, inter96[reg]);
4781       break;
4782     case CHARSET_TYPE_94X94:
4783       Dynarr_add (dst, '$');
4784       if (reg != 0
4785           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4786           || final < '@'
4787           || final > 'B')
4788         Dynarr_add (dst, inter94[reg]);
4789       break;
4790     case CHARSET_TYPE_96X96:
4791       Dynarr_add (dst, '$');
4792       Dynarr_add (dst, inter96[reg]);
4793       break;
4794     }
4795   Dynarr_add (dst, final);
4796 }
4797
4798 static void
4799 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4800 {
4801   if (str->iso2022.register_left != 0)
4802     {
4803       Dynarr_add (dst, ISO_CODE_SI);
4804       str->iso2022.register_left = 0;
4805     }
4806 }
4807
4808 static void
4809 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4810 {
4811   if (str->iso2022.register_left != 1)
4812     {
4813       Dynarr_add (dst, ISO_CODE_SO);
4814       str->iso2022.register_left = 1;
4815     }
4816 }
4817
4818 /* Convert internally-formatted data to ISO2022 format. */
4819
4820 static void
4821 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4822                        unsigned_char_dynarr *dst, unsigned int n)
4823 {
4824   unsigned char charmask, c;
4825   unsigned char char_boundary;
4826   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4827   unsigned int flags          = str->flags;
4828   unsigned int ch             = str->ch;
4829   Lisp_Coding_System *codesys = str->codesys;
4830   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4831   int i;
4832   Lisp_Object charset;
4833   int half;
4834
4835 #ifdef ENABLE_COMPOSITE_CHARS
4836   /* flags for handling composite chars.  We do a little switcharoo
4837      on the source while we're outputting the composite char. */
4838   unsigned int saved_n = 0;
4839   CONST unsigned char *saved_src = NULL;
4840   int in_composite = 0;
4841 #endif /* ENABLE_COMPOSITE_CHARS */
4842
4843   char_boundary = str->iso2022.current_char_boundary;
4844   charset = str->iso2022.current_charset;
4845   half = str->iso2022.current_half;
4846
4847 #ifdef ENABLE_COMPOSITE_CHARS
4848  back_to_square_n:
4849 #endif
4850   while (n--)
4851     {
4852       c = *src++;
4853
4854       if (BYTE_ASCII_P (c))
4855         {               /* Processing ASCII character */
4856           ch = 0;
4857
4858           restore_left_to_right_direction (codesys, dst, &flags, 0);
4859
4860           /* Make sure G0 contains ASCII */
4861           if ((c > ' ' && c < ISO_CODE_DEL) ||
4862               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4863             {
4864               ensure_normal_shift (str, dst);
4865               iso2022_designate (Vcharset_ascii, 0, str, dst);
4866             }
4867
4868           /* If necessary, restore everything to the default state
4869              at end-of-line */
4870           if (c == '\n' &&
4871               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4872             {
4873               restore_left_to_right_direction (codesys, dst, &flags, 0);
4874
4875               ensure_normal_shift (str, dst);
4876
4877               for (i = 0; i < 4; i++)
4878                 {
4879                   Lisp_Object initial_charset =
4880                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4881                   iso2022_designate (initial_charset, i, str, dst);
4882                 }
4883             }
4884           if (c == '\n')
4885             {
4886               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4887                 Dynarr_add (dst, '\r');
4888               if (eol_type != EOL_CR)
4889                 Dynarr_add (dst, c);
4890             }
4891           else
4892             {
4893               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4894                   && fit_to_be_escape_quoted (c))
4895                 Dynarr_add (dst, ISO_CODE_ESC);
4896               Dynarr_add (dst, c);
4897             }
4898           char_boundary = 1;
4899         }
4900
4901       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4902         { /* Processing Leading Byte */
4903           ch = 0;
4904           charset = CHARSET_BY_LEADING_BYTE (c);
4905           if (LEADING_BYTE_PREFIX_P(c))
4906             ch = c;
4907           else if (!EQ (charset, Vcharset_control_1)
4908 #ifdef ENABLE_COMPOSITE_CHARS
4909                    && !EQ (charset, Vcharset_composite)
4910 #endif
4911                    )
4912             {
4913               int reg;
4914
4915               ensure_correct_direction (XCHARSET_DIRECTION (charset),
4916                                         codesys, dst, &flags, 0);
4917
4918               /* Now determine which register to use. */
4919               reg = -1;
4920               for (i = 0; i < 4; i++)
4921                 {
4922                   if (EQ (charset, str->iso2022.charset[i]) ||
4923                       EQ (charset,
4924                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4925                     {
4926                       reg = i;
4927                       break;
4928                     }
4929                 }
4930
4931               if (reg == -1)
4932                 {
4933                   if (XCHARSET_GRAPHIC (charset) != 0)
4934                     {
4935                       if (!NILP (str->iso2022.charset[1]) &&
4936                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4937                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4938                         reg = 1;
4939                       else if (!NILP (str->iso2022.charset[2]))
4940                         reg = 2;
4941                       else if (!NILP (str->iso2022.charset[3]))
4942                         reg = 3;
4943                       else
4944                         reg = 0;
4945                     }
4946                   else
4947                     reg = 0;
4948                 }
4949
4950               iso2022_designate (charset, reg, str, dst);
4951
4952               /* Now invoke that register. */
4953               switch (reg)
4954                 {
4955                 case 0:
4956                   ensure_normal_shift (str, dst);
4957                   half = 0;
4958                   break;
4959
4960                 case 1:
4961                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4962                     {
4963                       ensure_shift_out (str, dst);
4964                       half = 0;
4965                     }
4966                   else
4967                     half = 1;
4968                   break;
4969
4970                 case 2:
4971                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4972                     {
4973                       Dynarr_add (dst, ISO_CODE_ESC);
4974                       Dynarr_add (dst, 'N');
4975                       half = 0;
4976                     }
4977                   else
4978                     {
4979                       Dynarr_add (dst, ISO_CODE_SS2);
4980                       half = 1;
4981                     }
4982                   break;
4983
4984                 case 3:
4985                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4986                     {
4987                       Dynarr_add (dst, ISO_CODE_ESC);
4988                       Dynarr_add (dst, 'O');
4989                       half = 0;
4990                     }
4991                   else
4992                     {
4993                       Dynarr_add (dst, ISO_CODE_SS3);
4994                       half = 1;
4995                     }
4996                   break;
4997
4998                 default:
4999                   abort ();
5000                 }
5001             }
5002           char_boundary = 0;
5003         }
5004       else
5005         {                       /* Processing Non-ASCII character */
5006           charmask = (half == 0 ? 0x7F : 0xFF);
5007           char_boundary = 1;
5008           if (EQ (charset, Vcharset_control_1))
5009             {
5010               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5011                   && fit_to_be_escape_quoted (c))
5012                 Dynarr_add (dst, ISO_CODE_ESC);
5013               /* you asked for it ... */
5014               Dynarr_add (dst, c - 0x20);
5015             }
5016           else
5017             {
5018               switch (XCHARSET_REP_BYTES (charset))
5019                 {
5020                 case 2:
5021                   Dynarr_add (dst, c & charmask);
5022                   break;
5023                 case 3:
5024                   if (XCHARSET_PRIVATE_P (charset))
5025                     {
5026                       Dynarr_add (dst, c & charmask);
5027                       ch = 0;
5028                     }
5029                   else if (ch)
5030                     {
5031 #ifdef ENABLE_COMPOSITE_CHARS
5032                       if (EQ (charset, Vcharset_composite))
5033                         {
5034                           if (in_composite)
5035                             {
5036                               /* #### Bother! We don't know how to
5037                                  handle this yet. */
5038                               Dynarr_add (dst, '~');
5039                             }
5040                           else
5041                             {
5042                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5043                                                        ch & 0x7F, c & 0x7F);
5044                               Lisp_Object lstr = composite_char_string (emch);
5045                               saved_n = n;
5046                               saved_src = src;
5047                               in_composite = 1;
5048                               src = XSTRING_DATA   (lstr);
5049                               n   = XSTRING_LENGTH (lstr);
5050                               Dynarr_add (dst, ISO_CODE_ESC);
5051                               Dynarr_add (dst, '0'); /* start composing */
5052                             }
5053                         }
5054                       else
5055 #endif /* ENABLE_COMPOSITE_CHARS */
5056                         {
5057                           Dynarr_add (dst, ch & charmask);
5058                           Dynarr_add (dst, c & charmask);
5059                         }
5060                       ch = 0;
5061                     }
5062                   else
5063                     {
5064                       ch = c;
5065                       char_boundary = 0;
5066                     }
5067                   break;
5068                 case 4:
5069                   if (ch)
5070                     {
5071                       Dynarr_add (dst, ch & charmask);
5072                       Dynarr_add (dst, c & charmask);
5073                       ch = 0;
5074                     }
5075                   else
5076                     {
5077                       ch = c;
5078                       char_boundary = 0;
5079                     }
5080                   break;
5081                 default:
5082                   abort ();
5083                 }
5084             }
5085         }
5086     }
5087
5088 #ifdef ENABLE_COMPOSITE_CHARS
5089   if (in_composite)
5090     {
5091       n = saved_n;
5092       src = saved_src;
5093       in_composite = 0;
5094       Dynarr_add (dst, ISO_CODE_ESC);
5095       Dynarr_add (dst, '1'); /* end composing */
5096       goto back_to_square_n; /* Wheeeeeeeee ..... */
5097     }
5098 #endif /* ENABLE_COMPOSITE_CHARS */
5099
5100   if (char_boundary && flags & CODING_STATE_END)
5101     {
5102       restore_left_to_right_direction (codesys, dst, &flags, 0);
5103       ensure_normal_shift (str, dst);
5104       for (i = 0; i < 4; i++)
5105         {
5106           Lisp_Object initial_charset =
5107             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5108           iso2022_designate (initial_charset, i, str, dst);
5109         }
5110     }
5111
5112   str->flags = flags;
5113   str->ch    = ch;
5114   str->iso2022.current_char_boundary = char_boundary;
5115   str->iso2022.current_charset = charset;
5116   str->iso2022.current_half = half;
5117
5118   /* Verbum caro factum est! */
5119 }
5120 #endif /* MULE */
5121 \f
5122 /************************************************************************/
5123 /*                     No-conversion methods                            */
5124 /************************************************************************/
5125
5126 /* This is used when reading in "binary" files -- i.e. files that may
5127    contain all 256 possible byte values and that are not to be
5128    interpreted as being in any particular decoding. */
5129 static void
5130 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5131                              unsigned_char_dynarr *dst, unsigned int n)
5132 {
5133   unsigned char c;
5134   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5135   unsigned int flags  = str->flags;
5136   unsigned int ch     = str->ch;
5137   eol_type_t eol_type = str->eol_type;
5138
5139   while (n--)
5140     {
5141       c = *src++;
5142
5143       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5144       DECODE_ADD_BINARY_CHAR (c, dst);
5145     label_continue_loop:;
5146     }
5147
5148   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5149
5150   str->flags = flags;
5151   str->ch    = ch;
5152 }
5153
5154 static void
5155 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5156                              unsigned_char_dynarr *dst, unsigned int n)
5157 {
5158   unsigned char c;
5159   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5160   unsigned int flags  = str->flags;
5161   unsigned int ch     = str->ch;
5162   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5163
5164   while (n--)
5165     {
5166       c = *src++;
5167       if (c == '\n')
5168         {
5169           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5170             Dynarr_add (dst, '\r');
5171           if (eol_type != EOL_CR)
5172             Dynarr_add (dst, '\n');
5173           ch = 0;
5174         }
5175       else if (BYTE_ASCII_P (c))
5176         {
5177           assert (ch == 0);
5178           Dynarr_add (dst, c);
5179         }
5180       else if (BUFBYTE_LEADING_BYTE_P (c))
5181         {
5182           assert (ch == 0);
5183           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5184               c == LEADING_BYTE_CONTROL_1)
5185             ch = c;
5186           else
5187             Dynarr_add (dst, '~'); /* untranslatable character */
5188         }
5189       else
5190         {
5191           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5192             Dynarr_add (dst, c);
5193           else if (ch == LEADING_BYTE_CONTROL_1)
5194             {
5195               assert (c < 0xC0);
5196               Dynarr_add (dst, c - 0x20);
5197             }
5198           /* else it should be the second or third byte of an
5199              untranslatable character, so ignore it */
5200           ch = 0;
5201         }
5202     }
5203
5204   str->flags = flags;
5205   str->ch    = ch;
5206 }
5207
5208 \f
5209 /************************************************************************/
5210 /*                   Simple internal/external functions                 */
5211 /************************************************************************/
5212
5213 static Extbyte_dynarr *conversion_out_dynarr;
5214 static Bufbyte_dynarr *conversion_in_dynarr;
5215
5216 /* Determine coding system from coding format */
5217
5218 /* #### not correct for all values of `fmt'! */
5219 static Lisp_Object
5220 external_data_format_to_coding_system (enum external_data_format fmt)
5221 {
5222   switch (fmt)
5223     {
5224     case FORMAT_FILENAME:
5225     case FORMAT_TERMINAL:
5226       if (EQ (Vfile_name_coding_system, Qnil) ||
5227           EQ (Vfile_name_coding_system, Qbinary))
5228         return Qnil;
5229       else
5230         return Fget_coding_system (Vfile_name_coding_system);
5231 #ifdef MULE
5232     case FORMAT_CTEXT:
5233       return Fget_coding_system (Qctext);
5234 #endif
5235     default:
5236       return Qnil;
5237     }
5238 }
5239
5240 Extbyte *
5241 convert_to_external_format (CONST Bufbyte *ptr,
5242                             Bytecount len,
5243                             Extcount *len_out,
5244                             enum external_data_format fmt)
5245 {
5246   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5247
5248   if (!conversion_out_dynarr)
5249     conversion_out_dynarr = Dynarr_new (Extbyte);
5250   else
5251     Dynarr_reset (conversion_out_dynarr);
5252
5253   if (NILP (coding_system))
5254     {
5255       CONST Bufbyte *end = ptr + len;
5256
5257       for (; ptr < end;)
5258         {
5259           Bufbyte c =
5260             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5261             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5262             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5263             '~';
5264
5265           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5266           INC_CHARPTR (ptr);
5267         }
5268
5269 #ifdef ERROR_CHECK_BUFPOS
5270       assert (ptr == end);
5271 #endif
5272     }
5273   else
5274     {
5275       Lisp_Object instream, outstream, da_outstream;
5276       Lstream *istr, *ostr;
5277       struct gcpro gcpro1, gcpro2, gcpro3;
5278       char tempbuf[1024]; /* some random amount */
5279
5280       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5281       da_outstream = make_dynarr_output_stream
5282         ((unsigned_char_dynarr *) conversion_out_dynarr);
5283       outstream =
5284         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5285       istr = XLSTREAM (instream);
5286       ostr = XLSTREAM (outstream);
5287       GCPRO3 (instream, outstream, da_outstream);
5288       while (1)
5289         {
5290           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5291           if (!size_in_bytes)
5292             break;
5293           Lstream_write (ostr, tempbuf, size_in_bytes);
5294         }
5295       Lstream_close (istr);
5296       Lstream_close (ostr);
5297       UNGCPRO;
5298       Lstream_delete (istr);
5299       Lstream_delete (ostr);
5300       Lstream_delete (XLSTREAM (da_outstream));
5301     }
5302
5303   *len_out = Dynarr_length (conversion_out_dynarr);
5304   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5305   return Dynarr_atp (conversion_out_dynarr, 0);
5306 }
5307
5308 Bufbyte *
5309 convert_from_external_format (CONST Extbyte *ptr,
5310                               Extcount len,
5311                               Bytecount *len_out,
5312                               enum external_data_format fmt)
5313 {
5314   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5315
5316   if (!conversion_in_dynarr)
5317     conversion_in_dynarr = Dynarr_new (Bufbyte);
5318   else
5319     Dynarr_reset (conversion_in_dynarr);
5320
5321   if (NILP (coding_system))
5322     {
5323       CONST Extbyte *end = ptr + len;
5324       for (; ptr < end; ptr++)
5325         {
5326           Extbyte c = *ptr;
5327           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5328         }
5329     }
5330   else
5331     {
5332       Lisp_Object instream, outstream, da_outstream;
5333       Lstream *istr, *ostr;
5334       struct gcpro gcpro1, gcpro2, gcpro3;
5335       char tempbuf[1024]; /* some random amount */
5336
5337       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5338       da_outstream = make_dynarr_output_stream
5339         ((unsigned_char_dynarr *) conversion_in_dynarr);
5340       outstream =
5341         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5342       istr = XLSTREAM (instream);
5343       ostr = XLSTREAM (outstream);
5344       GCPRO3 (instream, outstream, da_outstream);
5345       while (1)
5346         {
5347           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5348           if (!size_in_bytes)
5349             break;
5350           Lstream_write (ostr, tempbuf, size_in_bytes);
5351         }
5352       Lstream_close (istr);
5353       Lstream_close (ostr);
5354       UNGCPRO;
5355       Lstream_delete (istr);
5356       Lstream_delete (ostr);
5357       Lstream_delete (XLSTREAM (da_outstream));
5358     }
5359
5360   *len_out = Dynarr_length (conversion_in_dynarr);
5361   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5362   return Dynarr_atp (conversion_in_dynarr, 0);
5363 }
5364
5365 \f
5366 /************************************************************************/
5367 /*                             Initialization                           */
5368 /************************************************************************/
5369
5370 void
5371 syms_of_mule_coding (void)
5372 {
5373   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5374   deferror (&Qcoding_system_error, "coding-system-error",
5375             "Coding-system error", Qio_error);
5376
5377   DEFSUBR (Fcoding_system_p);
5378   DEFSUBR (Ffind_coding_system);
5379   DEFSUBR (Fget_coding_system);
5380   DEFSUBR (Fcoding_system_list);
5381   DEFSUBR (Fcoding_system_name);
5382   DEFSUBR (Fmake_coding_system);
5383   DEFSUBR (Fcopy_coding_system);
5384   DEFSUBR (Fsubsidiary_coding_system);
5385
5386   DEFSUBR (Fcoding_system_type);
5387   DEFSUBR (Fcoding_system_doc_string);
5388 #ifdef MULE
5389   DEFSUBR (Fcoding_system_charset);
5390 #endif
5391   DEFSUBR (Fcoding_system_property);
5392
5393   DEFSUBR (Fcoding_category_list);
5394   DEFSUBR (Fset_coding_priority_list);
5395   DEFSUBR (Fcoding_priority_list);
5396   DEFSUBR (Fset_coding_category_system);
5397   DEFSUBR (Fcoding_category_system);
5398
5399   DEFSUBR (Fdetect_coding_region);
5400   DEFSUBR (Fdecode_coding_region);
5401   DEFSUBR (Fencode_coding_region);
5402 #ifdef MULE
5403   DEFSUBR (Fdecode_shift_jis_char);
5404   DEFSUBR (Fencode_shift_jis_char);
5405   DEFSUBR (Fdecode_big5_char);
5406   DEFSUBR (Fencode_big5_char);
5407   DEFSUBR (Fset_ucs_char);
5408   DEFSUBR (Fucs_char);
5409   DEFSUBR (Fset_char_ucs);
5410   DEFSUBR (Fchar_ucs);
5411 #endif /* MULE */
5412   defsymbol (&Qcoding_system_p, "coding-system-p");
5413   defsymbol (&Qno_conversion, "no-conversion");
5414 #ifdef MULE
5415   defsymbol (&Qbig5, "big5");
5416   defsymbol (&Qshift_jis, "shift-jis");
5417   defsymbol (&Qucs4, "ucs-4");
5418   defsymbol (&Qutf8, "utf-8");
5419   defsymbol (&Qccl, "ccl");
5420   defsymbol (&Qiso2022, "iso2022");
5421 #endif /* MULE */
5422   defsymbol (&Qmnemonic, "mnemonic");
5423   defsymbol (&Qeol_type, "eol-type");
5424   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5425   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5426
5427   defsymbol (&Qcr, "cr");
5428   defsymbol (&Qlf, "lf");
5429   defsymbol (&Qcrlf, "crlf");
5430   defsymbol (&Qeol_cr, "eol-cr");
5431   defsymbol (&Qeol_lf, "eol-lf");
5432   defsymbol (&Qeol_crlf, "eol-crlf");
5433 #ifdef MULE
5434   defsymbol (&Qcharset_g0, "charset-g0");
5435   defsymbol (&Qcharset_g1, "charset-g1");
5436   defsymbol (&Qcharset_g2, "charset-g2");
5437   defsymbol (&Qcharset_g3, "charset-g3");
5438   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5439   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5440   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5441   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5442   defsymbol (&Qno_iso6429, "no-iso6429");
5443   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5444   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5445
5446   defsymbol (&Qshort, "short");
5447   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5448   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5449   defsymbol (&Qseven, "seven");
5450   defsymbol (&Qlock_shift, "lock-shift");
5451   defsymbol (&Qescape_quoted, "escape-quoted");
5452 #endif /* MULE */
5453   defsymbol (&Qencode, "encode");
5454   defsymbol (&Qdecode, "decode");
5455
5456 #ifdef MULE
5457   defsymbol (&Qctext, "ctext");
5458   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5459              "shift-jis");
5460   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5461              "big5");
5462   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5463              "ucs-4");
5464   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5465              "utf-8");
5466   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5467              "iso-7");
5468   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5469              "iso-8-designate");
5470   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5471              "iso-8-1");
5472   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5473              "iso-8-2");
5474   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5475              "iso-lock-shift");
5476 #endif /* MULE */
5477   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5478              "no-conversion");
5479 }
5480
5481 void
5482 lstream_type_create_mule_coding (void)
5483 {
5484   LSTREAM_HAS_METHOD (decoding, reader);
5485   LSTREAM_HAS_METHOD (decoding, writer);
5486   LSTREAM_HAS_METHOD (decoding, rewinder);
5487   LSTREAM_HAS_METHOD (decoding, seekable_p);
5488   LSTREAM_HAS_METHOD (decoding, flusher);
5489   LSTREAM_HAS_METHOD (decoding, closer);
5490   LSTREAM_HAS_METHOD (decoding, marker);
5491
5492   LSTREAM_HAS_METHOD (encoding, reader);
5493   LSTREAM_HAS_METHOD (encoding, writer);
5494   LSTREAM_HAS_METHOD (encoding, rewinder);
5495   LSTREAM_HAS_METHOD (encoding, seekable_p);
5496   LSTREAM_HAS_METHOD (encoding, flusher);
5497   LSTREAM_HAS_METHOD (encoding, closer);
5498   LSTREAM_HAS_METHOD (encoding, marker);
5499 }
5500
5501 void
5502 vars_of_mule_coding (void)
5503 {
5504   int i;
5505
5506   /* Initialize to something reasonable ... */
5507   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5508     {
5509       coding_category_system[i] = Qnil;
5510       coding_category_by_priority[i] = i;
5511     }
5512
5513   Fprovide (intern ("file-coding"));
5514
5515   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5516 Coding system used for TTY keyboard input.
5517 Not used under a windowing system.
5518 */ );
5519   Vkeyboard_coding_system = Qnil;
5520
5521   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5522 Coding system used for TTY display output.
5523 Not used under a windowing system.
5524 */ );
5525   Vterminal_coding_system = Qnil;
5526
5527   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5528 Overriding coding system used when writing a file or process.
5529 You should *bind* this, not set it.  If this is non-nil, it specifies
5530 the coding system that will be used when a file or process is read
5531 in, and overrides `buffer-file-coding-system-for-read',
5532 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5533 this one for permanent changes to the environment.
5534 */ );
5535   Vcoding_system_for_read = Qnil;
5536
5537   DEFVAR_LISP ("coding-system-for-write",
5538                &Vcoding_system_for_write /*
5539 Overriding coding system used when writing a file or process.
5540 You should *bind* this, not set it.  If this is non-nil, it specifies
5541 the coding system that will be used when a file or process is wrote
5542 in, and overrides `buffer-file-coding-system',
5543 `write-region-pre-hook', etc.  Use those variables instead of this one
5544 for permanent changes to the environment.
5545 */ );
5546   Vcoding_system_for_write = Qnil;
5547
5548   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5549 Coding system used to convert pathnames when accessing files.
5550 */ );
5551   Vfile_name_coding_system = Qnil;
5552
5553   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5554 Non-nil means the buffer contents are regarded as multi-byte form
5555 of characters, not a binary code.  This affects the display, file I/O,
5556 and behaviors of various editing commands.
5557
5558 Setting this to nil does not do anything.
5559 */ );
5560   enable_multibyte_characters = 1;
5561 }
5562
5563 void
5564 complex_vars_of_mule_coding (void)
5565 {
5566   staticpro (&Vcoding_system_hash_table);
5567   Vcoding_system_hash_table =
5568     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5569
5570   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5571
5572 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5573 {                                               \
5574   struct codesys_prop csp;                      \
5575   csp.sym = (Sym);                              \
5576   csp.prop_type = (Prop_Type);                  \
5577   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5578 } while (0)
5579
5580   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5581   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5582   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5583   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5584   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5585   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5586   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5587 #ifdef MULE
5588   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5589   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5590   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5591   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5592   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5593   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5594   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5595   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5596   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5597   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5598   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5599   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5600   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5601   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5602   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5603   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5604   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5605
5606   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5607   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5608 #endif /* MULE */
5609   /* Need to create this here or we're really screwed. */
5610   Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
5611                        list2 (Qmnemonic, build_string ("Noconv")));
5612
5613   Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
5614                        Qbinary);
5615
5616   /* Need this for bootstrapping */
5617   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5618     Fget_coding_system (Qno_conversion);
5619
5620 #ifdef MULE
5621   {
5622     unsigned int i;
5623
5624     for (i = 0; i < 65536; i++)
5625       ucs_to_mule_table[i] = Qnil;
5626   }
5627   staticpro (&mule_to_ucs_table);
5628   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5629 #endif /* MULE */
5630 }