src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154
 155 static void
 156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
 157                      unsigned_char_dynarr *dst, unsigned int n);
 158
 159 static int detect_coding_sjis (struct detection_state *st,
 160                                CONST unsigned char *src,
 161                                unsigned int n);
 162 static void decode_coding_sjis (Lstream *decoding,
 163                                 CONST unsigned char *src,
 164                                 unsigned_char_dynarr *dst,
 165                                 unsigned int n);
 166 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 167                             unsigned_char_dynarr *dst, unsigned int *flags);
 168 void char_finish_shift_jis (struct encoding_stream *str,
 169                             unsigned_char_dynarr *dst, unsigned int *flags);
 170
 171 static int detect_coding_big5 (struct detection_state *st,
 172                                CONST unsigned char *src,
 173                                unsigned int n);
 174 static void decode_coding_big5 (Lstream *decoding,
 175                                 CONST unsigned char *src,
 176                                 unsigned_char_dynarr *dst, unsigned int n);
 177 static void encode_coding_big5 (Lstream *encoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static int detect_coding_ucs4 (struct detection_state *st,
 181                                CONST unsigned char *src,
 182                                unsigned int n);
 183 static void decode_coding_ucs4 (Lstream *decoding,
 184                                 CONST unsigned char *src,
 185                                 unsigned_char_dynarr *dst, unsigned int n);
 186 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 187                        unsigned_char_dynarr *dst, unsigned int *flags);
 188 void char_finish_ucs4 (struct encoding_stream *str,
 189                        unsigned_char_dynarr *dst, unsigned int *flags);
 190
 191 static int detect_coding_utf8 (struct detection_state *st,
 192                                CONST unsigned char *src,
 193                                unsigned int n);
 194 static void decode_coding_utf8 (Lstream *decoding,
 195                                 CONST unsigned char *src,
 196                                 unsigned_char_dynarr *dst, unsigned int n);
 197 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 198                        unsigned_char_dynarr *dst, unsigned int *flags);
 199 void char_finish_utf8 (struct encoding_stream *str,
 200                        unsigned_char_dynarr *dst, unsigned int *flags);
 201
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   CONST unsigned char *src,
 207                                   unsigned int n);
 208 static void decode_coding_iso2022 (Lstream *decoding,
 209                                    CONST unsigned char *src,
 210                                    unsigned_char_dynarr *dst, unsigned int n);
 211 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 212                           unsigned_char_dynarr *dst, unsigned int *flags);
 213 void char_finish_iso2022 (struct encoding_stream *str,
 214                           unsigned_char_dynarr *dst, unsigned int *flags);
 215 #endif /* MULE */
 216 static void decode_coding_no_conversion (Lstream *decoding,
 217                                          CONST unsigned char *src,
 218                                          unsigned_char_dynarr *dst,
 219                                          unsigned int n);
 220 static void encode_coding_no_conversion (Lstream *encoding,
 221                                          CONST unsigned char *src,
 222                                          unsigned_char_dynarr *dst,
 223                                          unsigned int n);
 224 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 225                          unsigned_char_dynarr *dst, unsigned int n);
 226 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 227                          unsigned_char_dynarr *dst, unsigned int n);
 228
 229 typedef struct codesys_prop codesys_prop;
 230 struct codesys_prop
 231 {
 232   Lisp_Object sym;
 233   int prop_type;
 234 };
 235
 236 typedef struct
 237 {
 238   Dynarr_declare (codesys_prop);
 239 } codesys_prop_dynarr;
 240
 241 codesys_prop_dynarr *the_codesys_prop_dynarr;
 242
 243 enum codesys_prop_enum
 244 {
 245   CODESYS_PROP_ALL_OK,
 246   CODESYS_PROP_ISO2022,
 247   CODESYS_PROP_CCL
 248 };
 249
 250 \f
 251 /************************************************************************/
 252 /*                       Coding system functions                        */
 253 /************************************************************************/
 254
 255 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 256 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 257 static void finalize_coding_system (void *header, int for_disksave);
 258
 259 #ifdef MULE
 260 static const struct lrecord_description ccs_description_1[] = {
 261   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 262   { XD_END }
 263 };
 264
 265 static const struct struct_description ccs_description = {
 266   sizeof(charset_conversion_spec),
 267   ccs_description_1
 268 };
 269
 270 static const struct lrecord_description ccsd_description_1[] = {
 271   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 272   { XD_END }
 273 };
 274
 275 static const struct struct_description ccsd_description = {
 276   sizeof(charset_conversion_spec_dynarr),
 277   ccsd_description_1
 278 };
 279 #endif
 280
 281 static const struct lrecord_description coding_system_description[] = {
 282   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 283   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 284   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 285 #ifdef MULE
 286   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 287   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 288   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 289   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 290 #endif
 291   { XD_END }
 292 };
 293
 294 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 295                                mark_coding_system, print_coding_system,
 296                                finalize_coding_system,
 297                                0, 0, coding_system_description,
 298                                struct Lisp_Coding_System);
 299
 300 static Lisp_Object
 301 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 302 {
 303   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 304
 305   markobj (CODING_SYSTEM_NAME (codesys));
 306   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 307   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 308   markobj (CODING_SYSTEM_EOL_LF (codesys));
 309   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 310   markobj (CODING_SYSTEM_EOL_CR (codesys));
 311
 312   switch (CODING_SYSTEM_TYPE (codesys))
 313     {
 314 #ifdef MULE
 315       int i;
 316     case CODESYS_ISO2022:
 317       for (i = 0; i < 4; i++)
 318         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 319       if (codesys->iso2022.input_conv)
 320         {
 321           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 322             {
 323               struct charset_conversion_spec *ccs =
 324                 Dynarr_atp (codesys->iso2022.input_conv, i);
 325               markobj (ccs->from_charset);
 326               markobj (ccs->to_charset);
 327             }
 328         }
 329       if (codesys->iso2022.output_conv)
 330         {
 331           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 332             {
 333               struct charset_conversion_spec *ccs =
 334                 Dynarr_atp (codesys->iso2022.output_conv, i);
 335               markobj (ccs->from_charset);
 336               markobj (ccs->to_charset);
 337             }
 338         }
 339       break;
 340
 341     case CODESYS_CCL:
 342       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 343       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 344       break;
 345 #endif /* MULE */
 346     default:
 347       break;
 348     }
 349
 350   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 351   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 352 }
 353
 354 static void
 355 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 356                      int escapeflag)
 357 {
 358   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 359   if (print_readably)
 360     error ("printing unreadable object #<coding_system 0x%x>",
 361            c->header.uid);
 362
 363   write_c_string ("#<coding_system ", printcharfun);
 364   print_internal (c->name, printcharfun, 1);
 365   write_c_string (">", printcharfun);
 366 }
 367
 368 static void
 369 finalize_coding_system (void *header, int for_disksave)
 370 {
 371   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 372   /* Since coding systems never go away, this function is not
 373      necessary.  But it would be necessary if we changed things
 374      so that coding systems could go away. */
 375   if (!for_disksave) /* see comment in lstream.c */
 376     {
 377       switch (CODING_SYSTEM_TYPE (c))
 378         {
 379 #ifdef MULE
 380         case CODESYS_ISO2022:
 381           if (c->iso2022.input_conv)
 382             {
 383               Dynarr_free (c->iso2022.input_conv);
 384               c->iso2022.input_conv = 0;
 385             }
 386           if (c->iso2022.output_conv)
 387             {
 388               Dynarr_free (c->iso2022.output_conv);
 389               c->iso2022.output_conv = 0;
 390             }
 391           break;
 392 #endif /* MULE */
 393         default:
 394           break;
 395         }
 396     }
 397 }
 398
 399 static enum eol_type
 400 symbol_to_eol_type (Lisp_Object symbol)
 401 {
 402   CHECK_SYMBOL (symbol);
 403   if (NILP (symbol))      return EOL_AUTODETECT;
 404   if (EQ (symbol, Qlf))   return EOL_LF;
 405   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 406   if (EQ (symbol, Qcr))   return EOL_CR;
 407
 408   signal_simple_error ("Unrecognized eol type", symbol);
 409   return EOL_AUTODETECT; /* not reached */
 410 }
 411
 412 static Lisp_Object
 413 eol_type_to_symbol (enum eol_type type)
 414 {
 415   switch (type)
 416     {
 417     default: abort ();
 418     case EOL_LF:         return Qlf;
 419     case EOL_CRLF:       return Qcrlf;
 420     case EOL_CR:         return Qcr;
 421     case EOL_AUTODETECT: return Qnil;
 422     }
 423 }
 424
 425 static void
 426 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 427 {
 428   Lisp_Object codesys_obj;
 429   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 430   char *codesys_name = (char *) alloca (len + 7);
 431   int mlen = -1;
 432   char *codesys_mnemonic=0;
 433
 434   Lisp_Object codesys_name_sym, sub_codesys_obj;
 435
 436   /* kludge */
 437
 438   XSETCODING_SYSTEM (codesys_obj, codesys);
 439
 440   memcpy (codesys_name,
 441           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 442
 443   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 444     {
 445       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 446       codesys_mnemonic = (char *) alloca (mlen + 7);
 447       memcpy (codesys_mnemonic,
 448               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 449     }
 450
 451 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 452   strcpy (codesys_name + len, "-" op_sys);                                      \
 453   if (mlen != -1)                                                               \
 454     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 455   codesys_name_sym = intern (codesys_name);                                     \
 456   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 457   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 458   if (mlen != -1)                                                               \
 459     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 460       build_string (codesys_mnemonic);                                          \
 461   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 462 } while (0)
 463
 464   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 465   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 466   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 467 }
 468
 469 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 470 Return t if OBJECT is a coding system.
 471 A coding system is an object that defines how text containing multiple
 472 character sets is encoded into a stream of (typically 8-bit) bytes.
 473 The coding system is used to decode the stream into a series of
 474 characters (which may be from multiple charsets) when the text is read
 475 from a file or process, and is used to encode the text back into the
 476 same format when it is written out to a file or process.
 477
 478 For example, many ISO2022-compliant coding systems (such as Compound
 479 Text, which is used for inter-client data under the X Window System)
 480 use escape sequences to switch between different charsets -- Japanese
 481 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 482 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 483 `make-coding-system' for more information.
 484
 485 Coding systems are normally identified using a symbol, and the
 486 symbol is accepted in place of the actual coding system object whenever
 487 a coding system is called for. (This is similar to how faces work.)
 488 */
 489        (object))
 490 {
 491   return CODING_SYSTEMP (object) ? Qt : Qnil;
 492 }
 493
 494 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 495 Retrieve the coding system of the given name.
 496
 497 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 498 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 499 If there is no such coding system, nil is returned.  Otherwise the
 500 associated coding system object is returned.
 501 */
 502        (coding_system_or_name))
 503 {
 504   if (CODING_SYSTEMP (coding_system_or_name))
 505     return coding_system_or_name;
 506
 507   if (NILP (coding_system_or_name))
 508     coding_system_or_name = Qbinary;
 509   else
 510     CHECK_SYMBOL (coding_system_or_name);
 511
 512   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 513 }
 514
 515 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 516 Retrieve the coding system of the given name.
 517 Same as `find-coding-system' except that if there is no such
 518 coding system, an error is signaled instead of returning nil.
 519 */
 520        (name))
 521 {
 522   Lisp_Object coding_system = Ffind_coding_system (name);
 523
 524   if (NILP (coding_system))
 525     signal_simple_error ("No such coding system", name);
 526   return coding_system;
 527 }
 528
 529 /* We store the coding systems in hash tables with the names as the key and the
 530    actual coding system object as the value.  Occasionally we need to use them
 531    in a list format.  These routines provide us with that. */
 532 struct coding_system_list_closure
 533 {
 534   Lisp_Object *coding_system_list;
 535 };
 536
 537 static int
 538 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 539                                   void *coding_system_list_closure)
 540 {
 541   /* This function can GC */
 542   struct coding_system_list_closure *cscl =
 543     (struct coding_system_list_closure *) coding_system_list_closure;
 544   Lisp_Object *coding_system_list = cscl->coding_system_list;
 545
 546   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 547                                *coding_system_list);
 548   return 0;
 549 }
 550
 551 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 552 Return a list of the names of all defined coding systems.
 553 */
 554        ())
 555 {
 556   Lisp_Object coding_system_list = Qnil;
 557   struct gcpro gcpro1;
 558   struct coding_system_list_closure coding_system_list_closure;
 559
 560   GCPRO1 (coding_system_list);
 561   coding_system_list_closure.coding_system_list = &coding_system_list;
 562   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 563                  &coding_system_list_closure);
 564   UNGCPRO;
 565
 566   return coding_system_list;
 567 }
 568
 569 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 570 Return the name of the given coding system.
 571 */
 572        (coding_system))
 573 {
 574   coding_system = Fget_coding_system (coding_system);
 575   return XCODING_SYSTEM_NAME (coding_system);
 576 }
 577
 578 static Lisp_Coding_System *
 579 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 580 {
 581   Lisp_Coding_System *codesys =
 582     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 583
 584   zero_lcrecord (codesys);
 585   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 586   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 587   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 588   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 589   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 590   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 591   CODING_SYSTEM_TYPE     (codesys) = type;
 592   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 593 #ifdef MULE
 594   if (type == CODESYS_ISO2022)
 595     {
 596       int i;
 597       for (i = 0; i < 4; i++)
 598         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 599     }
 600   else if (type == CODESYS_CCL)
 601     {
 602       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 603       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 604     }
 605 #endif /* MULE */
 606   CODING_SYSTEM_NAME (codesys) = name;
 607
 608   return codesys;
 609 }
 610
 611 #ifdef MULE
 612 /* Given a list of charset conversion specs as specified in a Lisp
 613    program, parse it into STORE_HERE. */
 614
 615 static void
 616 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 617                                 Lisp_Object spec_list)
 618 {
 619   Lisp_Object rest;
 620
 621   EXTERNAL_LIST_LOOP (rest, spec_list)
 622     {
 623       Lisp_Object car = XCAR (rest);
 624       Lisp_Object from, to;
 625       struct charset_conversion_spec spec;
 626
 627       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 628         signal_simple_error ("Invalid charset conversion spec", car);
 629       from = Fget_charset (XCAR (car));
 630       to = Fget_charset (XCAR (XCDR (car)));
 631       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 632         signal_simple_error_2
 633           ("Attempted conversion between different charset types",
 634            from, to);
 635       spec.from_charset = from;
 636       spec.to_charset = to;
 637
 638       Dynarr_add (store_here, spec);
 639     }
 640 }
 641
 642 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 643    specs, return the equivalent as the Lisp programmer would see it.
 644
 645    If LOAD_HERE is 0, return Qnil. */
 646
 647 static Lisp_Object
 648 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 649 {
 650   int i;
 651   Lisp_Object result;
 652
 653   if (!load_here)
 654     return Qnil;
 655   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 656     {
 657       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 658       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 659     }
 660
 661   return Fnreverse (result);
 662 }
 663
 664 #endif /* MULE */
 665
 666 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 667 Register symbol NAME as a coding system.
 668
 669 TYPE describes the conversion method used and should be one of
 670
 671 nil or 'undecided
 672      Automatic conversion.  XEmacs attempts to detect the coding system
 673      used in the file.
 674 'no-conversion
 675      No conversion.  Use this for binary files and such.  On output,
 676      graphic characters that are not in ASCII or Latin-1 will be
 677      replaced by a ?. (For a no-conversion-encoded buffer, these
 678      characters will only be present if you explicitly insert them.)
 679 'shift-jis
 680      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 681 'ucs-4
 682      ISO 10646 UCS-4 encoding.
 683 'utf-8
 684      ISO 10646 UTF-8 encoding.
 685 'iso2022
 686      Any ISO2022-compliant encoding.  Among other things, this includes
 687      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 688      standard Unix encoding for Japanese and other languages), and
 689      Compound Text (the encoding used in X11).  You can specify more
 690      specific information about the conversion with the FLAGS argument.
 691 'big5
 692      Big5 (the encoding commonly used for Taiwanese).
 693 'ccl
 694      The conversion is performed using a user-written pseudo-code
 695      program.  CCL (Code Conversion Language) is the name of this
 696      pseudo-code.
 697 'internal
 698      Write out or read in the raw contents of the memory representing
 699      the buffer's text.  This is primarily useful for debugging
 700      purposes, and is only enabled when XEmacs has been compiled with
 701      DEBUG_XEMACS defined (via the --debug configure option).
 702      WARNING: Reading in a file using 'internal conversion can result
 703      in an internal inconsistency in the memory representing a
 704      buffer's text, which will produce unpredictable results and may
 705      cause XEmacs to crash.  Under normal circumstances you should
 706      never use 'internal conversion.
 707
 708 DOC-STRING is a string describing the coding system.
 709
 710 PROPS is a property list, describing the specific nature of the
 711 character set.  Recognized properties are:
 712
 713 'mnemonic
 714      String to be displayed in the modeline when this coding system is
 715      active.
 716
 717 'eol-type
 718      End-of-line conversion to be used.  It should be one of
 719
 720         nil
 721                 Automatically detect the end-of-line type (LF, CRLF,
 722                 or CR).  Also generate subsidiary coding systems named
 723                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 724                 identical to this coding system but have an EOL-TYPE
 725                 value of 'lf, 'crlf, and 'cr, respectively.
 726         'lf
 727                 The end of a line is marked externally using ASCII LF.
 728                 Since this is also the way that XEmacs represents an
 729                 end-of-line internally, specifying this option results
 730                 in no end-of-line conversion.  This is the standard
 731                 format for Unix text files.
 732         'crlf
 733                 The end of a line is marked externally using ASCII
 734                 CRLF.  This is the standard format for MS-DOS text
 735                 files.
 736         'cr
 737                 The end of a line is marked externally using ASCII CR.
 738                 This is the standard format for Macintosh text files.
 739         t
 740                 Automatically detect the end-of-line type but do not
 741                 generate subsidiary coding systems.  (This value is
 742                 converted to nil when stored internally, and
 743                 `coding-system-property' will return nil.)
 744
 745 'post-read-conversion
 746      Function called after a file has been read in, to perform the
 747      decoding.  Called with two arguments, BEG and END, denoting
 748      a region of the current buffer to be decoded.
 749
 750 'pre-write-conversion
 751      Function called before a file is written out, to perform the
 752      encoding.  Called with two arguments, BEG and END, denoting
 753      a region of the current buffer to be encoded.
 754
 755
 756 The following additional properties are recognized if TYPE is 'iso2022:
 757
 758 'charset-g0
 759 'charset-g1
 760 'charset-g2
 761 'charset-g3
 762      The character set initially designated to the G0 - G3 registers.
 763      The value should be one of
 764
 765           -- A charset object (designate that character set)
 766           -- nil (do not ever use this register)
 767           -- t (no character set is initially designated to
 768                 the register, but may be later on; this automatically
 769                 sets the corresponding `force-g*-on-output' property)
 770
 771 'force-g0-on-output
 772 'force-g1-on-output
 773 'force-g2-on-output
 774 'force-g2-on-output
 775      If non-nil, send an explicit designation sequence on output before
 776      using the specified register.
 777
 778 'short
 779      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 780      "ESC $ B" on output in place of the full designation sequences
 781      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 782
 783 'no-ascii-eol
 784      If non-nil, don't designate ASCII to G0 at each end of line on output.
 785      Setting this to non-nil also suppresses other state-resetting that
 786      normally happens at the end of a line.
 787
 788 'no-ascii-cntl
 789      If non-nil, don't designate ASCII to G0 before control chars on output.
 790
 791 'seven
 792      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 793      environment.
 794
 795 'lock-shift
 796      If non-nil, use locking-shift (SO/SI) instead of single-shift
 797      or designation by escape sequence.
 798
 799 'no-iso6429
 800      If non-nil, don't use ISO6429's direction specification.
 801
 802 'escape-quoted
 803      If non-nil, literal control characters that are the same as
 804      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 805      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 806      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 807      so that they can be properly distinguished from an escape sequence.
 808      (Note that doing this results in a non-portable encoding.) This
 809      encoding flag is used for byte-compiled files.  Note that ESC
 810      is a good choice for a quoting character because there are no
 811      escape sequences whose second byte is a character from the Control-0
 812      or Control-1 character sets; this is explicitly disallowed by the
 813      ISO2022 standard.
 814
 815 'input-charset-conversion
 816      A list of conversion specifications, specifying conversion of
 817      characters in one charset to another when decoding is performed.
 818      Each specification is a list of two elements: the source charset,
 819      and the destination charset.
 820
 821 'output-charset-conversion
 822      A list of conversion specifications, specifying conversion of
 823      characters in one charset to another when encoding is performed.
 824      The form of each specification is the same as for
 825      'input-charset-conversion.
 826
 827
 828 The following additional properties are recognized (and required)
 829 if TYPE is 'ccl:
 830
 831 'decode
 832      CCL program used for decoding (converting to internal format).
 833
 834 'encode
 835      CCL program used for encoding (converting to external format).
 836 */
 837        (name, type, doc_string, props))
 838 {
 839   Lisp_Coding_System *codesys;
 840   Lisp_Object rest, key, value;
 841   enum coding_system_type ty;
 842   int need_to_setup_eol_systems = 1;
 843
 844   /* Convert type to constant */
 845   if (NILP (type) || EQ (type, Qundecided))
 846                                       { ty = CODESYS_AUTODETECT; }
 847 #ifdef MULE
 848   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 849   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 850   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 851   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 852   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 853   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 854 #endif
 855   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 856 #ifdef DEBUG_XEMACS
 857   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 858 #endif
 859   else
 860     signal_simple_error ("Invalid coding system type", type);
 861
 862   CHECK_SYMBOL (name);
 863
 864   codesys = allocate_coding_system (ty, name);
 865
 866   if (NILP (doc_string))
 867     doc_string = build_string ("");
 868   else
 869     CHECK_STRING (doc_string);
 870   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 871
 872   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 873     {
 874       if (EQ (key, Qmnemonic))
 875         {
 876           if (!NILP (value))
 877             CHECK_STRING (value);
 878           CODING_SYSTEM_MNEMONIC (codesys) = value;
 879         }
 880
 881       else if (EQ (key, Qeol_type))
 882         {
 883           need_to_setup_eol_systems = NILP (value);
 884           if (EQ (value, Qt))
 885             value = Qnil;
 886           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 887         }
 888
 889       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 890       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 891 #ifdef MULE
 892       else if (ty == CODESYS_ISO2022)
 893         {
 894 #define FROB_INITIAL_CHARSET(charset_num) \
 895   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 896     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 897
 898           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 899           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 900           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 901           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 902
 903 #define FROB_FORCE_CHARSET(charset_num) \
 904   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 905
 906           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 907           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 908           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 909           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 910
 911 #define FROB_BOOLEAN_PROPERTY(prop) \
 912   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 913
 914           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 915           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 916           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 917           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 918           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 919           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 920           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 921
 922           else if (EQ (key, Qinput_charset_conversion))
 923             {
 924               codesys->iso2022.input_conv =
 925                 Dynarr_new (charset_conversion_spec);
 926               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 927                                               value);
 928             }
 929           else if (EQ (key, Qoutput_charset_conversion))
 930             {
 931               codesys->iso2022.output_conv =
 932                 Dynarr_new (charset_conversion_spec);
 933               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 934                                               value);
 935             }
 936           else
 937             signal_simple_error ("Unrecognized property", key);
 938         }
 939       else if (EQ (type, Qccl))
 940         {
 941           if (EQ (key, Qdecode))
 942             {
 943               CHECK_VECTOR (value);
 944               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 945             }
 946           else if (EQ (key, Qencode))
 947             {
 948               CHECK_VECTOR (value);
 949               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 950             }
 951           else
 952             signal_simple_error ("Unrecognized property", key);
 953         }
 954 #endif /* MULE */
 955       else
 956         signal_simple_error ("Unrecognized property", key);
 957     }
 958
 959   if (need_to_setup_eol_systems)
 960     setup_eol_coding_systems (codesys);
 961
 962   {
 963     Lisp_Object codesys_obj;
 964     XSETCODING_SYSTEM (codesys_obj, codesys);
 965     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 966     return codesys_obj;
 967   }
 968 }
 969
 970 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 971 Copy OLD-CODING-SYSTEM to NEW-NAME.
 972 If NEW-NAME does not name an existing coding system, a new one will
 973 be created.
 974 */
 975        (old_coding_system, new_name))
 976 {
 977   Lisp_Object new_coding_system;
 978   old_coding_system = Fget_coding_system (old_coding_system);
 979   new_coding_system = Ffind_coding_system (new_name);
 980   if (NILP (new_coding_system))
 981     {
 982       XSETCODING_SYSTEM (new_coding_system,
 983                          allocate_coding_system
 984                          (XCODING_SYSTEM_TYPE (old_coding_system),
 985                           new_name));
 986       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 987     }
 988
 989   {
 990     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 991     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 992     memcpy (((char *) to  ) + sizeof (to->header),
 993             ((char *) from) + sizeof (from->header),
 994             sizeof (*from) - sizeof (from->header));
 995     to->name = new_name;
 996   }
 997   return new_coding_system;
 998 }
 999
1000 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1001 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1002 */
1003        (alias, coding_system))
1004 {
1005   CHECK_SYMBOL (alias);
1006   if (!NILP (Ffind_coding_system (alias)))
1007     signal_simple_error ("Symbol already names a coding system", alias);
1008   coding_system = Fget_coding_system (coding_system);
1009   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1010
1011   /* Set up aliases for subsidiaries. */
1012   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1013     {
1014       Lisp_Object str;
1015       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1016 #define FROB(type, name)                                                        \
1017       do {                                                                      \
1018         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1019         if (!NILP (subsidiary))                                                 \
1020           Fdefine_coding_system_alias                                           \
1021             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1022       } while (0)
1023       FROB (LF,   "-unix");
1024       FROB (CRLF, "-dos");
1025       FROB (CR,   "-mac");
1026 #undef FROB
1027     }
1028   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1029      but it doesn't look intentional, so I'd rather return something
1030      meaningful or nothing at all. */
1031   return Qnil;
1032 }
1033
1034 static Lisp_Object
1035 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1036 {
1037   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1038   Lisp_Object new_coding_system;
1039
1040   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1041     return coding_system;
1042
1043   switch (type)
1044     {
1045     case EOL_AUTODETECT: return coding_system;
1046     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1047     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1048     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1049     default:       abort ();
1050     }
1051
1052   return NILP (new_coding_system) ? coding_system : new_coding_system;
1053 }
1054
1055 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1056 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1057 */
1058        (coding_system, eol_type))
1059 {
1060   coding_system = Fget_coding_system (coding_system);
1061
1062   return subsidiary_coding_system (coding_system,
1063                                    symbol_to_eol_type (eol_type));
1064 }
1065
1066 \f
1067 /************************************************************************/
1068 /*                         Coding system accessors                      */
1069 /************************************************************************/
1070
1071 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1072 Return the doc string for CODING-SYSTEM.
1073 */
1074        (coding_system))
1075 {
1076   coding_system = Fget_coding_system (coding_system);
1077   return XCODING_SYSTEM_DOC_STRING (coding_system);
1078 }
1079
1080 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1081 Return the type of CODING-SYSTEM.
1082 */
1083        (coding_system))
1084 {
1085   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1086     {
1087     default: abort ();
1088     case CODESYS_AUTODETECT:    return Qundecided;
1089 #ifdef MULE
1090     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1091     case CODESYS_ISO2022:       return Qiso2022;
1092     case CODESYS_BIG5:          return Qbig5;
1093     case CODESYS_UCS4:          return Qucs4;
1094     case CODESYS_UTF8:          return Qutf8;
1095     case CODESYS_CCL:           return Qccl;
1096 #endif
1097     case CODESYS_NO_CONVERSION: return Qno_conversion;
1098 #ifdef DEBUG_XEMACS
1099     case CODESYS_INTERNAL:      return Qinternal;
1100 #endif
1101     }
1102 }
1103
1104 #ifdef MULE
1105 static
1106 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1107 {
1108   Lisp_Object cs
1109     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1110
1111   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1112 }
1113
1114 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1115 Return initial charset of CODING-SYSTEM designated to GNUM.
1116 GNUM allows 0 .. 3.
1117 */
1118        (coding_system, gnum))
1119 {
1120   coding_system = Fget_coding_system (coding_system);
1121   CHECK_INT (gnum);
1122
1123   return coding_system_charset (coding_system, XINT (gnum));
1124 }
1125 #endif /* MULE */
1126
1127 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1128 Return the PROP property of CODING-SYSTEM.
1129 */
1130        (coding_system, prop))
1131 {
1132   int i, ok = 0;
1133   enum coding_system_type type;
1134
1135   coding_system = Fget_coding_system (coding_system);
1136   CHECK_SYMBOL (prop);
1137   type = XCODING_SYSTEM_TYPE (coding_system);
1138
1139   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1140     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1141       {
1142         ok = 1;
1143         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1144           {
1145           case CODESYS_PROP_ALL_OK:
1146             break;
1147 #ifdef MULE
1148           case CODESYS_PROP_ISO2022:
1149             if (type != CODESYS_ISO2022)
1150               signal_simple_error
1151                 ("Property only valid in ISO2022 coding systems",
1152                  prop);
1153             break;
1154
1155           case CODESYS_PROP_CCL:
1156             if (type != CODESYS_CCL)
1157               signal_simple_error
1158                 ("Property only valid in CCL coding systems",
1159                  prop);
1160             break;
1161 #endif /* MULE */
1162           default:
1163             abort ();
1164           }
1165       }
1166
1167   if (!ok)
1168     signal_simple_error ("Unrecognized property", prop);
1169
1170   if (EQ (prop, Qname))
1171     return XCODING_SYSTEM_NAME (coding_system);
1172   else if (EQ (prop, Qtype))
1173     return Fcoding_system_type (coding_system);
1174   else if (EQ (prop, Qdoc_string))
1175     return XCODING_SYSTEM_DOC_STRING (coding_system);
1176   else if (EQ (prop, Qmnemonic))
1177     return XCODING_SYSTEM_MNEMONIC (coding_system);
1178   else if (EQ (prop, Qeol_type))
1179     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1180   else if (EQ (prop, Qeol_lf))
1181     return XCODING_SYSTEM_EOL_LF (coding_system);
1182   else if (EQ (prop, Qeol_crlf))
1183     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1184   else if (EQ (prop, Qeol_cr))
1185     return XCODING_SYSTEM_EOL_CR (coding_system);
1186   else if (EQ (prop, Qpost_read_conversion))
1187     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1188   else if (EQ (prop, Qpre_write_conversion))
1189     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1190 #ifdef MULE
1191   else if (type == CODESYS_ISO2022)
1192     {
1193       if (EQ (prop, Qcharset_g0))
1194         return coding_system_charset (coding_system, 0);
1195       else if (EQ (prop, Qcharset_g1))
1196         return coding_system_charset (coding_system, 1);
1197       else if (EQ (prop, Qcharset_g2))
1198         return coding_system_charset (coding_system, 2);
1199       else if (EQ (prop, Qcharset_g3))
1200         return coding_system_charset (coding_system, 3);
1201
1202 #define FORCE_CHARSET(charset_num) \
1203   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1204    (coding_system, charset_num) ? Qt : Qnil)
1205
1206       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1207       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1208       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1209       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1210
1211 #define LISP_BOOLEAN(prop) \
1212   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1213
1214       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1215       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1216       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1217       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1218       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1219       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1220       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1221
1222       else if (EQ (prop, Qinput_charset_conversion))
1223         return
1224           unparse_charset_conversion_specs
1225             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1226       else if (EQ (prop, Qoutput_charset_conversion))
1227         return
1228           unparse_charset_conversion_specs
1229             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1230       else
1231         abort ();
1232     }
1233   else if (type == CODESYS_CCL)
1234     {
1235       if (EQ (prop, Qdecode))
1236         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1237       else if (EQ (prop, Qencode))
1238         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1239       else
1240         abort ();
1241     }
1242 #endif /* MULE */
1243   else
1244     abort ();
1245
1246   return Qnil; /* not reached */
1247 }
1248
1249 \f
1250 /************************************************************************/
1251 /*                       Coding category functions                      */
1252 /************************************************************************/
1253
1254 static int
1255 decode_coding_category (Lisp_Object symbol)
1256 {
1257   int i;
1258
1259   CHECK_SYMBOL (symbol);
1260   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1261     if (EQ (coding_category_symbol[i], symbol))
1262       return i;
1263
1264   signal_simple_error ("Unrecognized coding category", symbol);
1265   return 0; /* not reached */
1266 }
1267
1268 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1269 Return a list of all recognized coding categories.
1270 */
1271        ())
1272 {
1273   int i;
1274   Lisp_Object list = Qnil;
1275
1276   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1277     list = Fcons (coding_category_symbol[i], list);
1278   return list;
1279 }
1280
1281 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1282 Change the priority order of the coding categories.
1283 LIST should be list of coding categories, in descending order of
1284 priority.  Unspecified coding categories will be lower in priority
1285 than all specified ones, in the same relative order they were in
1286 previously.
1287 */
1288        (list))
1289 {
1290   int category_to_priority[CODING_CATEGORY_LAST + 1];
1291   int i, j;
1292   Lisp_Object rest;
1293
1294   /* First generate a list that maps coding categories to priorities. */
1295
1296   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1297     category_to_priority[i] = -1;
1298
1299   /* Highest priority comes from the specified list. */
1300   i = 0;
1301   EXTERNAL_LIST_LOOP (rest, list)
1302     {
1303       int cat = decode_coding_category (XCAR (rest));
1304
1305       if (category_to_priority[cat] >= 0)
1306         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1307       category_to_priority[cat] = i++;
1308     }
1309
1310   /* Now go through the existing categories by priority to retrieve
1311      the categories not yet specified and preserve their priority
1312      order. */
1313   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1314     {
1315       int cat = coding_category_by_priority[j];
1316       if (category_to_priority[cat] < 0)
1317         category_to_priority[cat] = i++;
1318     }
1319
1320   /* Now we need to construct the inverse of the mapping we just
1321      constructed. */
1322
1323   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1324     coding_category_by_priority[category_to_priority[i]] = i;
1325
1326   /* Phew!  That was confusing. */
1327   return Qnil;
1328 }
1329
1330 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1331 Return a list of coding categories in descending order of priority.
1332 */
1333        ())
1334 {
1335   int i;
1336   Lisp_Object list = Qnil;
1337
1338   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1339     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1340                   list);
1341   return list;
1342 }
1343
1344 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1345 Change the coding system associated with a coding category.
1346 */
1347        (coding_category, coding_system))
1348 {
1349   int cat = decode_coding_category (coding_category);
1350
1351   coding_system = Fget_coding_system (coding_system);
1352   coding_category_system[cat] = coding_system;
1353   return Qnil;
1354 }
1355
1356 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1357 Return the coding system associated with a coding category.
1358 */
1359        (coding_category))
1360 {
1361   int cat = decode_coding_category (coding_category);
1362   Lisp_Object sys = coding_category_system[cat];
1363
1364   if (!NILP (sys))
1365     return XCODING_SYSTEM_NAME (sys);
1366   return Qnil;
1367 }
1368
1369 \f
1370 /************************************************************************/
1371 /*                     Detecting the encoding of data                   */
1372 /************************************************************************/
1373
1374 struct detection_state
1375 {
1376   enum eol_type eol_type;
1377   int seen_non_ascii;
1378   int mask;
1379 #ifdef MULE
1380   struct
1381     {
1382       int mask;
1383       int in_second_byte;
1384     }
1385   big5;
1386
1387   struct
1388     {
1389       int mask;
1390       int in_second_byte;
1391     }
1392   shift_jis;
1393
1394   struct
1395     {
1396       int mask;
1397       int in_byte;
1398   }
1399   ucs4;
1400
1401   struct
1402     {
1403       int mask;
1404       int in_byte;
1405     }
1406   utf8;
1407
1408   struct
1409     {
1410       int mask;
1411       int initted;
1412       struct iso2022_decoder iso;
1413       unsigned int flags;
1414       int high_byte_count;
1415       unsigned int saw_single_shift:1;
1416     }
1417   iso2022;
1418 #endif
1419   struct
1420     {
1421       int seen_anything;
1422       int just_saw_cr;
1423     }
1424   eol;
1425 };
1426
1427 static int
1428 acceptable_control_char_p (int c)
1429 {
1430   switch (c)
1431     {
1432       /* Allow and ignore control characters that you might
1433          reasonably see in a text file */
1434     case '\r':
1435     case '\n':
1436     case '\t':
1437     case  7: /* bell */
1438     case  8: /* backspace */
1439     case 11: /* vertical tab */
1440     case 12: /* form feed */
1441     case 26: /* MS-DOS C-z junk */
1442     case 31: /* '^_' -- for info */
1443       return 1;
1444     default:
1445       return 0;
1446     }
1447 }
1448
1449 static int
1450 mask_has_at_most_one_bit_p (int mask)
1451 {
1452   /* Perhaps the only thing useful you learn from intensive Microsoft
1453      technical interviews */
1454   return (mask & (mask - 1)) == 0;
1455 }
1456
1457 static enum eol_type
1458 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1459                  unsigned int n)
1460 {
1461   int c;
1462
1463   while (n--)
1464     {
1465       c = *src++;
1466       if (c == '\r')
1467         st->eol.just_saw_cr = 1;
1468       else
1469         {
1470           if (c == '\n')
1471             {
1472               if (st->eol.just_saw_cr)
1473                 return EOL_CRLF;
1474               else if (st->eol.seen_anything)
1475                 return EOL_LF;
1476             }
1477           else if (st->eol.just_saw_cr)
1478             return EOL_CR;
1479           st->eol.just_saw_cr = 0;
1480         }
1481       st->eol.seen_anything = 1;
1482     }
1483
1484   return EOL_AUTODETECT;
1485 }
1486
1487 /* Attempt to determine the encoding and EOL type of the given text.
1488    Before calling this function for the first type, you must initialize
1489    st->eol_type as appropriate and initialize st->mask to ~0.
1490
1491    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1492    not yet known.
1493
1494    st->mask holds the determined coding category mask, or ~0 if only
1495    ASCII has been seen so far.
1496
1497    Returns:
1498
1499    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1500         is present in st->mask
1501    1 == definitive answers are here for both st->eol_type and st->mask
1502 */
1503
1504 static int
1505 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1506                     unsigned int n, int just_do_eol)
1507 {
1508   int c;
1509
1510   if (st->eol_type == EOL_AUTODETECT)
1511     st->eol_type = detect_eol_type (st, src, n);
1512
1513   if (just_do_eol)
1514     return st->eol_type != EOL_AUTODETECT;
1515
1516   if (!st->seen_non_ascii)
1517     {
1518       for (; n; n--, src++)
1519         {
1520           c = *src;
1521           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1522             {
1523               st->seen_non_ascii = 1;
1524 #ifdef MULE
1525               st->shift_jis.mask = ~0;
1526               st->big5.mask = ~0;
1527               st->ucs4.mask = ~0;
1528               st->utf8.mask = ~0;
1529               st->iso2022.mask = ~0;
1530 #endif
1531               break;
1532             }
1533         }
1534     }
1535
1536   if (!n)
1537     return 0;
1538 #ifdef MULE
1539   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1540     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1541   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1542     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1543   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1544     st->big5.mask = detect_coding_big5 (st, src, n);
1545   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1546     st->utf8.mask = detect_coding_utf8 (st, src, n);
1547   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1548     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1549
1550   st->mask
1551     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1552     | st->utf8.mask | st->ucs4.mask;
1553 #endif
1554   {
1555     int retval = mask_has_at_most_one_bit_p (st->mask);
1556     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1557     return retval && st->eol_type != EOL_AUTODETECT;
1558   }
1559 }
1560
1561 static Lisp_Object
1562 coding_system_from_mask (int mask)
1563 {
1564   if (mask == ~0)
1565     {
1566       /* If the file was entirely or basically ASCII, use the
1567          default value of `buffer-file-coding-system'. */
1568       Lisp_Object retval =
1569         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1570       if (!NILP (retval))
1571         {
1572           retval = Ffind_coding_system (retval);
1573           if (NILP (retval))
1574             {
1575               warn_when_safe
1576                 (Qbad_variable, Qwarning,
1577                  "Invalid `default-buffer-file-coding-system', set to nil");
1578               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1579             }
1580         }
1581       if (NILP (retval))
1582         retval = Fget_coding_system (Qraw_text);
1583       return retval;
1584     }
1585   else
1586     {
1587       int i;
1588       int cat = -1;
1589 #ifdef MULE
1590       mask = postprocess_iso2022_mask (mask);
1591 #endif
1592       /* Look through the coding categories by priority and find
1593          the first one that is allowed. */
1594       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1595         {
1596           cat = coding_category_by_priority[i];
1597           if ((mask & (1 << cat)) &&
1598               !NILP (coding_category_system[cat]))
1599             break;
1600         }
1601       if (cat >= 0)
1602         return coding_category_system[cat];
1603       else
1604         return Fget_coding_system (Qraw_text);
1605     }
1606 }
1607
1608 /* Given a seekable read stream and potential coding system and EOL type
1609    as specified, do any autodetection that is called for.  If the
1610    coding system and/or EOL type are not autodetect, they will be left
1611    alone; but this function will never return an autodetect coding system
1612    or EOL type.
1613
1614    This function does not automatically fetch subsidiary coding systems;
1615    that should be unnecessary with the explicit eol-type argument. */
1616
1617 void
1618 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1619                               enum eol_type *eol_type_in_out)
1620 {
1621   struct detection_state decst;
1622
1623   if (*eol_type_in_out == EOL_AUTODETECT)
1624     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1625
1626   xzero (decst);
1627   decst.eol_type = *eol_type_in_out;
1628   decst.mask = ~0;
1629
1630   /* If autodetection is called for, do it now. */
1631   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1632       *eol_type_in_out == EOL_AUTODETECT)
1633     {
1634       unsigned char random_buffer[4096];
1635       int nread;
1636       Lisp_Object coding_system = Qnil;
1637
1638       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1639       if (nread)
1640         {
1641           unsigned char *cp = random_buffer;
1642
1643           while (cp < random_buffer + nread)
1644             {
1645               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1646                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1647                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1648                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1649                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1650                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1651                   (*cp++ == ':') && (cp < random_buffer + nread))
1652                 {
1653                   unsigned char coding_system_name[4096 - 6];
1654                   unsigned char *np = coding_system_name;
1655
1656                   while ( (cp < random_buffer + nread)
1657                           && ((*cp == ' ') || (*cp == '\t')) )
1658                     {
1659                       cp++;
1660                     }
1661                   while ( (cp < random_buffer + nread) &&
1662                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1663                     {
1664                       *np++ = *cp++;
1665                     }
1666                   *np = 0;
1667                   coding_system
1668                     = Ffind_coding_system (intern (coding_system_name));
1669                   break;
1670                 }
1671             }
1672           if (EQ(coding_system, Qnil))
1673             do{
1674               if (detect_coding_type (&decst, random_buffer, nread,
1675                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1676                                       != CODESYS_AUTODETECT))
1677                 break;
1678               nread = Lstream_read (stream,
1679                                     random_buffer, sizeof (random_buffer));
1680               if (!nread)
1681                 break;
1682             } while(1);
1683         }
1684       *eol_type_in_out = decst.eol_type;
1685       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1686         {
1687           if (EQ(coding_system, Qnil))
1688             *codesys_in_out = coding_system_from_mask (decst.mask);
1689           else
1690             *codesys_in_out = coding_system;
1691         }
1692     }
1693   /* If we absolutely can't determine the EOL type, just assume LF. */
1694   if (*eol_type_in_out == EOL_AUTODETECT)
1695     *eol_type_in_out = EOL_LF;
1696
1697   Lstream_rewind (stream);
1698 }
1699
1700 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1701 Detect coding system of the text in the region between START and END.
1702 Returned a list of possible coding systems ordered by priority.
1703 If only ASCII characters are found, it returns 'undecided or one of
1704 its subsidiary coding systems according to a detected end-of-line
1705 type.  Optional arg BUFFER defaults to the current buffer.
1706 */
1707        (start, end, buffer))
1708 {
1709   Lisp_Object val = Qnil;
1710   struct buffer *buf = decode_buffer (buffer, 0);
1711   Bufpos b, e;
1712   Lisp_Object instream, lb_instream;
1713   Lstream *istr, *lb_istr;
1714   struct detection_state decst;
1715   struct gcpro gcpro1, gcpro2;
1716
1717   get_buffer_range_char (buf, start, end, &b, &e, 0);
1718   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1719   lb_istr = XLSTREAM (lb_instream);
1720   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1721   istr = XLSTREAM (instream);
1722   GCPRO2 (instream, lb_instream);
1723   xzero (decst);
1724   decst.eol_type = EOL_AUTODETECT;
1725   decst.mask = ~0;
1726   while (1)
1727     {
1728       unsigned char random_buffer[4096];
1729       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1730
1731       if (!nread)
1732         break;
1733       if (detect_coding_type (&decst, random_buffer, nread, 0))
1734         break;
1735     }
1736
1737   if (decst.mask == ~0)
1738     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1739                                     decst.eol_type);
1740   else
1741     {
1742       int i;
1743
1744       val = Qnil;
1745 #ifdef MULE
1746       decst.mask = postprocess_iso2022_mask (decst.mask);
1747 #endif
1748       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1749         {
1750           int sys = coding_category_by_priority[i];
1751           if (decst.mask & (1 << sys))
1752             {
1753               Lisp_Object codesys = coding_category_system[sys];
1754               if (!NILP (codesys))
1755                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1756               val = Fcons (codesys, val);
1757             }
1758         }
1759     }
1760   Lstream_close (istr);
1761   UNGCPRO;
1762   Lstream_delete (istr);
1763   Lstream_delete (lb_istr);
1764   return val;
1765 }
1766
1767 \f
1768 /************************************************************************/
1769 /*           Converting to internal Mule format ("decoding")            */
1770 /************************************************************************/
1771
1772 /* A decoding stream is a stream used for decoding text (i.e.
1773    converting from some external format to internal format).
1774    The decoding-stream object keeps track of the actual coding
1775    stream, the stream that is at the other end, and data that
1776    needs to be persistent across the lifetime of the stream. */
1777
1778 /* Handle the EOL stuff related to just-read-in character C.
1779    EOL_TYPE is the EOL type of the coding stream.
1780    FLAGS is the current value of FLAGS in the coding stream, and may
1781    be modified by this macro.  (The macro only looks at the
1782    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1783    bytes are to be written.  You need to also define a local goto
1784    label "label_continue_loop" that is at the end of the main
1785    character-reading loop.
1786
1787    If C is a CR character, then this macro handles it entirely and
1788    jumps to label_continue_loop.  Otherwise, this macro does not add
1789    anything to DST, and continues normally.  You should continue
1790    processing C normally after this macro. */
1791
1792 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1793 do {                                                            \
1794   if (c == '\r')                                                \
1795     {                                                           \
1796       if (eol_type == EOL_CR)                                   \
1797         Dynarr_add (dst, '\n');                                 \
1798       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1799         Dynarr_add (dst, c);                                    \
1800       else                                                      \
1801         flags |= CODING_STATE_CR;                               \
1802       goto label_continue_loop;                                 \
1803     }                                                           \
1804   else if (flags & CODING_STATE_CR)                             \
1805     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1806       if (c != '\n')                                            \
1807         Dynarr_add (dst, '\r');                                 \
1808       flags &= ~CODING_STATE_CR;                                \
1809     }                                                           \
1810 } while (0)
1811
1812 /* C should be a binary character in the range 0 - 255; convert
1813    to internal format and add to Dynarr DST. */
1814
1815 #ifdef UTF2000
1816 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1817 do {                                            \
1818   if (BYTE_ASCII_P (c))                         \
1819     Dynarr_add (dst, c);                        \
1820   else                                          \
1821     {                                           \
1822       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1823       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1824     }                                           \
1825 } while (0)
1826
1827 INLINE void
1828 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1829 {
1830   if ( c <= 0x7f )
1831     {
1832       Dynarr_add (dst, c);
1833     }
1834   else if ( c <= 0x7ff )
1835     {
1836       Dynarr_add (dst, (c >> 6) | 0xc0);
1837       Dynarr_add (dst, (c & 0x3f) | 0x80);
1838     }
1839   else if ( c <= 0xffff )
1840     {
1841       Dynarr_add (dst,  (c >> 12) | 0xe0);
1842       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1843       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1844     }
1845   else if ( c <= 0x1fffff )
1846     {
1847       Dynarr_add (dst,  (c >> 18) | 0xf0);
1848       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1849       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1850       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1851     }
1852   else if ( c <= 0x3ffffff )
1853     {
1854       Dynarr_add (dst,  (c >> 24) | 0xf8);
1855       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1856       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1857       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1858       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1859     }
1860   else
1861     {
1862       Dynarr_add (dst,  (c >> 30) | 0xfc);
1863       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1864       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1865       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1866       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1867       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1868     }
1869 }
1870 #else
1871 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1872 do {                                            \
1873   if (BYTE_ASCII_P (c))                         \
1874     Dynarr_add (dst, c);                        \
1875   else if (BYTE_C1_P (c))                       \
1876     {                                           \
1877       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1878       Dynarr_add (dst, c + 0x20);               \
1879     }                                           \
1880   else                                          \
1881     {                                           \
1882       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1883       Dynarr_add (dst, c);                      \
1884     }                                           \
1885 } while (0)
1886 #endif
1887
1888 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1889 do {                                    \
1890   if (ch)                               \
1891     {                                   \
1892       DECODE_ADD_BINARY_CHAR (ch, dst); \
1893       ch = 0;                           \
1894     }                                   \
1895 } while (0)
1896
1897 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1898 do {                                    \
1899   if (flags & CODING_STATE_END)         \
1900     {                                   \
1901       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1902       if (flags & CODING_STATE_CR)      \
1903         Dynarr_add (dst, '\r');         \
1904     }                                   \
1905 } while (0)
1906
1907 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1908
1909 struct decoding_stream
1910 {
1911   /* Coding system that governs the conversion. */
1912   Lisp_Coding_System *codesys;
1913
1914   /* Stream that we read the encoded data from or
1915      write the decoded data to. */
1916   Lstream *other_end;
1917
1918   /* If we are reading, then we can return only a fixed amount of
1919      data, so if the conversion resulted in too much data, we store it
1920      here for retrieval the next time around. */
1921   unsigned_char_dynarr *runoff;
1922
1923   /* FLAGS holds flags indicating the current state of the decoding.
1924      Some of these flags are dependent on the coding system. */
1925   unsigned int flags;
1926
1927   /* CH holds a partially built-up character.  Since we only deal
1928      with one- and two-byte characters at the moment, we only use
1929      this to store the first byte of a two-byte character. */
1930   unsigned int ch;
1931
1932   /* EOL_TYPE specifies the type of end-of-line conversion that
1933      currently applies.  We need to keep this separate from the
1934      EOL type stored in CODESYS because the latter might indicate
1935      automatic EOL-type detection while the former will always
1936      indicate a particular EOL type. */
1937   enum eol_type eol_type;
1938 #ifdef MULE
1939   /* Additional ISO2022 information.  We define the structure above
1940      because it's also needed by the detection routines. */
1941   struct iso2022_decoder iso2022;
1942
1943   /* Additional information (the state of the running CCL program)
1944      used by the CCL decoder. */
1945   struct ccl_program ccl;
1946
1947   /* counter for UTF-8 or UCS-4 */
1948   unsigned char counter;
1949 #endif
1950   struct detection_state decst;
1951 };
1952
1953 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1954 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1955 static int decoding_rewinder   (Lstream *stream);
1956 static int decoding_seekable_p (Lstream *stream);
1957 static int decoding_flusher    (Lstream *stream);
1958 static int decoding_closer     (Lstream *stream);
1959
1960 static Lisp_Object decoding_marker (Lisp_Object stream,
1961                                     void (*markobj) (Lisp_Object));
1962
1963 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1964                                sizeof (struct decoding_stream));
1965
1966 static Lisp_Object
1967 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1968 {
1969   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1970   Lisp_Object str_obj;
1971
1972   /* We do not need to mark the coding systems or charsets stored
1973      within the stream because they are stored in a global list
1974      and automatically marked. */
1975
1976   XSETLSTREAM (str_obj, str);
1977   markobj (str_obj);
1978   if (str->imp->marker)
1979     return (str->imp->marker) (str_obj, markobj);
1980   else
1981     return Qnil;
1982 }
1983
1984 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1985    so we read data from the other end, decode it, and store it into DATA. */
1986
1987 static int
1988 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1989 {
1990   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1991   unsigned char *orig_data = data;
1992   int read_size;
1993   int error_occurred = 0;
1994
1995   /* We need to interface to mule_decode(), which expects to take some
1996      amount of data and store the result into a Dynarr.  We have
1997      mule_decode() store into str->runoff, and take data from there
1998      as necessary. */
1999
2000   /* We loop until we have enough data, reading chunks from the other
2001      end and decoding it. */
2002   while (1)
2003     {
2004       /* Take data from the runoff if we can.  Make sure to take at
2005          most SIZE bytes, and delete the data from the runoff. */
2006       if (Dynarr_length (str->runoff) > 0)
2007         {
2008           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2009           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2010           Dynarr_delete_many (str->runoff, 0, chunk);
2011           data += chunk;
2012           size -= chunk;
2013         }
2014
2015       if (size == 0)
2016         break; /* No more room for data */
2017
2018       if (str->flags & CODING_STATE_END)
2019         /* This means that on the previous iteration, we hit the EOF on
2020            the other end.  We loop once more so that mule_decode() can
2021            output any final stuff it may be holding, or any "go back
2022            to a sane state" escape sequences. (This latter makes sense
2023            during encoding.) */
2024         break;
2025
2026       /* Exhausted the runoff, so get some more.  DATA has at least
2027          SIZE bytes left of storage in it, so it's OK to read directly
2028          into it.  (We'll be overwriting above, after we've decoded it
2029          into the runoff.) */
2030       read_size = Lstream_read (str->other_end, data, size);
2031       if (read_size < 0)
2032         {
2033           error_occurred = 1;
2034           break;
2035         }
2036       if (read_size == 0)
2037         /* There might be some more end data produced in the translation.
2038            See the comment above. */
2039         str->flags |= CODING_STATE_END;
2040       mule_decode (stream, data, str->runoff, read_size);
2041     }
2042
2043   if (data - orig_data == 0)
2044     return error_occurred ? -1 : 0;
2045   else
2046     return data - orig_data;
2047 }
2048
2049 static int
2050 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2051 {
2052   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2053   int retval;
2054
2055   /* Decode all our data into the runoff, and then attempt to write
2056      it all out to the other end.  Remove whatever chunk we succeeded
2057      in writing. */
2058   mule_decode (stream, data, str->runoff, size);
2059   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2060                           Dynarr_length (str->runoff));
2061   if (retval > 0)
2062     Dynarr_delete_many (str->runoff, 0, retval);
2063   /* Do NOT return retval.  The return value indicates how much
2064      of the incoming data was written, not how many bytes were
2065      written. */
2066   return size;
2067 }
2068
2069 static void
2070 reset_decoding_stream (struct decoding_stream *str)
2071 {
2072 #ifdef MULE
2073   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2074     {
2075       Lisp_Object coding_system;
2076       XSETCODING_SYSTEM (coding_system, str->codesys);
2077       reset_iso2022 (coding_system, &str->iso2022);
2078     }
2079   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2080     {
2081       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2082     }
2083   str->counter = 0;
2084 #endif /* MULE */
2085   str->flags = str->ch = 0;
2086 }
2087
2088 static int
2089 decoding_rewinder (Lstream *stream)
2090 {
2091   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2092   reset_decoding_stream (str);
2093   Dynarr_reset (str->runoff);
2094   return Lstream_rewind (str->other_end);
2095 }
2096
2097 static int
2098 decoding_seekable_p (Lstream *stream)
2099 {
2100   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101   return Lstream_seekable_p (str->other_end);
2102 }
2103
2104 static int
2105 decoding_flusher (Lstream *stream)
2106 {
2107   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108   return Lstream_flush (str->other_end);
2109 }
2110
2111 static int
2112 decoding_closer (Lstream *stream)
2113 {
2114   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2115   if (stream->flags & LSTREAM_FL_WRITE)
2116     {
2117       str->flags |= CODING_STATE_END;
2118       decoding_writer (stream, 0, 0);
2119     }
2120   Dynarr_free (str->runoff);
2121 #ifdef MULE
2122 #ifdef ENABLE_COMPOSITE_CHARS
2123   if (str->iso2022.composite_chars)
2124     Dynarr_free (str->iso2022.composite_chars);
2125 #endif
2126 #endif
2127   return Lstream_close (str->other_end);
2128 }
2129
2130 Lisp_Object
2131 decoding_stream_coding_system (Lstream *stream)
2132 {
2133   Lisp_Object coding_system;
2134   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2135
2136   XSETCODING_SYSTEM (coding_system, str->codesys);
2137   return subsidiary_coding_system (coding_system, str->eol_type);
2138 }
2139
2140 void
2141 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2142 {
2143   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2144   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2145   str->codesys = cs;
2146   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2147     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2148   reset_decoding_stream (str);
2149 }
2150
2151 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2152    stream for writing, no automatic code detection will be performed.
2153    The reason for this is that automatic code detection requires a
2154    seekable input.  Things will also fail if you open a decoding
2155    stream for reading using a non-fully-specified coding system and
2156    a non-seekable input stream. */
2157
2158 static Lisp_Object
2159 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2160                         CONST char *mode)
2161 {
2162   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2163   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2164   Lisp_Object obj;
2165
2166   xzero (*str);
2167   str->other_end = stream;
2168   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2169   str->eol_type = EOL_AUTODETECT;
2170   if (!strcmp (mode, "r")
2171       && Lstream_seekable_p (stream))
2172     /* We can determine the coding system now. */
2173     determine_real_coding_system (stream, &codesys, &str->eol_type);
2174   set_decoding_stream_coding_system (lstr, codesys);
2175   str->decst.eol_type = str->eol_type;
2176   str->decst.mask = ~0;
2177   XSETLSTREAM (obj, lstr);
2178   return obj;
2179 }
2180
2181 Lisp_Object
2182 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2183 {
2184   return make_decoding_stream_1 (stream, codesys, "r");
2185 }
2186
2187 Lisp_Object
2188 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2189 {
2190   return make_decoding_stream_1 (stream, codesys, "w");
2191 }
2192
2193 /* Note: the decode_coding_* functions all take the same
2194    arguments as mule_decode(), which is to say some SRC data of
2195    size N, which is to be stored into dynamic array DST.
2196    DECODING is the stream within which the decoding is
2197    taking place, but no data is actually read from or
2198    written to that stream; that is handled in decoding_reader()
2199    or decoding_writer().  This allows the same functions to
2200    be used for both reading and writing. */
2201
2202 static void
2203 mule_decode (Lstream *decoding, CONST unsigned char *src,
2204              unsigned_char_dynarr *dst, unsigned int n)
2205 {
2206   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2207
2208   /* If necessary, do encoding-detection now.  We do this when
2209      we're a writing stream or a non-seekable reading stream,
2210      meaning that we can't just process the whole input,
2211      rewind, and start over. */
2212
2213   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2214       str->eol_type == EOL_AUTODETECT)
2215     {
2216       Lisp_Object codesys;
2217
2218       XSETCODING_SYSTEM (codesys, str->codesys);
2219       detect_coding_type (&str->decst, src, n,
2220                           CODING_SYSTEM_TYPE (str->codesys) !=
2221                           CODESYS_AUTODETECT);
2222       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2223           str->decst.mask != ~0)
2224         /* #### This is cheesy.  What we really ought to do is
2225            buffer up a certain amount of data so as to get a
2226            less random result. */
2227         codesys = coding_system_from_mask (str->decst.mask);
2228       str->eol_type = str->decst.eol_type;
2229       if (XCODING_SYSTEM (codesys) != str->codesys)
2230         {
2231           /* Preserve the CODING_STATE_END flag in case it was set.
2232              If we erase it, bad things might happen. */
2233           int was_end = str->flags & CODING_STATE_END;
2234           set_decoding_stream_coding_system (decoding, codesys);
2235           if (was_end)
2236             str->flags |= CODING_STATE_END;
2237         }
2238     }
2239
2240   switch (CODING_SYSTEM_TYPE (str->codesys))
2241     {
2242 #ifdef DEBUG_XEMACS
2243     case CODESYS_INTERNAL:
2244       Dynarr_add_many (dst, src, n);
2245       break;
2246 #endif
2247     case CODESYS_AUTODETECT:
2248       /* If we got this far and still haven't decided on the coding
2249          system, then do no conversion. */
2250     case CODESYS_NO_CONVERSION:
2251       decode_coding_no_conversion (decoding, src, dst, n);
2252       break;
2253 #ifdef MULE
2254     case CODESYS_SHIFT_JIS:
2255       decode_coding_sjis (decoding, src, dst, n);
2256       break;
2257     case CODESYS_BIG5:
2258       decode_coding_big5 (decoding, src, dst, n);
2259       break;
2260     case CODESYS_UCS4:
2261       decode_coding_ucs4 (decoding, src, dst, n);
2262       break;
2263     case CODESYS_UTF8:
2264       decode_coding_utf8 (decoding, src, dst, n);
2265       break;
2266     case CODESYS_CCL:
2267       str->ccl.last_block = str->flags & CODING_STATE_END;
2268       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2269       break;
2270     case CODESYS_ISO2022:
2271       decode_coding_iso2022 (decoding, src, dst, n);
2272       break;
2273 #endif /* MULE */
2274     default:
2275       abort ();
2276     }
2277 }
2278
2279 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2280 Decode the text between START and END which is encoded in CODING-SYSTEM.
2281 This is useful if you've read in encoded text from a file without decoding
2282 it (e.g. you read in a JIS-formatted file but used the `binary' or
2283 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2284 Return length of decoded text.
2285 BUFFER defaults to the current buffer if unspecified.
2286 */
2287        (start, end, coding_system, buffer))
2288 {
2289   Bufpos b, e;
2290   struct buffer *buf = decode_buffer (buffer, 0);
2291   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2292   Lstream *istr, *ostr;
2293   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2294
2295   get_buffer_range_char (buf, start, end, &b, &e, 0);
2296
2297   barf_if_buffer_read_only (buf, b, e);
2298
2299   coding_system = Fget_coding_system (coding_system);
2300   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2301   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2302   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2303                                               coding_system);
2304   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2305                                            Fget_coding_system (Qbinary));
2306   istr = XLSTREAM (instream);
2307   ostr = XLSTREAM (outstream);
2308   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2309
2310   /* The chain of streams looks like this:
2311
2312      [BUFFER] <----- send through
2313                      ------> [ENCODE AS BINARY]
2314                              ------> [DECODE AS SPECIFIED]
2315                                      ------> [BUFFER]
2316    */
2317
2318   while (1)
2319     {
2320       char tempbuf[1024]; /* some random amount */
2321       Bufpos newpos, even_newer_pos;
2322       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2323       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2324
2325       if (!size_in_bytes)
2326         break;
2327       newpos = lisp_buffer_stream_startpos (istr);
2328       Lstream_write (ostr, tempbuf, size_in_bytes);
2329       even_newer_pos = lisp_buffer_stream_startpos (istr);
2330       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2331                            even_newer_pos, 0);
2332     }
2333   Lstream_close (istr);
2334   Lstream_close (ostr);
2335   UNGCPRO;
2336   Lstream_delete (istr);
2337   Lstream_delete (ostr);
2338   Lstream_delete (XLSTREAM (de_outstream));
2339   Lstream_delete (XLSTREAM (lb_outstream));
2340   return Qnil;
2341 }
2342
2343 \f
2344 /************************************************************************/
2345 /*           Converting to an external encoding ("encoding")            */
2346 /************************************************************************/
2347
2348 /* An encoding stream is an output stream.  When you create the
2349    stream, you specify the coding system that governs the encoding
2350    and another stream that the resulting encoded data is to be
2351    sent to, and then start sending data to it. */
2352
2353 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2354
2355 struct encoding_stream
2356 {
2357   /* Coding system that governs the conversion. */
2358   Lisp_Coding_System *codesys;
2359
2360   /* Stream that we read the encoded data from or
2361      write the decoded data to. */
2362   Lstream *other_end;
2363
2364   /* If we are reading, then we can return only a fixed amount of
2365      data, so if the conversion resulted in too much data, we store it
2366      here for retrieval the next time around. */
2367   unsigned_char_dynarr *runoff;
2368
2369   /* FLAGS holds flags indicating the current state of the encoding.
2370      Some of these flags are dependent on the coding system. */
2371   unsigned int flags;
2372
2373   /* CH holds a partially built-up character.  Since we only deal
2374      with one- and two-byte characters at the moment, we only use
2375      this to store the first byte of a two-byte character. */
2376   unsigned int ch;
2377 #ifdef MULE
2378   /* Additional information used by the ISO2022 encoder. */
2379   struct
2380     {
2381       /* CHARSET holds the character sets currently assigned to the G0
2382          through G3 registers.  It is initialized from the array
2383          INITIAL_CHARSET in CODESYS. */
2384       Lisp_Object charset[4];
2385
2386       /* Which registers are currently invoked into the left (GL) and
2387          right (GR) halves of the 8-bit encoding space? */
2388       int register_left, register_right;
2389
2390       /* Whether we need to explicitly designate the charset in the
2391          G? register before using it.  It is initialized from the
2392          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2393       unsigned char force_charset_on_output[4];
2394
2395       /* Other state variables that need to be preserved across
2396          invocations. */
2397       Lisp_Object current_charset;
2398       int current_half;
2399       int current_char_boundary;
2400     } iso2022;
2401
2402   void (*encode_char) (struct encoding_stream *str, Emchar c,
2403                        unsigned_char_dynarr *dst, unsigned int *flags);
2404   void (*finish) (struct encoding_stream *str,
2405                   unsigned_char_dynarr *dst, unsigned int *flags);
2406
2407   /* Additional information (the state of the running CCL program)
2408      used by the CCL encoder. */
2409   struct ccl_program ccl;
2410 #endif /* MULE */
2411 };
2412
2413 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2414 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2415                             size_t size);
2416 static int encoding_rewinder   (Lstream *stream);
2417 static int encoding_seekable_p (Lstream *stream);
2418 static int encoding_flusher    (Lstream *stream);
2419 static int encoding_closer     (Lstream *stream);
2420
2421 static Lisp_Object encoding_marker (Lisp_Object stream,
2422                                     void (*markobj) (Lisp_Object));
2423
2424 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2425                                sizeof (struct encoding_stream));
2426
2427 static Lisp_Object
2428 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2429 {
2430   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2431   Lisp_Object str_obj;
2432
2433   /* We do not need to mark the coding systems or charsets stored
2434      within the stream because they are stored in a global list
2435      and automatically marked. */
2436
2437   XSETLSTREAM (str_obj, str);
2438   markobj (str_obj);
2439   if (str->imp->marker)
2440     return (str->imp->marker) (str_obj, markobj);
2441   else
2442     return Qnil;
2443 }
2444
2445 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2446    so we read data from the other end, encode it, and store it into DATA. */
2447
2448 static int
2449 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2450 {
2451   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2452   unsigned char *orig_data = data;
2453   int read_size;
2454   int error_occurred = 0;
2455
2456   /* We need to interface to mule_encode(), which expects to take some
2457      amount of data and store the result into a Dynarr.  We have
2458      mule_encode() store into str->runoff, and take data from there
2459      as necessary. */
2460
2461   /* We loop until we have enough data, reading chunks from the other
2462      end and encoding it. */
2463   while (1)
2464     {
2465       /* Take data from the runoff if we can.  Make sure to take at
2466          most SIZE bytes, and delete the data from the runoff. */
2467       if (Dynarr_length (str->runoff) > 0)
2468         {
2469           int chunk = min ((int) size, Dynarr_length (str->runoff));
2470           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2471           Dynarr_delete_many (str->runoff, 0, chunk);
2472           data += chunk;
2473           size -= chunk;
2474         }
2475
2476       if (size == 0)
2477         break; /* No more room for data */
2478
2479       if (str->flags & CODING_STATE_END)
2480         /* This means that on the previous iteration, we hit the EOF on
2481            the other end.  We loop once more so that mule_encode() can
2482            output any final stuff it may be holding, or any "go back
2483            to a sane state" escape sequences. (This latter makes sense
2484            during encoding.) */
2485         break;
2486
2487       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2488          left of storage in it, so it's OK to read directly into it.
2489          (We'll be overwriting above, after we've encoded it into the
2490          runoff.) */
2491       read_size = Lstream_read (str->other_end, data, size);
2492       if (read_size < 0)
2493         {
2494           error_occurred = 1;
2495           break;
2496         }
2497       if (read_size == 0)
2498         /* There might be some more end data produced in the translation.
2499            See the comment above. */
2500         str->flags |= CODING_STATE_END;
2501       mule_encode (stream, data, str->runoff, read_size);
2502     }
2503
2504   if (data == orig_data)
2505     return error_occurred ? -1 : 0;
2506   else
2507     return data - orig_data;
2508 }
2509
2510 static int
2511 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2512 {
2513   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2514   int retval;
2515
2516   /* Encode all our data into the runoff, and then attempt to write
2517      it all out to the other end.  Remove whatever chunk we succeeded
2518      in writing. */
2519   mule_encode (stream, data, str->runoff, size);
2520   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2521                           Dynarr_length (str->runoff));
2522   if (retval > 0)
2523     Dynarr_delete_many (str->runoff, 0, retval);
2524   /* Do NOT return retval.  The return value indicates how much
2525      of the incoming data was written, not how many bytes were
2526      written. */
2527   return size;
2528 }
2529
2530 static void
2531 reset_encoding_stream (struct encoding_stream *str)
2532 {
2533 #ifdef MULE
2534   switch (CODING_SYSTEM_TYPE (str->codesys))
2535     {
2536     case CODESYS_ISO2022:
2537       {
2538         int i;
2539
2540         str->encode_char = &char_encode_iso2022;
2541         str->finish = &char_finish_iso2022;
2542         for (i = 0; i < 4; i++)
2543           {
2544             str->iso2022.charset[i] =
2545               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2546             str->iso2022.force_charset_on_output[i] =
2547               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2548           }
2549         str->iso2022.register_left = 0;
2550         str->iso2022.register_right = 1;
2551         str->iso2022.current_charset = Qnil;
2552         str->iso2022.current_half = 0;
2553         break;
2554       }
2555     case CODESYS_CCL:
2556       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2557       break;
2558     case CODESYS_UTF8:
2559       str->encode_char = &char_encode_utf8;
2560       str->finish = &char_finish_utf8;
2561     case CODESYS_UCS4:
2562       str->encode_char = &char_encode_ucs4;
2563       str->finish = &char_finish_ucs4;
2564     case CODESYS_SHIFT_JIS:
2565       str->encode_char = &char_encode_shift_jis;
2566       str->finish = &char_finish_shift_jis;
2567     default:
2568       break;
2569     }
2570 #endif /* MULE */
2571   str->iso2022.current_char_boundary = 0;
2572   str->flags = str->ch = 0;
2573 }
2574
2575 static int
2576 encoding_rewinder (Lstream *stream)
2577 {
2578   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2579   reset_encoding_stream (str);
2580   Dynarr_reset (str->runoff);
2581   return Lstream_rewind (str->other_end);
2582 }
2583
2584 static int
2585 encoding_seekable_p (Lstream *stream)
2586 {
2587   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2588   return Lstream_seekable_p (str->other_end);
2589 }
2590
2591 static int
2592 encoding_flusher (Lstream *stream)
2593 {
2594   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2595   return Lstream_flush (str->other_end);
2596 }
2597
2598 static int
2599 encoding_closer (Lstream *stream)
2600 {
2601   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2602   if (stream->flags & LSTREAM_FL_WRITE)
2603     {
2604       str->flags |= CODING_STATE_END;
2605       encoding_writer (stream, 0, 0);
2606     }
2607   Dynarr_free (str->runoff);
2608   return Lstream_close (str->other_end);
2609 }
2610
2611 Lisp_Object
2612 encoding_stream_coding_system (Lstream *stream)
2613 {
2614   Lisp_Object coding_system;
2615   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2616
2617   XSETCODING_SYSTEM (coding_system, str->codesys);
2618   return coding_system;
2619 }
2620
2621 void
2622 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2623 {
2624   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2625   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2626   str->codesys = cs;
2627   reset_encoding_stream (str);
2628 }
2629
2630 static Lisp_Object
2631 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2632                         CONST char *mode)
2633 {
2634   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2635   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2636   Lisp_Object obj;
2637
2638   xzero (*str);
2639   str->runoff = Dynarr_new (unsigned_char);
2640   str->other_end = stream;
2641   set_encoding_stream_coding_system (lstr, codesys);
2642   XSETLSTREAM (obj, lstr);
2643   return obj;
2644 }
2645
2646 Lisp_Object
2647 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2648 {
2649   return make_encoding_stream_1 (stream, codesys, "r");
2650 }
2651
2652 Lisp_Object
2653 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2654 {
2655   return make_encoding_stream_1 (stream, codesys, "w");
2656 }
2657
2658 /* Convert N bytes of internally-formatted data stored in SRC to an
2659    external format, according to the encoding stream ENCODING.
2660    Store the encoded data into DST. */
2661
2662 static void
2663 mule_encode (Lstream *encoding, CONST unsigned char *src,
2664              unsigned_char_dynarr *dst, unsigned int n)
2665 {
2666   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2667
2668   switch (CODING_SYSTEM_TYPE (str->codesys))
2669     {
2670 #ifdef DEBUG_XEMACS
2671     case CODESYS_INTERNAL:
2672       Dynarr_add_many (dst, src, n);
2673       break;
2674 #endif
2675     case CODESYS_AUTODETECT:
2676       /* If we got this far and still haven't decided on the coding
2677          system, then do no conversion. */
2678     case CODESYS_NO_CONVERSION:
2679       encode_coding_no_conversion (encoding, src, dst, n);
2680       break;
2681 #ifdef MULE
2682     case CODESYS_BIG5:
2683       encode_coding_big5 (encoding, src, dst, n);
2684       break;
2685     case CODESYS_CCL:
2686       str->ccl.last_block = str->flags & CODING_STATE_END;
2687       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2688       break;
2689 #endif /* MULE */
2690     default:
2691       text_encode_generic (encoding, src, dst, n);
2692     }
2693 }
2694
2695 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2696 Encode the text between START and END using CODING-SYSTEM.
2697 This will, for example, convert Japanese characters into stuff such as
2698 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2699 text.  BUFFER defaults to the current buffer if unspecified.
2700 */
2701        (start, end, coding_system, buffer))
2702 {
2703   Bufpos b, e;
2704   struct buffer *buf = decode_buffer (buffer, 0);
2705   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2706   Lstream *istr, *ostr;
2707   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2708
2709   get_buffer_range_char (buf, start, end, &b, &e, 0);
2710
2711   barf_if_buffer_read_only (buf, b, e);
2712
2713   coding_system = Fget_coding_system (coding_system);
2714   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2715   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2716   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2717                                               Fget_coding_system (Qbinary));
2718   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2719                                            coding_system);
2720   istr = XLSTREAM (instream);
2721   ostr = XLSTREAM (outstream);
2722   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2723   /* The chain of streams looks like this:
2724
2725      [BUFFER] <----- send through
2726                      ------> [ENCODE AS SPECIFIED]
2727                              ------> [DECODE AS BINARY]
2728                                      ------> [BUFFER]
2729    */
2730   while (1)
2731     {
2732       char tempbuf[1024]; /* some random amount */
2733       Bufpos newpos, even_newer_pos;
2734       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2735       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2736
2737       if (!size_in_bytes)
2738         break;
2739       newpos = lisp_buffer_stream_startpos (istr);
2740       Lstream_write (ostr, tempbuf, size_in_bytes);
2741       even_newer_pos = lisp_buffer_stream_startpos (istr);
2742       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2743                            even_newer_pos, 0);
2744     }
2745
2746   {
2747     Charcount retlen =
2748       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2749     Lstream_close (istr);
2750     Lstream_close (ostr);
2751     UNGCPRO;
2752     Lstream_delete (istr);
2753     Lstream_delete (ostr);
2754     Lstream_delete (XLSTREAM (de_outstream));
2755     Lstream_delete (XLSTREAM (lb_outstream));
2756     return make_int (retlen);
2757   }
2758 }
2759
2760 #ifdef MULE
2761 \f
2762 static void
2763 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2764                      unsigned_char_dynarr *dst, unsigned int n)
2765 {
2766   unsigned char c;
2767   unsigned char char_boundary;
2768   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2769   unsigned int flags          = str->flags;
2770   Emchar ch                   = str->ch;
2771
2772   char_boundary = str->iso2022.current_char_boundary;
2773
2774   while (n--)
2775     {
2776       c = *src++;
2777
2778       if (char_boundary == 0)
2779         {
2780           if (c >= 0xfc)
2781             {
2782               ch = c & 0x01;
2783               char_boundary = 5;
2784             }
2785           else if (c >= 0xf8)
2786             {
2787               ch = c & 0x03;
2788               char_boundary = 4;
2789             }
2790           else if (c >= 0xf0)
2791             {
2792               ch = c & 0x07;
2793               char_boundary = 3;
2794             }
2795           else if (c >= 0xe0)
2796             {
2797               ch = c & 0x0f;
2798               char_boundary = 2;
2799             }
2800           else if (c >= 0xc0)
2801             {
2802               ch = c & 0x1f;
2803               char_boundary = 1;
2804             }
2805           else
2806             (*str->encode_char) (str, c, dst, &flags);
2807         }
2808       else if (char_boundary == 1)
2809         {
2810           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2811           ch =0;
2812           char_boundary = 0;
2813         }
2814       else
2815         {
2816           ch = (ch << 6) | (c & 0x3f);
2817           char_boundary--;
2818         }
2819     }
2820
2821   if ((char_boundary == 0) && (flags & CODING_STATE_END))
2822     {
2823       (*str->finish) (str, dst, &flags);
2824     }
2825
2826   str->flags = flags;
2827   str->ch    = ch;
2828   str->iso2022.current_char_boundary = char_boundary;
2829 }
2830
2831 \f
2832 /************************************************************************/
2833 /*                          Shift-JIS methods                           */
2834 /************************************************************************/
2835
2836 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2837    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2838    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2839    encoded by "position-code + 0x80".  A character of JISX0208
2840    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2841    position-codes are divided and shifted so that it fit in the range
2842    below.
2843
2844    --- CODE RANGE of Shift-JIS ---
2845    (character set)      (range)
2846    ASCII                0x00 .. 0x7F
2847    JISX0201-Kana        0xA0 .. 0xDF
2848    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2849             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2850    -------------------------------
2851
2852 */
2853
2854 /* Is this the first byte of a Shift-JIS two-byte char? */
2855
2856 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2857   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2858
2859 /* Is this the second byte of a Shift-JIS two-byte char? */
2860
2861 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2862   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2863
2864 #define BYTE_SJIS_KATAKANA_P(c) \
2865   ((c) >= 0xA1 && (c) <= 0xDF)
2866
2867 static int
2868 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2869                     unsigned int n)
2870 {
2871   int c;
2872
2873   while (n--)
2874     {
2875       c = *src++;
2876       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2877         return 0;
2878       if (st->shift_jis.in_second_byte)
2879         {
2880           st->shift_jis.in_second_byte = 0;
2881           if (c < 0x40)
2882             return 0;
2883         }
2884       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2885         st->shift_jis.in_second_byte = 1;
2886     }
2887   return CODING_CATEGORY_SHIFT_JIS_MASK;
2888 }
2889
2890 /* Convert Shift-JIS data to internal format. */
2891
2892 static void
2893 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2894                     unsigned_char_dynarr *dst, unsigned int n)
2895 {
2896   unsigned char c;
2897   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2898   unsigned int flags  = str->flags;
2899   unsigned int ch     = str->ch;
2900   eol_type_t eol_type = str->eol_type;
2901
2902   while (n--)
2903     {
2904       c = *src++;
2905
2906       if (ch)
2907         {
2908           /* Previous character was first byte of Shift-JIS Kanji char. */
2909           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2910             {
2911               unsigned char e1, e2;
2912
2913               DECODE_SJIS (ch, c, e1, e2);
2914 #ifdef UTF2000
2915               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2916                                             e1 & 0x7F,
2917                                             e2 & 0x7F), dst);
2918 #else
2919               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2920               Dynarr_add (dst, e1);
2921               Dynarr_add (dst, e2);
2922 #endif
2923             }
2924           else
2925             {
2926               DECODE_ADD_BINARY_CHAR (ch, dst);
2927               DECODE_ADD_BINARY_CHAR (c, dst);
2928             }
2929           ch = 0;
2930         }
2931       else
2932         {
2933           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2934           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2935             ch = c;
2936           else if (BYTE_SJIS_KATAKANA_P (c))
2937             {
2938 #ifdef UTF2000
2939               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2940                                             c & 0x7F, 0), dst);
2941 #else
2942               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2943               Dynarr_add (dst, c);
2944 #endif
2945             }
2946 #ifdef UTF2000
2947           else if (c > 32)
2948             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
2949                                           c, 0), dst);
2950 #endif
2951           else
2952             DECODE_ADD_BINARY_CHAR (c, dst);
2953         }
2954     label_continue_loop:;
2955     }
2956
2957   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2958
2959   str->flags = flags;
2960   str->ch    = ch;
2961 }
2962
2963 /* Convert internal character representation to Shift_JIS. */
2964
2965 void
2966 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
2967                        unsigned_char_dynarr *dst, unsigned int *flags)
2968 {
2969   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2970
2971   if (ch == '\n')
2972     {
2973       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2974         Dynarr_add (dst, '\r');
2975       if (eol_type != EOL_CR)
2976         Dynarr_add (dst, ch);
2977     }
2978   else
2979     {
2980       Lisp_Object charset;
2981       unsigned int c1, c2, s1, s2;
2982
2983 #ifdef UTF2000
2984       if ( (c1 =
2985             get_byte_from_character_table
2986             (ch, XCHARSET_TO_BYTE1_TABLE (Vcharset_latin_jisx0201))) )
2987         {
2988           charset = Vcharset_latin_jisx0201;
2989           c2 = 0;
2990         }
2991       else
2992 #endif
2993         BREAKUP_CHAR (ch, charset, c1, c2);
2994
2995       if (EQ(charset, Vcharset_katakana_jisx0201))
2996         {
2997           Dynarr_add (dst, c1 | 0x80);
2998         }
2999       else if (c2 == 0)
3000         {
3001           Dynarr_add (dst, c1);
3002         }
3003       else if (EQ(charset, Vcharset_japanese_jisx0208))
3004         {
3005           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3006           Dynarr_add (dst, s1);
3007           Dynarr_add (dst, s2);
3008         }
3009       else
3010         Dynarr_add (dst, '?');
3011     }
3012 }
3013
3014 void
3015 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3016                        unsigned int *flags)
3017 {
3018 }
3019
3020 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3021 Decode a JISX0208 character of Shift-JIS coding-system.
3022 CODE is the character code in Shift-JIS as a cons of type bytes.
3023 Return the corresponding character.
3024 */
3025        (code))
3026 {
3027   unsigned char c1, c2, s1, s2;
3028
3029   CHECK_CONS (code);
3030   CHECK_INT (XCAR (code));
3031   CHECK_INT (XCDR (code));
3032   s1 = XINT (XCAR (code));
3033   s2 = XINT (XCDR (code));
3034   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3035       BYTE_SJIS_TWO_BYTE_2_P (s2))
3036     {
3037       DECODE_SJIS (s1, s2, c1, c2);
3038       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3039                                    c1 & 0x7F, c2 & 0x7F));
3040     }
3041   else
3042     return Qnil;
3043 }
3044
3045 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3046 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3047 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3048 */
3049        (ch))
3050 {
3051   Lisp_Object charset;
3052   int c1, c2, s1, s2;
3053
3054   CHECK_CHAR_COERCE_INT (ch);
3055   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3056   if (EQ (charset, Vcharset_japanese_jisx0208))
3057     {
3058       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3059       return Fcons (make_int (s1), make_int (s2));
3060     }
3061   else
3062     return Qnil;
3063 }
3064
3065 \f
3066 /************************************************************************/
3067 /*                            Big5 methods                              */
3068 /************************************************************************/
3069
3070 /* BIG5 is a coding system encoding two character sets: ASCII and
3071    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3072    character set and is encoded in two-byte.
3073
3074    --- CODE RANGE of BIG5 ---
3075    (character set)      (range)
3076    ASCII                0x00 .. 0x7F
3077    Big5 (1st byte)      0xA1 .. 0xFE
3078         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3079    --------------------------
3080
3081    Since the number of characters in Big5 is larger than maximum
3082    characters in Emacs' charset (96x96), it can't be handled as one
3083    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3084    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3085    contains frequently used characters and the latter contains less
3086    frequently used characters.  */
3087
3088 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3089   ((c) >= 0xA1 && (c) <= 0xFE)
3090
3091 /* Is this the second byte of a Shift-JIS two-byte char? */
3092
3093 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3094   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3095
3096 /* Number of Big5 characters which have the same code in 1st byte.  */
3097
3098 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3099
3100 /* Code conversion macros.  These are macros because they are used in
3101    inner loops during code conversion.
3102
3103    Note that temporary variables in macros introduce the classic
3104    dynamic-scoping problems with variable names.  We use capital-
3105    lettered variables in the assumption that XEmacs does not use
3106    capital letters in variables except in a very formalized way
3107    (e.g. Qstring). */
3108
3109 /* Convert Big5 code (b1, b2) into its internal string representation
3110    (lb, c1, c2). */
3111
3112 /* There is a much simpler way to split the Big5 charset into two.
3113    For the moment I'm going to leave the algorithm as-is because it
3114    claims to separate out the most-used characters into a single
3115    charset, which perhaps will lead to optimizations in various
3116    places.
3117
3118    The way the algorithm works is something like this:
3119
3120    Big5 can be viewed as a 94x157 charset, where the row is
3121    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3122    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3123    the split between low and high column numbers is apparently
3124    meaningless; ascending rows produce less and less frequent chars.
3125    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3126    the first charset, and the upper half (0xC9 .. 0xFE) to the
3127    second.  To do the conversion, we convert the character into
3128    a single number where 0 .. 156 is the first row, 157 .. 313
3129    is the second, etc.  That way, the characters are ordered by
3130    decreasing frequency.  Then we just chop the space in two
3131    and coerce the result into a 94x94 space.
3132    */
3133
3134 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3135 {                                                                       \
3136   int B1 = b1, B2 = b2;                                                 \
3137   unsigned int I                                                        \
3138     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3139                                                                         \
3140   if (B1 < 0xC9)                                                        \
3141     {                                                                   \
3142       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3143     }                                                                   \
3144   else                                                                  \
3145     {                                                                   \
3146       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3147       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3148     }                                                                   \
3149   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3150   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3151 } while (0)
3152
3153 /* Convert the internal string representation of a Big5 character
3154    (lb, c1, c2) into Big5 code (b1, b2). */
3155
3156 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3157 {                                                                       \
3158   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3159                                                                         \
3160   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3161     {                                                                   \
3162       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3163     }                                                                   \
3164   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3165   b2 = I % BIG5_SAME_ROW;                                               \
3166   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3167 } while (0)
3168
3169 static int
3170 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3171                     unsigned int n)
3172 {
3173   int c;
3174
3175   while (n--)
3176     {
3177       c = *src++;
3178       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3179           (c >= 0x80 && c <= 0xA0))
3180         return 0;
3181       if (st->big5.in_second_byte)
3182         {
3183           st->big5.in_second_byte = 0;
3184           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3185             return 0;
3186         }
3187       else if (c >= 0xA1)
3188         st->big5.in_second_byte = 1;
3189     }
3190   return CODING_CATEGORY_BIG5_MASK;
3191 }
3192
3193 /* Convert Big5 data to internal format. */
3194
3195 static void
3196 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3197                     unsigned_char_dynarr *dst, unsigned int n)
3198 {
3199   unsigned char c;
3200   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3201   unsigned int flags  = str->flags;
3202   unsigned int ch     = str->ch;
3203   eol_type_t eol_type = str->eol_type;
3204
3205   while (n--)
3206     {
3207       c = *src++;
3208       if (ch)
3209         {
3210           /* Previous character was first byte of Big5 char. */
3211           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3212             {
3213               unsigned char b1, b2, b3;
3214               DECODE_BIG5 (ch, c, b1, b2, b3);
3215               Dynarr_add (dst, b1);
3216               Dynarr_add (dst, b2);
3217               Dynarr_add (dst, b3);
3218             }
3219           else
3220             {
3221               DECODE_ADD_BINARY_CHAR (ch, dst);
3222               DECODE_ADD_BINARY_CHAR (c, dst);
3223             }
3224           ch = 0;
3225         }
3226       else
3227         {
3228           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3229           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3230             ch = c;
3231           else
3232             DECODE_ADD_BINARY_CHAR (c, dst);
3233         }
3234     label_continue_loop:;
3235     }
3236
3237   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3238
3239   str->flags = flags;
3240   str->ch    = ch;
3241 }
3242
3243 /* Convert internally-formatted data to Big5. */
3244
3245 static void
3246 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3247                     unsigned_char_dynarr *dst, unsigned int n)
3248 {
3249 #ifndef UTF2000
3250   unsigned char c;
3251   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3252   unsigned int flags  = str->flags;
3253   unsigned int ch     = str->ch;
3254   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3255
3256   while (n--)
3257     {
3258       c = *src++;
3259       if (c == '\n')
3260         {
3261           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3262             Dynarr_add (dst, '\r');
3263           if (eol_type != EOL_CR)
3264             Dynarr_add (dst, '\n');
3265         }
3266       else if (BYTE_ASCII_P (c))
3267         {
3268           /* ASCII. */
3269           Dynarr_add (dst, c);
3270         }
3271       else if (BUFBYTE_LEADING_BYTE_P (c))
3272         {
3273           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3274               c == LEADING_BYTE_CHINESE_BIG5_2)
3275             {
3276               /* A recognized leading byte. */
3277               ch = c;
3278               continue; /* not done with this character. */
3279             }
3280           /* otherwise just ignore this character. */
3281         }
3282       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3283                ch == LEADING_BYTE_CHINESE_BIG5_2)
3284         {
3285           /* Previous char was a recognized leading byte. */
3286           ch = (ch << 8) | c;
3287           continue; /* not done with this character. */
3288         }
3289       else if (ch)
3290         {
3291           /* Encountering second byte of a Big5 character. */
3292           unsigned char b1, b2;
3293
3294           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3295           Dynarr_add (dst, b1);
3296           Dynarr_add (dst, b2);
3297         }
3298
3299       ch = 0;
3300     }
3301
3302   str->flags = flags;
3303   str->ch    = ch;
3304 #endif
3305 }
3306
3307
3308 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3309 Decode a Big5 character CODE of BIG5 coding-system.
3310 CODE is the character code in BIG5, a cons of two integers.
3311 Return the corresponding character.
3312 */
3313        (code))
3314 {
3315   unsigned char c1, c2, b1, b2;
3316
3317   CHECK_CONS (code);
3318   CHECK_INT (XCAR (code));
3319   CHECK_INT (XCDR (code));
3320   b1 = XINT (XCAR (code));
3321   b2 = XINT (XCDR (code));
3322   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3323       BYTE_BIG5_TWO_BYTE_2_P (b2))
3324     {
3325       Charset_ID leading_byte;
3326       Lisp_Object charset;
3327       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3328       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3329       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3330     }
3331   else
3332     return Qnil;
3333 }
3334
3335 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3336 Encode the Big5 character CH to BIG5 coding-system.
3337 Return the corresponding character code in Big5.
3338 */
3339        (ch))
3340 {
3341   Lisp_Object charset;
3342   int c1, c2, b1, b2;
3343
3344   CHECK_CHAR_COERCE_INT (ch);
3345   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3346   if (EQ (charset, Vcharset_chinese_big5_1) ||
3347       EQ (charset, Vcharset_chinese_big5_2))
3348     {
3349       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3350                    b1, b2);
3351       return Fcons (make_int (b1), make_int (b2));
3352     }
3353   else
3354     return Qnil;
3355 }
3356
3357 \f
3358 /************************************************************************/
3359 /*                           UCS-4 methods                              */
3360 /************************************************************************/
3361
3362 static int
3363 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3364                     unsigned int n)
3365 {
3366   while (n--)
3367     {
3368       int c = *src++;
3369       switch (st->ucs4.in_byte)
3370         {
3371         case 0:
3372           if (c >= 128)
3373             return 0;
3374           else
3375             st->ucs4.in_byte++;
3376           break;
3377         case 3:
3378           st->ucs4.in_byte = 0;
3379           break;
3380         default:
3381           st->ucs4.in_byte++;
3382         }
3383     }
3384   return CODING_CATEGORY_UCS4_MASK;
3385 }
3386
3387 static void
3388 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3389                     unsigned_char_dynarr *dst, unsigned int n)
3390 {
3391   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3392   unsigned int flags = str->flags;
3393   unsigned int ch    = str->ch;
3394   unsigned char counter = str->counter;
3395
3396   while (n--)
3397     {
3398       unsigned char c = *src++;
3399       switch (counter)
3400         {
3401         case 0:
3402           ch = c;
3403           counter = 3;
3404           break;
3405         case 1:
3406           DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3407           ch = 0;
3408           counter = 0;
3409           break;
3410         default:
3411           ch = ( ch << 8 ) | c;
3412           counter--;
3413         }
3414     }
3415   if (counter & CODING_STATE_END)
3416     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3417
3418   str->flags = flags;
3419   str->ch    = ch;
3420   str->counter = counter;
3421 }
3422
3423 void
3424 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3425                   unsigned_char_dynarr *dst, unsigned int *flags)
3426 {
3427   Dynarr_add (dst,  ch >> 24);
3428   Dynarr_add (dst, (ch >> 16) & 255);
3429   Dynarr_add (dst, (ch >>  8) & 255);
3430   Dynarr_add (dst,  ch        & 255);
3431 }
3432
3433 void
3434 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3435                   unsigned int *flags)
3436 {
3437 }
3438
3439 \f
3440 /************************************************************************/
3441 /*                           UTF-8 methods                              */
3442 /************************************************************************/
3443
3444 static int
3445 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3446                     unsigned int n)
3447 {
3448   while (n--)
3449     {
3450       unsigned char c = *src++;
3451       switch (st->utf8.in_byte)
3452         {
3453         case 0:
3454           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3455             return 0;
3456           else if (c >= 0xfc)
3457             st->utf8.in_byte = 5;
3458           else if (c >= 0xf8)
3459             st->utf8.in_byte = 4;
3460           else if (c >= 0xf0)
3461             st->utf8.in_byte = 3;
3462           else if (c >= 0xe0)
3463             st->utf8.in_byte = 2;
3464           else if (c >= 0xc0)
3465             st->utf8.in_byte = 1;
3466           else if (c >= 0x80)
3467             return 0;
3468           break;
3469         default:
3470           if ((c & 0xc0) != 0x80)
3471             return 0;
3472           else
3473             st->utf8.in_byte--;
3474         }
3475     }
3476   return CODING_CATEGORY_UTF8_MASK;
3477 }
3478
3479 static void
3480 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3481                     unsigned_char_dynarr *dst, unsigned int n)
3482 {
3483   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3484   unsigned int flags  = str->flags;
3485   unsigned int ch     = str->ch;
3486   eol_type_t eol_type = str->eol_type;
3487   unsigned char counter = str->counter;
3488
3489   while (n--)
3490     {
3491       unsigned char c = *src++;
3492       switch (counter)
3493         {
3494         case 0:
3495           if ( c >= 0xfc )
3496             {
3497               ch = c & 0x01;
3498               counter = 5;
3499             }
3500           else if ( c >= 0xf8 )
3501             {
3502               ch = c & 0x03;
3503               counter = 4;
3504             }
3505           else if ( c >= 0xf0 )
3506             {
3507               ch = c & 0x07;
3508               counter = 3;
3509             }
3510           else if ( c >= 0xe0 )
3511             {
3512               ch = c & 0x0f;
3513               counter = 2;
3514             }
3515           else if ( c >= 0xc0 )
3516             {
3517               ch = c & 0x1f;
3518               counter = 1;
3519             }
3520           else
3521             {
3522               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3523               DECODE_ADD_UCS_CHAR (c, dst);
3524             }
3525           break;
3526         case 1:
3527           ch = ( ch << 6 ) | ( c & 0x3f );
3528           DECODE_ADD_UCS_CHAR (ch, dst);
3529           ch = 0;
3530           counter = 0;
3531           break;
3532         default:
3533           ch = ( ch << 6 ) | ( c & 0x3f );
3534           counter--;
3535         }
3536     label_continue_loop:;
3537     }
3538
3539   if (flags & CODING_STATE_END)
3540     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3541
3542   str->flags = flags;
3543   str->ch    = ch;
3544   str->counter = counter;
3545 }
3546
3547 void
3548 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3549                   unsigned_char_dynarr *dst, unsigned int *flags)
3550 {
3551   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3552
3553   if (ch == '\n')
3554     {
3555       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3556         Dynarr_add (dst, '\r');
3557       if (eol_type != EOL_CR)
3558         Dynarr_add (dst, ch);
3559     }
3560   else if (ch <= 0x7f)
3561     {
3562       Dynarr_add (dst, ch);
3563     }
3564   else if (ch <= 0x7ff)
3565     {
3566       Dynarr_add (dst, (ch >> 6) | 0xc0);
3567       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3568     }
3569   else if (ch <= 0xffff)
3570     {
3571       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3572       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3573       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3574     }
3575   else if (ch <= 0x1fffff)
3576     {
3577       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3578       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3579       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3580       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3581     }
3582   else if (ch <= 0x3ffffff)
3583     {
3584       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3585       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3586       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3587       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3588       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3589     }
3590   else
3591     {
3592       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3593       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3594       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3595       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3596       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3597       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3598     }
3599 }
3600
3601 void
3602 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3603                   unsigned int *flags)
3604 {
3605 }
3606
3607 \f
3608 /************************************************************************/
3609 /*                           ISO2022 methods                            */
3610 /************************************************************************/
3611
3612 /* The following note describes the coding system ISO2022 briefly.
3613    Since the intention of this note is to help understand the
3614    functions in this file, some parts are NOT ACCURATE or OVERLY
3615    SIMPLIFIED.  For thorough understanding, please refer to the
3616    original document of ISO2022.
3617
3618    ISO2022 provides many mechanisms to encode several character sets
3619    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3620    is encoded using bytes less than 128.  This may make the encoded
3621    text a little bit longer, but the text passes more easily through
3622    several gateways, some of which strip off MSB (Most Signigant Bit).
3623
3624    There are two kinds of character sets: control character set and
3625    graphic character set.  The former contains control characters such
3626    as `newline' and `escape' to provide control functions (control
3627    functions are also provided by escape sequences).  The latter
3628    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3629    two control character sets and many graphic character sets.
3630
3631    Graphic character sets are classified into one of the following
3632    four classes, according to the number of bytes (DIMENSION) and
3633    number of characters in one dimension (CHARS) of the set:
3634    - DIMENSION1_CHARS94
3635    - DIMENSION1_CHARS96
3636    - DIMENSION2_CHARS94
3637    - DIMENSION2_CHARS96
3638
3639    In addition, each character set is assigned an identification tag,
3640    unique for each set, called "final character" (denoted as <F>
3641    hereafter).  The <F> of each character set is decided by ECMA(*)
3642    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3643    (0x30..0x3F are for private use only).
3644
3645    Note (*): ECMA = European Computer Manufacturers Association
3646
3647    Here are examples of graphic character set [NAME(<F>)]:
3648         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3649         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3650         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3651         o DIMENSION2_CHARS96 -- none for the moment
3652
3653    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3654         C0 [0x00..0x1F] -- control character plane 0
3655         GL [0x20..0x7F] -- graphic character plane 0
3656         C1 [0x80..0x9F] -- control character plane 1
3657         GR [0xA0..0xFF] -- graphic character plane 1
3658
3659    A control character set is directly designated and invoked to C0 or
3660    C1 by an escape sequence.  The most common case is that:
3661    - ISO646's  control character set is designated/invoked to C0, and
3662    - ISO6429's control character set is designated/invoked to C1,
3663    and usually these designations/invocations are omitted in encoded
3664    text.  In a 7-bit environment, only C0 can be used, and a control
3665    character for C1 is encoded by an appropriate escape sequence to
3666    fit into the environment.  All control characters for C1 are
3667    defined to have corresponding escape sequences.
3668
3669    A graphic character set is at first designated to one of four
3670    graphic registers (G0 through G3), then these graphic registers are
3671    invoked to GL or GR.  These designations and invocations can be
3672    done independently.  The most common case is that G0 is invoked to
3673    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3674    these invocations and designations are omitted in encoded text.
3675    In a 7-bit environment, only GL can be used.
3676
3677    When a graphic character set of CHARS94 is invoked to GL, codes
3678    0x20 and 0x7F of the GL area work as control characters SPACE and
3679    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3680    be used.
3681
3682    There are two ways of invocation: locking-shift and single-shift.
3683    With locking-shift, the invocation lasts until the next different
3684    invocation, whereas with single-shift, the invocation affects the
3685    following character only and doesn't affect the locking-shift
3686    state.  Invocations are done by the following control characters or
3687    escape sequences:
3688
3689    ----------------------------------------------------------------------
3690    abbrev  function                  cntrl escape seq   description
3691    ----------------------------------------------------------------------
3692    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3693    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3694    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3695    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3696    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3697    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3698    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3699    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3700    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3701    ----------------------------------------------------------------------
3702    (*) These are not used by any known coding system.
3703
3704    Control characters for these functions are defined by macros
3705    ISO_CODE_XXX in `coding.h'.
3706
3707    Designations are done by the following escape sequences:
3708    ----------------------------------------------------------------------
3709    escape sequence      description
3710    ----------------------------------------------------------------------
3711    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3712    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3713    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3714    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3715    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3716    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3717    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3718    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3719    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3720    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3721    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3722    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3723    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3724    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3725    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3726    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3727    ----------------------------------------------------------------------
3728
3729    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3730    of dimension 1, chars 94, and final character <F>, etc...
3731
3732    Note (*): Although these designations are not allowed in ISO2022,
3733    Emacs accepts them on decoding, and produces them on encoding
3734    CHARS96 character sets in a coding system which is characterized as
3735    7-bit environment, non-locking-shift, and non-single-shift.
3736
3737    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3738    '(' can be omitted.  We refer to this as "short-form" hereafter.
3739
3740    Now you may notice that there are a lot of ways for encoding the
3741    same multilingual text in ISO2022.  Actually, there exist many
3742    coding systems such as Compound Text (used in X11's inter client
3743    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3744    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3745    localized platforms), and all of these are variants of ISO2022.
3746
3747    In addition to the above, Emacs handles two more kinds of escape
3748    sequences: ISO6429's direction specification and Emacs' private
3749    sequence for specifying character composition.
3750
3751    ISO6429's direction specification takes the following form:
3752         o CSI ']'      -- end of the current direction
3753         o CSI '0' ']'  -- end of the current direction
3754         o CSI '1' ']'  -- start of left-to-right text
3755         o CSI '2' ']'  -- start of right-to-left text
3756    The control character CSI (0x9B: control sequence introducer) is
3757    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3758
3759    Character composition specification takes the following form:
3760         o ESC '0' -- start character composition
3761         o ESC '1' -- end character composition
3762    Since these are not standard escape sequences of any ISO standard,
3763    their use with these meanings is restricted to Emacs only.  */
3764
3765 static void
3766 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3767 {
3768   int i;
3769
3770   for (i = 0; i < 4; i++)
3771     {
3772       if (!NILP (coding_system))
3773         iso->charset[i] =
3774           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3775       else
3776         iso->charset[i] = Qt;
3777       iso->invalid_designated[i] = 0;
3778     }
3779   iso->esc = ISO_ESC_NOTHING;
3780   iso->esc_bytes_index = 0;
3781   iso->register_left = 0;
3782   iso->register_right = 1;
3783   iso->switched_dir_and_no_valid_charset_yet = 0;
3784   iso->invalid_switch_dir = 0;
3785   iso->output_direction_sequence = 0;
3786   iso->output_literally = 0;
3787 #ifdef ENABLE_COMPOSITE_CHARS
3788   if (iso->composite_chars)
3789     Dynarr_reset (iso->composite_chars);
3790 #endif
3791 }
3792
3793 static int
3794 fit_to_be_escape_quoted (unsigned char c)
3795 {
3796   switch (c)
3797     {
3798     case ISO_CODE_ESC:
3799     case ISO_CODE_CSI:
3800     case ISO_CODE_SS2:
3801     case ISO_CODE_SS3:
3802     case ISO_CODE_SO:
3803     case ISO_CODE_SI:
3804       return 1;
3805
3806     default:
3807       return 0;
3808     }
3809 }
3810
3811 /* Parse one byte of an ISO2022 escape sequence.
3812    If the result is an invalid escape sequence, return 0 and
3813    do not change anything in STR.  Otherwise, if the result is
3814    an incomplete escape sequence, update ISO2022.ESC and
3815    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3816    all the state variables (but not ISO2022.ESC_BYTES) and
3817    return 1.
3818
3819    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3820    or invocation of an invalid character set and treat that as
3821    an unrecognized escape sequence. */
3822
3823 static int
3824 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3825                    unsigned char c, unsigned int *flags,
3826                    int check_invalid_charsets)
3827 {
3828   /* (1) If we're at the end of a designation sequence, CS is the
3829      charset being designated and REG is the register to designate
3830      it to.
3831
3832      (2) If we're at the end of a locking-shift sequence, REG is
3833      the register to invoke and HALF (0 == left, 1 == right) is
3834      the half to invoke it into.
3835
3836      (3) If we're at the end of a single-shift sequence, REG is
3837      the register to invoke. */
3838   Lisp_Object cs = Qnil;
3839   int reg, half;
3840
3841   /* NOTE: This code does goto's all over the fucking place.
3842      The reason for this is that we're basically implementing
3843      a state machine here, and hierarchical languages like C
3844      don't really provide a clean way of doing this. */
3845
3846   if (! (*flags & CODING_STATE_ESCAPE))
3847     /* At beginning of escape sequence; we need to reset our
3848        escape-state variables. */
3849     iso->esc = ISO_ESC_NOTHING;
3850
3851   iso->output_literally = 0;
3852   iso->output_direction_sequence = 0;
3853
3854   switch (iso->esc)
3855     {
3856     case ISO_ESC_NOTHING:
3857       iso->esc_bytes_index = 0;
3858       switch (c)
3859         {
3860         case ISO_CODE_ESC:      /* Start escape sequence */
3861           *flags |= CODING_STATE_ESCAPE;
3862           iso->esc = ISO_ESC;
3863           goto not_done;
3864
3865         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3866           *flags |= CODING_STATE_ESCAPE;
3867           iso->esc = ISO_ESC_5_11;
3868           goto not_done;
3869
3870         case ISO_CODE_SO:       /* locking shift 1 */
3871           reg = 1; half = 0;
3872           goto locking_shift;
3873         case ISO_CODE_SI:       /* locking shift 0 */
3874           reg = 0; half = 0;
3875           goto locking_shift;
3876
3877         case ISO_CODE_SS2:      /* single shift */
3878           reg = 2;
3879           goto single_shift;
3880         case ISO_CODE_SS3:      /* single shift */
3881           reg = 3;
3882           goto single_shift;
3883
3884         default:                        /* Other control characters */
3885           return 0;
3886         }
3887
3888     case ISO_ESC:
3889       switch (c)
3890         {
3891           /**** single shift ****/
3892
3893         case 'N':       /* single shift 2 */
3894           reg = 2;
3895           goto single_shift;
3896         case 'O':       /* single shift 3 */
3897           reg = 3;
3898           goto single_shift;
3899
3900           /**** locking shift ****/
3901
3902         case '~':       /* locking shift 1 right */
3903           reg = 1; half = 1;
3904           goto locking_shift;
3905         case 'n':       /* locking shift 2 */
3906           reg = 2; half = 0;
3907           goto locking_shift;
3908         case '}':       /* locking shift 2 right */
3909           reg = 2; half = 1;
3910           goto locking_shift;
3911         case 'o':       /* locking shift 3 */
3912           reg = 3; half = 0;
3913           goto locking_shift;
3914         case '|':       /* locking shift 3 right */
3915           reg = 3; half = 1;
3916           goto locking_shift;
3917
3918 #ifdef ENABLE_COMPOSITE_CHARS
3919           /**** composite ****/
3920
3921         case '0':
3922           iso->esc = ISO_ESC_START_COMPOSITE;
3923           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3924             CODING_STATE_COMPOSITE;
3925           return 1;
3926
3927         case '1':
3928           iso->esc = ISO_ESC_END_COMPOSITE;
3929           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3930             ~CODING_STATE_COMPOSITE;
3931           return 1;
3932 #endif /* ENABLE_COMPOSITE_CHARS */
3933
3934           /**** directionality ****/
3935
3936         case '[':
3937           iso->esc = ISO_ESC_5_11;
3938           goto not_done;
3939
3940           /**** designation ****/
3941
3942         case '$':       /* multibyte charset prefix */
3943           iso->esc = ISO_ESC_2_4;
3944           goto not_done;
3945
3946         default:
3947           if (0x28 <= c && c <= 0x2F)
3948             {
3949               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3950               goto not_done;
3951             }
3952
3953           /* This function is called with CODESYS equal to nil when
3954              doing coding-system detection. */
3955           if (!NILP (codesys)
3956               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3957               && fit_to_be_escape_quoted (c))
3958             {
3959               iso->esc = ISO_ESC_LITERAL;
3960               *flags &= CODING_STATE_ISO2022_LOCK;
3961               return 1;
3962             }
3963
3964           /* bzzzt! */
3965           return 0;
3966         }
3967
3968
3969
3970       /**** directionality ****/
3971
3972     case ISO_ESC_5_11:          /* ISO6429 direction control */
3973       if (c == ']')
3974         {
3975           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3976           goto directionality;
3977         }
3978       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
3979       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3980       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3981       else               return 0;
3982       goto not_done;
3983
3984     case ISO_ESC_5_11_0:
3985       if (c == ']')
3986         {
3987           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3988           goto directionality;
3989         }
3990       return 0;
3991
3992     case ISO_ESC_5_11_1:
3993       if (c == ']')
3994         {
3995           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3996           goto directionality;
3997         }
3998       return 0;
3999
4000     case ISO_ESC_5_11_2:
4001       if (c == ']')
4002         {
4003           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4004           goto directionality;
4005         }
4006       return 0;
4007
4008     directionality:
4009       iso->esc = ISO_ESC_DIRECTIONALITY;
4010       /* Various junk here to attempt to preserve the direction sequences
4011          literally in the text if they would otherwise be swallowed due
4012          to invalid designations that don't show up as actual charset
4013          changes in the text. */
4014       if (iso->invalid_switch_dir)
4015         {
4016           /* We already inserted a direction switch literally into the
4017              text.  We assume (#### this may not be right) that the
4018              next direction switch is the one going the other way,
4019              and we need to output that literally as well. */
4020           iso->output_literally = 1;
4021           iso->invalid_switch_dir = 0;
4022         }
4023       else
4024         {
4025           int jj;
4026
4027           /* If we are in the thrall of an invalid designation,
4028            then stick the directionality sequence literally into the
4029            output stream so it ends up in the original text again. */
4030           for (jj = 0; jj < 4; jj++)
4031             if (iso->invalid_designated[jj])
4032               break;
4033           if (jj < 4)
4034             {
4035               iso->output_literally = 1;
4036               iso->invalid_switch_dir = 1;
4037             }
4038           else
4039             /* Indicate that we haven't yet seen a valid designation,
4040                so that if a switch-dir is directly followed by an
4041                invalid designation, both get inserted literally. */
4042             iso->switched_dir_and_no_valid_charset_yet = 1;
4043         }
4044       return 1;
4045
4046
4047       /**** designation ****/
4048
4049     case ISO_ESC_2_4:
4050       if (0x28 <= c && c <= 0x2F)
4051         {
4052           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4053           goto not_done;
4054         }
4055       if (0x40 <= c && c <= 0x42)
4056         {
4057           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4058                                       *flags & CODING_STATE_R2L ?
4059                                       CHARSET_RIGHT_TO_LEFT :
4060                                       CHARSET_LEFT_TO_RIGHT);
4061           reg = 0;
4062           goto designated;
4063         }
4064       return 0;
4065
4066     default:
4067       {
4068         int type =-1;
4069
4070         if (c < '0' || c > '~')
4071           return 0; /* bad final byte */
4072
4073         if (iso->esc >= ISO_ESC_2_8 &&
4074             iso->esc <= ISO_ESC_2_15)
4075           {
4076             type = ((iso->esc >= ISO_ESC_2_12) ?
4077                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4078             reg = (iso->esc - ISO_ESC_2_8) & 3;
4079           }
4080         else if (iso->esc >= ISO_ESC_2_4_8 &&
4081                  iso->esc <= ISO_ESC_2_4_15)
4082           {
4083             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4084                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4085             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4086           }
4087         else
4088           {
4089             /* Can this ever be reached? -slb */
4090             abort();
4091           }
4092
4093         cs = CHARSET_BY_ATTRIBUTES (type, c,
4094                                     *flags & CODING_STATE_R2L ?
4095                                     CHARSET_RIGHT_TO_LEFT :
4096                                     CHARSET_LEFT_TO_RIGHT);
4097         goto designated;
4098       }
4099     }
4100
4101  not_done:
4102   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4103   return -1;
4104
4105  single_shift:
4106   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4107     /* can't invoke something that ain't there. */
4108     return 0;
4109   iso->esc = ISO_ESC_SINGLE_SHIFT;
4110   *flags &= CODING_STATE_ISO2022_LOCK;
4111   if (reg == 2)
4112     *flags |= CODING_STATE_SS2;
4113   else
4114     *flags |= CODING_STATE_SS3;
4115   return 1;
4116
4117  locking_shift:
4118   if (check_invalid_charsets &&
4119       !CHARSETP (iso->charset[reg]))
4120     /* can't invoke something that ain't there. */
4121     return 0;
4122   if (half)
4123     iso->register_right = reg;
4124   else
4125     iso->register_left = reg;
4126   *flags &= CODING_STATE_ISO2022_LOCK;
4127   iso->esc = ISO_ESC_LOCKING_SHIFT;
4128   return 1;
4129
4130  designated:
4131   if (NILP (cs) && check_invalid_charsets)
4132     {
4133       iso->invalid_designated[reg] = 1;
4134       iso->charset[reg] = Vcharset_ascii;
4135       iso->esc = ISO_ESC_DESIGNATE;
4136       *flags &= CODING_STATE_ISO2022_LOCK;
4137       iso->output_literally = 1;
4138       if (iso->switched_dir_and_no_valid_charset_yet)
4139         {
4140           /* We encountered a switch-direction followed by an
4141              invalid designation.  Ensure that the switch-direction
4142              gets outputted; otherwise it will probably get eaten
4143              when the text is written out again. */
4144           iso->switched_dir_and_no_valid_charset_yet = 0;
4145           iso->output_direction_sequence = 1;
4146           /* And make sure that the switch-dir going the other
4147              way gets outputted, as well. */
4148           iso->invalid_switch_dir = 1;
4149         }
4150       return 1;
4151     }
4152   /* This function is called with CODESYS equal to nil when
4153      doing coding-system detection. */
4154   if (!NILP (codesys))
4155     {
4156       charset_conversion_spec_dynarr *dyn =
4157         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4158
4159       if (dyn)
4160         {
4161           int i;
4162
4163           for (i = 0; i < Dynarr_length (dyn); i++)
4164             {
4165               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4166               if (EQ (cs, spec->from_charset))
4167                 cs = spec->to_charset;
4168             }
4169         }
4170     }
4171
4172   iso->charset[reg] = cs;
4173   iso->esc = ISO_ESC_DESIGNATE;
4174   *flags &= CODING_STATE_ISO2022_LOCK;
4175   if (iso->invalid_designated[reg])
4176     {
4177       iso->invalid_designated[reg] = 0;
4178       iso->output_literally = 1;
4179     }
4180   if (iso->switched_dir_and_no_valid_charset_yet)
4181     iso->switched_dir_and_no_valid_charset_yet = 0;
4182   return 1;
4183 }
4184
4185 static int
4186 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4187                        unsigned int n)
4188 {
4189   int mask;
4190
4191   /* #### There are serious deficiencies in the recognition mechanism
4192      here.  This needs to be much smarter if it's going to cut it.
4193      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4194      it should be detected as Latin-1.
4195      All the ISO2022 stuff in this file should be synced up with the
4196      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4197      Perhaps we should wait till R2L works in FSF Emacs? */
4198
4199   if (!st->iso2022.initted)
4200     {
4201       reset_iso2022 (Qnil, &st->iso2022.iso);
4202       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4203                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4204                           CODING_CATEGORY_ISO_8_1_MASK |
4205                           CODING_CATEGORY_ISO_8_2_MASK |
4206                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4207       st->iso2022.flags = 0;
4208       st->iso2022.high_byte_count = 0;
4209       st->iso2022.saw_single_shift = 0;
4210       st->iso2022.initted = 1;
4211     }
4212
4213   mask = st->iso2022.mask;
4214
4215   while (n--)
4216     {
4217       int c = *src++;
4218       if (c >= 0xA0)
4219         {
4220           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4221           st->iso2022.high_byte_count++;
4222         }
4223       else
4224         {
4225           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4226             {
4227               if (st->iso2022.high_byte_count & 1)
4228                 /* odd number of high bytes; assume not iso-8-2 */
4229                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4230             }
4231           st->iso2022.high_byte_count = 0;
4232           st->iso2022.saw_single_shift = 0;
4233           if (c > 0x80)
4234             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4235         }
4236       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4237           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4238         { /* control chars */
4239           switch (c)
4240             {
4241               /* Allow and ignore control characters that you might
4242                  reasonably see in a text file */
4243             case '\r':
4244             case '\n':
4245             case '\t':
4246             case  7: /* bell */
4247             case  8: /* backspace */
4248             case 11: /* vertical tab */
4249             case 12: /* form feed */
4250             case 26: /* MS-DOS C-z junk */
4251             case 31: /* '^_' -- for info */
4252               goto label_continue_loop;
4253
4254             default:
4255               break;
4256             }
4257         }
4258
4259       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4260           || BYTE_C1_P (c))
4261         {
4262           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4263                                  &st->iso2022.flags, 0))
4264             {
4265               switch (st->iso2022.iso.esc)
4266                 {
4267                 case ISO_ESC_DESIGNATE:
4268                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4269                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4270                   break;
4271                 case ISO_ESC_LOCKING_SHIFT:
4272                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4273                   goto ran_out_of_chars;
4274                 case ISO_ESC_SINGLE_SHIFT:
4275                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4276                   st->iso2022.saw_single_shift = 1;
4277                   break;
4278                 default:
4279                   break;
4280                 }
4281             }
4282           else
4283             {
4284               mask = 0;
4285               goto ran_out_of_chars;
4286             }
4287         }
4288     label_continue_loop:;
4289     }
4290
4291  ran_out_of_chars:
4292
4293   return mask;
4294 }
4295
4296 static int
4297 postprocess_iso2022_mask (int mask)
4298 {
4299   /* #### kind of cheesy */
4300   /* If seven-bit ISO is allowed, then assume that the encoding is
4301      entirely seven-bit and turn off the eight-bit ones. */
4302   if (mask & CODING_CATEGORY_ISO_7_MASK)
4303     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4304                CODING_CATEGORY_ISO_8_1_MASK |
4305                CODING_CATEGORY_ISO_8_2_MASK);
4306   return mask;
4307 }
4308
4309 /* If FLAGS is a null pointer or specifies right-to-left motion,
4310    output a switch-dir-to-left-to-right sequence to DST.
4311    Also update FLAGS if it is not a null pointer.
4312    If INTERNAL_P is set, we are outputting in internal format and
4313    need to handle the CSI differently. */
4314
4315 static void
4316 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4317                                  unsigned_char_dynarr *dst,
4318                                  unsigned int *flags,
4319                                  int internal_p)
4320 {
4321   if (!flags || (*flags & CODING_STATE_R2L))
4322     {
4323       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4324         {
4325           Dynarr_add (dst, ISO_CODE_ESC);
4326           Dynarr_add (dst, '[');
4327         }
4328       else if (internal_p)
4329         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4330       else
4331         Dynarr_add (dst, ISO_CODE_CSI);
4332       Dynarr_add (dst, '0');
4333       Dynarr_add (dst, ']');
4334       if (flags)
4335         *flags &= ~CODING_STATE_R2L;
4336     }
4337 }
4338
4339 /* If FLAGS is a null pointer or specifies a direction different from
4340    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4341    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4342    sequence to DST.  Also update FLAGS if it is not a null pointer.
4343    If INTERNAL_P is set, we are outputting in internal format and
4344    need to handle the CSI differently. */
4345
4346 static void
4347 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4348                           unsigned_char_dynarr *dst, unsigned int *flags,
4349                           int internal_p)
4350 {
4351   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4352       direction == CHARSET_LEFT_TO_RIGHT)
4353     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4354   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4355            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4356            direction == CHARSET_RIGHT_TO_LEFT)
4357     {
4358       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4359         {
4360           Dynarr_add (dst, ISO_CODE_ESC);
4361           Dynarr_add (dst, '[');
4362         }
4363       else if (internal_p)
4364         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4365       else
4366         Dynarr_add (dst, ISO_CODE_CSI);
4367       Dynarr_add (dst, '2');
4368       Dynarr_add (dst, ']');
4369       if (flags)
4370         *flags |= CODING_STATE_R2L;
4371     }
4372 }
4373
4374 /* Convert ISO2022-format data to internal format. */
4375
4376 static void
4377 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4378                        unsigned_char_dynarr *dst, unsigned int n)
4379 {
4380   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4381   unsigned int flags  = str->flags;
4382   unsigned int ch     = str->ch;
4383   eol_type_t eol_type = str->eol_type;
4384 #ifdef ENABLE_COMPOSITE_CHARS
4385   unsigned_char_dynarr *real_dst = dst;
4386 #endif
4387   Lisp_Object coding_system;
4388
4389   XSETCODING_SYSTEM (coding_system, str->codesys);
4390
4391 #ifdef ENABLE_COMPOSITE_CHARS
4392   if (flags & CODING_STATE_COMPOSITE)
4393     dst = str->iso2022.composite_chars;
4394 #endif /* ENABLE_COMPOSITE_CHARS */
4395
4396   while (n--)
4397     {
4398       unsigned char c = *src++;
4399       if (flags & CODING_STATE_ESCAPE)
4400         {       /* Within ESC sequence */
4401           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4402                                           c, &flags, 1);
4403
4404           if (retval)
4405             {
4406               switch (str->iso2022.esc)
4407                 {
4408 #ifdef ENABLE_COMPOSITE_CHARS
4409                 case ISO_ESC_START_COMPOSITE:
4410                   if (str->iso2022.composite_chars)
4411                     Dynarr_reset (str->iso2022.composite_chars);
4412                   else
4413                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4414                   dst = str->iso2022.composite_chars;
4415                   break;
4416                 case ISO_ESC_END_COMPOSITE:
4417                   {
4418                     Bufbyte comstr[MAX_EMCHAR_LEN];
4419                     Bytecount len;
4420                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4421                                                          Dynarr_length (dst));
4422                     dst = real_dst;
4423                     len = set_charptr_emchar (comstr, emch);
4424                     Dynarr_add_many (dst, comstr, len);
4425                     break;
4426                   }
4427 #endif /* ENABLE_COMPOSITE_CHARS */
4428
4429                 case ISO_ESC_LITERAL:
4430                   DECODE_ADD_BINARY_CHAR (c, dst);
4431                   break;
4432
4433                 default:
4434                   /* Everything else handled already */
4435                   break;
4436                 }
4437             }
4438
4439           /* Attempted error recovery. */
4440           if (str->iso2022.output_direction_sequence)
4441             ensure_correct_direction (flags & CODING_STATE_R2L ?
4442                                       CHARSET_RIGHT_TO_LEFT :
4443                                       CHARSET_LEFT_TO_RIGHT,
4444                                       str->codesys, dst, 0, 1);
4445           /* More error recovery. */
4446           if (!retval || str->iso2022.output_literally)
4447             {
4448               /* Output the (possibly invalid) sequence */
4449               int i;
4450               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4451                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4452               flags &= CODING_STATE_ISO2022_LOCK;
4453               if (!retval)
4454                 n++, src--;/* Repeat the loop with the same character. */
4455               else
4456                 {
4457                   /* No sense in reprocessing the final byte of the
4458                      escape sequence; it could mess things up anyway.
4459                      Just add it now. */
4460                   DECODE_ADD_BINARY_CHAR (c, dst);
4461                 }
4462             }
4463           ch = 0;
4464         }
4465       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4466         { /* Control characters */
4467
4468           /***** Error-handling *****/
4469
4470           /* If we were in the middle of a character, dump out the
4471              partial character. */
4472           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4473
4474           /* If we just saw a single-shift character, dump it out.
4475              This may dump out the wrong sort of single-shift character,
4476              but least it will give an indication that something went
4477              wrong. */
4478           if (flags & CODING_STATE_SS2)
4479             {
4480               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4481               flags &= ~CODING_STATE_SS2;
4482             }
4483           if (flags & CODING_STATE_SS3)
4484             {
4485               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4486               flags &= ~CODING_STATE_SS3;
4487             }
4488
4489           /***** Now handle the control characters. *****/
4490
4491           /* Handle CR/LF */
4492           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4493
4494           flags &= CODING_STATE_ISO2022_LOCK;
4495
4496           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4497             DECODE_ADD_BINARY_CHAR (c, dst);
4498         }
4499       else
4500         {                       /* Graphic characters */
4501           Lisp_Object charset;
4502 #ifndef UTF2000
4503           Charset_ID lb;
4504 #endif
4505           int reg;
4506
4507           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4508
4509           /* Now determine the charset. */
4510           reg = ((flags & CODING_STATE_SS2) ? 2
4511                  : (flags & CODING_STATE_SS3) ? 3
4512                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4513                  : str->iso2022.register_left);
4514           charset = str->iso2022.charset[reg];
4515
4516           /* Error checking: */
4517           if (! CHARSETP (charset)
4518               || str->iso2022.invalid_designated[reg]
4519               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4520                   && XCHARSET_CHARS (charset) == 94))
4521             /* Mrmph.  We are trying to invoke a register that has no
4522                or an invalid charset in it, or trying to add a character
4523                outside the range of the charset.  Insert that char literally
4524                to preserve it for the output. */
4525             {
4526               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4527               DECODE_ADD_BINARY_CHAR (c, dst);
4528             }
4529
4530           else
4531             {
4532               /* Things are probably hunky-dorey. */
4533
4534               /* Fetch reverse charset, maybe. */
4535               if (((flags & CODING_STATE_R2L) &&
4536                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4537                   ||
4538                   (!(flags & CODING_STATE_R2L) &&
4539                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4540                 {
4541                   Lisp_Object new_charset =
4542                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4543                   if (!NILP (new_charset))
4544                     charset = new_charset;
4545                 }
4546
4547 #ifdef UTF2000
4548               if (XCHARSET_DIMENSION (charset) == 1)
4549                 {
4550                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4551                   DECODE_ADD_UCS_CHAR
4552                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4553                 }
4554               else if (ch)
4555                 {
4556                   DECODE_ADD_UCS_CHAR
4557                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4558                   ch = 0;
4559                 }
4560               else
4561                 ch = c;
4562 #else
4563               lb = XCHARSET_LEADING_BYTE (charset);
4564               switch (XCHARSET_REP_BYTES (charset))
4565                 {
4566                 case 1: /* ASCII */
4567                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4568                   Dynarr_add (dst, c & 0x7F);
4569                   break;
4570
4571                 case 2: /* one-byte official */
4572                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4573                   Dynarr_add (dst, lb);
4574                   Dynarr_add (dst, c | 0x80);
4575                   break;
4576
4577                 case 3: /* one-byte private or two-byte official */
4578                   if (XCHARSET_PRIVATE_P (charset))
4579                     {
4580                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4581                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4582                       Dynarr_add (dst, lb);
4583                       Dynarr_add (dst, c | 0x80);
4584                     }
4585                   else
4586                     {
4587                       if (ch)
4588                         {
4589                           Dynarr_add (dst, lb);
4590                           Dynarr_add (dst, ch | 0x80);
4591                           Dynarr_add (dst, c | 0x80);
4592                           ch = 0;
4593                         }
4594                       else
4595                         ch = c;
4596                     }
4597                   break;
4598
4599                 default:        /* two-byte private */
4600                   if (ch)
4601                     {
4602                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4603                       Dynarr_add (dst, lb);
4604                       Dynarr_add (dst, ch | 0x80);
4605                       Dynarr_add (dst, c | 0x80);
4606                       ch = 0;
4607                     }
4608                   else
4609                     ch = c;
4610                 }
4611 #endif
4612             }
4613
4614           if (!ch)
4615             flags &= CODING_STATE_ISO2022_LOCK;
4616         }
4617
4618     label_continue_loop:;
4619     }
4620
4621   if (flags & CODING_STATE_END)
4622     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4623
4624   str->flags = flags;
4625   str->ch    = ch;
4626 }
4627
4628
4629 /***** ISO2022 encoder *****/
4630
4631 /* Designate CHARSET into register REG. */
4632
4633 static void
4634 iso2022_designate (Lisp_Object charset, unsigned char reg,
4635                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4636 {
4637   static CONST char inter94[] = "()*+";
4638   static CONST char inter96[] = ",-./";
4639   unsigned int type;
4640   unsigned char final;
4641   Lisp_Object old_charset = str->iso2022.charset[reg];
4642
4643   str->iso2022.charset[reg] = charset;
4644   if (!CHARSETP (charset))
4645     /* charset might be an initial nil or t. */
4646     return;
4647   type = XCHARSET_TYPE (charset);
4648   final = XCHARSET_FINAL (charset);
4649   if (!str->iso2022.force_charset_on_output[reg] &&
4650       CHARSETP (old_charset) &&
4651       XCHARSET_TYPE (old_charset) == type &&
4652       XCHARSET_FINAL (old_charset) == final)
4653     return;
4654
4655   str->iso2022.force_charset_on_output[reg] = 0;
4656
4657   {
4658     charset_conversion_spec_dynarr *dyn =
4659       str->codesys->iso2022.output_conv;
4660
4661     if (dyn)
4662       {
4663         int i;
4664
4665         for (i = 0; i < Dynarr_length (dyn); i++)
4666           {
4667             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4668             if (EQ (charset, spec->from_charset))
4669                 charset = spec->to_charset;
4670           }
4671       }
4672   }
4673
4674   Dynarr_add (dst, ISO_CODE_ESC);
4675   switch (type)
4676     {
4677     case CHARSET_TYPE_94:
4678       Dynarr_add (dst, inter94[reg]);
4679       break;
4680     case CHARSET_TYPE_96:
4681       Dynarr_add (dst, inter96[reg]);
4682       break;
4683     case CHARSET_TYPE_94X94:
4684       Dynarr_add (dst, '$');
4685       if (reg != 0
4686           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4687           || final < '@'
4688           || final > 'B')
4689         Dynarr_add (dst, inter94[reg]);
4690       break;
4691     case CHARSET_TYPE_96X96:
4692       Dynarr_add (dst, '$');
4693       Dynarr_add (dst, inter96[reg]);
4694       break;
4695     }
4696   Dynarr_add (dst, final);
4697 }
4698
4699 static void
4700 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4701 {
4702   if (str->iso2022.register_left != 0)
4703     {
4704       Dynarr_add (dst, ISO_CODE_SI);
4705       str->iso2022.register_left = 0;
4706     }
4707 }
4708
4709 static void
4710 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4711 {
4712   if (str->iso2022.register_left != 1)
4713     {
4714       Dynarr_add (dst, ISO_CODE_SO);
4715       str->iso2022.register_left = 1;
4716     }
4717 }
4718
4719 void
4720 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4721                      unsigned_char_dynarr *dst, unsigned int *flags)
4722 {
4723   unsigned char charmask;
4724   Lisp_Coding_System* codesys = str->codesys;
4725   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4726   int i;
4727   Lisp_Object charset = str->iso2022.current_charset;
4728   int half = str->iso2022.current_half;
4729   unsigned int byte1, byte2;
4730
4731   if (ch <= 0x7F)
4732     {
4733       restore_left_to_right_direction (codesys, dst, flags, 0);
4734
4735       /* Make sure G0 contains ASCII */
4736       if ((ch > ' ' && ch < ISO_CODE_DEL)
4737           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4738         {
4739           ensure_normal_shift (str, dst);
4740           iso2022_designate (Vcharset_ascii, 0, str, dst);
4741         }
4742
4743       /* If necessary, restore everything to the default state
4744          at end-of-line */
4745       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4746         {
4747           restore_left_to_right_direction (codesys, dst, flags, 0);
4748
4749           ensure_normal_shift (str, dst);
4750
4751           for (i = 0; i < 4; i++)
4752             {
4753               Lisp_Object initial_charset =
4754                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4755               iso2022_designate (initial_charset, i, str, dst);
4756             }
4757         }
4758       if (ch == '\n')
4759         {
4760           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4761             Dynarr_add (dst, '\r');
4762           if (eol_type != EOL_CR)
4763             Dynarr_add (dst, ch);
4764         }
4765       else
4766         {
4767           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4768               && fit_to_be_escape_quoted (ch))
4769             Dynarr_add (dst, ISO_CODE_ESC);
4770           Dynarr_add (dst, ch);
4771         }
4772     }
4773   else if ( (0x80 <= ch) && (ch <= 0x9f) )
4774     {
4775       charmask = (half == 0 ? 0x00 : 0x80);
4776
4777       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4778           && fit_to_be_escape_quoted (ch))
4779         Dynarr_add (dst, ISO_CODE_ESC);
4780       /* you asked for it ... */
4781       Dynarr_add (dst, ch);
4782     }
4783   else
4784     {
4785       int reg;
4786
4787       BREAKUP_CHAR (ch, charset, byte1, byte2);
4788       ensure_correct_direction (XCHARSET_DIRECTION (charset),
4789                                 codesys, dst, flags, 0);
4790
4791       /* Now determine which register to use. */
4792       reg = -1;
4793       for (i = 0; i < 4; i++)
4794         {
4795           if (EQ (charset, str->iso2022.charset[i]) ||
4796               EQ (charset,
4797                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4798             {
4799               reg = i;
4800               break;
4801             }
4802         }
4803
4804       if (reg == -1)
4805         {
4806           if (XCHARSET_GRAPHIC (charset) != 0)
4807             {
4808               if (!NILP (str->iso2022.charset[1]) &&
4809                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4810                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4811                 reg = 1;
4812               else if (!NILP (str->iso2022.charset[2]))
4813                 reg = 2;
4814               else if (!NILP (str->iso2022.charset[3]))
4815                 reg = 3;
4816               else
4817                 reg = 0;
4818             }
4819           else
4820             reg = 0;
4821         }
4822
4823       iso2022_designate (charset, reg, str, dst);
4824
4825       /* Now invoke that register. */
4826       switch (reg)
4827         {
4828         case 0:
4829           ensure_normal_shift (str, dst);
4830           half = 0;
4831           break;
4832         case 1:
4833           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4834             {
4835               ensure_shift_out (str, dst);
4836               half = 0;
4837             }
4838           else
4839             half = 1;
4840           break;
4841         case 2:
4842           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4843             {
4844               Dynarr_add (dst, ISO_CODE_ESC);
4845               Dynarr_add (dst, 'N');
4846               half = 0;
4847             }
4848           else
4849             {
4850               Dynarr_add (dst, ISO_CODE_SS2);
4851               half = 1;
4852             }
4853           break;
4854         case 3:
4855           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4856             {
4857               Dynarr_add (dst, ISO_CODE_ESC);
4858               Dynarr_add (dst, 'O');
4859               half = 0;
4860             }
4861           else
4862             {
4863               Dynarr_add (dst, ISO_CODE_SS3);
4864               half = 1;
4865             }
4866           break;
4867         default:
4868           abort ();
4869         }
4870
4871       charmask = (half == 0 ? 0x00 : 0x80);
4872
4873       switch (XCHARSET_DIMENSION (charset))
4874         {
4875         case 1:
4876           Dynarr_add (dst, byte1 | charmask);
4877           break;
4878         case 2:
4879           Dynarr_add (dst, byte1 | charmask);
4880           Dynarr_add (dst, byte2 | charmask);
4881           break;
4882         default:
4883           abort ();
4884         }
4885     }
4886   str->iso2022.current_charset = charset;
4887   str->iso2022.current_half = half;
4888 }
4889
4890 void
4891 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4892                      unsigned int *flags)
4893 {
4894   Lisp_Coding_System* codesys = str->codesys;
4895   int i;
4896
4897   restore_left_to_right_direction (codesys, dst, flags, 0);
4898   ensure_normal_shift (str, dst);
4899   for (i = 0; i < 4; i++)
4900     {
4901       Lisp_Object initial_charset
4902         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4903       iso2022_designate (initial_charset, i, str, dst);
4904     }
4905 }
4906 #endif /* MULE */
4907 \f
4908 /************************************************************************/
4909 /*                     No-conversion methods                            */
4910 /************************************************************************/
4911
4912 /* This is used when reading in "binary" files -- i.e. files that may
4913    contain all 256 possible byte values and that are not to be
4914    interpreted as being in any particular decoding. */
4915 static void
4916 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4917                              unsigned_char_dynarr *dst, unsigned int n)
4918 {
4919   unsigned char c;
4920   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4921   unsigned int flags  = str->flags;
4922   unsigned int ch     = str->ch;
4923   eol_type_t eol_type = str->eol_type;
4924
4925   while (n--)
4926     {
4927       c = *src++;
4928
4929       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4930       DECODE_ADD_BINARY_CHAR (c, dst);
4931     label_continue_loop:;
4932     }
4933
4934   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4935
4936   str->flags = flags;
4937   str->ch    = ch;
4938 }
4939
4940 static void
4941 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4942                              unsigned_char_dynarr *dst, unsigned int n)
4943 {
4944   unsigned char c;
4945   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4946   unsigned int flags  = str->flags;
4947   unsigned int ch     = str->ch;
4948   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4949 #ifdef UTF2000
4950   unsigned char char_boundary = str->iso2022.current_char_boundary;
4951 #endif
4952
4953   while (n--)
4954     {
4955       c = *src++;
4956 #ifdef UTF2000
4957       if (char_boundary == 0)
4958         if ( c >= 0xfc )
4959           {
4960             ch = c & 0x01;
4961             char_boundary = 5;
4962           }
4963         else if ( c >= 0xf8 )
4964           {
4965             ch = c & 0x03;
4966             char_boundary = 4;
4967           }
4968         else if ( c >= 0xf0 )
4969           {
4970             ch = c & 0x07;
4971             char_boundary = 3;
4972           }
4973         else if ( c >= 0xe0 )
4974           {
4975             ch = c & 0x0f;
4976             char_boundary = 2;
4977           }
4978         else if ( c >= 0xc0 )
4979           {
4980             ch = c & 0x1f;
4981             char_boundary = 1;
4982           }
4983         else
4984           {
4985             ch = 0;
4986             if (c == '\n')
4987               {
4988                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4989                   Dynarr_add (dst, '\r');
4990                 if (eol_type != EOL_CR)
4991                   Dynarr_add (dst, c);
4992               }
4993             else
4994               Dynarr_add (dst, c);
4995             char_boundary = 0;
4996           }
4997       else if (char_boundary == 1)
4998         {
4999           ch = ( ch << 6 ) | ( c & 0x3f );
5000           Dynarr_add (dst, ch & 0xff);
5001           char_boundary = 0;
5002         }
5003       else
5004         {
5005           ch = ( ch << 6 ) | ( c & 0x3f );
5006           char_boundary--;
5007         }
5008 #else /* not UTF2000 */
5009       if (c == '\n')
5010         {
5011           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5012             Dynarr_add (dst, '\r');
5013           if (eol_type != EOL_CR)
5014             Dynarr_add (dst, '\n');
5015           ch = 0;
5016         }
5017       else if (BYTE_ASCII_P (c))
5018         {
5019           assert (ch == 0);
5020           Dynarr_add (dst, c);
5021         }
5022       else if (BUFBYTE_LEADING_BYTE_P (c))
5023         {
5024           assert (ch == 0);
5025           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5026               c == LEADING_BYTE_CONTROL_1)
5027             ch = c;
5028           else
5029             Dynarr_add (dst, '~'); /* untranslatable character */
5030         }
5031       else
5032         {
5033           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5034             Dynarr_add (dst, c);
5035           else if (ch == LEADING_BYTE_CONTROL_1)
5036             {
5037               assert (c < 0xC0);
5038               Dynarr_add (dst, c - 0x20);
5039             }
5040           /* else it should be the second or third byte of an
5041              untranslatable character, so ignore it */
5042           ch = 0;
5043         }
5044 #endif /* not UTF2000 */
5045     }
5046
5047   str->flags = flags;
5048   str->ch    = ch;
5049 #ifdef UTF2000
5050   str->iso2022.current_char_boundary = char_boundary;
5051 #endif
5052 }
5053
5054 \f
5055 /************************************************************************/
5056 /*                   Simple internal/external functions                 */
5057 /************************************************************************/
5058
5059 static Extbyte_dynarr *conversion_out_dynarr;
5060 static Bufbyte_dynarr *conversion_in_dynarr;
5061
5062 /* Determine coding system from coding format */
5063
5064 /* #### not correct for all values of `fmt'! */
5065 static Lisp_Object
5066 external_data_format_to_coding_system (enum external_data_format fmt)
5067 {
5068   switch (fmt)
5069     {
5070     case FORMAT_FILENAME:
5071     case FORMAT_TERMINAL:
5072       if (EQ (Vfile_name_coding_system, Qnil) ||
5073           EQ (Vfile_name_coding_system, Qbinary))
5074         return Qnil;
5075       else
5076         return Fget_coding_system (Vfile_name_coding_system);
5077 #ifdef MULE
5078     case FORMAT_CTEXT:
5079       return Fget_coding_system (Qctext);
5080 #endif
5081     default:
5082       return Qnil;
5083     }
5084 }
5085
5086 Extbyte *
5087 convert_to_external_format (CONST Bufbyte *ptr,
5088                             Bytecount len,
5089                             Extcount *len_out,
5090                             enum external_data_format fmt)
5091 {
5092   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5093
5094   if (!conversion_out_dynarr)
5095     conversion_out_dynarr = Dynarr_new (Extbyte);
5096   else
5097     Dynarr_reset (conversion_out_dynarr);
5098
5099   if (NILP (coding_system))
5100     {
5101       CONST Bufbyte *end = ptr + len;
5102
5103       for (; ptr < end;)
5104         {
5105 #ifdef UTF2000
5106           Bufbyte c =
5107             (*ptr < 0xc0) ? *ptr :
5108             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5109 #else
5110           Bufbyte c =
5111             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5112             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5113             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5114             '~';
5115 #endif
5116           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5117           INC_CHARPTR (ptr);
5118         }
5119
5120 #ifdef ERROR_CHECK_BUFPOS
5121       assert (ptr == end);
5122 #endif
5123     }
5124   else
5125     {
5126       Lisp_Object instream, outstream, da_outstream;
5127       Lstream *istr, *ostr;
5128       struct gcpro gcpro1, gcpro2, gcpro3;
5129       char tempbuf[1024]; /* some random amount */
5130
5131       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5132       da_outstream = make_dynarr_output_stream
5133         ((unsigned_char_dynarr *) conversion_out_dynarr);
5134       outstream =
5135         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5136       istr = XLSTREAM (instream);
5137       ostr = XLSTREAM (outstream);
5138       GCPRO3 (instream, outstream, da_outstream);
5139       while (1)
5140         {
5141           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5142           if (!size_in_bytes)
5143             break;
5144           Lstream_write (ostr, tempbuf, size_in_bytes);
5145         }
5146       Lstream_close (istr);
5147       Lstream_close (ostr);
5148       UNGCPRO;
5149       Lstream_delete (istr);
5150       Lstream_delete (ostr);
5151       Lstream_delete (XLSTREAM (da_outstream));
5152     }
5153
5154   *len_out = Dynarr_length (conversion_out_dynarr);
5155   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5156   return Dynarr_atp (conversion_out_dynarr, 0);
5157 }
5158
5159 Bufbyte *
5160 convert_from_external_format (CONST Extbyte *ptr,
5161                               Extcount len,
5162                               Bytecount *len_out,
5163                               enum external_data_format fmt)
5164 {
5165   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5166
5167   if (!conversion_in_dynarr)
5168     conversion_in_dynarr = Dynarr_new (Bufbyte);
5169   else
5170     Dynarr_reset (conversion_in_dynarr);
5171
5172   if (NILP (coding_system))
5173     {
5174       CONST Extbyte *end = ptr + len;
5175       for (; ptr < end; ptr++)
5176         {
5177           Extbyte c = *ptr;
5178           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5179         }
5180     }
5181   else
5182     {
5183       Lisp_Object instream, outstream, da_outstream;
5184       Lstream *istr, *ostr;
5185       struct gcpro gcpro1, gcpro2, gcpro3;
5186       char tempbuf[1024]; /* some random amount */
5187
5188       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5189       da_outstream = make_dynarr_output_stream
5190         ((unsigned_char_dynarr *) conversion_in_dynarr);
5191       outstream =
5192         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5193       istr = XLSTREAM (instream);
5194       ostr = XLSTREAM (outstream);
5195       GCPRO3 (instream, outstream, da_outstream);
5196       while (1)
5197         {
5198           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5199           if (!size_in_bytes)
5200             break;
5201           Lstream_write (ostr, tempbuf, size_in_bytes);
5202         }
5203       Lstream_close (istr);
5204       Lstream_close (ostr);
5205       UNGCPRO;
5206       Lstream_delete (istr);
5207       Lstream_delete (ostr);
5208       Lstream_delete (XLSTREAM (da_outstream));
5209     }
5210
5211   *len_out = Dynarr_length (conversion_in_dynarr);
5212   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5213   return Dynarr_atp (conversion_in_dynarr, 0);
5214 }
5215
5216 \f
5217 /************************************************************************/
5218 /*                             Initialization                           */
5219 /************************************************************************/
5220
5221 void
5222 syms_of_file_coding (void)
5223 {
5224   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5225   deferror (&Qcoding_system_error, "coding-system-error",
5226             "Coding-system error", Qio_error);
5227
5228   DEFSUBR (Fcoding_system_p);
5229   DEFSUBR (Ffind_coding_system);
5230   DEFSUBR (Fget_coding_system);
5231   DEFSUBR (Fcoding_system_list);
5232   DEFSUBR (Fcoding_system_name);
5233   DEFSUBR (Fmake_coding_system);
5234   DEFSUBR (Fcopy_coding_system);
5235   DEFSUBR (Fdefine_coding_system_alias);
5236   DEFSUBR (Fsubsidiary_coding_system);
5237
5238   DEFSUBR (Fcoding_system_type);
5239   DEFSUBR (Fcoding_system_doc_string);
5240 #ifdef MULE
5241   DEFSUBR (Fcoding_system_charset);
5242 #endif
5243   DEFSUBR (Fcoding_system_property);
5244
5245   DEFSUBR (Fcoding_category_list);
5246   DEFSUBR (Fset_coding_priority_list);
5247   DEFSUBR (Fcoding_priority_list);
5248   DEFSUBR (Fset_coding_category_system);
5249   DEFSUBR (Fcoding_category_system);
5250
5251   DEFSUBR (Fdetect_coding_region);
5252   DEFSUBR (Fdecode_coding_region);
5253   DEFSUBR (Fencode_coding_region);
5254 #ifdef MULE
5255   DEFSUBR (Fdecode_shift_jis_char);
5256   DEFSUBR (Fencode_shift_jis_char);
5257   DEFSUBR (Fdecode_big5_char);
5258   DEFSUBR (Fencode_big5_char);
5259 #endif /* MULE */
5260   defsymbol (&Qcoding_system_p, "coding-system-p");
5261   defsymbol (&Qno_conversion, "no-conversion");
5262   defsymbol (&Qraw_text, "raw-text");
5263 #ifdef MULE
5264   defsymbol (&Qbig5, "big5");
5265   defsymbol (&Qshift_jis, "shift-jis");
5266   defsymbol (&Qucs4, "ucs-4");
5267   defsymbol (&Qutf8, "utf-8");
5268   defsymbol (&Qccl, "ccl");
5269   defsymbol (&Qiso2022, "iso2022");
5270 #endif /* MULE */
5271   defsymbol (&Qmnemonic, "mnemonic");
5272   defsymbol (&Qeol_type, "eol-type");
5273   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5274   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5275
5276   defsymbol (&Qcr, "cr");
5277   defsymbol (&Qlf, "lf");
5278   defsymbol (&Qcrlf, "crlf");
5279   defsymbol (&Qeol_cr, "eol-cr");
5280   defsymbol (&Qeol_lf, "eol-lf");
5281   defsymbol (&Qeol_crlf, "eol-crlf");
5282 #ifdef MULE
5283   defsymbol (&Qcharset_g0, "charset-g0");
5284   defsymbol (&Qcharset_g1, "charset-g1");
5285   defsymbol (&Qcharset_g2, "charset-g2");
5286   defsymbol (&Qcharset_g3, "charset-g3");
5287   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5288   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5289   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5290   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5291   defsymbol (&Qno_iso6429, "no-iso6429");
5292   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5293   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5294
5295   defsymbol (&Qshort, "short");
5296   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5297   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5298   defsymbol (&Qseven, "seven");
5299   defsymbol (&Qlock_shift, "lock-shift");
5300   defsymbol (&Qescape_quoted, "escape-quoted");
5301 #endif /* MULE */
5302   defsymbol (&Qencode, "encode");
5303   defsymbol (&Qdecode, "decode");
5304
5305 #ifdef MULE
5306   defsymbol (&Qctext, "ctext");
5307   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5308              "shift-jis");
5309   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5310              "big5");
5311   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5312              "ucs-4");
5313   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5314              "utf-8");
5315   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5316              "iso-7");
5317   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5318              "iso-8-designate");
5319   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5320              "iso-8-1");
5321   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5322              "iso-8-2");
5323   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5324              "iso-lock-shift");
5325 #endif /* MULE */
5326   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5327              "no-conversion");
5328 }
5329
5330 void
5331 lstream_type_create_file_coding (void)
5332 {
5333   LSTREAM_HAS_METHOD (decoding, reader);
5334   LSTREAM_HAS_METHOD (decoding, writer);
5335   LSTREAM_HAS_METHOD (decoding, rewinder);
5336   LSTREAM_HAS_METHOD (decoding, seekable_p);
5337   LSTREAM_HAS_METHOD (decoding, flusher);
5338   LSTREAM_HAS_METHOD (decoding, closer);
5339   LSTREAM_HAS_METHOD (decoding, marker);
5340
5341   LSTREAM_HAS_METHOD (encoding, reader);
5342   LSTREAM_HAS_METHOD (encoding, writer);
5343   LSTREAM_HAS_METHOD (encoding, rewinder);
5344   LSTREAM_HAS_METHOD (encoding, seekable_p);
5345   LSTREAM_HAS_METHOD (encoding, flusher);
5346   LSTREAM_HAS_METHOD (encoding, closer);
5347   LSTREAM_HAS_METHOD (encoding, marker);
5348 }
5349
5350 void
5351 vars_of_file_coding (void)
5352 {
5353   int i;
5354
5355   /* Initialize to something reasonable ... */
5356   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5357     {
5358       coding_category_system[i] = Qnil;
5359       coding_category_by_priority[i] = i;
5360     }
5361
5362   Fprovide (intern ("file-coding"));
5363
5364   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5365 Coding system used for TTY keyboard input.
5366 Not used under a windowing system.
5367 */ );
5368   Vkeyboard_coding_system = Qnil;
5369
5370   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5371 Coding system used for TTY display output.
5372 Not used under a windowing system.
5373 */ );
5374   Vterminal_coding_system = Qnil;
5375
5376   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5377 Overriding coding system used when writing a file or process.
5378 You should *bind* this, not set it.  If this is non-nil, it specifies
5379 the coding system that will be used when a file or process is read
5380 in, and overrides `buffer-file-coding-system-for-read',
5381 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5382 this one for permanent changes to the environment.
5383 */ );
5384   Vcoding_system_for_read = Qnil;
5385
5386   DEFVAR_LISP ("coding-system-for-write",
5387                &Vcoding_system_for_write /*
5388 Overriding coding system used when writing a file or process.
5389 You should *bind* this, not set it.  If this is non-nil, it specifies
5390 the coding system that will be used when a file or process is wrote
5391 in, and overrides `buffer-file-coding-system',
5392 `write-region-pre-hook', etc.  Use those variables instead of this one
5393 for permanent changes to the environment.
5394 */ );
5395   Vcoding_system_for_write = Qnil;
5396
5397   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5398 Coding system used to convert pathnames when accessing files.
5399 */ );
5400   Vfile_name_coding_system = Qnil;
5401
5402   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5403 Non-nil means the buffer contents are regarded as multi-byte form
5404 of characters, not a binary code.  This affects the display, file I/O,
5405 and behaviors of various editing commands.
5406
5407 Setting this to nil does not do anything.
5408 */ );
5409   enable_multibyte_characters = 1;
5410 }
5411
5412 void
5413 complex_vars_of_file_coding (void)
5414 {
5415   staticpro (&Vcoding_system_hash_table);
5416   Vcoding_system_hash_table =
5417     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5418
5419   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5420
5421 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5422 {                                               \
5423   struct codesys_prop csp;                      \
5424   csp.sym = (Sym);                              \
5425   csp.prop_type = (Prop_Type);                  \
5426   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5427 } while (0)
5428
5429   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5430   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5431   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5432   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5433   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5434   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5435   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5436 #ifdef MULE
5437   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5438   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5439   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5440   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5441   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5442   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5443   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5444   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5445   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5446   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5447   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5448   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5449   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5450   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5451   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5452   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5453   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5454
5455   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5456   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5457 #endif /* MULE */
5458   /* Need to create this here or we're really screwed. */
5459   Fmake_coding_system
5460     (Qraw_text, Qno_conversion,
5461      build_string ("Raw text, which means it converts only line-break-codes."),
5462      list2 (Qmnemonic, build_string ("Raw")));
5463
5464   Fmake_coding_system
5465     (Qbinary, Qno_conversion,
5466      build_string ("Binary, which means it does not convert anything."),
5467      list4 (Qeol_type, Qlf,
5468             Qmnemonic, build_string ("Binary")));
5469
5470 #ifdef UTF2000
5471   Fmake_coding_system
5472     (Qutf8, Qutf8,
5473      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5474      list2 (Qmnemonic, build_string ("UTF8")));
5475 #endif
5476
5477   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5478
5479   /* Need this for bootstrapping */
5480   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5481     Fget_coding_system (Qraw_text);
5482
5483 #ifdef UTF2000
5484   coding_category_system[CODING_CATEGORY_UTF8]
5485    = Fget_coding_system (Qutf8);
5486 #endif
5487 }