git.chise.org Git - chise/xemacs-chise.git-/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154
 155 static void
 156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
 157                      unsigned_char_dynarr *dst, unsigned int n);
 158
 159 static int detect_coding_sjis (struct detection_state *st,
 160                                CONST unsigned char *src,
 161                                unsigned int n);
 162 static void decode_coding_sjis (Lstream *decoding,
 163                                 CONST unsigned char *src,
 164                                 unsigned_char_dynarr *dst,
 165                                 unsigned int n);
 166 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 167                             unsigned_char_dynarr *dst, unsigned int *flags);
 168 void char_finish_shift_jis (struct encoding_stream *str,
 169                             unsigned_char_dynarr *dst, unsigned int *flags);
 170
 171 static int detect_coding_big5 (struct detection_state *st,
 172                                CONST unsigned char *src,
 173                                unsigned int n);
 174 static void decode_coding_big5 (Lstream *decoding,
 175                                 CONST unsigned char *src,
 176                                 unsigned_char_dynarr *dst, unsigned int n);
 177 static void encode_coding_big5 (Lstream *encoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static int detect_coding_ucs4 (struct detection_state *st,
 181                                CONST unsigned char *src,
 182                                unsigned int n);
 183 static void decode_coding_ucs4 (Lstream *decoding,
 184                                 CONST unsigned char *src,
 185                                 unsigned_char_dynarr *dst, unsigned int n);
 186 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 187                        unsigned_char_dynarr *dst, unsigned int *flags);
 188 void char_finish_ucs4 (struct encoding_stream *str,
 189                        unsigned_char_dynarr *dst, unsigned int *flags);
 190
 191 static int detect_coding_utf8 (struct detection_state *st,
 192                                CONST unsigned char *src,
 193                                unsigned int n);
 194 static void decode_coding_utf8 (Lstream *decoding,
 195                                 CONST unsigned char *src,
 196                                 unsigned_char_dynarr *dst, unsigned int n);
 197 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 198                        unsigned_char_dynarr *dst, unsigned int *flags);
 199 void char_finish_utf8 (struct encoding_stream *str,
 200                        unsigned_char_dynarr *dst, unsigned int *flags);
 201
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   CONST unsigned char *src,
 207                                   unsigned int n);
 208 static void decode_coding_iso2022 (Lstream *decoding,
 209                                    CONST unsigned char *src,
 210                                    unsigned_char_dynarr *dst, unsigned int n);
 211 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 212                           unsigned_char_dynarr *dst, unsigned int *flags);
 213 void char_finish_iso2022 (struct encoding_stream *str,
 214                           unsigned_char_dynarr *dst, unsigned int *flags);
 215 #endif /* MULE */
 216 static void decode_coding_no_conversion (Lstream *decoding,
 217                                          CONST unsigned char *src,
 218                                          unsigned_char_dynarr *dst,
 219                                          unsigned int n);
 220 static void encode_coding_no_conversion (Lstream *encoding,
 221                                          CONST unsigned char *src,
 222                                          unsigned_char_dynarr *dst,
 223                                          unsigned int n);
 224 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 225                          unsigned_char_dynarr *dst, unsigned int n);
 226 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 227                          unsigned_char_dynarr *dst, unsigned int n);
 228
 229 typedef struct codesys_prop codesys_prop;
 230 struct codesys_prop
 231 {
 232   Lisp_Object sym;
 233   int prop_type;
 234 };
 235
 236 typedef struct
 237 {
 238   Dynarr_declare (codesys_prop);
 239 } codesys_prop_dynarr;
 240
 241 codesys_prop_dynarr *the_codesys_prop_dynarr;
 242
 243 enum codesys_prop_enum
 244 {
 245   CODESYS_PROP_ALL_OK,
 246   CODESYS_PROP_ISO2022,
 247   CODESYS_PROP_CCL
 248 };
 249
 250 \f
 251 /************************************************************************/
 252 /*                       Coding system functions                        */
 253 /************************************************************************/
 254
 255 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 256 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 257 static void finalize_coding_system (void *header, int for_disksave);
 258
 259 #ifdef MULE
 260 static const struct lrecord_description ccs_description_1[] = {
 261   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 262   { XD_END }
 263 };
 264
 265 static const struct struct_description ccs_description = {
 266   sizeof(charset_conversion_spec),
 267   ccs_description_1
 268 };
 269
 270 static const struct lrecord_description ccsd_description_1[] = {
 271   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 272   { XD_END }
 273 };
 274
 275 static const struct struct_description ccsd_description = {
 276   sizeof(charset_conversion_spec_dynarr),
 277   ccsd_description_1
 278 };
 279 #endif
 280
 281 static const struct lrecord_description coding_system_description[] = {
 282   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 283   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 284   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 285 #ifdef MULE
 286   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 287   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 288   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 289   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 290 #endif
 291   { XD_END }
 292 };
 293
 294 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 295                                mark_coding_system, print_coding_system,
 296                                finalize_coding_system,
 297                                0, 0, coding_system_description,
 298                                struct Lisp_Coding_System);
 299
 300 static Lisp_Object
 301 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 302 {
 303   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 304
 305   markobj (CODING_SYSTEM_NAME (codesys));
 306   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 307   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 308   markobj (CODING_SYSTEM_EOL_LF (codesys));
 309   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 310   markobj (CODING_SYSTEM_EOL_CR (codesys));
 311
 312   switch (CODING_SYSTEM_TYPE (codesys))
 313     {
 314 #ifdef MULE
 315       int i;
 316     case CODESYS_ISO2022:
 317       for (i = 0; i < 4; i++)
 318         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 319       if (codesys->iso2022.input_conv)
 320         {
 321           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 322             {
 323               struct charset_conversion_spec *ccs =
 324                 Dynarr_atp (codesys->iso2022.input_conv, i);
 325               markobj (ccs->from_charset);
 326               markobj (ccs->to_charset);
 327             }
 328         }
 329       if (codesys->iso2022.output_conv)
 330         {
 331           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 332             {
 333               struct charset_conversion_spec *ccs =
 334                 Dynarr_atp (codesys->iso2022.output_conv, i);
 335               markobj (ccs->from_charset);
 336               markobj (ccs->to_charset);
 337             }
 338         }
 339       break;
 340
 341     case CODESYS_CCL:
 342       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 343       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 344       break;
 345 #endif /* MULE */
 346     default:
 347       break;
 348     }
 349
 350   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 351   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 352 }
 353
 354 static void
 355 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 356                      int escapeflag)
 357 {
 358   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 359   if (print_readably)
 360     error ("printing unreadable object #<coding_system 0x%x>",
 361            c->header.uid);
 362
 363   write_c_string ("#<coding_system ", printcharfun);
 364   print_internal (c->name, printcharfun, 1);
 365   write_c_string (">", printcharfun);
 366 }
 367
 368 static void
 369 finalize_coding_system (void *header, int for_disksave)
 370 {
 371   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 372   /* Since coding systems never go away, this function is not
 373      necessary.  But it would be necessary if we changed things
 374      so that coding systems could go away. */
 375   if (!for_disksave) /* see comment in lstream.c */
 376     {
 377       switch (CODING_SYSTEM_TYPE (c))
 378         {
 379 #ifdef MULE
 380         case CODESYS_ISO2022:
 381           if (c->iso2022.input_conv)
 382             {
 383               Dynarr_free (c->iso2022.input_conv);
 384               c->iso2022.input_conv = 0;
 385             }
 386           if (c->iso2022.output_conv)
 387             {
 388               Dynarr_free (c->iso2022.output_conv);
 389               c->iso2022.output_conv = 0;
 390             }
 391           break;
 392 #endif /* MULE */
 393         default:
 394           break;
 395         }
 396     }
 397 }
 398
 399 static enum eol_type
 400 symbol_to_eol_type (Lisp_Object symbol)
 401 {
 402   CHECK_SYMBOL (symbol);
 403   if (NILP (symbol))      return EOL_AUTODETECT;
 404   if (EQ (symbol, Qlf))   return EOL_LF;
 405   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 406   if (EQ (symbol, Qcr))   return EOL_CR;
 407
 408   signal_simple_error ("Unrecognized eol type", symbol);
 409   return EOL_AUTODETECT; /* not reached */
 410 }
 411
 412 static Lisp_Object
 413 eol_type_to_symbol (enum eol_type type)
 414 {
 415   switch (type)
 416     {
 417     default: abort ();
 418     case EOL_LF:         return Qlf;
 419     case EOL_CRLF:       return Qcrlf;
 420     case EOL_CR:         return Qcr;
 421     case EOL_AUTODETECT: return Qnil;
 422     }
 423 }
 424
 425 static void
 426 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 427 {
 428   Lisp_Object codesys_obj;
 429   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 430   char *codesys_name = (char *) alloca (len + 7);
 431   int mlen = -1;
 432   char *codesys_mnemonic=0;
 433
 434   Lisp_Object codesys_name_sym, sub_codesys_obj;
 435
 436   /* kludge */
 437
 438   XSETCODING_SYSTEM (codesys_obj, codesys);
 439
 440   memcpy (codesys_name,
 441           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 442
 443   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 444     {
 445       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 446       codesys_mnemonic = (char *) alloca (mlen + 7);
 447       memcpy (codesys_mnemonic,
 448               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 449     }
 450
 451 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 452   strcpy (codesys_name + len, "-" op_sys);                                      \
 453   if (mlen != -1)                                                               \
 454     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 455   codesys_name_sym = intern (codesys_name);                                     \
 456   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 457   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 458   if (mlen != -1)                                                               \
 459     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 460       build_string (codesys_mnemonic);                                          \
 461   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 462 } while (0)
 463
 464   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 465   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 466   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 467 }
 468
 469 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 470 Return t if OBJECT is a coding system.
 471 A coding system is an object that defines how text containing multiple
 472 character sets is encoded into a stream of (typically 8-bit) bytes.
 473 The coding system is used to decode the stream into a series of
 474 characters (which may be from multiple charsets) when the text is read
 475 from a file or process, and is used to encode the text back into the
 476 same format when it is written out to a file or process.
 477
 478 For example, many ISO2022-compliant coding systems (such as Compound
 479 Text, which is used for inter-client data under the X Window System)
 480 use escape sequences to switch between different charsets -- Japanese
 481 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 482 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 483 `make-coding-system' for more information.
 484
 485 Coding systems are normally identified using a symbol, and the
 486 symbol is accepted in place of the actual coding system object whenever
 487 a coding system is called for. (This is similar to how faces work.)
 488 */
 489        (object))
 490 {
 491   return CODING_SYSTEMP (object) ? Qt : Qnil;
 492 }
 493
 494 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 495 Retrieve the coding system of the given name.
 496
 497 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 498 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 499 If there is no such coding system, nil is returned.  Otherwise the
 500 associated coding system object is returned.
 501 */
 502        (coding_system_or_name))
 503 {
 504   if (CODING_SYSTEMP (coding_system_or_name))
 505     return coding_system_or_name;
 506
 507   if (NILP (coding_system_or_name))
 508     coding_system_or_name = Qbinary;
 509   else
 510     CHECK_SYMBOL (coding_system_or_name);
 511
 512   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 513 }
 514
 515 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 516 Retrieve the coding system of the given name.
 517 Same as `find-coding-system' except that if there is no such
 518 coding system, an error is signaled instead of returning nil.
 519 */
 520        (name))
 521 {
 522   Lisp_Object coding_system = Ffind_coding_system (name);
 523
 524   if (NILP (coding_system))
 525     signal_simple_error ("No such coding system", name);
 526   return coding_system;
 527 }
 528
 529 /* We store the coding systems in hash tables with the names as the key and the
 530    actual coding system object as the value.  Occasionally we need to use them
 531    in a list format.  These routines provide us with that. */
 532 struct coding_system_list_closure
 533 {
 534   Lisp_Object *coding_system_list;
 535 };
 536
 537 static int
 538 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 539                                   void *coding_system_list_closure)
 540 {
 541   /* This function can GC */
 542   struct coding_system_list_closure *cscl =
 543     (struct coding_system_list_closure *) coding_system_list_closure;
 544   Lisp_Object *coding_system_list = cscl->coding_system_list;
 545
 546   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 547                                *coding_system_list);
 548   return 0;
 549 }
 550
 551 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 552 Return a list of the names of all defined coding systems.
 553 */
 554        ())
 555 {
 556   Lisp_Object coding_system_list = Qnil;
 557   struct gcpro gcpro1;
 558   struct coding_system_list_closure coding_system_list_closure;
 559
 560   GCPRO1 (coding_system_list);
 561   coding_system_list_closure.coding_system_list = &coding_system_list;
 562   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 563                  &coding_system_list_closure);
 564   UNGCPRO;
 565
 566   return coding_system_list;
 567 }
 568
 569 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 570 Return the name of the given coding system.
 571 */
 572        (coding_system))
 573 {
 574   coding_system = Fget_coding_system (coding_system);
 575   return XCODING_SYSTEM_NAME (coding_system);
 576 }
 577
 578 static Lisp_Coding_System *
 579 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 580 {
 581   Lisp_Coding_System *codesys =
 582     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 583
 584   zero_lcrecord (codesys);
 585   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 586   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 587   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 588   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 589   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 590   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 591   CODING_SYSTEM_TYPE     (codesys) = type;
 592   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 593 #ifdef MULE
 594   if (type == CODESYS_ISO2022)
 595     {
 596       int i;
 597       for (i = 0; i < 4; i++)
 598         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 599     }
 600   else if (type == CODESYS_CCL)
 601     {
 602       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 603       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 604     }
 605 #endif /* MULE */
 606   CODING_SYSTEM_NAME (codesys) = name;
 607
 608   return codesys;
 609 }
 610
 611 #ifdef MULE
 612 /* Given a list of charset conversion specs as specified in a Lisp
 613    program, parse it into STORE_HERE. */
 614
 615 static void
 616 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 617                                 Lisp_Object spec_list)
 618 {
 619   Lisp_Object rest;
 620
 621   EXTERNAL_LIST_LOOP (rest, spec_list)
 622     {
 623       Lisp_Object car = XCAR (rest);
 624       Lisp_Object from, to;
 625       struct charset_conversion_spec spec;
 626
 627       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 628         signal_simple_error ("Invalid charset conversion spec", car);
 629       from = Fget_charset (XCAR (car));
 630       to = Fget_charset (XCAR (XCDR (car)));
 631       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 632         signal_simple_error_2
 633           ("Attempted conversion between different charset types",
 634            from, to);
 635       spec.from_charset = from;
 636       spec.to_charset = to;
 637
 638       Dynarr_add (store_here, spec);
 639     }
 640 }
 641
 642 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 643    specs, return the equivalent as the Lisp programmer would see it.
 644
 645    If LOAD_HERE is 0, return Qnil. */
 646
 647 static Lisp_Object
 648 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 649 {
 650   int i;
 651   Lisp_Object result;
 652
 653   if (!load_here)
 654     return Qnil;
 655   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 656     {
 657       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 658       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 659     }
 660
 661   return Fnreverse (result);
 662 }
 663
 664 #endif /* MULE */
 665
 666 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 667 Register symbol NAME as a coding system.
 668
 669 TYPE describes the conversion method used and should be one of
 670
 671 nil or 'undecided
 672      Automatic conversion.  XEmacs attempts to detect the coding system
 673      used in the file.
 674 'no-conversion
 675      No conversion.  Use this for binary files and such.  On output,
 676      graphic characters that are not in ASCII or Latin-1 will be
 677      replaced by a ?. (For a no-conversion-encoded buffer, these
 678      characters will only be present if you explicitly insert them.)
 679 'shift-jis
 680      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 681 'ucs-4
 682      ISO 10646 UCS-4 encoding.
 683 'utf-8
 684      ISO 10646 UTF-8 encoding.
 685 'iso2022
 686      Any ISO2022-compliant encoding.  Among other things, this includes
 687      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 688      standard Unix encoding for Japanese and other languages), and
 689      Compound Text (the encoding used in X11).  You can specify more
 690      specific information about the conversion with the FLAGS argument.
 691 'big5
 692      Big5 (the encoding commonly used for Taiwanese).
 693 'ccl
 694      The conversion is performed using a user-written pseudo-code
 695      program.  CCL (Code Conversion Language) is the name of this
 696      pseudo-code.
 697 'internal
 698      Write out or read in the raw contents of the memory representing
 699      the buffer's text.  This is primarily useful for debugging
 700      purposes, and is only enabled when XEmacs has been compiled with
 701      DEBUG_XEMACS defined (via the --debug configure option).
 702      WARNING: Reading in a file using 'internal conversion can result
 703      in an internal inconsistency in the memory representing a
 704      buffer's text, which will produce unpredictable results and may
 705      cause XEmacs to crash.  Under normal circumstances you should
 706      never use 'internal conversion.
 707
 708 DOC-STRING is a string describing the coding system.
 709
 710 PROPS is a property list, describing the specific nature of the
 711 character set.  Recognized properties are:
 712
 713 'mnemonic
 714      String to be displayed in the modeline when this coding system is
 715      active.
 716
 717 'eol-type
 718      End-of-line conversion to be used.  It should be one of
 719
 720         nil
 721                 Automatically detect the end-of-line type (LF, CRLF,
 722                 or CR).  Also generate subsidiary coding systems named
 723                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 724                 identical to this coding system but have an EOL-TYPE
 725                 value of 'lf, 'crlf, and 'cr, respectively.
 726         'lf
 727                 The end of a line is marked externally using ASCII LF.
 728                 Since this is also the way that XEmacs represents an
 729                 end-of-line internally, specifying this option results
 730                 in no end-of-line conversion.  This is the standard
 731                 format for Unix text files.
 732         'crlf
 733                 The end of a line is marked externally using ASCII
 734                 CRLF.  This is the standard format for MS-DOS text
 735                 files.
 736         'cr
 737                 The end of a line is marked externally using ASCII CR.
 738                 This is the standard format for Macintosh text files.
 739         t
 740                 Automatically detect the end-of-line type but do not
 741                 generate subsidiary coding systems.  (This value is
 742                 converted to nil when stored internally, and
 743                 `coding-system-property' will return nil.)
 744
 745 'post-read-conversion
 746      Function called after a file has been read in, to perform the
 747      decoding.  Called with two arguments, BEG and END, denoting
 748      a region of the current buffer to be decoded.
 749
 750 'pre-write-conversion
 751      Function called before a file is written out, to perform the
 752      encoding.  Called with two arguments, BEG and END, denoting
 753      a region of the current buffer to be encoded.
 754
 755
 756 The following additional properties are recognized if TYPE is 'iso2022:
 757
 758 'charset-g0
 759 'charset-g1
 760 'charset-g2
 761 'charset-g3
 762      The character set initially designated to the G0 - G3 registers.
 763      The value should be one of
 764
 765           -- A charset object (designate that character set)
 766           -- nil (do not ever use this register)
 767           -- t (no character set is initially designated to
 768                 the register, but may be later on; this automatically
 769                 sets the corresponding `force-g*-on-output' property)
 770
 771 'force-g0-on-output
 772 'force-g1-on-output
 773 'force-g2-on-output
 774 'force-g2-on-output
 775      If non-nil, send an explicit designation sequence on output before
 776      using the specified register.
 777
 778 'short
 779      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 780      "ESC $ B" on output in place of the full designation sequences
 781      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 782
 783 'no-ascii-eol
 784      If non-nil, don't designate ASCII to G0 at each end of line on output.
 785      Setting this to non-nil also suppresses other state-resetting that
 786      normally happens at the end of a line.
 787
 788 'no-ascii-cntl
 789      If non-nil, don't designate ASCII to G0 before control chars on output.
 790
 791 'seven
 792      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 793      environment.
 794
 795 'lock-shift
 796      If non-nil, use locking-shift (SO/SI) instead of single-shift
 797      or designation by escape sequence.
 798
 799 'no-iso6429
 800      If non-nil, don't use ISO6429's direction specification.
 801
 802 'escape-quoted
 803      If non-nil, literal control characters that are the same as
 804      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 805      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 806      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 807      so that they can be properly distinguished from an escape sequence.
 808      (Note that doing this results in a non-portable encoding.) This
 809      encoding flag is used for byte-compiled files.  Note that ESC
 810      is a good choice for a quoting character because there are no
 811      escape sequences whose second byte is a character from the Control-0
 812      or Control-1 character sets; this is explicitly disallowed by the
 813      ISO2022 standard.
 814
 815 'input-charset-conversion
 816      A list of conversion specifications, specifying conversion of
 817      characters in one charset to another when decoding is performed.
 818      Each specification is a list of two elements: the source charset,
 819      and the destination charset.
 820
 821 'output-charset-conversion
 822      A list of conversion specifications, specifying conversion of
 823      characters in one charset to another when encoding is performed.
 824      The form of each specification is the same as for
 825      'input-charset-conversion.
 826
 827
 828 The following additional properties are recognized (and required)
 829 if TYPE is 'ccl:
 830
 831 'decode
 832      CCL program used for decoding (converting to internal format).
 833
 834 'encode
 835      CCL program used for encoding (converting to external format).
 836 */
 837        (name, type, doc_string, props))
 838 {
 839   Lisp_Coding_System *codesys;
 840   Lisp_Object rest, key, value;
 841   enum coding_system_type ty;
 842   int need_to_setup_eol_systems = 1;
 843
 844   /* Convert type to constant */
 845   if (NILP (type) || EQ (type, Qundecided))
 846                                       { ty = CODESYS_AUTODETECT; }
 847 #ifdef MULE
 848   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 849   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 850   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 851   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 852   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 853   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 854 #endif
 855   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 856 #ifdef DEBUG_XEMACS
 857   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 858 #endif
 859   else
 860     signal_simple_error ("Invalid coding system type", type);
 861
 862   CHECK_SYMBOL (name);
 863
 864   codesys = allocate_coding_system (ty, name);
 865
 866   if (NILP (doc_string))
 867     doc_string = build_string ("");
 868   else
 869     CHECK_STRING (doc_string);
 870   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 871
 872   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 873     {
 874       if (EQ (key, Qmnemonic))
 875         {
 876           if (!NILP (value))
 877             CHECK_STRING (value);
 878           CODING_SYSTEM_MNEMONIC (codesys) = value;
 879         }
 880
 881       else if (EQ (key, Qeol_type))
 882         {
 883           need_to_setup_eol_systems = NILP (value);
 884           if (EQ (value, Qt))
 885             value = Qnil;
 886           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 887         }
 888
 889       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 890       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 891 #ifdef MULE
 892       else if (ty == CODESYS_ISO2022)
 893         {
 894 #define FROB_INITIAL_CHARSET(charset_num) \
 895   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 896     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 897
 898           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 899           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 900           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 901           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 902
 903 #define FROB_FORCE_CHARSET(charset_num) \
 904   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 905
 906           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 907           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 908           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 909           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 910
 911 #define FROB_BOOLEAN_PROPERTY(prop) \
 912   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 913
 914           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 915           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 916           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 917           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 918           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 919           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 920           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 921
 922           else if (EQ (key, Qinput_charset_conversion))
 923             {
 924               codesys->iso2022.input_conv =
 925                 Dynarr_new (charset_conversion_spec);
 926               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 927                                               value);
 928             }
 929           else if (EQ (key, Qoutput_charset_conversion))
 930             {
 931               codesys->iso2022.output_conv =
 932                 Dynarr_new (charset_conversion_spec);
 933               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 934                                               value);
 935             }
 936           else
 937             signal_simple_error ("Unrecognized property", key);
 938         }
 939       else if (EQ (type, Qccl))
 940         {
 941           if (EQ (key, Qdecode))
 942             {
 943               CHECK_VECTOR (value);
 944               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 945             }
 946           else if (EQ (key, Qencode))
 947             {
 948               CHECK_VECTOR (value);
 949               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 950             }
 951           else
 952             signal_simple_error ("Unrecognized property", key);
 953         }
 954 #endif /* MULE */
 955       else
 956         signal_simple_error ("Unrecognized property", key);
 957     }
 958
 959   if (need_to_setup_eol_systems)
 960     setup_eol_coding_systems (codesys);
 961
 962   {
 963     Lisp_Object codesys_obj;
 964     XSETCODING_SYSTEM (codesys_obj, codesys);
 965     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 966     return codesys_obj;
 967   }
 968 }
 969
 970 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 971 Copy OLD-CODING-SYSTEM to NEW-NAME.
 972 If NEW-NAME does not name an existing coding system, a new one will
 973 be created.
 974 */
 975        (old_coding_system, new_name))
 976 {
 977   Lisp_Object new_coding_system;
 978   old_coding_system = Fget_coding_system (old_coding_system);
 979   new_coding_system = Ffind_coding_system (new_name);
 980   if (NILP (new_coding_system))
 981     {
 982       XSETCODING_SYSTEM (new_coding_system,
 983                          allocate_coding_system
 984                          (XCODING_SYSTEM_TYPE (old_coding_system),
 985                           new_name));
 986       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 987     }
 988
 989   {
 990     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 991     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 992     memcpy (((char *) to  ) + sizeof (to->header),
 993             ((char *) from) + sizeof (from->header),
 994             sizeof (*from) - sizeof (from->header));
 995     to->name = new_name;
 996   }
 997   return new_coding_system;
 998 }
 999
1000 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1001 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1002 */
1003        (alias, coding_system))
1004 {
1005   CHECK_SYMBOL (alias);
1006   if (!NILP (Ffind_coding_system (alias)))
1007     signal_simple_error ("Symbol already names a coding system", alias);
1008   coding_system = Fget_coding_system (coding_system);
1009   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1010
1011   /* Set up aliases for subsidiaries. */
1012   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1013     {
1014       Lisp_Object str;
1015       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1016 #define FROB(type, name)                                                        \
1017       do {                                                                      \
1018         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1019         if (!NILP (subsidiary))                                                 \
1020           Fdefine_coding_system_alias                                           \
1021             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1022       } while (0)
1023       FROB (LF,   "-unix");
1024       FROB (CRLF, "-dos");
1025       FROB (CR,   "-mac");
1026 #undef FROB
1027     }
1028   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1029      but it doesn't look intentional, so I'd rather return something
1030      meaningful or nothing at all. */
1031   return Qnil;
1032 }
1033
1034 static Lisp_Object
1035 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1036 {
1037   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1038   Lisp_Object new_coding_system;
1039
1040   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1041     return coding_system;
1042
1043   switch (type)
1044     {
1045     case EOL_AUTODETECT: return coding_system;
1046     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1047     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1048     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1049     default:       abort ();
1050     }
1051
1052   return NILP (new_coding_system) ? coding_system : new_coding_system;
1053 }
1054
1055 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1056 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1057 */
1058        (coding_system, eol_type))
1059 {
1060   coding_system = Fget_coding_system (coding_system);
1061
1062   return subsidiary_coding_system (coding_system,
1063                                    symbol_to_eol_type (eol_type));
1064 }
1065
1066 \f
1067 /************************************************************************/
1068 /*                         Coding system accessors                      */
1069 /************************************************************************/
1070
1071 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1072 Return the doc string for CODING-SYSTEM.
1073 */
1074        (coding_system))
1075 {
1076   coding_system = Fget_coding_system (coding_system);
1077   return XCODING_SYSTEM_DOC_STRING (coding_system);
1078 }
1079
1080 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1081 Return the type of CODING-SYSTEM.
1082 */
1083        (coding_system))
1084 {
1085   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1086     {
1087     default: abort ();
1088     case CODESYS_AUTODETECT:    return Qundecided;
1089 #ifdef MULE
1090     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1091     case CODESYS_ISO2022:       return Qiso2022;
1092     case CODESYS_BIG5:          return Qbig5;
1093     case CODESYS_UCS4:          return Qucs4;
1094     case CODESYS_UTF8:          return Qutf8;
1095     case CODESYS_CCL:           return Qccl;
1096 #endif
1097     case CODESYS_NO_CONVERSION: return Qno_conversion;
1098 #ifdef DEBUG_XEMACS
1099     case CODESYS_INTERNAL:      return Qinternal;
1100 #endif
1101     }
1102 }
1103
1104 #ifdef MULE
1105 static
1106 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1107 {
1108   Lisp_Object cs
1109     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1110
1111   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1112 }
1113
1114 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1115 Return initial charset of CODING-SYSTEM designated to GNUM.
1116 GNUM allows 0 .. 3.
1117 */
1118        (coding_system, gnum))
1119 {
1120   coding_system = Fget_coding_system (coding_system);
1121   CHECK_INT (gnum);
1122
1123   return coding_system_charset (coding_system, XINT (gnum));
1124 }
1125 #endif /* MULE */
1126
1127 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1128 Return the PROP property of CODING-SYSTEM.
1129 */
1130        (coding_system, prop))
1131 {
1132   int i, ok = 0;
1133   enum coding_system_type type;
1134
1135   coding_system = Fget_coding_system (coding_system);
1136   CHECK_SYMBOL (prop);
1137   type = XCODING_SYSTEM_TYPE (coding_system);
1138
1139   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1140     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1141       {
1142         ok = 1;
1143         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1144           {
1145           case CODESYS_PROP_ALL_OK:
1146             break;
1147 #ifdef MULE
1148           case CODESYS_PROP_ISO2022:
1149             if (type != CODESYS_ISO2022)
1150               signal_simple_error
1151                 ("Property only valid in ISO2022 coding systems",
1152                  prop);
1153             break;
1154
1155           case CODESYS_PROP_CCL:
1156             if (type != CODESYS_CCL)
1157               signal_simple_error
1158                 ("Property only valid in CCL coding systems",
1159                  prop);
1160             break;
1161 #endif /* MULE */
1162           default:
1163             abort ();
1164           }
1165       }
1166
1167   if (!ok)
1168     signal_simple_error ("Unrecognized property", prop);
1169
1170   if (EQ (prop, Qname))
1171     return XCODING_SYSTEM_NAME (coding_system);
1172   else if (EQ (prop, Qtype))
1173     return Fcoding_system_type (coding_system);
1174   else if (EQ (prop, Qdoc_string))
1175     return XCODING_SYSTEM_DOC_STRING (coding_system);
1176   else if (EQ (prop, Qmnemonic))
1177     return XCODING_SYSTEM_MNEMONIC (coding_system);
1178   else if (EQ (prop, Qeol_type))
1179     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1180   else if (EQ (prop, Qeol_lf))
1181     return XCODING_SYSTEM_EOL_LF (coding_system);
1182   else if (EQ (prop, Qeol_crlf))
1183     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1184   else if (EQ (prop, Qeol_cr))
1185     return XCODING_SYSTEM_EOL_CR (coding_system);
1186   else if (EQ (prop, Qpost_read_conversion))
1187     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1188   else if (EQ (prop, Qpre_write_conversion))
1189     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1190 #ifdef MULE
1191   else if (type == CODESYS_ISO2022)
1192     {
1193       if (EQ (prop, Qcharset_g0))
1194         return coding_system_charset (coding_system, 0);
1195       else if (EQ (prop, Qcharset_g1))
1196         return coding_system_charset (coding_system, 1);
1197       else if (EQ (prop, Qcharset_g2))
1198         return coding_system_charset (coding_system, 2);
1199       else if (EQ (prop, Qcharset_g3))
1200         return coding_system_charset (coding_system, 3);
1201
1202 #define FORCE_CHARSET(charset_num) \
1203   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1204    (coding_system, charset_num) ? Qt : Qnil)
1205
1206       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1207       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1208       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1209       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1210
1211 #define LISP_BOOLEAN(prop) \
1212   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1213
1214       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1215       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1216       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1217       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1218       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1219       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1220       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1221
1222       else if (EQ (prop, Qinput_charset_conversion))
1223         return
1224           unparse_charset_conversion_specs
1225             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1226       else if (EQ (prop, Qoutput_charset_conversion))
1227         return
1228           unparse_charset_conversion_specs
1229             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1230       else
1231         abort ();
1232     }
1233   else if (type == CODESYS_CCL)
1234     {
1235       if (EQ (prop, Qdecode))
1236         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1237       else if (EQ (prop, Qencode))
1238         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1239       else
1240         abort ();
1241     }
1242 #endif /* MULE */
1243   else
1244     abort ();
1245
1246   return Qnil; /* not reached */
1247 }
1248
1249 \f
1250 /************************************************************************/
1251 /*                       Coding category functions                      */
1252 /************************************************************************/
1253
1254 static int
1255 decode_coding_category (Lisp_Object symbol)
1256 {
1257   int i;
1258
1259   CHECK_SYMBOL (symbol);
1260   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1261     if (EQ (coding_category_symbol[i], symbol))
1262       return i;
1263
1264   signal_simple_error ("Unrecognized coding category", symbol);
1265   return 0; /* not reached */
1266 }
1267
1268 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1269 Return a list of all recognized coding categories.
1270 */
1271        ())
1272 {
1273   int i;
1274   Lisp_Object list = Qnil;
1275
1276   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1277     list = Fcons (coding_category_symbol[i], list);
1278   return list;
1279 }
1280
1281 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1282 Change the priority order of the coding categories.
1283 LIST should be list of coding categories, in descending order of
1284 priority.  Unspecified coding categories will be lower in priority
1285 than all specified ones, in the same relative order they were in
1286 previously.
1287 */
1288        (list))
1289 {
1290   int category_to_priority[CODING_CATEGORY_LAST + 1];
1291   int i, j;
1292   Lisp_Object rest;
1293
1294   /* First generate a list that maps coding categories to priorities. */
1295
1296   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1297     category_to_priority[i] = -1;
1298
1299   /* Highest priority comes from the specified list. */
1300   i = 0;
1301   EXTERNAL_LIST_LOOP (rest, list)
1302     {
1303       int cat = decode_coding_category (XCAR (rest));
1304
1305       if (category_to_priority[cat] >= 0)
1306         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1307       category_to_priority[cat] = i++;
1308     }
1309
1310   /* Now go through the existing categories by priority to retrieve
1311      the categories not yet specified and preserve their priority
1312      order. */
1313   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1314     {
1315       int cat = coding_category_by_priority[j];
1316       if (category_to_priority[cat] < 0)
1317         category_to_priority[cat] = i++;
1318     }
1319
1320   /* Now we need to construct the inverse of the mapping we just
1321      constructed. */
1322
1323   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1324     coding_category_by_priority[category_to_priority[i]] = i;
1325
1326   /* Phew!  That was confusing. */
1327   return Qnil;
1328 }
1329
1330 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1331 Return a list of coding categories in descending order of priority.
1332 */
1333        ())
1334 {
1335   int i;
1336   Lisp_Object list = Qnil;
1337
1338   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1339     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1340                   list);
1341   return list;
1342 }
1343
1344 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1345 Change the coding system associated with a coding category.
1346 */
1347        (coding_category, coding_system))
1348 {
1349   int cat = decode_coding_category (coding_category);
1350
1351   coding_system = Fget_coding_system (coding_system);
1352   coding_category_system[cat] = coding_system;
1353   return Qnil;
1354 }
1355
1356 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1357 Return the coding system associated with a coding category.
1358 */
1359        (coding_category))
1360 {
1361   int cat = decode_coding_category (coding_category);
1362   Lisp_Object sys = coding_category_system[cat];
1363
1364   if (!NILP (sys))
1365     return XCODING_SYSTEM_NAME (sys);
1366   return Qnil;
1367 }
1368
1369 \f
1370 /************************************************************************/
1371 /*                     Detecting the encoding of data                   */
1372 /************************************************************************/
1373
1374 struct detection_state
1375 {
1376   enum eol_type eol_type;
1377   int seen_non_ascii;
1378   int mask;
1379 #ifdef MULE
1380   struct
1381     {
1382       int mask;
1383       int in_second_byte;
1384     }
1385   big5;
1386
1387   struct
1388     {
1389       int mask;
1390       int in_second_byte;
1391     }
1392   shift_jis;
1393
1394   struct
1395     {
1396       int mask;
1397       int in_byte;
1398   }
1399   ucs4;
1400
1401   struct
1402     {
1403       int mask;
1404       int in_byte;
1405     }
1406   utf8;
1407
1408   struct
1409     {
1410       int mask;
1411       int initted;
1412       struct iso2022_decoder iso;
1413       unsigned int flags;
1414       int high_byte_count;
1415       unsigned int saw_single_shift:1;
1416     }
1417   iso2022;
1418 #endif
1419   struct
1420     {
1421       int seen_anything;
1422       int just_saw_cr;
1423     }
1424   eol;
1425 };
1426
1427 static int
1428 acceptable_control_char_p (int c)
1429 {
1430   switch (c)
1431     {
1432       /* Allow and ignore control characters that you might
1433          reasonably see in a text file */
1434     case '\r':
1435     case '\n':
1436     case '\t':
1437     case  7: /* bell */
1438     case  8: /* backspace */
1439     case 11: /* vertical tab */
1440     case 12: /* form feed */
1441     case 26: /* MS-DOS C-z junk */
1442     case 31: /* '^_' -- for info */
1443       return 1;
1444     default:
1445       return 0;
1446     }
1447 }
1448
1449 static int
1450 mask_has_at_most_one_bit_p (int mask)
1451 {
1452   /* Perhaps the only thing useful you learn from intensive Microsoft
1453      technical interviews */
1454   return (mask & (mask - 1)) == 0;
1455 }
1456
1457 static enum eol_type
1458 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1459                  unsigned int n)
1460 {
1461   int c;
1462
1463   while (n--)
1464     {
1465       c = *src++;
1466       if (c == '\r')
1467         st->eol.just_saw_cr = 1;
1468       else
1469         {
1470           if (c == '\n')
1471             {
1472               if (st->eol.just_saw_cr)
1473                 return EOL_CRLF;
1474               else if (st->eol.seen_anything)
1475                 return EOL_LF;
1476             }
1477           else if (st->eol.just_saw_cr)
1478             return EOL_CR;
1479           st->eol.just_saw_cr = 0;
1480         }
1481       st->eol.seen_anything = 1;
1482     }
1483
1484   return EOL_AUTODETECT;
1485 }
1486
1487 /* Attempt to determine the encoding and EOL type of the given text.
1488    Before calling this function for the first type, you must initialize
1489    st->eol_type as appropriate and initialize st->mask to ~0.
1490
1491    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1492    not yet known.
1493
1494    st->mask holds the determined coding category mask, or ~0 if only
1495    ASCII has been seen so far.
1496
1497    Returns:
1498
1499    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1500         is present in st->mask
1501    1 == definitive answers are here for both st->eol_type and st->mask
1502 */
1503
1504 static int
1505 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1506                     unsigned int n, int just_do_eol)
1507 {
1508   int c;
1509
1510   if (st->eol_type == EOL_AUTODETECT)
1511     st->eol_type = detect_eol_type (st, src, n);
1512
1513   if (just_do_eol)
1514     return st->eol_type != EOL_AUTODETECT;
1515
1516   if (!st->seen_non_ascii)
1517     {
1518       for (; n; n--, src++)
1519         {
1520           c = *src;
1521           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1522             {
1523               st->seen_non_ascii = 1;
1524 #ifdef MULE
1525               st->shift_jis.mask = ~0;
1526               st->big5.mask = ~0;
1527               st->ucs4.mask = ~0;
1528               st->utf8.mask = ~0;
1529               st->iso2022.mask = ~0;
1530 #endif
1531               break;
1532             }
1533         }
1534     }
1535
1536   if (!n)
1537     return 0;
1538 #ifdef MULE
1539   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1540     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1541   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1542     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1543   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1544     st->big5.mask = detect_coding_big5 (st, src, n);
1545   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1546     st->utf8.mask = detect_coding_utf8 (st, src, n);
1547   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1548     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1549
1550   st->mask
1551     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1552     | st->utf8.mask | st->ucs4.mask;
1553 #endif
1554   {
1555     int retval = mask_has_at_most_one_bit_p (st->mask);
1556     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1557     return retval && st->eol_type != EOL_AUTODETECT;
1558   }
1559 }
1560
1561 static Lisp_Object
1562 coding_system_from_mask (int mask)
1563 {
1564   if (mask == ~0)
1565     {
1566       /* If the file was entirely or basically ASCII, use the
1567          default value of `buffer-file-coding-system'. */
1568       Lisp_Object retval =
1569         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1570       if (!NILP (retval))
1571         {
1572           retval = Ffind_coding_system (retval);
1573           if (NILP (retval))
1574             {
1575               warn_when_safe
1576                 (Qbad_variable, Qwarning,
1577                  "Invalid `default-buffer-file-coding-system', set to nil");
1578               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1579             }
1580         }
1581       if (NILP (retval))
1582         retval = Fget_coding_system (Qraw_text);
1583       return retval;
1584     }
1585   else
1586     {
1587       int i;
1588       int cat = -1;
1589 #ifdef MULE
1590       mask = postprocess_iso2022_mask (mask);
1591 #endif
1592       /* Look through the coding categories by priority and find
1593          the first one that is allowed. */
1594       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1595         {
1596           cat = coding_category_by_priority[i];
1597           if ((mask & (1 << cat)) &&
1598               !NILP (coding_category_system[cat]))
1599             break;
1600         }
1601       if (cat >= 0)
1602         return coding_category_system[cat];
1603       else
1604         return Fget_coding_system (Qraw_text);
1605     }
1606 }
1607
1608 /* Given a seekable read stream and potential coding system and EOL type
1609    as specified, do any autodetection that is called for.  If the
1610    coding system and/or EOL type are not autodetect, they will be left
1611    alone; but this function will never return an autodetect coding system
1612    or EOL type.
1613
1614    This function does not automatically fetch subsidiary coding systems;
1615    that should be unnecessary with the explicit eol-type argument. */
1616
1617 void
1618 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1619                               enum eol_type *eol_type_in_out)
1620 {
1621   struct detection_state decst;
1622
1623   if (*eol_type_in_out == EOL_AUTODETECT)
1624     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1625
1626   xzero (decst);
1627   decst.eol_type = *eol_type_in_out;
1628   decst.mask = ~0;
1629
1630   /* If autodetection is called for, do it now. */
1631   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1632       *eol_type_in_out == EOL_AUTODETECT)
1633     {
1634       unsigned char random_buffer[4096];
1635       int nread;
1636       Lisp_Object coding_system = Qnil;
1637
1638       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1639       if (nread)
1640         {
1641           unsigned char *cp = random_buffer;
1642
1643           while (cp < random_buffer + nread)
1644             {
1645               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1646                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1647                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1648                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1649                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1650                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1651                   (*cp++ == ':') && (cp < random_buffer + nread))
1652                 {
1653                   unsigned char coding_system_name[4096 - 6];
1654                   unsigned char *np = coding_system_name;
1655
1656                   while ( (cp < random_buffer + nread)
1657                           && ((*cp == ' ') || (*cp == '\t')) )
1658                     {
1659                       cp++;
1660                     }
1661                   while ( (cp < random_buffer + nread) &&
1662                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1663                     {
1664                       *np++ = *cp++;
1665                     }
1666                   *np = 0;
1667                   coding_system
1668                     = Ffind_coding_system (intern (coding_system_name));
1669                   break;
1670                 }
1671             }
1672           if (EQ(coding_system, Qnil))
1673             do{
1674               if (detect_coding_type (&decst, random_buffer, nread,
1675                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1676                                       != CODESYS_AUTODETECT))
1677                 break;
1678               nread = Lstream_read (stream,
1679                                     random_buffer, sizeof (random_buffer));
1680               if (!nread)
1681                 break;
1682             } while(1);
1683         }
1684       *eol_type_in_out = decst.eol_type;
1685       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1686         {
1687           if (EQ(coding_system, Qnil))
1688             *codesys_in_out = coding_system_from_mask (decst.mask);
1689           else
1690             *codesys_in_out = coding_system;
1691         }
1692     }
1693   /* If we absolutely can't determine the EOL type, just assume LF. */
1694   if (*eol_type_in_out == EOL_AUTODETECT)
1695     *eol_type_in_out = EOL_LF;
1696
1697   Lstream_rewind (stream);
1698 }
1699
1700 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1701 Detect coding system of the text in the region between START and END.
1702 Returned a list of possible coding systems ordered by priority.
1703 If only ASCII characters are found, it returns 'undecided or one of
1704 its subsidiary coding systems according to a detected end-of-line
1705 type.  Optional arg BUFFER defaults to the current buffer.
1706 */
1707        (start, end, buffer))
1708 {
1709   Lisp_Object val = Qnil;
1710   struct buffer *buf = decode_buffer (buffer, 0);
1711   Bufpos b, e;
1712   Lisp_Object instream, lb_instream;
1713   Lstream *istr, *lb_istr;
1714   struct detection_state decst;
1715   struct gcpro gcpro1, gcpro2;
1716
1717   get_buffer_range_char (buf, start, end, &b, &e, 0);
1718   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1719   lb_istr = XLSTREAM (lb_instream);
1720   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1721   istr = XLSTREAM (instream);
1722   GCPRO2 (instream, lb_instream);
1723   xzero (decst);
1724   decst.eol_type = EOL_AUTODETECT;
1725   decst.mask = ~0;
1726   while (1)
1727     {
1728       unsigned char random_buffer[4096];
1729       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1730
1731       if (!nread)
1732         break;
1733       if (detect_coding_type (&decst, random_buffer, nread, 0))
1734         break;
1735     }
1736
1737   if (decst.mask == ~0)
1738     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1739                                     decst.eol_type);
1740   else
1741     {
1742       int i;
1743
1744       val = Qnil;
1745 #ifdef MULE
1746       decst.mask = postprocess_iso2022_mask (decst.mask);
1747 #endif
1748       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1749         {
1750           int sys = coding_category_by_priority[i];
1751           if (decst.mask & (1 << sys))
1752             {
1753               Lisp_Object codesys = coding_category_system[sys];
1754               if (!NILP (codesys))
1755                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1756               val = Fcons (codesys, val);
1757             }
1758         }
1759     }
1760   Lstream_close (istr);
1761   UNGCPRO;
1762   Lstream_delete (istr);
1763   Lstream_delete (lb_istr);
1764   return val;
1765 }
1766
1767 \f
1768 /************************************************************************/
1769 /*           Converting to internal Mule format ("decoding")            */
1770 /************************************************************************/
1771
1772 /* A decoding stream is a stream used for decoding text (i.e.
1773    converting from some external format to internal format).
1774    The decoding-stream object keeps track of the actual coding
1775    stream, the stream that is at the other end, and data that
1776    needs to be persistent across the lifetime of the stream. */
1777
1778 /* Handle the EOL stuff related to just-read-in character C.
1779    EOL_TYPE is the EOL type of the coding stream.
1780    FLAGS is the current value of FLAGS in the coding stream, and may
1781    be modified by this macro.  (The macro only looks at the
1782    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1783    bytes are to be written.  You need to also define a local goto
1784    label "label_continue_loop" that is at the end of the main
1785    character-reading loop.
1786
1787    If C is a CR character, then this macro handles it entirely and
1788    jumps to label_continue_loop.  Otherwise, this macro does not add
1789    anything to DST, and continues normally.  You should continue
1790    processing C normally after this macro. */
1791
1792 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1793 do {                                                            \
1794   if (c == '\r')                                                \
1795     {                                                           \
1796       if (eol_type == EOL_CR)                                   \
1797         Dynarr_add (dst, '\n');                                 \
1798       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1799         Dynarr_add (dst, c);                                    \
1800       else                                                      \
1801         flags |= CODING_STATE_CR;                               \
1802       goto label_continue_loop;                                 \
1803     }                                                           \
1804   else if (flags & CODING_STATE_CR)                             \
1805     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1806       if (c != '\n')                                            \
1807         Dynarr_add (dst, '\r');                                 \
1808       flags &= ~CODING_STATE_CR;                                \
1809     }                                                           \
1810 } while (0)
1811
1812 /* C should be a binary character in the range 0 - 255; convert
1813    to internal format and add to Dynarr DST. */
1814
1815 #ifdef UTF2000
1816 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1817 do {                                            \
1818   if (BYTE_ASCII_P (c))                         \
1819     Dynarr_add (dst, c);                        \
1820   else                                          \
1821     {                                           \
1822       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1823       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1824     }                                           \
1825 } while (0)
1826
1827 INLINE void
1828 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1829 {
1830   if ( c <= 0x7f )
1831     {
1832       Dynarr_add (dst, c);
1833     }
1834   else if ( c <= 0x7ff )
1835     {
1836       Dynarr_add (dst, (c >> 6) | 0xc0);
1837       Dynarr_add (dst, (c & 0x3f) | 0x80);
1838     }
1839   else if ( c <= 0xffff )
1840     {
1841       Dynarr_add (dst,  (c >> 12) | 0xe0);
1842       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1843       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1844     }
1845   else if ( c <= 0x1fffff )
1846     {
1847       Dynarr_add (dst,  (c >> 18) | 0xf0);
1848       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1849       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1850       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1851     }
1852   else if ( c <= 0x3ffffff )
1853     {
1854       Dynarr_add (dst,  (c >> 24) | 0xf8);
1855       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1856       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1857       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1858       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1859     }
1860   else
1861     {
1862       Dynarr_add (dst,  (c >> 30) | 0xfc);
1863       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1864       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1865       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1866       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1867       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1868     }
1869 }
1870 #else
1871 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1872 do {                                            \
1873   if (BYTE_ASCII_P (c))                         \
1874     Dynarr_add (dst, c);                        \
1875   else if (BYTE_C1_P (c))                       \
1876     {                                           \
1877       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1878       Dynarr_add (dst, c + 0x20);               \
1879     }                                           \
1880   else                                          \
1881     {                                           \
1882       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1883       Dynarr_add (dst, c);                      \
1884     }                                           \
1885 } while (0)
1886 #endif
1887
1888 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1889 do {                                    \
1890   if (ch)                               \
1891     {                                   \
1892       DECODE_ADD_BINARY_CHAR (ch, dst); \
1893       ch = 0;                           \
1894     }                                   \
1895 } while (0)
1896
1897 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1898 do {                                    \
1899   if (flags & CODING_STATE_END)         \
1900     {                                   \
1901       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1902       if (flags & CODING_STATE_CR)      \
1903         Dynarr_add (dst, '\r');         \
1904     }                                   \
1905 } while (0)
1906
1907 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1908
1909 struct decoding_stream
1910 {
1911   /* Coding system that governs the conversion. */
1912   Lisp_Coding_System *codesys;
1913
1914   /* Stream that we read the encoded data from or
1915      write the decoded data to. */
1916   Lstream *other_end;
1917
1918   /* If we are reading, then we can return only a fixed amount of
1919      data, so if the conversion resulted in too much data, we store it
1920      here for retrieval the next time around. */
1921   unsigned_char_dynarr *runoff;
1922
1923   /* FLAGS holds flags indicating the current state of the decoding.
1924      Some of these flags are dependent on the coding system. */
1925   unsigned int flags;
1926
1927   /* CH holds a partially built-up character.  Since we only deal
1928      with one- and two-byte characters at the moment, we only use
1929      this to store the first byte of a two-byte character. */
1930   unsigned int ch;
1931
1932   /* EOL_TYPE specifies the type of end-of-line conversion that
1933      currently applies.  We need to keep this separate from the
1934      EOL type stored in CODESYS because the latter might indicate
1935      automatic EOL-type detection while the former will always
1936      indicate a particular EOL type. */
1937   enum eol_type eol_type;
1938 #ifdef MULE
1939   /* Additional ISO2022 information.  We define the structure above
1940      because it's also needed by the detection routines. */
1941   struct iso2022_decoder iso2022;
1942
1943   /* Additional information (the state of the running CCL program)
1944      used by the CCL decoder. */
1945   struct ccl_program ccl;
1946
1947   /* counter for UTF-8 or UCS-4 */
1948   unsigned char counter;
1949 #endif
1950   struct detection_state decst;
1951 };
1952
1953 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1954 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1955 static int decoding_rewinder   (Lstream *stream);
1956 static int decoding_seekable_p (Lstream *stream);
1957 static int decoding_flusher    (Lstream *stream);
1958 static int decoding_closer     (Lstream *stream);
1959
1960 static Lisp_Object decoding_marker (Lisp_Object stream,
1961                                     void (*markobj) (Lisp_Object));
1962
1963 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1964                                sizeof (struct decoding_stream));
1965
1966 static Lisp_Object
1967 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1968 {
1969   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1970   Lisp_Object str_obj;
1971
1972   /* We do not need to mark the coding systems or charsets stored
1973      within the stream because they are stored in a global list
1974      and automatically marked. */
1975
1976   XSETLSTREAM (str_obj, str);
1977   markobj (str_obj);
1978   if (str->imp->marker)
1979     return (str->imp->marker) (str_obj, markobj);
1980   else
1981     return Qnil;
1982 }
1983
1984 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1985    so we read data from the other end, decode it, and store it into DATA. */
1986
1987 static int
1988 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1989 {
1990   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1991   unsigned char *orig_data = data;
1992   int read_size;
1993   int error_occurred = 0;
1994
1995   /* We need to interface to mule_decode(), which expects to take some
1996      amount of data and store the result into a Dynarr.  We have
1997      mule_decode() store into str->runoff, and take data from there
1998      as necessary. */
1999
2000   /* We loop until we have enough data, reading chunks from the other
2001      end and decoding it. */
2002   while (1)
2003     {
2004       /* Take data from the runoff if we can.  Make sure to take at
2005          most SIZE bytes, and delete the data from the runoff. */
2006       if (Dynarr_length (str->runoff) > 0)
2007         {
2008           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2009           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2010           Dynarr_delete_many (str->runoff, 0, chunk);
2011           data += chunk;
2012           size -= chunk;
2013         }
2014
2015       if (size == 0)
2016         break; /* No more room for data */
2017
2018       if (str->flags & CODING_STATE_END)
2019         /* This means that on the previous iteration, we hit the EOF on
2020            the other end.  We loop once more so that mule_decode() can
2021            output any final stuff it may be holding, or any "go back
2022            to a sane state" escape sequences. (This latter makes sense
2023            during encoding.) */
2024         break;
2025
2026       /* Exhausted the runoff, so get some more.  DATA has at least
2027          SIZE bytes left of storage in it, so it's OK to read directly
2028          into it.  (We'll be overwriting above, after we've decoded it
2029          into the runoff.) */
2030       read_size = Lstream_read (str->other_end, data, size);
2031       if (read_size < 0)
2032         {
2033           error_occurred = 1;
2034           break;
2035         }
2036       if (read_size == 0)
2037         /* There might be some more end data produced in the translation.
2038            See the comment above. */
2039         str->flags |= CODING_STATE_END;
2040       mule_decode (stream, data, str->runoff, read_size);
2041     }
2042
2043   if (data - orig_data == 0)
2044     return error_occurred ? -1 : 0;
2045   else
2046     return data - orig_data;
2047 }
2048
2049 static int
2050 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2051 {
2052   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2053   int retval;
2054
2055   /* Decode all our data into the runoff, and then attempt to write
2056      it all out to the other end.  Remove whatever chunk we succeeded
2057      in writing. */
2058   mule_decode (stream, data, str->runoff, size);
2059   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2060                           Dynarr_length (str->runoff));
2061   if (retval > 0)
2062     Dynarr_delete_many (str->runoff, 0, retval);
2063   /* Do NOT return retval.  The return value indicates how much
2064      of the incoming data was written, not how many bytes were
2065      written. */
2066   return size;
2067 }
2068
2069 static void
2070 reset_decoding_stream (struct decoding_stream *str)
2071 {
2072 #ifdef MULE
2073   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2074     {
2075       Lisp_Object coding_system;
2076       XSETCODING_SYSTEM (coding_system, str->codesys);
2077       reset_iso2022 (coding_system, &str->iso2022);
2078     }
2079   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2080     {
2081       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2082     }
2083   str->counter = 0;
2084 #endif /* MULE */
2085   str->flags = str->ch = 0;
2086 }
2087
2088 static int
2089 decoding_rewinder (Lstream *stream)
2090 {
2091   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2092   reset_decoding_stream (str);
2093   Dynarr_reset (str->runoff);
2094   return Lstream_rewind (str->other_end);
2095 }
2096
2097 static int
2098 decoding_seekable_p (Lstream *stream)
2099 {
2100   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101   return Lstream_seekable_p (str->other_end);
2102 }
2103
2104 static int
2105 decoding_flusher (Lstream *stream)
2106 {
2107   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108   return Lstream_flush (str->other_end);
2109 }
2110
2111 static int
2112 decoding_closer (Lstream *stream)
2113 {
2114   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2115   if (stream->flags & LSTREAM_FL_WRITE)
2116     {
2117       str->flags |= CODING_STATE_END;
2118       decoding_writer (stream, 0, 0);
2119     }
2120   Dynarr_free (str->runoff);
2121 #ifdef MULE
2122 #ifdef ENABLE_COMPOSITE_CHARS
2123   if (str->iso2022.composite_chars)
2124     Dynarr_free (str->iso2022.composite_chars);
2125 #endif
2126 #endif
2127   return Lstream_close (str->other_end);
2128 }
2129
2130 Lisp_Object
2131 decoding_stream_coding_system (Lstream *stream)
2132 {
2133   Lisp_Object coding_system;
2134   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2135
2136   XSETCODING_SYSTEM (coding_system, str->codesys);
2137   return subsidiary_coding_system (coding_system, str->eol_type);
2138 }
2139
2140 void
2141 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2142 {
2143   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2144   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2145   str->codesys = cs;
2146   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2147     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2148   reset_decoding_stream (str);
2149 }
2150
2151 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2152    stream for writing, no automatic code detection will be performed.
2153    The reason for this is that automatic code detection requires a
2154    seekable input.  Things will also fail if you open a decoding
2155    stream for reading using a non-fully-specified coding system and
2156    a non-seekable input stream. */
2157
2158 static Lisp_Object
2159 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2160                         CONST char *mode)
2161 {
2162   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2163   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2164   Lisp_Object obj;
2165
2166   xzero (*str);
2167   str->other_end = stream;
2168   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2169   str->eol_type = EOL_AUTODETECT;
2170   if (!strcmp (mode, "r")
2171       && Lstream_seekable_p (stream))
2172     /* We can determine the coding system now. */
2173     determine_real_coding_system (stream, &codesys, &str->eol_type);
2174   set_decoding_stream_coding_system (lstr, codesys);
2175   str->decst.eol_type = str->eol_type;
2176   str->decst.mask = ~0;
2177   XSETLSTREAM (obj, lstr);
2178   return obj;
2179 }
2180
2181 Lisp_Object
2182 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2183 {
2184   return make_decoding_stream_1 (stream, codesys, "r");
2185 }
2186
2187 Lisp_Object
2188 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2189 {
2190   return make_decoding_stream_1 (stream, codesys, "w");
2191 }
2192
2193 /* Note: the decode_coding_* functions all take the same
2194    arguments as mule_decode(), which is to say some SRC data of
2195    size N, which is to be stored into dynamic array DST.
2196    DECODING is the stream within which the decoding is
2197    taking place, but no data is actually read from or
2198    written to that stream; that is handled in decoding_reader()
2199    or decoding_writer().  This allows the same functions to
2200    be used for both reading and writing. */
2201
2202 static void
2203 mule_decode (Lstream *decoding, CONST unsigned char *src,
2204              unsigned_char_dynarr *dst, unsigned int n)
2205 {
2206   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2207
2208   /* If necessary, do encoding-detection now.  We do this when
2209      we're a writing stream or a non-seekable reading stream,
2210      meaning that we can't just process the whole input,
2211      rewind, and start over. */
2212
2213   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2214       str->eol_type == EOL_AUTODETECT)
2215     {
2216       Lisp_Object codesys;
2217
2218       XSETCODING_SYSTEM (codesys, str->codesys);
2219       detect_coding_type (&str->decst, src, n,
2220                           CODING_SYSTEM_TYPE (str->codesys) !=
2221                           CODESYS_AUTODETECT);
2222       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2223           str->decst.mask != ~0)
2224         /* #### This is cheesy.  What we really ought to do is
2225            buffer up a certain amount of data so as to get a
2226            less random result. */
2227         codesys = coding_system_from_mask (str->decst.mask);
2228       str->eol_type = str->decst.eol_type;
2229       if (XCODING_SYSTEM (codesys) != str->codesys)
2230         {
2231           /* Preserve the CODING_STATE_END flag in case it was set.
2232              If we erase it, bad things might happen. */
2233           int was_end = str->flags & CODING_STATE_END;
2234           set_decoding_stream_coding_system (decoding, codesys);
2235           if (was_end)
2236             str->flags |= CODING_STATE_END;
2237         }
2238     }
2239
2240   switch (CODING_SYSTEM_TYPE (str->codesys))
2241     {
2242 #ifdef DEBUG_XEMACS
2243     case CODESYS_INTERNAL:
2244       Dynarr_add_many (dst, src, n);
2245       break;
2246 #endif
2247     case CODESYS_AUTODETECT:
2248       /* If we got this far and still haven't decided on the coding
2249          system, then do no conversion. */
2250     case CODESYS_NO_CONVERSION:
2251       decode_coding_no_conversion (decoding, src, dst, n);
2252       break;
2253 #ifdef MULE
2254     case CODESYS_SHIFT_JIS:
2255       decode_coding_sjis (decoding, src, dst, n);
2256       break;
2257     case CODESYS_BIG5:
2258       decode_coding_big5 (decoding, src, dst, n);
2259       break;
2260     case CODESYS_UCS4:
2261       decode_coding_ucs4 (decoding, src, dst, n);
2262       break;
2263     case CODESYS_UTF8:
2264       decode_coding_utf8 (decoding, src, dst, n);
2265       break;
2266     case CODESYS_CCL:
2267       str->ccl.last_block = str->flags & CODING_STATE_END;
2268       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2269       break;
2270     case CODESYS_ISO2022:
2271       decode_coding_iso2022 (decoding, src, dst, n);
2272       break;
2273 #endif /* MULE */
2274     default:
2275       abort ();
2276     }
2277 }
2278
2279 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2280 Decode the text between START and END which is encoded in CODING-SYSTEM.
2281 This is useful if you've read in encoded text from a file without decoding
2282 it (e.g. you read in a JIS-formatted file but used the `binary' or
2283 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2284 Return length of decoded text.
2285 BUFFER defaults to the current buffer if unspecified.
2286 */
2287        (start, end, coding_system, buffer))
2288 {
2289   Bufpos b, e;
2290   struct buffer *buf = decode_buffer (buffer, 0);
2291   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2292   Lstream *istr, *ostr;
2293   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2294
2295   get_buffer_range_char (buf, start, end, &b, &e, 0);
2296
2297   barf_if_buffer_read_only (buf, b, e);
2298
2299   coding_system = Fget_coding_system (coding_system);
2300   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2301   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2302   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2303                                               coding_system);
2304   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2305                                            Fget_coding_system (Qbinary));
2306   istr = XLSTREAM (instream);
2307   ostr = XLSTREAM (outstream);
2308   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2309
2310   /* The chain of streams looks like this:
2311
2312      [BUFFER] <----- send through
2313                      ------> [ENCODE AS BINARY]
2314                              ------> [DECODE AS SPECIFIED]
2315                                      ------> [BUFFER]
2316    */
2317
2318   while (1)
2319     {
2320       char tempbuf[1024]; /* some random amount */
2321       Bufpos newpos, even_newer_pos;
2322       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2323       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2324
2325       if (!size_in_bytes)
2326         break;
2327       newpos = lisp_buffer_stream_startpos (istr);
2328       Lstream_write (ostr, tempbuf, size_in_bytes);
2329       even_newer_pos = lisp_buffer_stream_startpos (istr);
2330       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2331                            even_newer_pos, 0);
2332     }
2333   Lstream_close (istr);
2334   Lstream_close (ostr);
2335   UNGCPRO;
2336   Lstream_delete (istr);
2337   Lstream_delete (ostr);
2338   Lstream_delete (XLSTREAM (de_outstream));
2339   Lstream_delete (XLSTREAM (lb_outstream));
2340   return Qnil;
2341 }
2342
2343 \f
2344 /************************************************************************/
2345 /*           Converting to an external encoding ("encoding")            */
2346 /************************************************************************/
2347
2348 /* An encoding stream is an output stream.  When you create the
2349    stream, you specify the coding system that governs the encoding
2350    and another stream that the resulting encoded data is to be
2351    sent to, and then start sending data to it. */
2352
2353 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2354
2355 struct encoding_stream
2356 {
2357   /* Coding system that governs the conversion. */
2358   Lisp_Coding_System *codesys;
2359
2360   /* Stream that we read the encoded data from or
2361      write the decoded data to. */
2362   Lstream *other_end;
2363
2364   /* If we are reading, then we can return only a fixed amount of
2365      data, so if the conversion resulted in too much data, we store it
2366      here for retrieval the next time around. */
2367   unsigned_char_dynarr *runoff;
2368
2369   /* FLAGS holds flags indicating the current state of the encoding.
2370      Some of these flags are dependent on the coding system. */
2371   unsigned int flags;
2372
2373   /* CH holds a partially built-up character.  Since we only deal
2374      with one- and two-byte characters at the moment, we only use
2375      this to store the first byte of a two-byte character. */
2376   unsigned int ch;
2377 #ifdef MULE
2378   /* Additional information used by the ISO2022 encoder. */
2379   struct
2380     {
2381       /* CHARSET holds the character sets currently assigned to the G0
2382          through G3 registers.  It is initialized from the array
2383          INITIAL_CHARSET in CODESYS. */
2384       Lisp_Object charset[4];
2385
2386       /* Which registers are currently invoked into the left (GL) and
2387          right (GR) halves of the 8-bit encoding space? */
2388       int register_left, register_right;
2389
2390       /* Whether we need to explicitly designate the charset in the
2391          G? register before using it.  It is initialized from the
2392          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2393       unsigned char force_charset_on_output[4];
2394
2395       /* Other state variables that need to be preserved across
2396          invocations. */
2397       Lisp_Object current_charset;
2398       int current_half;
2399       int current_char_boundary;
2400     } iso2022;
2401
2402   void (*encode_char) (struct encoding_stream *str, Emchar c,
2403                        unsigned_char_dynarr *dst, unsigned int *flags);
2404   void (*finish) (struct encoding_stream *str,
2405                   unsigned_char_dynarr *dst, unsigned int *flags);
2406
2407   /* Additional information (the state of the running CCL program)
2408      used by the CCL encoder. */
2409   struct ccl_program ccl;
2410 #endif /* MULE */
2411 };
2412
2413 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2414 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2415                             size_t size);
2416 static int encoding_rewinder   (Lstream *stream);
2417 static int encoding_seekable_p (Lstream *stream);
2418 static int encoding_flusher    (Lstream *stream);
2419 static int encoding_closer     (Lstream *stream);
2420
2421 static Lisp_Object encoding_marker (Lisp_Object stream,
2422                                     void (*markobj) (Lisp_Object));
2423
2424 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2425                                sizeof (struct encoding_stream));
2426
2427 static Lisp_Object
2428 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2429 {
2430   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2431   Lisp_Object str_obj;
2432
2433   /* We do not need to mark the coding systems or charsets stored
2434      within the stream because they are stored in a global list
2435      and automatically marked. */
2436
2437   XSETLSTREAM (str_obj, str);
2438   markobj (str_obj);
2439   if (str->imp->marker)
2440     return (str->imp->marker) (str_obj, markobj);
2441   else
2442     return Qnil;
2443 }
2444
2445 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2446    so we read data from the other end, encode it, and store it into DATA. */
2447
2448 static int
2449 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2450 {
2451   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2452   unsigned char *orig_data = data;
2453   int read_size;
2454   int error_occurred = 0;
2455
2456   /* We need to interface to mule_encode(), which expects to take some
2457      amount of data and store the result into a Dynarr.  We have
2458      mule_encode() store into str->runoff, and take data from there
2459      as necessary. */
2460
2461   /* We loop until we have enough data, reading chunks from the other
2462      end and encoding it. */
2463   while (1)
2464     {
2465       /* Take data from the runoff if we can.  Make sure to take at
2466          most SIZE bytes, and delete the data from the runoff. */
2467       if (Dynarr_length (str->runoff) > 0)
2468         {
2469           int chunk = min ((int) size, Dynarr_length (str->runoff));
2470           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2471           Dynarr_delete_many (str->runoff, 0, chunk);
2472           data += chunk;
2473           size -= chunk;
2474         }
2475
2476       if (size == 0)
2477         break; /* No more room for data */
2478
2479       if (str->flags & CODING_STATE_END)
2480         /* This means that on the previous iteration, we hit the EOF on
2481            the other end.  We loop once more so that mule_encode() can
2482            output any final stuff it may be holding, or any "go back
2483            to a sane state" escape sequences. (This latter makes sense
2484            during encoding.) */
2485         break;
2486
2487       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2488          left of storage in it, so it's OK to read directly into it.
2489          (We'll be overwriting above, after we've encoded it into the
2490          runoff.) */
2491       read_size = Lstream_read (str->other_end, data, size);
2492       if (read_size < 0)
2493         {
2494           error_occurred = 1;
2495           break;
2496         }
2497       if (read_size == 0)
2498         /* There might be some more end data produced in the translation.
2499            See the comment above. */
2500         str->flags |= CODING_STATE_END;
2501       mule_encode (stream, data, str->runoff, read_size);
2502     }
2503
2504   if (data == orig_data)
2505     return error_occurred ? -1 : 0;
2506   else
2507     return data - orig_data;
2508 }
2509
2510 static int
2511 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2512 {
2513   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2514   int retval;
2515
2516   /* Encode all our data into the runoff, and then attempt to write
2517      it all out to the other end.  Remove whatever chunk we succeeded
2518      in writing. */
2519   mule_encode (stream, data, str->runoff, size);
2520   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2521                           Dynarr_length (str->runoff));
2522   if (retval > 0)
2523     Dynarr_delete_many (str->runoff, 0, retval);
2524   /* Do NOT return retval.  The return value indicates how much
2525      of the incoming data was written, not how many bytes were
2526      written. */
2527   return size;
2528 }
2529
2530 static void
2531 reset_encoding_stream (struct encoding_stream *str)
2532 {
2533 #ifdef MULE
2534   switch (CODING_SYSTEM_TYPE (str->codesys))
2535     {
2536     case CODESYS_ISO2022:
2537       {
2538         int i;
2539
2540         str->encode_char = &char_encode_iso2022;
2541         str->finish = &char_finish_iso2022;
2542         for (i = 0; i < 4; i++)
2543           {
2544             str->iso2022.charset[i] =
2545               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2546             str->iso2022.force_charset_on_output[i] =
2547               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2548           }
2549         str->iso2022.register_left = 0;
2550         str->iso2022.register_right = 1;
2551         str->iso2022.current_charset = Qnil;
2552         str->iso2022.current_half = 0;
2553         break;
2554       }
2555     case CODESYS_CCL:
2556       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2557       break;
2558     case CODESYS_UTF8:
2559       str->encode_char = &char_encode_utf8;
2560       str->finish = &char_finish_utf8;
2561       break;
2562     case CODESYS_UCS4:
2563       str->encode_char = &char_encode_ucs4;
2564       str->finish = &char_finish_ucs4;
2565       break;
2566     case CODESYS_SHIFT_JIS:
2567       str->encode_char = &char_encode_shift_jis;
2568       str->finish = &char_finish_shift_jis;
2569       break;
2570     default:
2571       break;
2572     }
2573 #endif /* MULE */
2574   str->iso2022.current_char_boundary = 0;
2575   str->flags = str->ch = 0;
2576 }
2577
2578 static int
2579 encoding_rewinder (Lstream *stream)
2580 {
2581   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2582   reset_encoding_stream (str);
2583   Dynarr_reset (str->runoff);
2584   return Lstream_rewind (str->other_end);
2585 }
2586
2587 static int
2588 encoding_seekable_p (Lstream *stream)
2589 {
2590   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2591   return Lstream_seekable_p (str->other_end);
2592 }
2593
2594 static int
2595 encoding_flusher (Lstream *stream)
2596 {
2597   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2598   return Lstream_flush (str->other_end);
2599 }
2600
2601 static int
2602 encoding_closer (Lstream *stream)
2603 {
2604   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2605   if (stream->flags & LSTREAM_FL_WRITE)
2606     {
2607       str->flags |= CODING_STATE_END;
2608       encoding_writer (stream, 0, 0);
2609     }
2610   Dynarr_free (str->runoff);
2611   return Lstream_close (str->other_end);
2612 }
2613
2614 Lisp_Object
2615 encoding_stream_coding_system (Lstream *stream)
2616 {
2617   Lisp_Object coding_system;
2618   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2619
2620   XSETCODING_SYSTEM (coding_system, str->codesys);
2621   return coding_system;
2622 }
2623
2624 void
2625 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2626 {
2627   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2628   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2629   str->codesys = cs;
2630   reset_encoding_stream (str);
2631 }
2632
2633 static Lisp_Object
2634 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2635                         CONST char *mode)
2636 {
2637   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2638   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2639   Lisp_Object obj;
2640
2641   xzero (*str);
2642   str->runoff = Dynarr_new (unsigned_char);
2643   str->other_end = stream;
2644   set_encoding_stream_coding_system (lstr, codesys);
2645   XSETLSTREAM (obj, lstr);
2646   return obj;
2647 }
2648
2649 Lisp_Object
2650 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2651 {
2652   return make_encoding_stream_1 (stream, codesys, "r");
2653 }
2654
2655 Lisp_Object
2656 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2657 {
2658   return make_encoding_stream_1 (stream, codesys, "w");
2659 }
2660
2661 /* Convert N bytes of internally-formatted data stored in SRC to an
2662    external format, according to the encoding stream ENCODING.
2663    Store the encoded data into DST. */
2664
2665 static void
2666 mule_encode (Lstream *encoding, CONST unsigned char *src,
2667              unsigned_char_dynarr *dst, unsigned int n)
2668 {
2669   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2670
2671   switch (CODING_SYSTEM_TYPE (str->codesys))
2672     {
2673 #ifdef DEBUG_XEMACS
2674     case CODESYS_INTERNAL:
2675       Dynarr_add_many (dst, src, n);
2676       break;
2677 #endif
2678     case CODESYS_AUTODETECT:
2679       /* If we got this far and still haven't decided on the coding
2680          system, then do no conversion. */
2681     case CODESYS_NO_CONVERSION:
2682       encode_coding_no_conversion (encoding, src, dst, n);
2683       break;
2684 #ifdef MULE
2685     case CODESYS_BIG5:
2686       encode_coding_big5 (encoding, src, dst, n);
2687       break;
2688     case CODESYS_CCL:
2689       str->ccl.last_block = str->flags & CODING_STATE_END;
2690       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2691       break;
2692 #endif /* MULE */
2693     default:
2694       text_encode_generic (encoding, src, dst, n);
2695     }
2696 }
2697
2698 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2699 Encode the text between START and END using CODING-SYSTEM.
2700 This will, for example, convert Japanese characters into stuff such as
2701 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2702 text.  BUFFER defaults to the current buffer if unspecified.
2703 */
2704        (start, end, coding_system, buffer))
2705 {
2706   Bufpos b, e;
2707   struct buffer *buf = decode_buffer (buffer, 0);
2708   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2709   Lstream *istr, *ostr;
2710   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2711
2712   get_buffer_range_char (buf, start, end, &b, &e, 0);
2713
2714   barf_if_buffer_read_only (buf, b, e);
2715
2716   coding_system = Fget_coding_system (coding_system);
2717   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2718   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2719   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2720                                               Fget_coding_system (Qbinary));
2721   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2722                                            coding_system);
2723   istr = XLSTREAM (instream);
2724   ostr = XLSTREAM (outstream);
2725   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2726   /* The chain of streams looks like this:
2727
2728      [BUFFER] <----- send through
2729                      ------> [ENCODE AS SPECIFIED]
2730                              ------> [DECODE AS BINARY]
2731                                      ------> [BUFFER]
2732    */
2733   while (1)
2734     {
2735       char tempbuf[1024]; /* some random amount */
2736       Bufpos newpos, even_newer_pos;
2737       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2738       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2739
2740       if (!size_in_bytes)
2741         break;
2742       newpos = lisp_buffer_stream_startpos (istr);
2743       Lstream_write (ostr, tempbuf, size_in_bytes);
2744       even_newer_pos = lisp_buffer_stream_startpos (istr);
2745       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2746                            even_newer_pos, 0);
2747     }
2748
2749   {
2750     Charcount retlen =
2751       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2752     Lstream_close (istr);
2753     Lstream_close (ostr);
2754     UNGCPRO;
2755     Lstream_delete (istr);
2756     Lstream_delete (ostr);
2757     Lstream_delete (XLSTREAM (de_outstream));
2758     Lstream_delete (XLSTREAM (lb_outstream));
2759     return make_int (retlen);
2760   }
2761 }
2762
2763 #ifdef MULE
2764 \f
2765 static void
2766 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2767                      unsigned_char_dynarr *dst, unsigned int n)
2768 {
2769   unsigned char c;
2770   unsigned char char_boundary;
2771   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2772   unsigned int flags          = str->flags;
2773   Emchar ch                   = str->ch;
2774
2775   char_boundary = str->iso2022.current_char_boundary;
2776
2777   while (n--)
2778     {
2779       c = *src++;
2780
2781       if (char_boundary == 0)
2782         {
2783           if (c >= 0xfc)
2784             {
2785               ch = c & 0x01;
2786               char_boundary = 5;
2787             }
2788           else if (c >= 0xf8)
2789             {
2790               ch = c & 0x03;
2791               char_boundary = 4;
2792             }
2793           else if (c >= 0xf0)
2794             {
2795               ch = c & 0x07;
2796               char_boundary = 3;
2797             }
2798           else if (c >= 0xe0)
2799             {
2800               ch = c & 0x0f;
2801               char_boundary = 2;
2802             }
2803           else if (c >= 0xc0)
2804             {
2805               ch = c & 0x1f;
2806               char_boundary = 1;
2807             }
2808           else
2809             (*str->encode_char) (str, c, dst, &flags);
2810         }
2811       else if (char_boundary == 1)
2812         {
2813           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2814           ch =0;
2815           char_boundary = 0;
2816         }
2817       else
2818         {
2819           ch = (ch << 6) | (c & 0x3f);
2820           char_boundary--;
2821         }
2822     }
2823
2824   if ((char_boundary == 0) && (flags & CODING_STATE_END))
2825     {
2826       (*str->finish) (str, dst, &flags);
2827     }
2828
2829   str->flags = flags;
2830   str->ch    = ch;
2831   str->iso2022.current_char_boundary = char_boundary;
2832 }
2833
2834 \f
2835 /************************************************************************/
2836 /*                          Shift-JIS methods                           */
2837 /************************************************************************/
2838
2839 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2840    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2841    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2842    encoded by "position-code + 0x80".  A character of JISX0208
2843    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2844    position-codes are divided and shifted so that it fit in the range
2845    below.
2846
2847    --- CODE RANGE of Shift-JIS ---
2848    (character set)      (range)
2849    ASCII                0x00 .. 0x7F
2850    JISX0201-Kana        0xA0 .. 0xDF
2851    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2852             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2853    -------------------------------
2854
2855 */
2856
2857 /* Is this the first byte of a Shift-JIS two-byte char? */
2858
2859 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2860   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2861
2862 /* Is this the second byte of a Shift-JIS two-byte char? */
2863
2864 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2865   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2866
2867 #define BYTE_SJIS_KATAKANA_P(c) \
2868   ((c) >= 0xA1 && (c) <= 0xDF)
2869
2870 static int
2871 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2872                     unsigned int n)
2873 {
2874   int c;
2875
2876   while (n--)
2877     {
2878       c = *src++;
2879       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2880         return 0;
2881       if (st->shift_jis.in_second_byte)
2882         {
2883           st->shift_jis.in_second_byte = 0;
2884           if (c < 0x40)
2885             return 0;
2886         }
2887       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2888         st->shift_jis.in_second_byte = 1;
2889     }
2890   return CODING_CATEGORY_SHIFT_JIS_MASK;
2891 }
2892
2893 /* Convert Shift-JIS data to internal format. */
2894
2895 static void
2896 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2897                     unsigned_char_dynarr *dst, unsigned int n)
2898 {
2899   unsigned char c;
2900   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2901   unsigned int flags  = str->flags;
2902   unsigned int ch     = str->ch;
2903   eol_type_t eol_type = str->eol_type;
2904
2905   while (n--)
2906     {
2907       c = *src++;
2908
2909       if (ch)
2910         {
2911           /* Previous character was first byte of Shift-JIS Kanji char. */
2912           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2913             {
2914               unsigned char e1, e2;
2915
2916               DECODE_SJIS (ch, c, e1, e2);
2917 #ifdef UTF2000
2918               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2919                                             e1 & 0x7F,
2920                                             e2 & 0x7F), dst);
2921 #else
2922               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2923               Dynarr_add (dst, e1);
2924               Dynarr_add (dst, e2);
2925 #endif
2926             }
2927           else
2928             {
2929               DECODE_ADD_BINARY_CHAR (ch, dst);
2930               DECODE_ADD_BINARY_CHAR (c, dst);
2931             }
2932           ch = 0;
2933         }
2934       else
2935         {
2936           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2937           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2938             ch = c;
2939           else if (BYTE_SJIS_KATAKANA_P (c))
2940             {
2941 #ifdef UTF2000
2942               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2943                                             c & 0x7F, 0), dst);
2944 #else
2945               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2946               Dynarr_add (dst, c);
2947 #endif
2948             }
2949 #ifdef UTF2000
2950           else if (c > 32)
2951             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
2952                                           c, 0), dst);
2953 #endif
2954           else
2955             DECODE_ADD_BINARY_CHAR (c, dst);
2956         }
2957     label_continue_loop:;
2958     }
2959
2960   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2961
2962   str->flags = flags;
2963   str->ch    = ch;
2964 }
2965
2966 /* Convert internal character representation to Shift_JIS. */
2967
2968 void
2969 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
2970                        unsigned_char_dynarr *dst, unsigned int *flags)
2971 {
2972   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2973
2974   if (ch == '\n')
2975     {
2976       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2977         Dynarr_add (dst, '\r');
2978       if (eol_type != EOL_CR)
2979         Dynarr_add (dst, ch);
2980     }
2981   else
2982     {
2983       Lisp_Object charset;
2984       unsigned int c1, c2, s1, s2;
2985
2986 #ifdef UTF2000
2987       if ( (c1 =
2988             get_byte_from_character_table
2989             (ch, XCHARSET_TO_BYTE1_TABLE (Vcharset_latin_jisx0201))) )
2990         {
2991           charset = Vcharset_latin_jisx0201;
2992           c2 = 0;
2993         }
2994       else
2995 #endif
2996         BREAKUP_CHAR (ch, charset, c1, c2);
2997
2998       if (EQ(charset, Vcharset_katakana_jisx0201))
2999         {
3000           Dynarr_add (dst, c1 | 0x80);
3001         }
3002       else if (c2 == 0)
3003         {
3004           Dynarr_add (dst, c1);
3005         }
3006       else if (EQ(charset, Vcharset_japanese_jisx0208))
3007         {
3008           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3009           Dynarr_add (dst, s1);
3010           Dynarr_add (dst, s2);
3011         }
3012       else
3013         Dynarr_add (dst, '?');
3014     }
3015 }
3016
3017 void
3018 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3019                        unsigned int *flags)
3020 {
3021 }
3022
3023 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3024 Decode a JISX0208 character of Shift-JIS coding-system.
3025 CODE is the character code in Shift-JIS as a cons of type bytes.
3026 Return the corresponding character.
3027 */
3028        (code))
3029 {
3030   unsigned char c1, c2, s1, s2;
3031
3032   CHECK_CONS (code);
3033   CHECK_INT (XCAR (code));
3034   CHECK_INT (XCDR (code));
3035   s1 = XINT (XCAR (code));
3036   s2 = XINT (XCDR (code));
3037   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3038       BYTE_SJIS_TWO_BYTE_2_P (s2))
3039     {
3040       DECODE_SJIS (s1, s2, c1, c2);
3041       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3042                                    c1 & 0x7F, c2 & 0x7F));
3043     }
3044   else
3045     return Qnil;
3046 }
3047
3048 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3049 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3050 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3051 */
3052        (ch))
3053 {
3054   Lisp_Object charset;
3055   int c1, c2, s1, s2;
3056
3057   CHECK_CHAR_COERCE_INT (ch);
3058   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3059   if (EQ (charset, Vcharset_japanese_jisx0208))
3060     {
3061       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3062       return Fcons (make_int (s1), make_int (s2));
3063     }
3064   else
3065     return Qnil;
3066 }
3067
3068 \f
3069 /************************************************************************/
3070 /*                            Big5 methods                              */
3071 /************************************************************************/
3072
3073 /* BIG5 is a coding system encoding two character sets: ASCII and
3074    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3075    character set and is encoded in two-byte.
3076
3077    --- CODE RANGE of BIG5 ---
3078    (character set)      (range)
3079    ASCII                0x00 .. 0x7F
3080    Big5 (1st byte)      0xA1 .. 0xFE
3081         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3082    --------------------------
3083
3084    Since the number of characters in Big5 is larger than maximum
3085    characters in Emacs' charset (96x96), it can't be handled as one
3086    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3087    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3088    contains frequently used characters and the latter contains less
3089    frequently used characters.  */
3090
3091 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3092   ((c) >= 0xA1 && (c) <= 0xFE)
3093
3094 /* Is this the second byte of a Shift-JIS two-byte char? */
3095
3096 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3097   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3098
3099 /* Number of Big5 characters which have the same code in 1st byte.  */
3100
3101 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3102
3103 /* Code conversion macros.  These are macros because they are used in
3104    inner loops during code conversion.
3105
3106    Note that temporary variables in macros introduce the classic
3107    dynamic-scoping problems with variable names.  We use capital-
3108    lettered variables in the assumption that XEmacs does not use
3109    capital letters in variables except in a very formalized way
3110    (e.g. Qstring). */
3111
3112 /* Convert Big5 code (b1, b2) into its internal string representation
3113    (lb, c1, c2). */
3114
3115 /* There is a much simpler way to split the Big5 charset into two.
3116    For the moment I'm going to leave the algorithm as-is because it
3117    claims to separate out the most-used characters into a single
3118    charset, which perhaps will lead to optimizations in various
3119    places.
3120
3121    The way the algorithm works is something like this:
3122
3123    Big5 can be viewed as a 94x157 charset, where the row is
3124    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3125    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3126    the split between low and high column numbers is apparently
3127    meaningless; ascending rows produce less and less frequent chars.
3128    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3129    the first charset, and the upper half (0xC9 .. 0xFE) to the
3130    second.  To do the conversion, we convert the character into
3131    a single number where 0 .. 156 is the first row, 157 .. 313
3132    is the second, etc.  That way, the characters are ordered by
3133    decreasing frequency.  Then we just chop the space in two
3134    and coerce the result into a 94x94 space.
3135    */
3136
3137 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3138 {                                                                       \
3139   int B1 = b1, B2 = b2;                                                 \
3140   unsigned int I                                                        \
3141     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3142                                                                         \
3143   if (B1 < 0xC9)                                                        \
3144     {                                                                   \
3145       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3146     }                                                                   \
3147   else                                                                  \
3148     {                                                                   \
3149       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3150       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3151     }                                                                   \
3152   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3153   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3154 } while (0)
3155
3156 /* Convert the internal string representation of a Big5 character
3157    (lb, c1, c2) into Big5 code (b1, b2). */
3158
3159 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3160 {                                                                       \
3161   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3162                                                                         \
3163   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3164     {                                                                   \
3165       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3166     }                                                                   \
3167   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3168   b2 = I % BIG5_SAME_ROW;                                               \
3169   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3170 } while (0)
3171
3172 static int
3173 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3174                     unsigned int n)
3175 {
3176   int c;
3177
3178   while (n--)
3179     {
3180       c = *src++;
3181       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3182           (c >= 0x80 && c <= 0xA0))
3183         return 0;
3184       if (st->big5.in_second_byte)
3185         {
3186           st->big5.in_second_byte = 0;
3187           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3188             return 0;
3189         }
3190       else if (c >= 0xA1)
3191         st->big5.in_second_byte = 1;
3192     }
3193   return CODING_CATEGORY_BIG5_MASK;
3194 }
3195
3196 /* Convert Big5 data to internal format. */
3197
3198 static void
3199 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3200                     unsigned_char_dynarr *dst, unsigned int n)
3201 {
3202   unsigned char c;
3203   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3204   unsigned int flags  = str->flags;
3205   unsigned int ch     = str->ch;
3206   eol_type_t eol_type = str->eol_type;
3207
3208   while (n--)
3209     {
3210       c = *src++;
3211       if (ch)
3212         {
3213           /* Previous character was first byte of Big5 char. */
3214           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3215             {
3216               unsigned char b1, b2, b3;
3217               DECODE_BIG5 (ch, c, b1, b2, b3);
3218               Dynarr_add (dst, b1);
3219               Dynarr_add (dst, b2);
3220               Dynarr_add (dst, b3);
3221             }
3222           else
3223             {
3224               DECODE_ADD_BINARY_CHAR (ch, dst);
3225               DECODE_ADD_BINARY_CHAR (c, dst);
3226             }
3227           ch = 0;
3228         }
3229       else
3230         {
3231           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3232           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3233             ch = c;
3234           else
3235             DECODE_ADD_BINARY_CHAR (c, dst);
3236         }
3237     label_continue_loop:;
3238     }
3239
3240   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3241
3242   str->flags = flags;
3243   str->ch    = ch;
3244 }
3245
3246 /* Convert internally-formatted data to Big5. */
3247
3248 static void
3249 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3250                     unsigned_char_dynarr *dst, unsigned int n)
3251 {
3252 #ifndef UTF2000
3253   unsigned char c;
3254   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3255   unsigned int flags  = str->flags;
3256   unsigned int ch     = str->ch;
3257   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3258
3259   while (n--)
3260     {
3261       c = *src++;
3262       if (c == '\n')
3263         {
3264           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3265             Dynarr_add (dst, '\r');
3266           if (eol_type != EOL_CR)
3267             Dynarr_add (dst, '\n');
3268         }
3269       else if (BYTE_ASCII_P (c))
3270         {
3271           /* ASCII. */
3272           Dynarr_add (dst, c);
3273         }
3274       else if (BUFBYTE_LEADING_BYTE_P (c))
3275         {
3276           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3277               c == LEADING_BYTE_CHINESE_BIG5_2)
3278             {
3279               /* A recognized leading byte. */
3280               ch = c;
3281               continue; /* not done with this character. */
3282             }
3283           /* otherwise just ignore this character. */
3284         }
3285       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3286                ch == LEADING_BYTE_CHINESE_BIG5_2)
3287         {
3288           /* Previous char was a recognized leading byte. */
3289           ch = (ch << 8) | c;
3290           continue; /* not done with this character. */
3291         }
3292       else if (ch)
3293         {
3294           /* Encountering second byte of a Big5 character. */
3295           unsigned char b1, b2;
3296
3297           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3298           Dynarr_add (dst, b1);
3299           Dynarr_add (dst, b2);
3300         }
3301
3302       ch = 0;
3303     }
3304
3305   str->flags = flags;
3306   str->ch    = ch;
3307 #endif
3308 }
3309
3310
3311 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3312 Decode a Big5 character CODE of BIG5 coding-system.
3313 CODE is the character code in BIG5, a cons of two integers.
3314 Return the corresponding character.
3315 */
3316        (code))
3317 {
3318   unsigned char c1, c2, b1, b2;
3319
3320   CHECK_CONS (code);
3321   CHECK_INT (XCAR (code));
3322   CHECK_INT (XCDR (code));
3323   b1 = XINT (XCAR (code));
3324   b2 = XINT (XCDR (code));
3325   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3326       BYTE_BIG5_TWO_BYTE_2_P (b2))
3327     {
3328       Charset_ID leading_byte;
3329       Lisp_Object charset;
3330       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3331       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3332       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3333     }
3334   else
3335     return Qnil;
3336 }
3337
3338 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3339 Encode the Big5 character CH to BIG5 coding-system.
3340 Return the corresponding character code in Big5.
3341 */
3342        (ch))
3343 {
3344   Lisp_Object charset;
3345   int c1, c2, b1, b2;
3346
3347   CHECK_CHAR_COERCE_INT (ch);
3348   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3349   if (EQ (charset, Vcharset_chinese_big5_1) ||
3350       EQ (charset, Vcharset_chinese_big5_2))
3351     {
3352       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3353                    b1, b2);
3354       return Fcons (make_int (b1), make_int (b2));
3355     }
3356   else
3357     return Qnil;
3358 }
3359
3360 \f
3361 /************************************************************************/
3362 /*                           UCS-4 methods                              */
3363 /************************************************************************/
3364
3365 static int
3366 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3367                     unsigned int n)
3368 {
3369   while (n--)
3370     {
3371       int c = *src++;
3372       switch (st->ucs4.in_byte)
3373         {
3374         case 0:
3375           if (c >= 128)
3376             return 0;
3377           else
3378             st->ucs4.in_byte++;
3379           break;
3380         case 3:
3381           st->ucs4.in_byte = 0;
3382           break;
3383         default:
3384           st->ucs4.in_byte++;
3385         }
3386     }
3387   return CODING_CATEGORY_UCS4_MASK;
3388 }
3389
3390 static void
3391 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3392                     unsigned_char_dynarr *dst, unsigned int n)
3393 {
3394   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3395   unsigned int flags = str->flags;
3396   unsigned int ch    = str->ch;
3397   unsigned char counter = str->counter;
3398
3399   while (n--)
3400     {
3401       unsigned char c = *src++;
3402       switch (counter)
3403         {
3404         case 0:
3405           ch = c;
3406           counter = 3;
3407           break;
3408         case 1:
3409           DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3410           ch = 0;
3411           counter = 0;
3412           break;
3413         default:
3414           ch = ( ch << 8 ) | c;
3415           counter--;
3416         }
3417     }
3418   if (counter & CODING_STATE_END)
3419     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3420
3421   str->flags = flags;
3422   str->ch    = ch;
3423   str->counter = counter;
3424 }
3425
3426 void
3427 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3428                   unsigned_char_dynarr *dst, unsigned int *flags)
3429 {
3430   Dynarr_add (dst, ch >> 24);
3431   Dynarr_add (dst, ch >> 16);
3432   Dynarr_add (dst, ch >>  8);
3433   Dynarr_add (dst, ch      );
3434 }
3435
3436 void
3437 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3438                   unsigned int *flags)
3439 {
3440 }
3441
3442 \f
3443 /************************************************************************/
3444 /*                           UTF-8 methods                              */
3445 /************************************************************************/
3446
3447 static int
3448 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3449                     unsigned int n)
3450 {
3451   while (n--)
3452     {
3453       unsigned char c = *src++;
3454       switch (st->utf8.in_byte)
3455         {
3456         case 0:
3457           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3458             return 0;
3459           else if (c >= 0xfc)
3460             st->utf8.in_byte = 5;
3461           else if (c >= 0xf8)
3462             st->utf8.in_byte = 4;
3463           else if (c >= 0xf0)
3464             st->utf8.in_byte = 3;
3465           else if (c >= 0xe0)
3466             st->utf8.in_byte = 2;
3467           else if (c >= 0xc0)
3468             st->utf8.in_byte = 1;
3469           else if (c >= 0x80)
3470             return 0;
3471           break;
3472         default:
3473           if ((c & 0xc0) != 0x80)
3474             return 0;
3475           else
3476             st->utf8.in_byte--;
3477         }
3478     }
3479   return CODING_CATEGORY_UTF8_MASK;
3480 }
3481
3482 static void
3483 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3484                     unsigned_char_dynarr *dst, unsigned int n)
3485 {
3486   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3487   unsigned int flags  = str->flags;
3488   unsigned int ch     = str->ch;
3489   eol_type_t eol_type = str->eol_type;
3490   unsigned char counter = str->counter;
3491
3492   while (n--)
3493     {
3494       unsigned char c = *src++;
3495       switch (counter)
3496         {
3497         case 0:
3498           if ( c >= 0xfc )
3499             {
3500               ch = c & 0x01;
3501               counter = 5;
3502             }
3503           else if ( c >= 0xf8 )
3504             {
3505               ch = c & 0x03;
3506               counter = 4;
3507             }
3508           else if ( c >= 0xf0 )
3509             {
3510               ch = c & 0x07;
3511               counter = 3;
3512             }
3513           else if ( c >= 0xe0 )
3514             {
3515               ch = c & 0x0f;
3516               counter = 2;
3517             }
3518           else if ( c >= 0xc0 )
3519             {
3520               ch = c & 0x1f;
3521               counter = 1;
3522             }
3523           else
3524             {
3525               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3526               DECODE_ADD_UCS_CHAR (c, dst);
3527             }
3528           break;
3529         case 1:
3530           ch = ( ch << 6 ) | ( c & 0x3f );
3531           DECODE_ADD_UCS_CHAR (ch, dst);
3532           ch = 0;
3533           counter = 0;
3534           break;
3535         default:
3536           ch = ( ch << 6 ) | ( c & 0x3f );
3537           counter--;
3538         }
3539     label_continue_loop:;
3540     }
3541
3542   if (flags & CODING_STATE_END)
3543     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3544
3545   str->flags = flags;
3546   str->ch    = ch;
3547   str->counter = counter;
3548 }
3549
3550 void
3551 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3552                   unsigned_char_dynarr *dst, unsigned int *flags)
3553 {
3554   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3555
3556   if (ch == '\n')
3557     {
3558       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3559         Dynarr_add (dst, '\r');
3560       if (eol_type != EOL_CR)
3561         Dynarr_add (dst, ch);
3562     }
3563   else if (ch <= 0x7f)
3564     {
3565       Dynarr_add (dst, ch);
3566     }
3567   else if (ch <= 0x7ff)
3568     {
3569       Dynarr_add (dst, (ch >> 6) | 0xc0);
3570       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3571     }
3572   else if (ch <= 0xffff)
3573     {
3574       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3575       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3576       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3577     }
3578   else if (ch <= 0x1fffff)
3579     {
3580       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3581       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3582       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3583       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3584     }
3585   else if (ch <= 0x3ffffff)
3586     {
3587       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3588       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3589       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3590       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3591       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3592     }
3593   else
3594     {
3595       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3596       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3597       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3598       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3599       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3600       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3601     }
3602 }
3603
3604 void
3605 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3606                   unsigned int *flags)
3607 {
3608 }
3609
3610 \f
3611 /************************************************************************/
3612 /*                           ISO2022 methods                            */
3613 /************************************************************************/
3614
3615 /* The following note describes the coding system ISO2022 briefly.
3616    Since the intention of this note is to help understand the
3617    functions in this file, some parts are NOT ACCURATE or OVERLY
3618    SIMPLIFIED.  For thorough understanding, please refer to the
3619    original document of ISO2022.
3620
3621    ISO2022 provides many mechanisms to encode several character sets
3622    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3623    is encoded using bytes less than 128.  This may make the encoded
3624    text a little bit longer, but the text passes more easily through
3625    several gateways, some of which strip off MSB (Most Signigant Bit).
3626
3627    There are two kinds of character sets: control character set and
3628    graphic character set.  The former contains control characters such
3629    as `newline' and `escape' to provide control functions (control
3630    functions are also provided by escape sequences).  The latter
3631    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3632    two control character sets and many graphic character sets.
3633
3634    Graphic character sets are classified into one of the following
3635    four classes, according to the number of bytes (DIMENSION) and
3636    number of characters in one dimension (CHARS) of the set:
3637    - DIMENSION1_CHARS94
3638    - DIMENSION1_CHARS96
3639    - DIMENSION2_CHARS94
3640    - DIMENSION2_CHARS96
3641
3642    In addition, each character set is assigned an identification tag,
3643    unique for each set, called "final character" (denoted as <F>
3644    hereafter).  The <F> of each character set is decided by ECMA(*)
3645    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3646    (0x30..0x3F are for private use only).
3647
3648    Note (*): ECMA = European Computer Manufacturers Association
3649
3650    Here are examples of graphic character set [NAME(<F>)]:
3651         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3652         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3653         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3654         o DIMENSION2_CHARS96 -- none for the moment
3655
3656    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3657         C0 [0x00..0x1F] -- control character plane 0
3658         GL [0x20..0x7F] -- graphic character plane 0
3659         C1 [0x80..0x9F] -- control character plane 1
3660         GR [0xA0..0xFF] -- graphic character plane 1
3661
3662    A control character set is directly designated and invoked to C0 or
3663    C1 by an escape sequence.  The most common case is that:
3664    - ISO646's  control character set is designated/invoked to C0, and
3665    - ISO6429's control character set is designated/invoked to C1,
3666    and usually these designations/invocations are omitted in encoded
3667    text.  In a 7-bit environment, only C0 can be used, and a control
3668    character for C1 is encoded by an appropriate escape sequence to
3669    fit into the environment.  All control characters for C1 are
3670    defined to have corresponding escape sequences.
3671
3672    A graphic character set is at first designated to one of four
3673    graphic registers (G0 through G3), then these graphic registers are
3674    invoked to GL or GR.  These designations and invocations can be
3675    done independently.  The most common case is that G0 is invoked to
3676    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3677    these invocations and designations are omitted in encoded text.
3678    In a 7-bit environment, only GL can be used.
3679
3680    When a graphic character set of CHARS94 is invoked to GL, codes
3681    0x20 and 0x7F of the GL area work as control characters SPACE and
3682    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3683    be used.
3684
3685    There are two ways of invocation: locking-shift and single-shift.
3686    With locking-shift, the invocation lasts until the next different
3687    invocation, whereas with single-shift, the invocation affects the
3688    following character only and doesn't affect the locking-shift
3689    state.  Invocations are done by the following control characters or
3690    escape sequences:
3691
3692    ----------------------------------------------------------------------
3693    abbrev  function                  cntrl escape seq   description
3694    ----------------------------------------------------------------------
3695    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3696    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3697    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3698    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3699    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3700    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3701    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3702    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3703    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3704    ----------------------------------------------------------------------
3705    (*) These are not used by any known coding system.
3706
3707    Control characters for these functions are defined by macros
3708    ISO_CODE_XXX in `coding.h'.
3709
3710    Designations are done by the following escape sequences:
3711    ----------------------------------------------------------------------
3712    escape sequence      description
3713    ----------------------------------------------------------------------
3714    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3715    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3716    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3717    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3718    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3719    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3720    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3721    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3722    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3723    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3724    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3725    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3726    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3727    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3728    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3729    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3730    ----------------------------------------------------------------------
3731
3732    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3733    of dimension 1, chars 94, and final character <F>, etc...
3734
3735    Note (*): Although these designations are not allowed in ISO2022,
3736    Emacs accepts them on decoding, and produces them on encoding
3737    CHARS96 character sets in a coding system which is characterized as
3738    7-bit environment, non-locking-shift, and non-single-shift.
3739
3740    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3741    '(' can be omitted.  We refer to this as "short-form" hereafter.
3742
3743    Now you may notice that there are a lot of ways for encoding the
3744    same multilingual text in ISO2022.  Actually, there exist many
3745    coding systems such as Compound Text (used in X11's inter client
3746    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3747    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3748    localized platforms), and all of these are variants of ISO2022.
3749
3750    In addition to the above, Emacs handles two more kinds of escape
3751    sequences: ISO6429's direction specification and Emacs' private
3752    sequence for specifying character composition.
3753
3754    ISO6429's direction specification takes the following form:
3755         o CSI ']'      -- end of the current direction
3756         o CSI '0' ']'  -- end of the current direction
3757         o CSI '1' ']'  -- start of left-to-right text
3758         o CSI '2' ']'  -- start of right-to-left text
3759    The control character CSI (0x9B: control sequence introducer) is
3760    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3761
3762    Character composition specification takes the following form:
3763         o ESC '0' -- start character composition
3764         o ESC '1' -- end character composition
3765    Since these are not standard escape sequences of any ISO standard,
3766    their use with these meanings is restricted to Emacs only.  */
3767
3768 static void
3769 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3770 {
3771   int i;
3772
3773   for (i = 0; i < 4; i++)
3774     {
3775       if (!NILP (coding_system))
3776         iso->charset[i] =
3777           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3778       else
3779         iso->charset[i] = Qt;
3780       iso->invalid_designated[i] = 0;
3781     }
3782   iso->esc = ISO_ESC_NOTHING;
3783   iso->esc_bytes_index = 0;
3784   iso->register_left = 0;
3785   iso->register_right = 1;
3786   iso->switched_dir_and_no_valid_charset_yet = 0;
3787   iso->invalid_switch_dir = 0;
3788   iso->output_direction_sequence = 0;
3789   iso->output_literally = 0;
3790 #ifdef ENABLE_COMPOSITE_CHARS
3791   if (iso->composite_chars)
3792     Dynarr_reset (iso->composite_chars);
3793 #endif
3794 }
3795
3796 static int
3797 fit_to_be_escape_quoted (unsigned char c)
3798 {
3799   switch (c)
3800     {
3801     case ISO_CODE_ESC:
3802     case ISO_CODE_CSI:
3803     case ISO_CODE_SS2:
3804     case ISO_CODE_SS3:
3805     case ISO_CODE_SO:
3806     case ISO_CODE_SI:
3807       return 1;
3808
3809     default:
3810       return 0;
3811     }
3812 }
3813
3814 /* Parse one byte of an ISO2022 escape sequence.
3815    If the result is an invalid escape sequence, return 0 and
3816    do not change anything in STR.  Otherwise, if the result is
3817    an incomplete escape sequence, update ISO2022.ESC and
3818    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3819    all the state variables (but not ISO2022.ESC_BYTES) and
3820    return 1.
3821
3822    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3823    or invocation of an invalid character set and treat that as
3824    an unrecognized escape sequence. */
3825
3826 static int
3827 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3828                    unsigned char c, unsigned int *flags,
3829                    int check_invalid_charsets)
3830 {
3831   /* (1) If we're at the end of a designation sequence, CS is the
3832      charset being designated and REG is the register to designate
3833      it to.
3834
3835      (2) If we're at the end of a locking-shift sequence, REG is
3836      the register to invoke and HALF (0 == left, 1 == right) is
3837      the half to invoke it into.
3838
3839      (3) If we're at the end of a single-shift sequence, REG is
3840      the register to invoke. */
3841   Lisp_Object cs = Qnil;
3842   int reg, half;
3843
3844   /* NOTE: This code does goto's all over the fucking place.
3845      The reason for this is that we're basically implementing
3846      a state machine here, and hierarchical languages like C
3847      don't really provide a clean way of doing this. */
3848
3849   if (! (*flags & CODING_STATE_ESCAPE))
3850     /* At beginning of escape sequence; we need to reset our
3851        escape-state variables. */
3852     iso->esc = ISO_ESC_NOTHING;
3853
3854   iso->output_literally = 0;
3855   iso->output_direction_sequence = 0;
3856
3857   switch (iso->esc)
3858     {
3859     case ISO_ESC_NOTHING:
3860       iso->esc_bytes_index = 0;
3861       switch (c)
3862         {
3863         case ISO_CODE_ESC:      /* Start escape sequence */
3864           *flags |= CODING_STATE_ESCAPE;
3865           iso->esc = ISO_ESC;
3866           goto not_done;
3867
3868         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3869           *flags |= CODING_STATE_ESCAPE;
3870           iso->esc = ISO_ESC_5_11;
3871           goto not_done;
3872
3873         case ISO_CODE_SO:       /* locking shift 1 */
3874           reg = 1; half = 0;
3875           goto locking_shift;
3876         case ISO_CODE_SI:       /* locking shift 0 */
3877           reg = 0; half = 0;
3878           goto locking_shift;
3879
3880         case ISO_CODE_SS2:      /* single shift */
3881           reg = 2;
3882           goto single_shift;
3883         case ISO_CODE_SS3:      /* single shift */
3884           reg = 3;
3885           goto single_shift;
3886
3887         default:                        /* Other control characters */
3888           return 0;
3889         }
3890
3891     case ISO_ESC:
3892       switch (c)
3893         {
3894           /**** single shift ****/
3895
3896         case 'N':       /* single shift 2 */
3897           reg = 2;
3898           goto single_shift;
3899         case 'O':       /* single shift 3 */
3900           reg = 3;
3901           goto single_shift;
3902
3903           /**** locking shift ****/
3904
3905         case '~':       /* locking shift 1 right */
3906           reg = 1; half = 1;
3907           goto locking_shift;
3908         case 'n':       /* locking shift 2 */
3909           reg = 2; half = 0;
3910           goto locking_shift;
3911         case '}':       /* locking shift 2 right */
3912           reg = 2; half = 1;
3913           goto locking_shift;
3914         case 'o':       /* locking shift 3 */
3915           reg = 3; half = 0;
3916           goto locking_shift;
3917         case '|':       /* locking shift 3 right */
3918           reg = 3; half = 1;
3919           goto locking_shift;
3920
3921 #ifdef ENABLE_COMPOSITE_CHARS
3922           /**** composite ****/
3923
3924         case '0':
3925           iso->esc = ISO_ESC_START_COMPOSITE;
3926           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3927             CODING_STATE_COMPOSITE;
3928           return 1;
3929
3930         case '1':
3931           iso->esc = ISO_ESC_END_COMPOSITE;
3932           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3933             ~CODING_STATE_COMPOSITE;
3934           return 1;
3935 #endif /* ENABLE_COMPOSITE_CHARS */
3936
3937           /**** directionality ****/
3938
3939         case '[':
3940           iso->esc = ISO_ESC_5_11;
3941           goto not_done;
3942
3943           /**** designation ****/
3944
3945         case '$':       /* multibyte charset prefix */
3946           iso->esc = ISO_ESC_2_4;
3947           goto not_done;
3948
3949         default:
3950           if (0x28 <= c && c <= 0x2F)
3951             {
3952               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3953               goto not_done;
3954             }
3955
3956           /* This function is called with CODESYS equal to nil when
3957              doing coding-system detection. */
3958           if (!NILP (codesys)
3959               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3960               && fit_to_be_escape_quoted (c))
3961             {
3962               iso->esc = ISO_ESC_LITERAL;
3963               *flags &= CODING_STATE_ISO2022_LOCK;
3964               return 1;
3965             }
3966
3967           /* bzzzt! */
3968           return 0;
3969         }
3970
3971
3972
3973       /**** directionality ****/
3974
3975     case ISO_ESC_5_11:          /* ISO6429 direction control */
3976       if (c == ']')
3977         {
3978           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3979           goto directionality;
3980         }
3981       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
3982       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3983       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3984       else               return 0;
3985       goto not_done;
3986
3987     case ISO_ESC_5_11_0:
3988       if (c == ']')
3989         {
3990           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3991           goto directionality;
3992         }
3993       return 0;
3994
3995     case ISO_ESC_5_11_1:
3996       if (c == ']')
3997         {
3998           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3999           goto directionality;
4000         }
4001       return 0;
4002
4003     case ISO_ESC_5_11_2:
4004       if (c == ']')
4005         {
4006           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4007           goto directionality;
4008         }
4009       return 0;
4010
4011     directionality:
4012       iso->esc = ISO_ESC_DIRECTIONALITY;
4013       /* Various junk here to attempt to preserve the direction sequences
4014          literally in the text if they would otherwise be swallowed due
4015          to invalid designations that don't show up as actual charset
4016          changes in the text. */
4017       if (iso->invalid_switch_dir)
4018         {
4019           /* We already inserted a direction switch literally into the
4020              text.  We assume (#### this may not be right) that the
4021              next direction switch is the one going the other way,
4022              and we need to output that literally as well. */
4023           iso->output_literally = 1;
4024           iso->invalid_switch_dir = 0;
4025         }
4026       else
4027         {
4028           int jj;
4029
4030           /* If we are in the thrall of an invalid designation,
4031            then stick the directionality sequence literally into the
4032            output stream so it ends up in the original text again. */
4033           for (jj = 0; jj < 4; jj++)
4034             if (iso->invalid_designated[jj])
4035               break;
4036           if (jj < 4)
4037             {
4038               iso->output_literally = 1;
4039               iso->invalid_switch_dir = 1;
4040             }
4041           else
4042             /* Indicate that we haven't yet seen a valid designation,
4043                so that if a switch-dir is directly followed by an
4044                invalid designation, both get inserted literally. */
4045             iso->switched_dir_and_no_valid_charset_yet = 1;
4046         }
4047       return 1;
4048
4049
4050       /**** designation ****/
4051
4052     case ISO_ESC_2_4:
4053       if (0x28 <= c && c <= 0x2F)
4054         {
4055           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4056           goto not_done;
4057         }
4058       if (0x40 <= c && c <= 0x42)
4059         {
4060           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4061                                       *flags & CODING_STATE_R2L ?
4062                                       CHARSET_RIGHT_TO_LEFT :
4063                                       CHARSET_LEFT_TO_RIGHT);
4064           reg = 0;
4065           goto designated;
4066         }
4067       return 0;
4068
4069     default:
4070       {
4071         int type =-1;
4072
4073         if (c < '0' || c > '~')
4074           return 0; /* bad final byte */
4075
4076         if (iso->esc >= ISO_ESC_2_8 &&
4077             iso->esc <= ISO_ESC_2_15)
4078           {
4079             type = ((iso->esc >= ISO_ESC_2_12) ?
4080                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4081             reg = (iso->esc - ISO_ESC_2_8) & 3;
4082           }
4083         else if (iso->esc >= ISO_ESC_2_4_8 &&
4084                  iso->esc <= ISO_ESC_2_4_15)
4085           {
4086             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4087                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4088             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4089           }
4090         else
4091           {
4092             /* Can this ever be reached? -slb */
4093             abort();
4094           }
4095
4096         cs = CHARSET_BY_ATTRIBUTES (type, c,
4097                                     *flags & CODING_STATE_R2L ?
4098                                     CHARSET_RIGHT_TO_LEFT :
4099                                     CHARSET_LEFT_TO_RIGHT);
4100         goto designated;
4101       }
4102     }
4103
4104  not_done:
4105   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4106   return -1;
4107
4108  single_shift:
4109   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4110     /* can't invoke something that ain't there. */
4111     return 0;
4112   iso->esc = ISO_ESC_SINGLE_SHIFT;
4113   *flags &= CODING_STATE_ISO2022_LOCK;
4114   if (reg == 2)
4115     *flags |= CODING_STATE_SS2;
4116   else
4117     *flags |= CODING_STATE_SS3;
4118   return 1;
4119
4120  locking_shift:
4121   if (check_invalid_charsets &&
4122       !CHARSETP (iso->charset[reg]))
4123     /* can't invoke something that ain't there. */
4124     return 0;
4125   if (half)
4126     iso->register_right = reg;
4127   else
4128     iso->register_left = reg;
4129   *flags &= CODING_STATE_ISO2022_LOCK;
4130   iso->esc = ISO_ESC_LOCKING_SHIFT;
4131   return 1;
4132
4133  designated:
4134   if (NILP (cs) && check_invalid_charsets)
4135     {
4136       iso->invalid_designated[reg] = 1;
4137       iso->charset[reg] = Vcharset_ascii;
4138       iso->esc = ISO_ESC_DESIGNATE;
4139       *flags &= CODING_STATE_ISO2022_LOCK;
4140       iso->output_literally = 1;
4141       if (iso->switched_dir_and_no_valid_charset_yet)
4142         {
4143           /* We encountered a switch-direction followed by an
4144              invalid designation.  Ensure that the switch-direction
4145              gets outputted; otherwise it will probably get eaten
4146              when the text is written out again. */
4147           iso->switched_dir_and_no_valid_charset_yet = 0;
4148           iso->output_direction_sequence = 1;
4149           /* And make sure that the switch-dir going the other
4150              way gets outputted, as well. */
4151           iso->invalid_switch_dir = 1;
4152         }
4153       return 1;
4154     }
4155   /* This function is called with CODESYS equal to nil when
4156      doing coding-system detection. */
4157   if (!NILP (codesys))
4158     {
4159       charset_conversion_spec_dynarr *dyn =
4160         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4161
4162       if (dyn)
4163         {
4164           int i;
4165
4166           for (i = 0; i < Dynarr_length (dyn); i++)
4167             {
4168               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4169               if (EQ (cs, spec->from_charset))
4170                 cs = spec->to_charset;
4171             }
4172         }
4173     }
4174
4175   iso->charset[reg] = cs;
4176   iso->esc = ISO_ESC_DESIGNATE;
4177   *flags &= CODING_STATE_ISO2022_LOCK;
4178   if (iso->invalid_designated[reg])
4179     {
4180       iso->invalid_designated[reg] = 0;
4181       iso->output_literally = 1;
4182     }
4183   if (iso->switched_dir_and_no_valid_charset_yet)
4184     iso->switched_dir_and_no_valid_charset_yet = 0;
4185   return 1;
4186 }
4187
4188 static int
4189 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4190                        unsigned int n)
4191 {
4192   int mask;
4193
4194   /* #### There are serious deficiencies in the recognition mechanism
4195      here.  This needs to be much smarter if it's going to cut it.
4196      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4197      it should be detected as Latin-1.
4198      All the ISO2022 stuff in this file should be synced up with the
4199      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4200      Perhaps we should wait till R2L works in FSF Emacs? */
4201
4202   if (!st->iso2022.initted)
4203     {
4204       reset_iso2022 (Qnil, &st->iso2022.iso);
4205       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4206                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4207                           CODING_CATEGORY_ISO_8_1_MASK |
4208                           CODING_CATEGORY_ISO_8_2_MASK |
4209                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4210       st->iso2022.flags = 0;
4211       st->iso2022.high_byte_count = 0;
4212       st->iso2022.saw_single_shift = 0;
4213       st->iso2022.initted = 1;
4214     }
4215
4216   mask = st->iso2022.mask;
4217
4218   while (n--)
4219     {
4220       int c = *src++;
4221       if (c >= 0xA0)
4222         {
4223           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4224           st->iso2022.high_byte_count++;
4225         }
4226       else
4227         {
4228           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4229             {
4230               if (st->iso2022.high_byte_count & 1)
4231                 /* odd number of high bytes; assume not iso-8-2 */
4232                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4233             }
4234           st->iso2022.high_byte_count = 0;
4235           st->iso2022.saw_single_shift = 0;
4236           if (c > 0x80)
4237             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4238         }
4239       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4240           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4241         { /* control chars */
4242           switch (c)
4243             {
4244               /* Allow and ignore control characters that you might
4245                  reasonably see in a text file */
4246             case '\r':
4247             case '\n':
4248             case '\t':
4249             case  7: /* bell */
4250             case  8: /* backspace */
4251             case 11: /* vertical tab */
4252             case 12: /* form feed */
4253             case 26: /* MS-DOS C-z junk */
4254             case 31: /* '^_' -- for info */
4255               goto label_continue_loop;
4256
4257             default:
4258               break;
4259             }
4260         }
4261
4262       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4263           || BYTE_C1_P (c))
4264         {
4265           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4266                                  &st->iso2022.flags, 0))
4267             {
4268               switch (st->iso2022.iso.esc)
4269                 {
4270                 case ISO_ESC_DESIGNATE:
4271                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4272                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4273                   break;
4274                 case ISO_ESC_LOCKING_SHIFT:
4275                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4276                   goto ran_out_of_chars;
4277                 case ISO_ESC_SINGLE_SHIFT:
4278                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4279                   st->iso2022.saw_single_shift = 1;
4280                   break;
4281                 default:
4282                   break;
4283                 }
4284             }
4285           else
4286             {
4287               mask = 0;
4288               goto ran_out_of_chars;
4289             }
4290         }
4291     label_continue_loop:;
4292     }
4293
4294  ran_out_of_chars:
4295
4296   return mask;
4297 }
4298
4299 static int
4300 postprocess_iso2022_mask (int mask)
4301 {
4302   /* #### kind of cheesy */
4303   /* If seven-bit ISO is allowed, then assume that the encoding is
4304      entirely seven-bit and turn off the eight-bit ones. */
4305   if (mask & CODING_CATEGORY_ISO_7_MASK)
4306     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4307                CODING_CATEGORY_ISO_8_1_MASK |
4308                CODING_CATEGORY_ISO_8_2_MASK);
4309   return mask;
4310 }
4311
4312 /* If FLAGS is a null pointer or specifies right-to-left motion,
4313    output a switch-dir-to-left-to-right sequence to DST.
4314    Also update FLAGS if it is not a null pointer.
4315    If INTERNAL_P is set, we are outputting in internal format and
4316    need to handle the CSI differently. */
4317
4318 static void
4319 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4320                                  unsigned_char_dynarr *dst,
4321                                  unsigned int *flags,
4322                                  int internal_p)
4323 {
4324   if (!flags || (*flags & CODING_STATE_R2L))
4325     {
4326       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4327         {
4328           Dynarr_add (dst, ISO_CODE_ESC);
4329           Dynarr_add (dst, '[');
4330         }
4331       else if (internal_p)
4332         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4333       else
4334         Dynarr_add (dst, ISO_CODE_CSI);
4335       Dynarr_add (dst, '0');
4336       Dynarr_add (dst, ']');
4337       if (flags)
4338         *flags &= ~CODING_STATE_R2L;
4339     }
4340 }
4341
4342 /* If FLAGS is a null pointer or specifies a direction different from
4343    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4344    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4345    sequence to DST.  Also update FLAGS if it is not a null pointer.
4346    If INTERNAL_P is set, we are outputting in internal format and
4347    need to handle the CSI differently. */
4348
4349 static void
4350 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4351                           unsigned_char_dynarr *dst, unsigned int *flags,
4352                           int internal_p)
4353 {
4354   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4355       direction == CHARSET_LEFT_TO_RIGHT)
4356     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4357   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4358            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4359            direction == CHARSET_RIGHT_TO_LEFT)
4360     {
4361       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4362         {
4363           Dynarr_add (dst, ISO_CODE_ESC);
4364           Dynarr_add (dst, '[');
4365         }
4366       else if (internal_p)
4367         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4368       else
4369         Dynarr_add (dst, ISO_CODE_CSI);
4370       Dynarr_add (dst, '2');
4371       Dynarr_add (dst, ']');
4372       if (flags)
4373         *flags |= CODING_STATE_R2L;
4374     }
4375 }
4376
4377 /* Convert ISO2022-format data to internal format. */
4378
4379 static void
4380 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4381                        unsigned_char_dynarr *dst, unsigned int n)
4382 {
4383   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4384   unsigned int flags  = str->flags;
4385   unsigned int ch     = str->ch;
4386   eol_type_t eol_type = str->eol_type;
4387 #ifdef ENABLE_COMPOSITE_CHARS
4388   unsigned_char_dynarr *real_dst = dst;
4389 #endif
4390   Lisp_Object coding_system;
4391
4392   XSETCODING_SYSTEM (coding_system, str->codesys);
4393
4394 #ifdef ENABLE_COMPOSITE_CHARS
4395   if (flags & CODING_STATE_COMPOSITE)
4396     dst = str->iso2022.composite_chars;
4397 #endif /* ENABLE_COMPOSITE_CHARS */
4398
4399   while (n--)
4400     {
4401       unsigned char c = *src++;
4402       if (flags & CODING_STATE_ESCAPE)
4403         {       /* Within ESC sequence */
4404           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4405                                           c, &flags, 1);
4406
4407           if (retval)
4408             {
4409               switch (str->iso2022.esc)
4410                 {
4411 #ifdef ENABLE_COMPOSITE_CHARS
4412                 case ISO_ESC_START_COMPOSITE:
4413                   if (str->iso2022.composite_chars)
4414                     Dynarr_reset (str->iso2022.composite_chars);
4415                   else
4416                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4417                   dst = str->iso2022.composite_chars;
4418                   break;
4419                 case ISO_ESC_END_COMPOSITE:
4420                   {
4421                     Bufbyte comstr[MAX_EMCHAR_LEN];
4422                     Bytecount len;
4423                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4424                                                          Dynarr_length (dst));
4425                     dst = real_dst;
4426                     len = set_charptr_emchar (comstr, emch);
4427                     Dynarr_add_many (dst, comstr, len);
4428                     break;
4429                   }
4430 #endif /* ENABLE_COMPOSITE_CHARS */
4431
4432                 case ISO_ESC_LITERAL:
4433                   DECODE_ADD_BINARY_CHAR (c, dst);
4434                   break;
4435
4436                 default:
4437                   /* Everything else handled already */
4438                   break;
4439                 }
4440             }
4441
4442           /* Attempted error recovery. */
4443           if (str->iso2022.output_direction_sequence)
4444             ensure_correct_direction (flags & CODING_STATE_R2L ?
4445                                       CHARSET_RIGHT_TO_LEFT :
4446                                       CHARSET_LEFT_TO_RIGHT,
4447                                       str->codesys, dst, 0, 1);
4448           /* More error recovery. */
4449           if (!retval || str->iso2022.output_literally)
4450             {
4451               /* Output the (possibly invalid) sequence */
4452               int i;
4453               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4454                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4455               flags &= CODING_STATE_ISO2022_LOCK;
4456               if (!retval)
4457                 n++, src--;/* Repeat the loop with the same character. */
4458               else
4459                 {
4460                   /* No sense in reprocessing the final byte of the
4461                      escape sequence; it could mess things up anyway.
4462                      Just add it now. */
4463                   DECODE_ADD_BINARY_CHAR (c, dst);
4464                 }
4465             }
4466           ch = 0;
4467         }
4468       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4469         { /* Control characters */
4470
4471           /***** Error-handling *****/
4472
4473           /* If we were in the middle of a character, dump out the
4474              partial character. */
4475           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4476
4477           /* If we just saw a single-shift character, dump it out.
4478              This may dump out the wrong sort of single-shift character,
4479              but least it will give an indication that something went
4480              wrong. */
4481           if (flags & CODING_STATE_SS2)
4482             {
4483               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4484               flags &= ~CODING_STATE_SS2;
4485             }
4486           if (flags & CODING_STATE_SS3)
4487             {
4488               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4489               flags &= ~CODING_STATE_SS3;
4490             }
4491
4492           /***** Now handle the control characters. *****/
4493
4494           /* Handle CR/LF */
4495           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4496
4497           flags &= CODING_STATE_ISO2022_LOCK;
4498
4499           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4500             DECODE_ADD_BINARY_CHAR (c, dst);
4501         }
4502       else
4503         {                       /* Graphic characters */
4504           Lisp_Object charset;
4505 #ifndef UTF2000
4506           Charset_ID lb;
4507 #endif
4508           int reg;
4509
4510           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4511
4512           /* Now determine the charset. */
4513           reg = ((flags & CODING_STATE_SS2) ? 2
4514                  : (flags & CODING_STATE_SS3) ? 3
4515                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4516                  : str->iso2022.register_left);
4517           charset = str->iso2022.charset[reg];
4518
4519           /* Error checking: */
4520           if (! CHARSETP (charset)
4521               || str->iso2022.invalid_designated[reg]
4522               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4523                   && XCHARSET_CHARS (charset) == 94))
4524             /* Mrmph.  We are trying to invoke a register that has no
4525                or an invalid charset in it, or trying to add a character
4526                outside the range of the charset.  Insert that char literally
4527                to preserve it for the output. */
4528             {
4529               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4530               DECODE_ADD_BINARY_CHAR (c, dst);
4531             }
4532
4533           else
4534             {
4535               /* Things are probably hunky-dorey. */
4536
4537               /* Fetch reverse charset, maybe. */
4538               if (((flags & CODING_STATE_R2L) &&
4539                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4540                   ||
4541                   (!(flags & CODING_STATE_R2L) &&
4542                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4543                 {
4544                   Lisp_Object new_charset =
4545                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4546                   if (!NILP (new_charset))
4547                     charset = new_charset;
4548                 }
4549
4550 #ifdef UTF2000
4551               if (XCHARSET_DIMENSION (charset) == 1)
4552                 {
4553                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4554                   DECODE_ADD_UCS_CHAR
4555                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4556                 }
4557               else if (ch)
4558                 {
4559                   DECODE_ADD_UCS_CHAR
4560                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4561                   ch = 0;
4562                 }
4563               else
4564                 ch = c;
4565 #else
4566               lb = XCHARSET_LEADING_BYTE (charset);
4567               switch (XCHARSET_REP_BYTES (charset))
4568                 {
4569                 case 1: /* ASCII */
4570                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4571                   Dynarr_add (dst, c & 0x7F);
4572                   break;
4573
4574                 case 2: /* one-byte official */
4575                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4576                   Dynarr_add (dst, lb);
4577                   Dynarr_add (dst, c | 0x80);
4578                   break;
4579
4580                 case 3: /* one-byte private or two-byte official */
4581                   if (XCHARSET_PRIVATE_P (charset))
4582                     {
4583                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4584                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4585                       Dynarr_add (dst, lb);
4586                       Dynarr_add (dst, c | 0x80);
4587                     }
4588                   else
4589                     {
4590                       if (ch)
4591                         {
4592                           Dynarr_add (dst, lb);
4593                           Dynarr_add (dst, ch | 0x80);
4594                           Dynarr_add (dst, c | 0x80);
4595                           ch = 0;
4596                         }
4597                       else
4598                         ch = c;
4599                     }
4600                   break;
4601
4602                 default:        /* two-byte private */
4603                   if (ch)
4604                     {
4605                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4606                       Dynarr_add (dst, lb);
4607                       Dynarr_add (dst, ch | 0x80);
4608                       Dynarr_add (dst, c | 0x80);
4609                       ch = 0;
4610                     }
4611                   else
4612                     ch = c;
4613                 }
4614 #endif
4615             }
4616
4617           if (!ch)
4618             flags &= CODING_STATE_ISO2022_LOCK;
4619         }
4620
4621     label_continue_loop:;
4622     }
4623
4624   if (flags & CODING_STATE_END)
4625     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4626
4627   str->flags = flags;
4628   str->ch    = ch;
4629 }
4630
4631
4632 /***** ISO2022 encoder *****/
4633
4634 /* Designate CHARSET into register REG. */
4635
4636 static void
4637 iso2022_designate (Lisp_Object charset, unsigned char reg,
4638                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4639 {
4640   static CONST char inter94[] = "()*+";
4641   static CONST char inter96[] = ",-./";
4642   unsigned int type;
4643   unsigned char final;
4644   Lisp_Object old_charset = str->iso2022.charset[reg];
4645
4646   str->iso2022.charset[reg] = charset;
4647   if (!CHARSETP (charset))
4648     /* charset might be an initial nil or t. */
4649     return;
4650   type = XCHARSET_TYPE (charset);
4651   final = XCHARSET_FINAL (charset);
4652   if (!str->iso2022.force_charset_on_output[reg] &&
4653       CHARSETP (old_charset) &&
4654       XCHARSET_TYPE (old_charset) == type &&
4655       XCHARSET_FINAL (old_charset) == final)
4656     return;
4657
4658   str->iso2022.force_charset_on_output[reg] = 0;
4659
4660   {
4661     charset_conversion_spec_dynarr *dyn =
4662       str->codesys->iso2022.output_conv;
4663
4664     if (dyn)
4665       {
4666         int i;
4667
4668         for (i = 0; i < Dynarr_length (dyn); i++)
4669           {
4670             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4671             if (EQ (charset, spec->from_charset))
4672                 charset = spec->to_charset;
4673           }
4674       }
4675   }
4676
4677   Dynarr_add (dst, ISO_CODE_ESC);
4678   switch (type)
4679     {
4680     case CHARSET_TYPE_94:
4681       Dynarr_add (dst, inter94[reg]);
4682       break;
4683     case CHARSET_TYPE_96:
4684       Dynarr_add (dst, inter96[reg]);
4685       break;
4686     case CHARSET_TYPE_94X94:
4687       Dynarr_add (dst, '$');
4688       if (reg != 0
4689           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4690           || final < '@'
4691           || final > 'B')
4692         Dynarr_add (dst, inter94[reg]);
4693       break;
4694     case CHARSET_TYPE_96X96:
4695       Dynarr_add (dst, '$');
4696       Dynarr_add (dst, inter96[reg]);
4697       break;
4698     }
4699   Dynarr_add (dst, final);
4700 }
4701
4702 static void
4703 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4704 {
4705   if (str->iso2022.register_left != 0)
4706     {
4707       Dynarr_add (dst, ISO_CODE_SI);
4708       str->iso2022.register_left = 0;
4709     }
4710 }
4711
4712 static void
4713 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4714 {
4715   if (str->iso2022.register_left != 1)
4716     {
4717       Dynarr_add (dst, ISO_CODE_SO);
4718       str->iso2022.register_left = 1;
4719     }
4720 }
4721
4722 void
4723 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4724                      unsigned_char_dynarr *dst, unsigned int *flags)
4725 {
4726   unsigned char charmask;
4727   Lisp_Coding_System* codesys = str->codesys;
4728   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4729   int i;
4730   Lisp_Object charset = str->iso2022.current_charset;
4731   int half = str->iso2022.current_half;
4732   unsigned int byte1, byte2;
4733
4734   if (ch <= 0x7F)
4735     {
4736       restore_left_to_right_direction (codesys, dst, flags, 0);
4737
4738       /* Make sure G0 contains ASCII */
4739       if ((ch > ' ' && ch < ISO_CODE_DEL)
4740           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4741         {
4742           ensure_normal_shift (str, dst);
4743           iso2022_designate (Vcharset_ascii, 0, str, dst);
4744         }
4745
4746       /* If necessary, restore everything to the default state
4747          at end-of-line */
4748       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4749         {
4750           restore_left_to_right_direction (codesys, dst, flags, 0);
4751
4752           ensure_normal_shift (str, dst);
4753
4754           for (i = 0; i < 4; i++)
4755             {
4756               Lisp_Object initial_charset =
4757                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4758               iso2022_designate (initial_charset, i, str, dst);
4759             }
4760         }
4761       if (ch == '\n')
4762         {
4763           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4764             Dynarr_add (dst, '\r');
4765           if (eol_type != EOL_CR)
4766             Dynarr_add (dst, ch);
4767         }
4768       else
4769         {
4770           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4771               && fit_to_be_escape_quoted (ch))
4772             Dynarr_add (dst, ISO_CODE_ESC);
4773           Dynarr_add (dst, ch);
4774         }
4775     }
4776   else if ( (0x80 <= ch) && (ch <= 0x9f) )
4777     {
4778       charmask = (half == 0 ? 0x00 : 0x80);
4779
4780       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4781           && fit_to_be_escape_quoted (ch))
4782         Dynarr_add (dst, ISO_CODE_ESC);
4783       /* you asked for it ... */
4784       Dynarr_add (dst, ch);
4785     }
4786   else
4787     {
4788       int reg;
4789
4790       /* Now determine which register to use. */
4791       reg = -1;
4792       for (i = 0; i < 4; i++)
4793         {
4794           if ((CHARSETP (charset = str->iso2022.charset[i])
4795                && (byte1 = charset_get_byte1 (charset, ch))) ||
4796               (CHARSETP
4797                (charset
4798                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
4799                && (byte1 = charset_get_byte1 (charset, ch))))
4800             {
4801               reg = i;
4802               byte2 = charset_get_byte2 (charset, ch);
4803               break;
4804             }
4805         }
4806       if (reg == -1)
4807         BREAKUP_CHAR (ch, charset, byte1, byte2);
4808       ensure_correct_direction (XCHARSET_DIRECTION (charset),
4809                                 codesys, dst, flags, 0);
4810
4811       if (reg == -1)
4812         {
4813           if (XCHARSET_GRAPHIC (charset) != 0)
4814             {
4815               if (!NILP (str->iso2022.charset[1]) &&
4816                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4817                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4818                 reg = 1;
4819               else if (!NILP (str->iso2022.charset[2]))
4820                 reg = 2;
4821               else if (!NILP (str->iso2022.charset[3]))
4822                 reg = 3;
4823               else
4824                 reg = 0;
4825             }
4826           else
4827             reg = 0;
4828         }
4829
4830       iso2022_designate (charset, reg, str, dst);
4831
4832       /* Now invoke that register. */
4833       switch (reg)
4834         {
4835         case 0:
4836           ensure_normal_shift (str, dst);
4837           half = 0;
4838           break;
4839         case 1:
4840           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4841             {
4842               ensure_shift_out (str, dst);
4843               half = 0;
4844             }
4845           else
4846             half = 1;
4847           break;
4848         case 2:
4849           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4850             {
4851               Dynarr_add (dst, ISO_CODE_ESC);
4852               Dynarr_add (dst, 'N');
4853               half = 0;
4854             }
4855           else
4856             {
4857               Dynarr_add (dst, ISO_CODE_SS2);
4858               half = 1;
4859             }
4860           break;
4861         case 3:
4862           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4863             {
4864               Dynarr_add (dst, ISO_CODE_ESC);
4865               Dynarr_add (dst, 'O');
4866               half = 0;
4867             }
4868           else
4869             {
4870               Dynarr_add (dst, ISO_CODE_SS3);
4871               half = 1;
4872             }
4873           break;
4874         default:
4875           abort ();
4876         }
4877
4878       charmask = (half == 0 ? 0x00 : 0x80);
4879
4880       switch (XCHARSET_DIMENSION (charset))
4881         {
4882         case 1:
4883           Dynarr_add (dst, byte1 | charmask);
4884           break;
4885         case 2:
4886           Dynarr_add (dst, byte1 | charmask);
4887           Dynarr_add (dst, byte2 | charmask);
4888           break;
4889         default:
4890           abort ();
4891         }
4892     }
4893   str->iso2022.current_charset = charset;
4894   str->iso2022.current_half = half;
4895 }
4896
4897 void
4898 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4899                      unsigned int *flags)
4900 {
4901   Lisp_Coding_System* codesys = str->codesys;
4902   int i;
4903
4904   restore_left_to_right_direction (codesys, dst, flags, 0);
4905   ensure_normal_shift (str, dst);
4906   for (i = 0; i < 4; i++)
4907     {
4908       Lisp_Object initial_charset
4909         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4910       iso2022_designate (initial_charset, i, str, dst);
4911     }
4912 }
4913 #endif /* MULE */
4914 \f
4915 /************************************************************************/
4916 /*                     No-conversion methods                            */
4917 /************************************************************************/
4918
4919 /* This is used when reading in "binary" files -- i.e. files that may
4920    contain all 256 possible byte values and that are not to be
4921    interpreted as being in any particular decoding. */
4922 static void
4923 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4924                              unsigned_char_dynarr *dst, unsigned int n)
4925 {
4926   unsigned char c;
4927   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4928   unsigned int flags  = str->flags;
4929   unsigned int ch     = str->ch;
4930   eol_type_t eol_type = str->eol_type;
4931
4932   while (n--)
4933     {
4934       c = *src++;
4935
4936       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4937       DECODE_ADD_BINARY_CHAR (c, dst);
4938     label_continue_loop:;
4939     }
4940
4941   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4942
4943   str->flags = flags;
4944   str->ch    = ch;
4945 }
4946
4947 static void
4948 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4949                              unsigned_char_dynarr *dst, unsigned int n)
4950 {
4951   unsigned char c;
4952   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4953   unsigned int flags  = str->flags;
4954   unsigned int ch     = str->ch;
4955   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4956 #ifdef UTF2000
4957   unsigned char char_boundary = str->iso2022.current_char_boundary;
4958 #endif
4959
4960   while (n--)
4961     {
4962       c = *src++;
4963 #ifdef UTF2000
4964       if (char_boundary == 0)
4965         if ( c >= 0xfc )
4966           {
4967             ch = c & 0x01;
4968             char_boundary = 5;
4969           }
4970         else if ( c >= 0xf8 )
4971           {
4972             ch = c & 0x03;
4973             char_boundary = 4;
4974           }
4975         else if ( c >= 0xf0 )
4976           {
4977             ch = c & 0x07;
4978             char_boundary = 3;
4979           }
4980         else if ( c >= 0xe0 )
4981           {
4982             ch = c & 0x0f;
4983             char_boundary = 2;
4984           }
4985         else if ( c >= 0xc0 )
4986           {
4987             ch = c & 0x1f;
4988             char_boundary = 1;
4989           }
4990         else
4991           {
4992             ch = 0;
4993             if (c == '\n')
4994               {
4995                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4996                   Dynarr_add (dst, '\r');
4997                 if (eol_type != EOL_CR)
4998                   Dynarr_add (dst, c);
4999               }
5000             else
5001               Dynarr_add (dst, c);
5002             char_boundary = 0;
5003           }
5004       else if (char_boundary == 1)
5005         {
5006           ch = ( ch << 6 ) | ( c & 0x3f );
5007           Dynarr_add (dst, ch & 0xff);
5008           char_boundary = 0;
5009         }
5010       else
5011         {
5012           ch = ( ch << 6 ) | ( c & 0x3f );
5013           char_boundary--;
5014         }
5015 #else /* not UTF2000 */
5016       if (c == '\n')
5017         {
5018           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5019             Dynarr_add (dst, '\r');
5020           if (eol_type != EOL_CR)
5021             Dynarr_add (dst, '\n');
5022           ch = 0;
5023         }
5024       else if (BYTE_ASCII_P (c))
5025         {
5026           assert (ch == 0);
5027           Dynarr_add (dst, c);
5028         }
5029       else if (BUFBYTE_LEADING_BYTE_P (c))
5030         {
5031           assert (ch == 0);
5032           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5033               c == LEADING_BYTE_CONTROL_1)
5034             ch = c;
5035           else
5036             Dynarr_add (dst, '~'); /* untranslatable character */
5037         }
5038       else
5039         {
5040           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5041             Dynarr_add (dst, c);
5042           else if (ch == LEADING_BYTE_CONTROL_1)
5043             {
5044               assert (c < 0xC0);
5045               Dynarr_add (dst, c - 0x20);
5046             }
5047           /* else it should be the second or third byte of an
5048              untranslatable character, so ignore it */
5049           ch = 0;
5050         }
5051 #endif /* not UTF2000 */
5052     }
5053
5054   str->flags = flags;
5055   str->ch    = ch;
5056 #ifdef UTF2000
5057   str->iso2022.current_char_boundary = char_boundary;
5058 #endif
5059 }
5060
5061 \f
5062 /************************************************************************/
5063 /*                   Simple internal/external functions                 */
5064 /************************************************************************/
5065
5066 static Extbyte_dynarr *conversion_out_dynarr;
5067 static Bufbyte_dynarr *conversion_in_dynarr;
5068
5069 /* Determine coding system from coding format */
5070
5071 /* #### not correct for all values of `fmt'! */
5072 static Lisp_Object
5073 external_data_format_to_coding_system (enum external_data_format fmt)
5074 {
5075   switch (fmt)
5076     {
5077     case FORMAT_FILENAME:
5078     case FORMAT_TERMINAL:
5079       if (EQ (Vfile_name_coding_system, Qnil) ||
5080           EQ (Vfile_name_coding_system, Qbinary))
5081         return Qnil;
5082       else
5083         return Fget_coding_system (Vfile_name_coding_system);
5084 #ifdef MULE
5085     case FORMAT_CTEXT:
5086       return Fget_coding_system (Qctext);
5087 #endif
5088     default:
5089       return Qnil;
5090     }
5091 }
5092
5093 Extbyte *
5094 convert_to_external_format (CONST Bufbyte *ptr,
5095                             Bytecount len,
5096                             Extcount *len_out,
5097                             enum external_data_format fmt)
5098 {
5099   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5100
5101   if (!conversion_out_dynarr)
5102     conversion_out_dynarr = Dynarr_new (Extbyte);
5103   else
5104     Dynarr_reset (conversion_out_dynarr);
5105
5106   if (NILP (coding_system))
5107     {
5108       CONST Bufbyte *end = ptr + len;
5109
5110       for (; ptr < end;)
5111         {
5112 #ifdef UTF2000
5113           Bufbyte c =
5114             (*ptr < 0xc0) ? *ptr :
5115             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5116 #else
5117           Bufbyte c =
5118             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5119             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5120             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5121             '~';
5122 #endif
5123           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5124           INC_CHARPTR (ptr);
5125         }
5126
5127 #ifdef ERROR_CHECK_BUFPOS
5128       assert (ptr == end);
5129 #endif
5130     }
5131   else
5132     {
5133       Lisp_Object instream, outstream, da_outstream;
5134       Lstream *istr, *ostr;
5135       struct gcpro gcpro1, gcpro2, gcpro3;
5136       char tempbuf[1024]; /* some random amount */
5137
5138       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5139       da_outstream = make_dynarr_output_stream
5140         ((unsigned_char_dynarr *) conversion_out_dynarr);
5141       outstream =
5142         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5143       istr = XLSTREAM (instream);
5144       ostr = XLSTREAM (outstream);
5145       GCPRO3 (instream, outstream, da_outstream);
5146       while (1)
5147         {
5148           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5149           if (!size_in_bytes)
5150             break;
5151           Lstream_write (ostr, tempbuf, size_in_bytes);
5152         }
5153       Lstream_close (istr);
5154       Lstream_close (ostr);
5155       UNGCPRO;
5156       Lstream_delete (istr);
5157       Lstream_delete (ostr);
5158       Lstream_delete (XLSTREAM (da_outstream));
5159     }
5160
5161   *len_out = Dynarr_length (conversion_out_dynarr);
5162   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5163   return Dynarr_atp (conversion_out_dynarr, 0);
5164 }
5165
5166 Bufbyte *
5167 convert_from_external_format (CONST Extbyte *ptr,
5168                               Extcount len,
5169                               Bytecount *len_out,
5170                               enum external_data_format fmt)
5171 {
5172   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5173
5174   if (!conversion_in_dynarr)
5175     conversion_in_dynarr = Dynarr_new (Bufbyte);
5176   else
5177     Dynarr_reset (conversion_in_dynarr);
5178
5179   if (NILP (coding_system))
5180     {
5181       CONST Extbyte *end = ptr + len;
5182       for (; ptr < end; ptr++)
5183         {
5184           Extbyte c = *ptr;
5185           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5186         }
5187     }
5188   else
5189     {
5190       Lisp_Object instream, outstream, da_outstream;
5191       Lstream *istr, *ostr;
5192       struct gcpro gcpro1, gcpro2, gcpro3;
5193       char tempbuf[1024]; /* some random amount */
5194
5195       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5196       da_outstream = make_dynarr_output_stream
5197         ((unsigned_char_dynarr *) conversion_in_dynarr);
5198       outstream =
5199         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5200       istr = XLSTREAM (instream);
5201       ostr = XLSTREAM (outstream);
5202       GCPRO3 (instream, outstream, da_outstream);
5203       while (1)
5204         {
5205           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5206           if (!size_in_bytes)
5207             break;
5208           Lstream_write (ostr, tempbuf, size_in_bytes);
5209         }
5210       Lstream_close (istr);
5211       Lstream_close (ostr);
5212       UNGCPRO;
5213       Lstream_delete (istr);
5214       Lstream_delete (ostr);
5215       Lstream_delete (XLSTREAM (da_outstream));
5216     }
5217
5218   *len_out = Dynarr_length (conversion_in_dynarr);
5219   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5220   return Dynarr_atp (conversion_in_dynarr, 0);
5221 }
5222
5223 \f
5224 /************************************************************************/
5225 /*                             Initialization                           */
5226 /************************************************************************/
5227
5228 void
5229 syms_of_file_coding (void)
5230 {
5231   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5232   deferror (&Qcoding_system_error, "coding-system-error",
5233             "Coding-system error", Qio_error);
5234
5235   DEFSUBR (Fcoding_system_p);
5236   DEFSUBR (Ffind_coding_system);
5237   DEFSUBR (Fget_coding_system);
5238   DEFSUBR (Fcoding_system_list);
5239   DEFSUBR (Fcoding_system_name);
5240   DEFSUBR (Fmake_coding_system);
5241   DEFSUBR (Fcopy_coding_system);
5242   DEFSUBR (Fdefine_coding_system_alias);
5243   DEFSUBR (Fsubsidiary_coding_system);
5244
5245   DEFSUBR (Fcoding_system_type);
5246   DEFSUBR (Fcoding_system_doc_string);
5247 #ifdef MULE
5248   DEFSUBR (Fcoding_system_charset);
5249 #endif
5250   DEFSUBR (Fcoding_system_property);
5251
5252   DEFSUBR (Fcoding_category_list);
5253   DEFSUBR (Fset_coding_priority_list);
5254   DEFSUBR (Fcoding_priority_list);
5255   DEFSUBR (Fset_coding_category_system);
5256   DEFSUBR (Fcoding_category_system);
5257
5258   DEFSUBR (Fdetect_coding_region);
5259   DEFSUBR (Fdecode_coding_region);
5260   DEFSUBR (Fencode_coding_region);
5261 #ifdef MULE
5262   DEFSUBR (Fdecode_shift_jis_char);
5263   DEFSUBR (Fencode_shift_jis_char);
5264   DEFSUBR (Fdecode_big5_char);
5265   DEFSUBR (Fencode_big5_char);
5266 #endif /* MULE */
5267   defsymbol (&Qcoding_system_p, "coding-system-p");
5268   defsymbol (&Qno_conversion, "no-conversion");
5269   defsymbol (&Qraw_text, "raw-text");
5270 #ifdef MULE
5271   defsymbol (&Qbig5, "big5");
5272   defsymbol (&Qshift_jis, "shift-jis");
5273   defsymbol (&Qucs4, "ucs-4");
5274   defsymbol (&Qutf8, "utf-8");
5275   defsymbol (&Qccl, "ccl");
5276   defsymbol (&Qiso2022, "iso2022");
5277 #endif /* MULE */
5278   defsymbol (&Qmnemonic, "mnemonic");
5279   defsymbol (&Qeol_type, "eol-type");
5280   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5281   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5282
5283   defsymbol (&Qcr, "cr");
5284   defsymbol (&Qlf, "lf");
5285   defsymbol (&Qcrlf, "crlf");
5286   defsymbol (&Qeol_cr, "eol-cr");
5287   defsymbol (&Qeol_lf, "eol-lf");
5288   defsymbol (&Qeol_crlf, "eol-crlf");
5289 #ifdef MULE
5290   defsymbol (&Qcharset_g0, "charset-g0");
5291   defsymbol (&Qcharset_g1, "charset-g1");
5292   defsymbol (&Qcharset_g2, "charset-g2");
5293   defsymbol (&Qcharset_g3, "charset-g3");
5294   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5295   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5296   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5297   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5298   defsymbol (&Qno_iso6429, "no-iso6429");
5299   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5300   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5301
5302   defsymbol (&Qshort, "short");
5303   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5304   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5305   defsymbol (&Qseven, "seven");
5306   defsymbol (&Qlock_shift, "lock-shift");
5307   defsymbol (&Qescape_quoted, "escape-quoted");
5308 #endif /* MULE */
5309   defsymbol (&Qencode, "encode");
5310   defsymbol (&Qdecode, "decode");
5311
5312 #ifdef MULE
5313   defsymbol (&Qctext, "ctext");
5314   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5315              "shift-jis");
5316   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5317              "big5");
5318   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5319              "ucs-4");
5320   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5321              "utf-8");
5322   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5323              "iso-7");
5324   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5325              "iso-8-designate");
5326   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5327              "iso-8-1");
5328   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5329              "iso-8-2");
5330   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5331              "iso-lock-shift");
5332 #endif /* MULE */
5333   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5334              "no-conversion");
5335 }
5336
5337 void
5338 lstream_type_create_file_coding (void)
5339 {
5340   LSTREAM_HAS_METHOD (decoding, reader);
5341   LSTREAM_HAS_METHOD (decoding, writer);
5342   LSTREAM_HAS_METHOD (decoding, rewinder);
5343   LSTREAM_HAS_METHOD (decoding, seekable_p);
5344   LSTREAM_HAS_METHOD (decoding, flusher);
5345   LSTREAM_HAS_METHOD (decoding, closer);
5346   LSTREAM_HAS_METHOD (decoding, marker);
5347
5348   LSTREAM_HAS_METHOD (encoding, reader);
5349   LSTREAM_HAS_METHOD (encoding, writer);
5350   LSTREAM_HAS_METHOD (encoding, rewinder);
5351   LSTREAM_HAS_METHOD (encoding, seekable_p);
5352   LSTREAM_HAS_METHOD (encoding, flusher);
5353   LSTREAM_HAS_METHOD (encoding, closer);
5354   LSTREAM_HAS_METHOD (encoding, marker);
5355 }
5356
5357 void
5358 vars_of_file_coding (void)
5359 {
5360   int i;
5361
5362   /* Initialize to something reasonable ... */
5363   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5364     {
5365       coding_category_system[i] = Qnil;
5366       coding_category_by_priority[i] = i;
5367     }
5368
5369   Fprovide (intern ("file-coding"));
5370
5371   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5372 Coding system used for TTY keyboard input.
5373 Not used under a windowing system.
5374 */ );
5375   Vkeyboard_coding_system = Qnil;
5376
5377   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5378 Coding system used for TTY display output.
5379 Not used under a windowing system.
5380 */ );
5381   Vterminal_coding_system = Qnil;
5382
5383   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5384 Overriding coding system used when writing a file or process.
5385 You should *bind* this, not set it.  If this is non-nil, it specifies
5386 the coding system that will be used when a file or process is read
5387 in, and overrides `buffer-file-coding-system-for-read',
5388 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5389 this one for permanent changes to the environment.
5390 */ );
5391   Vcoding_system_for_read = Qnil;
5392
5393   DEFVAR_LISP ("coding-system-for-write",
5394                &Vcoding_system_for_write /*
5395 Overriding coding system used when writing a file or process.
5396 You should *bind* this, not set it.  If this is non-nil, it specifies
5397 the coding system that will be used when a file or process is wrote
5398 in, and overrides `buffer-file-coding-system',
5399 `write-region-pre-hook', etc.  Use those variables instead of this one
5400 for permanent changes to the environment.
5401 */ );
5402   Vcoding_system_for_write = Qnil;
5403
5404   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5405 Coding system used to convert pathnames when accessing files.
5406 */ );
5407   Vfile_name_coding_system = Qnil;
5408
5409   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5410 Non-nil means the buffer contents are regarded as multi-byte form
5411 of characters, not a binary code.  This affects the display, file I/O,
5412 and behaviors of various editing commands.
5413
5414 Setting this to nil does not do anything.
5415 */ );
5416   enable_multibyte_characters = 1;
5417 }
5418
5419 void
5420 complex_vars_of_file_coding (void)
5421 {
5422   staticpro (&Vcoding_system_hash_table);
5423   Vcoding_system_hash_table =
5424     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5425
5426   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5427
5428 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5429 {                                               \
5430   struct codesys_prop csp;                      \
5431   csp.sym = (Sym);                              \
5432   csp.prop_type = (Prop_Type);                  \
5433   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5434 } while (0)
5435
5436   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5437   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5438   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5439   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5440   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5441   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5442   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5443 #ifdef MULE
5444   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5445   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5446   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5447   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5448   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5449   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5450   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5451   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5452   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5453   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5454   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5455   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5456   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5457   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5458   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5459   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5460   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5461
5462   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5463   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5464 #endif /* MULE */
5465   /* Need to create this here or we're really screwed. */
5466   Fmake_coding_system
5467     (Qraw_text, Qno_conversion,
5468      build_string ("Raw text, which means it converts only line-break-codes."),
5469      list2 (Qmnemonic, build_string ("Raw")));
5470
5471   Fmake_coding_system
5472     (Qbinary, Qno_conversion,
5473      build_string ("Binary, which means it does not convert anything."),
5474      list4 (Qeol_type, Qlf,
5475             Qmnemonic, build_string ("Binary")));
5476
5477 #ifdef UTF2000
5478   Fmake_coding_system
5479     (Qutf8, Qutf8,
5480      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5481      list2 (Qmnemonic, build_string ("UTF8")));
5482 #endif
5483
5484   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5485
5486   /* Need this for bootstrapping */
5487   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5488     Fget_coding_system (Qraw_text);
5489
5490 #ifdef UTF2000
5491   coding_category_system[CODING_CATEGORY_UTF8]
5492    = Fget_coding_system (Qutf8);
5493 #endif
5494 }